<a href="https://colab.research.google.com/github/feniltailor22/Natural-Language-Processing/blob/main/Spam_SMS_Classification_Using_BERT_%26_Tensorflow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

import tensorflow as tf
import tensorflow_hub as hub

!pip install tensorflow_text
import tensorflow_text as tf_text

Collecting tensorflow_text
  Downloading tensorflow_text-2.7.3-cp37-cp37m-manylinux2010_x86_64.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 12.4 MB/s 
Installing collected packages: tensorflow-text
Successfully installed tensorflow-text-2.7.3


In [2]:
df= pd.read_csv('SMSSpamCollection', sep='\t', names=["label", "message"])

In [3]:
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df.groupby(by='label').describe()

Unnamed: 0_level_0,message,message,message,message
Unnamed: 0_level_1,count,unique,top,freq
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


In [5]:
df['label'].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [6]:
747/4825

0.15481865284974095

In [7]:
#15% spam emails, 85% ham emails: This indicates class imbalance

In [8]:
#To handle the class imbalance we are doing Down Sampling here.
#That is reducing the Ham Messages to 747 and descarding the rest of messages.

In [9]:
df_spam= df[df['label']=='spam']

In [10]:
df_spam.head()

Unnamed: 0,label,message
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
5,spam,FreeMsg Hey there darling it's been 3 week's n...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...
11,spam,"SIX chances to win CASH! From 100 to 20,000 po..."


In [11]:
df_spam.shape

(747, 2)

In [12]:
df_ham= df[df['label']=='ham']
df_ham.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
6,ham,Even my brother is not like to speak with me. ...


In [13]:
df_ham.shape

(4825, 2)

In [14]:
df_ham_downsampled= df_ham.sample(df_spam.shape[0])
#df_spam.shape[0]= 747

In [15]:
df_ham_downsampled.head()

Unnamed: 0,label,message
2232,ham,"K, wen ur free come to my home and also tel vi..."
5344,ham,When you guys planning on coming over?
3608,ham,I have no idea where you are
2933,ham,Only 2% students solved this CAT question in '...
1552,ham,In e msg jus now. U said thanks for gift.


In [16]:
df_ham_downsampled.shape

(747, 2)

In [17]:
df_balanced= pd.concat([df_spam, df_ham_downsampled])

In [18]:
df_balanced.shape

(1494, 2)

In [19]:
df_balanced['label'].describe()

count     1494
unique       2
top        ham
freq       747
Name: label, dtype: object

In [20]:
df_balanced['label'].value_counts()

ham     747
spam    747
Name: label, dtype: int64

In [21]:
df_balanced['spam']= df_balanced['label'].apply(lambda x:1 if x=='spam' else 0)

In [22]:
df_balanced

Unnamed: 0,label,message,spam
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
5,spam,FreeMsg Hey there darling it's been 3 week's n...,1
8,spam,WINNER!! As a valued network customer you have...,1
9,spam,Had your mobile 11 months or more? U R entitle...,1
11,spam,"SIX chances to win CASH! From 100 to 20,000 po...",1
...,...,...,...
1588,ham,"Dont search love, let love find U. Thats why i...",0
5544,ham,"I'm taking derek &amp; taylor to walmart, if I...",0
5040,ham,Pls clarify back if an open return ticket that...,0
4978,ham,"A boy was late 2 home. His father: ""POWER OF F...",0


In [23]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_balanced['message'], df_balanced['spam'], stratify=df_balanced['spam'])

#stratify= data is split in a stratified fashion, using this as the class labels

In [24]:
#Now lets import BERT model and get embeding vectors for few sample statements

In [25]:
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

In [26]:
X_train.head()

2670    we tried to contact you re your response to ou...
2823    ROMCAPspam Everyone around should be respondin...
4834    New Mobiles from 2004, MUST GO! Txt: NOKIA to ...
1683    HI BABE U R MOST LIKELY TO BE IN BED BUT IM SO...
2032    I noe la... U wana pei bf oso rite... K lor, o...
Name: message, dtype: object

In [27]:
#Creating a function that takes a sentence as an input and returns 768 size vector as an output.

def get_sentence_embedding(sentences):
  preprocess_text= bert_preprocess(sentences)
  return bert_encoder(preprocess_text)['pooled_output'] #word embedding for entire sentence 

In [28]:
get_sentence_embedding([
    "500$ discount. hurry up", 
    "Bhavin, are you up for a volleybal game tomorrow?"]
)

<tf.Tensor: shape=(2, 768), dtype=float32, numpy=
array([[-0.8435166 , -0.5132724 , -0.88845706, ..., -0.7474883 ,
        -0.7531471 ,  0.91964483],
       [-0.8720836 , -0.50544   , -0.9444667 , ..., -0.8584748 ,
        -0.71745366,  0.88082993]], dtype=float32)>

In [29]:
#Get embeding vectors for few sample words. Compare them using cosine similarity

In [30]:
e = get_sentence_embedding([
    "banana", 
    "grapes",
    "mango",
    "jeff bezos",
    "elon musk",
    "bill gates"
])

In [31]:
from sklearn.metrics.pairwise import cosine_similarity

In [32]:
cosine_similarity([e[0]],[e[1]])

array([[0.99110895]], dtype=float32)

In [33]:
#Values near to 1 means they are similar. 0 means they are very different. Above you can use comparing "banana" vs "grapes" you get 0.99 similarity as they both are fruits

In [34]:
cosine_similarity([e[3]],[e[4]])

array([[0.9872036]], dtype=float32)

In [35]:
#Jeff bezos and Elon musk are more similar then Jeff bezos and banana as indicated above

In [36]:
cosine_similarity([e[0]],[e[3]])

array([[0.8470383]], dtype=float32)

In [37]:
#Comparing banana with jeff bezos you still get 0.84 but it is not as close as 0.99 that we got with grapes.

In [38]:
#Build Model

#There are two types of models you can build in tensorflow.
# (1) Sequential (2) Functional

# below we will build functional model

In [39]:
# Bert layers
text_input= tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocess_text =bert_preprocess(text_input)
outputs =bert_encoder(preprocess_text)

# Neural network layers
l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
l = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(l)

# Use inputs and outputs to construct a final model
model = tf.keras.Model(inputs=[text_input], outputs = [l])

In [40]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer (KerasLayer)       {'input_word_ids':   0           ['text[0][0]']                   
                                (None, 128),                                                      
                                 'input_type_ids':                                                
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128)}                                                      

In [41]:
METRICS = [
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall')
]

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=METRICS)

In [42]:
model.fit(X_train, y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f52d75d2410>

In [43]:
model.evaluate(X_test, y_test)



[0.27335962653160095,
 0.9117646813392639,
 0.8811880946159363,
 0.9518716335296631]

In [44]:
y_predicted = model.predict(X_test)
y_predicted

array([[0.9646564 ],
       [0.979935  ],
       [0.9017511 ],
       [0.9525932 ],
       [0.86457944],
       [0.9709551 ],
       [0.12718144],
       [0.86304724],
       [0.9366654 ],
       [0.03119833],
       [0.6926738 ],
       [0.9400277 ],
       [0.1100765 ],
       [0.11097459],
       [0.48077586],
       [0.24476038],
       [0.11124694],
       [0.9167892 ],
       [0.30973598],
       [0.9448888 ],
       [0.900297  ],
       [0.95106757],
       [0.87551576],
       [0.012553  ],
       [0.03685003],
       [0.66998655],
       [0.7722783 ],
       [0.8199729 ],
       [0.92830986],
       [0.89065987],
       [0.93163353],
       [0.86623895],
       [0.9028459 ],
       [0.8729961 ],
       [0.02319634],
       [0.06123212],
       [0.3601931 ],
       [0.6190118 ],
       [0.6625245 ],
       [0.9379399 ],
       [0.8601962 ],
       [0.280581  ],
       [0.82447785],
       [0.8899642 ],
       [0.5535648 ],
       [0.3781192 ],
       [0.1599547 ],
       [0.066

In [45]:
y_predicted = y_predicted.flatten()
y_predicted

array([0.9646564 , 0.979935  , 0.9017511 , 0.9525932 , 0.86457944,
       0.9709551 , 0.12718144, 0.86304724, 0.9366654 , 0.03119833,
       0.6926738 , 0.9400277 , 0.1100765 , 0.11097459, 0.48077586,
       0.24476038, 0.11124694, 0.9167892 , 0.30973598, 0.9448888 ,
       0.900297  , 0.95106757, 0.87551576, 0.012553  , 0.03685003,
       0.66998655, 0.7722783 , 0.8199729 , 0.92830986, 0.89065987,
       0.93163353, 0.86623895, 0.9028459 , 0.8729961 , 0.02319634,
       0.06123212, 0.3601931 , 0.6190118 , 0.6625245 , 0.9379399 ,
       0.8601962 , 0.280581  , 0.82447785, 0.8899642 , 0.5535648 ,
       0.3781192 , 0.1599547 , 0.06650181, 0.10281222, 0.7636468 ,
       0.06170337, 0.9395171 , 0.96429074, 0.06323265, 0.7297861 ,
       0.10847269, 0.04620627, 0.9647804 , 0.29735985, 0.98758674,
       0.8875626 , 0.40787077, 0.95139825, 0.94649327, 0.84868014,
       0.82262355, 0.31835884, 0.5107794 , 0.9622724 , 0.9395088 ,
       0.16475621, 0.63935804, 0.95842564, 0.941246  , 0.19556

In [46]:
import numpy as np

y_predicted = np.where(y_predicted > 0.5, 1, 0)
y_predicted

array([1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0,
       1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1,
       1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1,
       0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0,

In [47]:
from sklearn.metrics import confusion_matrix, classification_report

In [48]:
print(classification_report(y_test, y_predicted))

              precision    recall  f1-score   support

           0       0.95      0.87      0.91       187
           1       0.88      0.95      0.92       187

    accuracy                           0.91       374
   macro avg       0.91      0.91      0.91       374
weighted avg       0.91      0.91      0.91       374



In [49]:
print(confusion_matrix(y_test, y_predicted))

[[163  24]
 [  9 178]]


In [50]:
#Inference

In [51]:
reviews = [
    'Enter a chance to win $5000, hurry up, offer valid until march 31, 2021',
    'You are awarded a SiPix Digital Camera! call 09061221061 from landline. Delivery within 28days. T Cs Box177. M221BP. 2yr warranty. 150ppm. 16 . p pÂ£3.99',
    'it to 80488. Your 500 free text messages are valid until 31 December 2005.',
    'Hey Sam, Are you coming for a cricket game tomorrow',
    "Why don't you wait 'til at least wednesday to see if you get your ."
]

In [52]:
model.predict(reviews)

array([[0.80428255],
       [0.89321536],
       [0.8488006 ],
       [0.25549832],
       [0.12810624]], dtype=float32)