In [80]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

In [81]:
import pandas as pd
df = pd.read_excel("C:/Users/Dickson/Downloads/Data Science tutorials/spam_data.xlsx")
df.head()

Unnamed: 0,Category,Message,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [82]:
cols_to_drop = ['Unnamed: 2','Unnamed: 3','Unnamed: 4']
df1= df.drop(cols_to_drop, axis=True)
df1

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [83]:
#lets explore on the data
df1.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


In [84]:
#we create new column  to encode our spam text to numerics
df1['spam'] = df1['Category'].apply(lambda x: 1 if x == 'spam' else 0)
df1.head(5)


Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [105]:
#we can now split our data to train and test. Stratify ensures there is a balance when spliting
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df1['Message'], df1['spam'], test_size = 0.2, stratify = df1['spam'])

y_train.value_counts()


0    3859
1     598
Name: spam, dtype: int64

In [86]:
y_test.value_counts()

0    966
1    149
Name: spam, dtype: int64

In [87]:
#now we can do the emmbedding using BERT(). This is going to download a pre-trained model on wikipedea and bo...
#we are going to use this trained model to generate the embedding vectors
bert_preprocess = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3')
bert_encoder = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4')

In [88]:
#now we write a function that takes a couple of sentences and returns an embeding vector
def get_sentence_embending(sentences):
    preprocessed_text =  bert_preprocess(sentences)
    return bert_encoder(preprocessed_text)['pooled_output']

In [89]:
get_sentence_embending([
    '500$ discount. hurry up',
    'Bhavin, are you up for a volleyball game tomorrow?'
])

<tf.Tensor: shape=(2, 768), dtype=float32, numpy=
array([[-0.84351707, -0.5132728 , -0.8884573 , ..., -0.74748874,
        -0.7531474 ,  0.91964495],
       [-0.82059705, -0.52094984, -0.9524536 , ..., -0.88041943,
        -0.70536864,  0.8252004 ]], dtype=float32)>

In [90]:
#lets do simple demonistration how to link  words with similar relationship
e = get_sentence_embending([
    'banana',
    'grapes',
    'mango',
    'jeff bezos',
    'elon musk',
    'bill gates' 
])

In [91]:
#we can now use cosine similarity to check the similarities
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity([e[0]],[e[1]])

array([[0.9911088]], dtype=float32)

In [92]:
#now lets build a model. There are two types of models, sequential and functional. in this case we are going to use a functional model
text_input = tf.keras.layers.Input(shape=(), dtype = tf.string, name = 'text')

preprocessed_text =  bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

l = tf.keras.layers.Dropout(0.1, name = 'dropout')(outputs['pooled_output'])
l = tf.keras.layers.Dense(1, activation = 'sigmoid',name = 'output')(l)

model = tf.keras.Model(inputs = [text_input], outputs = [l])
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer_3 (KerasLayer)     {'input_type_ids':   0           ['text[0][0]']                   
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128),                                                          
                                 'input_word_ids':                                                
                                (None, 128)}                                                

In [93]:
#we can now compile our model
model.compile(optimizer ='adam',
              loss = 'binary_crossentropy',
              metrics =['accuracy'])

In [94]:
#we can now train our model
#import numpy as np
#x_train = np.asarray(x_train).astype(np.float32)
#y_train = np.asarray(y_train).astype(np.float32)
model.fit(X_train, y_train, epochs = 5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x286258cca60>

In [106]:
#then we evaluate our model
model.evaluate(X_test, y_test)



[0.14789451658725739, 0.9372197389602661]

In [109]:
#we can now predict on few emails
reviews = [
    'Free entry in 2 a wkly comp to win FA Cup final tkts ',
    'WINNER!! As a valued network customer you have ',
    'Hey Sam, are you coming for a cricket game tomorrow?',
    "Why don't you wait 'til at least wednesday to see if you get your ."
]
model.predict(reviews)

array([[0.02360621],
       [0.01251957],
       [0.02150384],
       [0.01095766]], dtype=float32)

In [110]:
#+++++++++++++++++++++++++++++++++++++++++++++++++++
#from Text Classification Using BERT & Tensorflow detailed
#we can create a new dataframe for spam
df_spam = df1[df1['Category']=='spam']
df_spam.shape

(747, 3)

In [111]:
#same case with ham
df_ham = df1[df1['Category']=='ham']
df_ham.shape

(4825, 3)

In [112]:
#we can sample the harm dataset
df_ham_downsampled = df_ham.sample(df_spam.shape[0])
df_ham_downsampled.shape

#you realize that now they are both 747

(747, 3)

In [113]:
#we now concatinate both datasets to form balanced data
df_balanced = pd.concat([df_spam, df_ham_downsampled])
df_balanced.shape

(1494, 3)

In [114]:
#now when we do the value count
df_balanced['Category'].value_counts()

spam    747
ham     747
Name: Category, dtype: int64

In [115]:
df_balanced.sample(5)

Unnamed: 0,Category,Message,spam
2652,ham,No need for the drug anymore.,0
3835,ham,Then Ì_ come n pick me at 530 ar?,0
4158,ham,Now got tv 2 watch meh? U no work today?,0
175,ham,Let me know when you've got the money so carlo...,0
1262,spam,"Hungry gay guys feeling hungry and up 4 it, no...",1


In [117]:
#we now create new column from category to encode text to numbers
#df_balanced['spam'] = df_balanced['Category'].apply(lambda x: 1 if x == 'spam' else 0)
#df_balanced.sample(5)

In [118]:
#we can now split our data to train and test. Stratify ensures there is a balance when spliting
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_balanced['Message'], df_balanced['spam'], stratify = df_balanced['spam'])

y_train.value_counts()
#y_test.value_counts()

0    560
1    560
Name: spam, dtype: int64

In [None]:
#now we can do the emmbedding using BERT(). This is going to download a pre-trained model on wikipedea and bo...
#we are going to use this trained model to generate the embedding vectors
#bert_preprocess = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3')
#bert_encoder = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4')

In [123]:
#we can write function that takes a sentence and returns 768 length vector
def get_sentence_embending(sentences):
    preprocessed_text =  bert_preprocess(sentences)
    return bert_encoder(preprocessed_text)['pooled_output']

get_sentence_embending([
    "500$ discount. hurry up",
    "Bhavin, are you up for a volleyball game tomorrow?"
])

<tf.Tensor: shape=(2, 768), dtype=float32, numpy=
array([[-0.84351707, -0.5132728 , -0.8884573 , ..., -0.74748874,
        -0.7531474 ,  0.91964495],
       [-0.82059705, -0.52094984, -0.9524536 , ..., -0.88041943,
        -0.70536864,  0.8252004 ]], dtype=float32)>

In [124]:
#we can try our function on few words
e = get_sentence_embending([
    'banana',
    'grapes',
    'mango',
    'jeff bezos',
    'elon musk',
    'bill gates' 
])

In [125]:
#we can now use cosine similarity to check the similarities
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity([e[0]],[e[1]])

array([[0.9911088]], dtype=float32)

In [127]:
cosine_similarity([e[0]],[e[4]])

array([[0.89336264]], dtype=float32)

In [131]:
#now back to building our clasification model
#There are two types of models, sequential and functional. in this case we are going to use a functional model

#BERT layers
text_input = tf.keras.layers.Input(shape=(), dtype = tf.string, name = 'text')
preprocessed_text =  bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

#Neural network layers
l = tf.keras.layers.Dropout(0.1, name = 'dropout')(outputs['pooled_output'])
l = tf.keras.layers.Dense(1, activation = 'sigmoid',name = 'output')(l)

#construct final model
model = tf.keras.Model(inputs = [text_input], outputs = [l])
model.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer_3 (KerasLayer)     {'input_type_ids':   0           ['text[0][0]']                   
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128),                                                          
                                 'input_word_ids':                                                
                                (None, 128)}                                                

In [134]:
#we can now compile our model
METRICS =  [
    tf.keras.metrics.BinaryAccuracy(name='accuracy'),
    tf.keras.metrics.Precision(name='precision'),
    tf.keras.metrics.Recall(name='recall')
]

model.compile(optimizer ='adam',
              loss = 'binary_crossentropy',
              metrics = METRICS)

In [135]:
#we can now train our model
model.fit(X_train, y_train, epochs = 5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x2865ebf3460>

In [None]:
#we can now evalute our model
model.evaluate(X_test,y_test)


In [None]:
#we can also do some predictions
y_predictd = model.predict(X_test)
y_predicted = y_predictd.flatten()

In [None]:
#since y_predicted is sigmoid values that is float, we can say >0.5 to be 1 and <0.5 be 0
import numpy as np
y_predicted = np.where(y_predicted > 0.05, 1, 0)
y_predicted

In [None]:
#using y_predicted and y_test we can plot our confusion matrix
from sk.learn.metrics import confusion_matrix, classification_report

cm = confusion_matrix(y_test, y_predicted)
cm

In [None]:
from matplotlib import pyplot as plt
import seaborn as sn
sn.heatmap(cm, annot = True, fmt = 'd')
plt.xlabel('Predicted')
plt.ylabel('Truth')

In [None]:
print(classification_report(y_test, y_predicted))

In [None]:
#we can some review on some emails
reviews = [
    'Enter a chance to win %5000, hurry up, offer valid untill march 31, 2021',
    'You are awarded a SiPix Digital camera! call 090661221061 from landline. Delivery within 28 days. T Cs Box177. M2211BP',
    'Free entry in 2 a wkly comp to win FA Cup final tkts ',
    'WINNER!! As a valued network customer you have ',
    'Hey Sam, are you coming for a cricket game tomorrow?',
    "Why don't you wait 'til at least wednesday to see if you get your."
]
model.predict(reviews)