In [None]:
import pandas as pd

from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
import tensorflow_hub as hub
import tensorflow_text as text

from sklearn.metrics.pairwise import cosine_similarity

from keras.layers import Input, Dense, Dropout
from keras.models import Model
from keras.metrics import Precision, BinaryAccuracy, Recall

In [None]:
df = pd.read_csv("/content/spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
df.shape

(5572, 2)

In [None]:
df.isna().sum()

Unnamed: 0,0
Category,0
Message,0


In [None]:
df['Category'].value_counts()

Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
ham,4825
spam,747


In [None]:
df_ham = df[df['Category']=='ham']
df_spam = df[df['Category']=='spam']

In [None]:
print(df_ham.shape)
print(df_spam.shape)

(4825, 2)
(747, 2)


In [None]:
# Minority class we are duplicating to have balance dataset
df_spam_balance = df_spam.sample(df_ham.shape[0], replace=True)

In [None]:
df_spam_balance.shape

(4825, 2)

In [None]:
# Concatenate spam & ham dataframe
dataset = pd.concat([df_spam_balance, df_ham])
dataset.shape

(9650, 2)

In [None]:
dataset['Category'].value_counts()

Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
spam,4825
ham,4825


In [None]:
dataset.head()

Unnamed: 0,Category,Message
3571,spam,Customer Loyalty Offer:The NEW Nokia6650 Mobil...
839,spam,We tried to contact you re our offer of New Vi...
947,spam,Ur cash-balance is currently 500 pounds - to m...
4154,spam,URGENT!! Your 4* Costa Del Sol Holiday or £500...
5147,spam,Get your garden ready for summer with a FREE s...


In [None]:
# Label encoding ham-0 & spam-1
dataset['Mail'] = dataset['Category'].apply(lambda x: 1 if x=='spam' else 0)

In [None]:
dataset.head()

Unnamed: 0,Category,Message,Mail
3571,spam,Customer Loyalty Offer:The NEW Nokia6650 Mobil...,1
839,spam,We tried to contact you re our offer of New Vi...,1
947,spam,Ur cash-balance is currently 500 pounds - to m...,1
4154,spam,URGENT!! Your 4* Costa Del Sol Holiday or £500...,1
5147,spam,Get your garden ready for summer with a FREE s...,1


In [None]:
dataset['Mail'].value_counts()

Unnamed: 0_level_0,count
Mail,Unnamed: 1_level_1
1,4825
0,4825


In [None]:
# split the data into train & test
x_train, x_test, y_train, y_test = train_test_split(dataset['Message'], dataset['Mail'], test_size=0.25, random_state=1)

In [None]:
print(x_train.shape)
print(x_test.shape)

(7237,)
(2413,)


#### BERT Model

In [None]:
# load BERT Preprocessor model
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")

# Load BERT encoder model
bert_model = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

In [None]:
# Example on how BERT works
def sentence_embeddings(sentences):
  preprocessed_text = bert_preprocess(sentences)
  return bert_model(preprocessed_text)['pooled_output']

print(sentence_embeddings(['Germany, officially the Federal Republic of Germany, is a country in Central Europe', 'It lies between the Baltic and North Sea to the north and the Alps to the south.']))

# get embedding vectors for few sample exampler
samples = sentence_embeddings(['Germany', 'Italy', 'France', 'Mercedes', 'Ford', 'Benz', 'Shoe', 'Shirt', 'Trump', 'Biden'])

print(cosine_similarity([samples[0]], [samples[1]]))
print("**")
print(cosine_similarity([samples[0]], [samples[3]]))
print("**")
print(cosine_similarity([samples[3]], [samples[5]]))
print("**")
print(cosine_similarity([samples[-2]], [samples[-1]]))

tf.Tensor(
[[-0.74103254 -0.0109413  -0.38677603 ... -0.37466368 -0.24527836
   0.37626195]
 [-0.9370649  -0.49418983 -0.91205996 ... -0.85579956 -0.6421839
   0.91424906]], shape=(2, 768), dtype=float32)
[[0.9965784]]
**
[[0.79787934]]
**
[[0.99600446]]
**
[[0.98565125]]


#### Building Transformer Deep Learning Model

In [None]:
# Build model
text_input = Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_model(preprocessed_text)

# Build Neural Network layers using Functional API
x1 = Dropout(0.1, name='Dropout')(outputs['pooled_output'])
x2 = Dense(1, activation='sigmoid', name='output')(x1)

model = Model(inputs=[text_input], outputs=[x2])
model.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 text (InputLayer)           [(None,)]                    0         []                            
                                                                                                  
 keras_layer_2 (KerasLayer)  {'input_type_ids': (None,    0         ['text[0][0]']                
                             128),                                                                
                              'input_mask': (None, 128)                                           
                             , 'input_word_ids': (None,                                           
                              128)}                                                               
                                                                                            

In [None]:
metrics = [BinaryAccuracy(name='accuracy'), Precision(name='precision'), Recall(name='recall')]
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=metrics)

In [None]:
model.fit(x_train, y_train, validation_data=(x_test, y_test), batch_size=32, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5