In [2]:
# A dependency of the preprocessing model
!pip install "tensorflow-text"

Collecting tensorflow-text
  Downloading tensorflow_text-2.13.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.5/6.5 MB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
Collecting tensorflow<2.14,>=2.13.0 (from tensorflow-text)
  Downloading tensorflow-2.13.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (524.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m524.1/524.1 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting keras<2.14,>=2.13.1 (from tensorflow<2.14,>=2.13.0->tensorflow-text)
  Downloading keras-2.13.1-py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m45.9 MB/s[0m eta [36m0:00:00[0m
Collecting tensorboard<2.14,>=2.13 (from tensorflow<2.14,>=2.13.0->tensorflow-text)
  Downloading tensorboard-2.13.0-py3-none-any.whl (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 

In [3]:
import tensorflow_hub as hub
import tensorflow_text as text
import tensorflow as tf
from tensorflow import keras

In [4]:
import pandas as pd

In [5]:
import sys

print("Python Version:", sys.version)
print("Python Version Info:", sys.version_info)
print(tf.__version__)
print(hub.__version__)
print(text.__version__)


Python Version: 3.10.12 (main, Jun 11 2023, 05:26:28) [GCC 11.4.0]
Python Version Info: sys.version_info(major=3, minor=10, micro=12, releaselevel='final', serial=0)
2.13.0
0.14.0
2.13.0


# import dataset

In [8]:
df = pd.read_csv('spam.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
df.shape

(5572, 2)

In [11]:
df['is_spam'] = df.Category.apply(lambda cat : 1 if cat == 'spam' else 0)
df['is_spam'].value_counts()

0    4825
1     747
Name: is_spam, dtype: int64

In [12]:
df.drop('Category' , axis = 1 , inplace = True)

In [13]:
df.head()

Unnamed: 0,Message,is_spam
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [14]:
print('fraction of spam classes : ' , 747 / (4825 + 747) )
print('fraction of not spam classes : ' , 4825 / (4825 + 747) )


fraction of spam classes :  0.13406317300789664
fraction of not spam classes :  0.8659368269921034


***Target class is imbalanced and majority class is not spam i.e 0***


*   Handling imbalanced data with Random Oversampling technique
* majority class : 0
* minorty class : 1



In [21]:
df_class_1 = df[df.is_spam == 1]
df_class_0 = df[df.is_spam == 0]

In [22]:
df_1_over = df_class_1.sample( len(df_class_0)   , replace = True)

In [23]:
df_oversampled = pd.concat( [ df_1_over , df_class_0] , axis = 0)
df_oversampled.is_spam.value_counts()

1    4825
0    4825
Name: is_spam, dtype: int64

In [27]:
X = df_oversampled['Message']
y = df_oversampled['is_spam']

In [29]:
from sklearn.model_selection import train_test_split
x_train , x_test , y_train , y_test = train_test_split(X , y, test_size=0.3 ,random_state=25 , stratify = y )
x_train.shape , x_test.shape

((6755,), (2895,))

# Preprocess text with Bert

In [30]:
preprocess_url = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"
encoder_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4"

In [32]:
bert_preprocess_model = hub.KerasLayer(preprocess_url)
bert_encode_model = hub.KerasLayer(encoder_url)


In [58]:
sentences = [
    "banana",
    "grapes",
    "mango",
    "jeff bezos",
    "elon musk",
    "bill gates" ,
      'virat',
      'dhoni',
        'ronaldo' ,
            'messi',
            ]

sen_dic = {}
i = 0
for sen in sentences:
    sen_dic[sen] = i
    i = i + 1

In [39]:
def get_bert_model_output( text ):
    text_preprocessed =  bert_preprocess_model(text)
    bert_results = bert_encode_model(text_preprocessed)
    return bert_results['pooled_output']


In [59]:
output = get_bert_model_output(sentences)

In [60]:
output.shape

TensorShape([10, 768])

In [61]:
from sklearn.metrics.pairwise import cosine_similarity

def get_prediction(wv1 , wv2):
  return cosine_similarity([wv1],[wv2])[0][0]

In [65]:
get_prediction( output[sen_dic['elon musk']] , output[sen_dic['bill gates']] )

0.9164165

# Build Model
There are two types of models you can build in tensorflow.

* (1) Sequential
* (2) Functional

So far we have built sequential model. But below we will build functional model. More information on these two is here: https://becominghuman.ai/sequential-vs-functional-model-in-keras-20684f766057

In [92]:
# Bert layers
input_layer = tf.keras.layers.Input(shape=( ), dtype=tf.string, name='text')
outputs = get_bert_model_output(input_layer)

In [93]:
# Neural network layers
dropout_layer = tf.keras.layers.Dropout(0.5, name="dropout")(outputs)
output_layer = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(dropout_layer)

In [94]:
# Use inputs and outputs to construct a final model
model = tf.keras.Model(inputs=[input_layer], outputs = [output_layer])

In [95]:
model.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 text (InputLayer)           [(None,)]                    0         []                            
                                                                                                  
 keras_layer_2 (KerasLayer)  {'input_type_ids': (None,    0         ['text[0][0]']                
                             128),                                                                
                              'input_word_ids': (None,                                            
                             128),                                                                
                              'input_mask': (None, 128)                                           
                             }                                                              

In [101]:
Metrics = [
    tf.keras.metrics.BinaryAccuracy( name = 'accuracy'),
    tf.keras.metrics.Precision( name = 'precision'),
    tf.keras.metrics.Recall( name = 'recall'),
]

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=Metrics)

In [103]:
model.fit(x_train, y_train, epochs=1)

 17/212 [=>............................] - ETA: 56:06 - loss: 0.7795 - accuracy: 0.5202 - precision: 0.5068 - recall: 0.4242

KeyboardInterrupt: ignored

In [None]:
model.evaluate(x_test, y_test)

In [None]:
reviews = [
    'Reply to win Â£100 weekly! Where will the 2006 FIFA World Cup be held? Send STOP to 87239 to end service',
    'You are awarded a SiPix Digital Camera! call 09061221061 from landline. Delivery within 28days. T Cs Box177. M221BP. 2yr warranty. 150ppm. 16 . p pÂ£3.99',
    'it to 80488. Your 500 free text messages are valid until 31 December 2005.',
    'Hey Sam, Are you coming for a cricket game tomorrow',
    "Why don't you wait 'til at least wednesday to see if you get your ."
]
model.predict(reviews)