# **Bert Tutorial**

In [None]:
!pip3 install --quiet "tensorflow-text==2.8.*"

[K     |████████████████████████████████| 4.9 MB 26.8 MB/s 
[K     |████████████████████████████████| 497.9 MB 4.4 kB/s 
[K     |████████████████████████████████| 1.4 MB 51.7 MB/s 
[K     |████████████████████████████████| 462 kB 47.4 MB/s 
[K     |████████████████████████████████| 5.8 MB 59.0 MB/s 
[?25h

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/alexvaroz/data_science_alem_do_basico/master/spamdata.csv')
df.head()

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
df.label.value_counts()

0    4825
1     747
Name: label, dtype: int64

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], stratify=df['label'])

In [None]:
bert_preprocess = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3')
bert_encoder = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4')

In [None]:
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

In [None]:
l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
l = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(l)

In [None]:
model = tf.keras.Model(inputs=[text_input], outputs = [l])

In [None]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer (KerasLayer)       {'input_mask': (Non  0           ['text[0][0]']                   
                                e, 128),                                                          
                                 'input_word_ids':                                                
                                (None, 128),                                                      
                                 'input_type_ids':                                                
                                (None, 128)}                                                  

In [None]:
METRICS = [
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall')
]

model.compile(optimizer='adam',
 loss='binary_crossentropy',
 metrics=METRICS)

In [None]:
model.fit(X_train, y_train, epochs=4)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f2f4041d510>

In [None]:
model.evaluate(X_test, y_test)



[0.1337115317583084, 0.9605168700218201, 0.9342105388641357, 0.759358286857605]

In [None]:
y_predicted = model.predict(X_test)
y_predicted = y_predicted.flatten()

In [None]:
import numpy as np

y_predicted = np.where(y_predicted > 0.5, 1, 0)
y_predicted

array([0, 0, 1, ..., 1, 0, 0])

In [None]:
sample_dataset = [
    'You can win alot of money, register in the link below',
    'You have an iphone 10, spin the image below to claim your prize and it willl be delivered in your door step',
    'You have an offer, the company will give you 50% off in every item purchased.',
    'Hey Bravin, dont be late for the meeting tomorrow, it will start at exactly 10:30am',
    'See you monday, we have alot to talk about the future of this company .'
]
model.predict(sample_dataset)

array([[0.49847984],
       [0.5758853 ],
       [0.21482512],
       [0.26953387],
       [0.03625413]], dtype=float32)

Balanceando o dataset

In [None]:
df_spam = df[df.label==1]
df_ham = df[df.label==0]
df_ham_downsampled = df_ham.sample(df_spam.shape[0])
df_ham_downsampled.shape

(747, 2)

In [None]:
df_balanced = pd.concat([df_ham_downsampled, df_spam])
df_balanced.shape

(1494, 2)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_balanced['text'],df_balanced['label'], stratify=df_balanced['label'])

In [None]:
bert_preprocess = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3')
bert_encoder = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4')

In [None]:
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

In [None]:
l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
l = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(l)

In [None]:
model = tf.keras.Model(inputs=[text_input], outputs = [l])

In [None]:
METRICS = [
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall')
]

model.compile(optimizer='adam',
 loss='binary_crossentropy',
 metrics=METRICS)

In [None]:
model.fit(X_train, y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f2eef3af350>

In [None]:
sample_dataset = [
    'You can win alot of money, register in the link below',
    'You have an iphone 10, spin the image below to claim your prize and it willl be delivered in your door step',
    'You have an offer, the company will give you 50% off in every item purchased.',
    'Hey Bravin, dont be late for the meeting tomorrow, it will start at exactly 10:30am',
    'See you monday, we have alot to talk about the future of this company .'
]
model.predict(sample_dataset)

array([[0.8341657 ],
       [0.9097381 ],
       [0.5292153 ],
       [0.7576493 ],
       [0.13838603]], dtype=float32)