<a href="https://colab.research.google.com/github/faridelya/Deep-Learning/blob/main/dbert_text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Import all packages ###

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import re
import unicodedata
import nltk
from nltk.corpus import stopwords
from tensorflow import keras
from tensorflow.keras.layers import Dense,Dropout, Input
from tqdm import tqdm
import pickle
from sklearn.metrics import confusion_matrix,f1_score,classification_report
import matplotlib.pyplot as plt
import itertools
from sklearn.utils import shuffle
from tensorflow.keras import regularizers
from transformers import *
from transformers import BertTokenizer, TFBertModel, BertConfig,TFDistilBertModel,DistilBertTokenizer,DistilBertConfig

###  Preprocessing and cleaning functions  ###

In [None]:
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

def clean_stopwords_shortwords(w):
    stopwords_list=stopwords.words('english')
    words = w.split() 
    clean_words = [word for word in words if (word not in stopwords_list) and len(word) > 2]
    return " ".join(clean_words) 

def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())
    w = re.sub(r"([?.!,¿])", r" ", w)
    w = re.sub(r'[" "]+', " ", w)
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
    w=clean_stopwords_shortwords(w)
    w=re.sub(r'@\w+', '',w)
    return w

###  Reading and Cleaning the Dataset  ###

In [None]:
data_file='./data/spam.csv'
data=pd.read_csv(data_file,encoding='ISO-8859-1')

data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


>  Removing Unnamed Columns, dropping NaN data and resetting the index after dropping some rows/columns containing NaN dataset and finally shuffling the dataset

In [None]:
data = data.loc[:, ~data.columns.str.contains('Unnamed: 2', case=False)] 
data = data.loc[:, ~data.columns.str.contains('Unnamed: 3', case=False)] 
data = data.loc[:, ~data.columns.str.contains('Unnamed: 4', case=False)] 
print('File has {} rows and {} columns'.format(data.shape[0],data.shape[1]))
data=data.dropna()
data=data.reset_index(drop=True)
print('File has {} rows and {} columns'.format(data.shape[0],data.shape[1]))
data = shuffle(data)

data.head()

File has 5572 rows and 2 columns
File has 5572 rows and 2 columns


Unnamed: 0,v1,v2
2011,ham,Do whatever you want. You know what the rules ...
2763,ham,"Say this slowly.? GOD,I LOVE YOU &amp; I NEED ..."
388,spam,4mths half price Orange line rental & latest c...
1938,ham,Excellent! Are you ready to moan and scream in...
1903,spam,Free entry in 2 a weekly comp for a chance to ...


 > Rename v1:label and v2:text, converting 'ham' labels to '0' and 'spam' to '1', saving it to the 'gt' (ground truth) column and applying the preprocess function to the dataset

In [None]:
data=data.rename(columns = {'v1': 'label', 'v2': 'text'}, inplace = False)

data['gt'] = data['label'].map({'ham':0,'spam':1})

print('Available labels: ',data.label.unique())
data['text']=data['text'].map(preprocess_sentence)

num_classes=len(data.label.unique())

data.head()

Available labels:  ['ham' 'spam']


Unnamed: 0,label,text,gt
2011,ham,whatever want know rules talk earlier week sta...,0
2763,ham,say slowly god love amp need clean heart blood...,0
388,spam,mths half price orange line rental latest came...,1
1938,ham,excellent ready moan scream ecstasy,0
1903,spam,free entry weekly comp chance win ipod txt pod...,1


###  Loading DistilBERT Tokenizer and the DistilBERT model  ###

In [None]:
dbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [None]:
dbert_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')

###  Preparing input for the model  ###

In [None]:
max_len=32
sentences=data['text']
labels=data['gt']
len(sentences),len(labels)

(5572, 5572)

####  Let's take a sentence from the dataset and understand the input and output of the DistilBERT  #### 

> Tokenized sentence 

In [None]:
dbert_tokenizer.tokenize(sentences[0])

['ju',
 '##rong',
 'point',
 'crazy',
 'available',
 'bug',
 '##is',
 'great',
 'world',
 'buffet',
 'ci',
 '##ne',
 'got',
 'amore',
 'wat']

> Input ids and the attention masks from the tokenizer 

In [None]:
dbert_inp=dbert_tokenizer.encode_plus(sentences[0],add_special_tokens = True,max_length =20,pad_to_max_length = True,truncation=True)
dbert_inp

{'input_ids': [101,
  18414,
  17583,
  2391,
  4689,
  2800,
  11829,
  2483,
  2307,
  2088,
  28305,
  25022,
  2638,
  2288,
  26297,
  28194,
  102,
  0,
  0,
  0],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0]}

In [None]:
dbert_inp['input_ids']

[101,
 18414,
 17583,
 2391,
 4689,
 2800,
 11829,
 2483,
 2307,
 2088,
 28305,
 25022,
 2638,
 2288,
 26297,
 28194,
 102,
 0,
 0,
 0]

>  DistilBERT model output: Give input_ids and the attention_mask obtained from the tokenizer. The output will be a tuple of the size (1,max_len,768)

In [None]:
id_inp=np.asarray(dbert_inp['input_ids'])
mask_inp=np.asarray(dbert_inp['attention_mask'])
out=dbert_model([id_inp.reshape(1,-1),mask_inp.reshape(1,-1)])
type(out),out

(tuple,
 (<tf.Tensor: shape=(1, 20, 768), dtype=float32, numpy=
  array([[[ 0.603038  , -0.87843955, -0.27702317, ...,  0.34013888,
           -0.31951576, -0.02768148],
          [ 0.8059453 , -1.2426811 , -0.3692848 , ..., -0.00915013,
           -0.2044661 , -0.12683335],
          [ 0.7864537 , -0.9070081 , -0.44475678, ..., -0.00204397,
           -0.31890398, -0.23745532],
          ...,
          [ 0.56349653, -1.0353185 , -0.26982975, ...,  0.37219822,
           -0.30490598, -0.09034443],
          [ 0.5596978 , -1.0259491 , -0.29068822, ...,  0.33829993,
           -0.29505792, -0.1154284 ],
          [ 0.70843756, -0.9754098 , -0.21341297, ...,  0.42030877,
           -0.3028642 , -0.06653835]]], dtype=float32)>,))

> Obtain the embeddings of a sentence from the output

In [None]:
out[0][:,0,:]

> Decode the original sentence from the tokenizer 

In [None]:
dbert_tokenizer.decode(dbert_inp['input_ids'])

'[CLS] jurong point crazy available bugis great world buffet cine got amore wat [SEP] [PAD] [PAD] [PAD]'

###  Create a basic NN model using DistilBERT embeddings to get the predictions  ###

In [None]:
def create_model():
    inps = Input(shape = (max_len,), dtype='int64')
    masks= Input(shape = (max_len,), dtype='int64')
    dbert_layer = dbert_model(inps, attention_mask=masks)[0][:,0,:]
    dense = Dense(512,activation='relu',kernel_regularizer=regularizers.l2(0.01))(dbert_layer)
    dropout= Dropout(0.5)(dense)
    pred = Dense(num_classes, activation='softmax',kernel_regularizer=regularizers.l2(0.01))(dropout)
    model = tf.keras.Model(inputs=[inps,masks], outputs=pred)
    print(model.summary())
    return model   

> Feel free to add more Dense and Dropout layers with variable units and the regularizers

In [None]:
model=create_model()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 32)]         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 32)]         0                                            
__________________________________________________________________________________________________
tf_distil_bert_model_4 (TFDisti ((None, 32, 768),)   66362880    input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
tf_op_layer_strided_slice (Tens [(None, 768)]        0           tf_distil_bert_model_4

> Prepare the model input 

In [None]:
input_ids=[]
attention_masks=[]

for sent in sentences:
    dbert_inps=dbert_tokenizer.encode_plus(sent,add_special_tokens = True,max_length =max_len,pad_to_max_length = True,return_attention_mask = True,truncation=True)
    input_ids.append(dbert_inps['input_ids'])
    attention_masks.append(dbert_inps['attention_mask'])

input_ids=np.asarray(input_ids)
attention_masks=np.array(attention_masks)
labels=np.array(labels)

In [None]:
len(input_ids),len(attention_masks),len(labels)

(5572, 5572, 5572)

> Save the model input in the pickle files to use it later without performing the above steps

In [None]:
print('Preparing the pickle file.....')

pickle_inp_path='./data/dbert_inp.pkl'
pickle_mask_path='./data/dbert_mask.pkl'
pickle_label_path='./data/dbert_label.pkl'

pickle.dump((input_ids),open(pickle_inp_path,'wb'))
pickle.dump((attention_masks),open(pickle_mask_path,'wb'))
pickle.dump((labels),open(pickle_label_path,'wb'))


print('Pickle files saved as ',pickle_inp_path,pickle_mask_path,pickle_label_path)

Preparing the pickle file.....
Pickle files saved as  ./data/dbert_inp.pkl ./data/dbert_mask.pkl ./data/dbert_label.pkl


In [None]:
print('Loading the saved pickle files..')

input_ids=pickle.load(open(pickle_inp_path, 'rb'))
attention_masks=pickle.load(open(pickle_mask_path, 'rb'))
labels=pickle.load(open(pickle_label_path, 'rb'))

print('Input shape {} Attention mask shape {} Input label shape {}'.format(input_ids.shape,attention_masks.shape,labels.shape))

Loading the saved pickle files..
Input shape (5572, 32) Attention mask shape (5572, 32) Input label shape (5572,)


In [None]:
label_class_dict={0:'ham',1:'spam'}
target_names=label_class_dict.values()

> Train Test split and setting up the loss function, accuracy and optimizer for the model. 

In [None]:
train_inp,val_inp,train_label,val_label,train_mask,val_mask=train_test_split(input_ids,labels,attention_masks,test_size=0.2)

print('Train inp shape {} Val input shape {}\nTrain label shape {} Val label shape {}\nTrain attention mask shape {} Val attention mask shape {}'.format(train_inp.shape,val_inp.shape,train_label.shape,val_label.shape,train_mask.shape,val_mask.shape))


log_dir='dbert_model'
model_save_path='./dbert_model.h5'

callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath=model_save_path,save_weights_only=True,monitor='val_loss',mode='min',save_best_only=True),keras.callbacks.TensorBoard(log_dir=log_dir)]

loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)

model.compile(loss=loss,optimizer=optimizer, metrics=[metric])

Train inp shape (4457, 32) Val input shape (1115, 32)
Train label shape (4457,) Val label shape (1115,)
Train attention mask shape (4457, 32) Val attention mask shape (1115, 32)


In [None]:
callbacks= [tf.keras.callbacks.ModelCheckpoint(filepath=model_save_path,save_weights_only=True,monitor='val_loss',mode='min',save_best_only=True),keras.callbacks.TensorBoard(log_dir=log_dir)]
model.compile(loss=loss,optimizer=optimizer, metrics=[metric])

### **Training**

In [None]:
history=model.fit([train_inp,train_mask],train_label,batch_size=16,epochs=5,validation_data=([val_inp,val_mask],val_label),callbacks=callbacks)

Epoch 1/5
1/279 [..............................] - ETA: 0s - loss: 6.7663 - accuracy: 0.8750
Instructions for updating:
use `tf.profiler.experimental.stop` instead.
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### Tensorboard visualization (Training-Testing curve) ###





In [None]:
%load_ext tensorboard

In [None]:
%tensorboard --logdir {log_dir}

< Increase the number of epochs in order to decrease the loss further

### Use the saved model for predictions and calculating the evaluation metrics ###

In [None]:
trained_model = create_model()
trained_model.compile(loss=loss,optimizer=optimizer, metrics=[metric])
trained_model.load_weights(model_save_path)

Model: "functional_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            [(None, 32)]         0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            [(None, 32)]         0                                            
__________________________________________________________________________________________________
tf_distil_bert_model_4 (TFDisti ((None, 32, 768),)   66362880    input_5[0][0]                    
                                                                 input_6[0][0]                    
__________________________________________________________________________________________________
tf_op_layer_strided_slice_2 (Te [(None, 768)]        0           tf_distil_bert_model_4

In [None]:
preds = trained_model.predict([val_inp,val_mask],batch_size=16)
pred_labels = preds.argmax(axis=1)
f1 = f1_score(val_label,pred_labels)
f1



0.9424460431654677

In [None]:
target_names=['ham','spam']
print('F1 score',f1)
print('Classification Report')
print(classification_report(val_label,pred_labels,target_names=target_names))

print('Training and saving built model.....')   

F1 score 0.9424460431654677
Classification Report
              precision    recall  f1-score   support

         ham       0.99      0.99      0.99       973
        spam       0.96      0.92      0.94       142

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115

Training and saving built model.....
