In [None]:
!pip install transformers

In [None]:
import torch
import pandas as pd 
import numpy as np 

import string 
import spacy
nlp = spacy.load("en_core_web_sm")
from textblob import Word, TextBlob


from tensorflow import keras
import tensorflow as tf
from keras.layers import LSTM, Activation, Dropout, Dense, Input, Bidirectional, Attention, Concatenate
from keras.layers.embeddings import Embedding
from keras.models import Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import transformers
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset


In [None]:
from keras.layers import LSTM, Activation, Dropout, Dense, Input
from keras.layers.embeddings import Embedding
from keras.models import Model

In [None]:
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Data Loading and Cleaning

In [None]:
root_dir = '/content/drive/MyDrive/FYP Model Training'
train_data = pd.read_csv(f'{root_dir}/Data/App_Training.csv')
test_data = pd.read_csv(f'{root_dir}/Data/App_Test_Labeled.csv')
eval_data = pd.read_csv(f'{root_dir}/Data/SubtaskA_EvaluationData_labeled.csv', header = None, encoding = 'latin1')
eval_data.columns = ['id', 'sentence', 'label']

train_data.drop(labels = ['0'], axis = 1, inplace = True)
test_data.drop(labels = ['0'], axis = 1, inplace = True)

In [None]:
train_data = pd.concat([train_data, test_data]).copy().reset_index(drop = True)
train_data

Unnamed: 0,id,sentence,label
0,663_3,Please enable removing language code from the...,1
1,663_4,Note in your csproj file there is a Supported...,0
2,664_1,Wich means the new version not fully replaced...,0
3,664_2,Some of my users will still receive the old x...,0
4,664_3,The store randomly gives the old xap or the n...,0
...,...,...,...
9087,1658_3,we should have small tiles instead of a long l...,1
9088,1658_7,An app should be able to publish a service tha...,1
9089,1658_8,For example if I have an app that can process ...,1
9090,1659_1,I would like access to a stream for music play...,1


In [None]:
train_data['sentence'] = train_data['sentence'].apply(lambda x: " ".join([word.lower() for word in x.split()]))
eval_data['sentence'] = eval_data['sentence'].apply(lambda x: " ".join([word.lower() for word in x.split()]))

train_data['sentence'] = train_data['sentence'].apply(lambda x: x.translate(str.maketrans('','', string.punctuation)))
eval_data['sentence'] = eval_data['sentence'].apply(lambda x: x.translate(str.maketrans('','', string.punctuation)))


In [None]:
train_data['sentence'][0]

'please enable removing language code from the dev center language history for example if you ever selected ru and ru ru laguages and you published this xap to the store then it causes tile localization to show the en us default tile localization which is bad'

In [None]:
#Run this cell to apply auto spell check according to TextBlob's correct() function
train_data['sentence'] = train_data['sentence'].apply(lambda x: TextBlob(x).correct().raw)
eval_data['sentence'] = eval_data['sentence'].apply(lambda x: TextBlob(x).correct().raw)

In [None]:
display(train_data.head())
display(eval_data.head())

Unnamed: 0,id,sentence,label
0,663_3,please enable removing language code from the ...,1
1,663_4,note in your csproj file there is a supportedc...,0
2,664_1,with means the new version not fully replaced ...,0
3,664_2,some of my users will still receive the old ca...,0
4,664_3,the store random gives the old cap or the new ...,0


Unnamed: 0,id,sentence,label
0,9566,this would enable live traffic aware apes,0
1,9569,please try other forgetting like bold italian ...,1
2,9576,since computers were invented to save time i s...,1
3,9577,allow rearranging if the user wants to change ...,1
4,9579,add sird instructions for better use of arm no...,1


In [None]:
train_data.to_csv(f'{root_dir}/Data/App_Training_Corrected.csv')
eval_data.to_csv(f'{root_dir}/Data/Eval_data_Corrected.csv')


### Model Training 2
1. Baseline LSTM model
2. Bidirectional LSTM Model
3. LSTM with attention
4. Impact of adding more layers


In [None]:
train_df, validate_df = train_test_split(train_data, random_state = 42)
X_train,Y_train = train_df['sentence'], train_df['label']
X_val, Y_val = validate_df['sentence'], validate_df['label']



In [None]:
vect = CountVectorizer()
vect.fit(X_train)
len(vect.get_feature_names_out())

8058

In [None]:
tokenizer = Tokenizer(num_words=8000)
tokenizer.fit_on_texts(X_train)
words_to_index = tokenizer.word_index
vocab_len = len(words_to_index)


In [None]:
with open(f'{root_dir}/Data/glove.6B.100d.txt', 'r', encoding='UTF-8') as f:
    words = set()
    word_to_vec_map = {}
    for line in f:
        w_line = line.split()
        curr_word = w_line[0]
        word_to_vec_map[curr_word] = np.array(w_line[1:], dtype=np.float64)
MAX_LENGTH = 150

In [None]:
vocab_len = len(words_to_index)
embed_vector_len = word_to_vec_map['moon'].shape[0]

emb_matrix = np.zeros((vocab_len, embed_vector_len))

for word, index in words_to_index.items():
  embedding_vector = word_to_vec_map.get(word)
  if embedding_vector is not None:
    emb_matrix[index, :] = embedding_vector


In [None]:
emb_matrix.shape

(8086, 100)

In [None]:
def create_model(input_shape):
    X_indices = Input(input_shape)
    embeddings = Embedding(input_dim=vocab_len, output_dim=embed_vector_len, input_length=150, weights = [emb_matrix], trainable=True)(X_indices)
    X = LSTM(128, dropout = 0.3)(embeddings)
    X = Dense(1, activation='sigmoid')(X)
    model = Model(inputs=X_indices, outputs=X)
    return model

def create_bilstm_model(input_shape):
    X_indices = Input(input_shape)
    embeddings = Embedding(input_dim=vocab_len, output_dim=embed_vector_len, input_length=150, weights = [emb_matrix], trainable=True)(X_indices)
    X = Bidirectional(LSTM(128, dropout = 0.3))(embeddings)
    X = Dense(1, activation='sigmoid')(X)
    model = Model(inputs=X_indices, outputs=X)
    return model

def prediction_pipeline(model, X_test, Y_test):
    X_test_indices = tokenizer.texts_to_sequences(X_test)
    X_test_indices = pad_sequences(X_test_indices, maxlen=150, padding='pre')
    Y_pred = model.predict(X_test_indices) > 0.5
    Y_pred = Y_pred.squeeze()
    Y_pred = np.array([1 if x else 0 for x in Y_pred ])
    accuracy = accuracy_score(Y_test, Y_pred)
    f1 = f1_score(Y_test, Y_pred)
    precision = precision_score(Y_test, Y_pred) 
    recall = recall_score(Y_test, Y_pred)
    return accuracy, f1, precision, recall, Y_pred

def prediction_pipeline_attn(model, X_test):
    X_test_indices = tokenizer.texts_to_sequences(X_test)
    X_test_indices = pad_sequences(X_test_indices, maxlen=150, padding='pre')
    Y_pred = model.predict(X_test_indices)
    return X_test_indices, Y_pred

def create_bilstm_attn_model_dot(input_shape):
    X_indices = Input(input_shape)
    embeddings = Embedding(input_dim=vocab_len, output_dim=embed_vector_len, input_length=150, weights = [emb_matrix], trainable=True)(X_indices)
    lstm, forward_h, forward_c, backward_h, backward_c = Bidirectional(LSTM(128, dropout = 0.3, return_sequences = True, return_state = True))(embeddings)
    final_hidden_state = Concatenate()([forward_h, forward_c])
    final_hidden_state = tf.expand_dims(final_hidden_state, 2)
    mat_matrix = tf.matmul(lstm, final_hidden_state)
    attention_weights = tf.nn.softmax(mat_matrix, axis=1)
    context_vector = attention_weights * lstm
    context_vector = tf.reduce_sum(context_vector, axis=1)
    X = Dense(1, activation='sigmoid')(context_vector)
    model = Model(inputs=X_indices, outputs=X)  
    attn_weights_intermediate = Model(inputs=X_indices, outputs = attention_weights)
    return model, attn_weights_intermediate

def create_bilstm_attn_model_dot_embeddings(input_shape):
    X_indices = Input(input_shape)
    embeddings = Embedding(input_dim=vocab_len, output_dim=200, input_length=150, trainable=True)(X_indices)
    lstm, forward_h, forward_c, backward_h, backward_c = Bidirectional(LSTM(128, dropout = 0.3, return_sequences = True, return_state = True))(embeddings)
    final_hidden_state = Concatenate()([forward_h, forward_c])
    final_hidden_state = tf.expand_dims(final_hidden_state, 2)
    mat_matrix = tf.matmul(lstm, final_hidden_state)
    attention_weights = tf.nn.softmax(mat_matrix, axis=1)
    context_vector = attention_weights * lstm
    context_vector = tf.reduce_sum(context_vector, axis=1)
    X = Dense(1, activation='sigmoid')(context_vector)
    model = Model(inputs=X_indices, outputs=X)  
    attn_weights_intermediate = Model(inputs=X_indices, outputs = attention_weights)
    return model, attn_weights_intermediate

In [None]:
X_train_indices = tokenizer.texts_to_sequences(X_train)
X_train_indices = pad_sequences(X_train_indices, maxlen=150, padding='pre')
X_val_indices = tokenizer.texts_to_sequences(X_val)
X_val_indices = pad_sequences(X_val_indices, maxlen=150, padding='pre')

In [None]:
X_test = eval_data['sentence']
Y_test = eval_data['label']

In [None]:
model = create_model(150)

early_stopping = keras.callbacks.EarlyStopping(monitor ='val_loss', patience = 5, restore_best_weights = True)
adam = keras.optimizers.Adam(learning_rate = 0.0001)

model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train_indices, Y_train, validation_data = (X_val_indices, Y_val), batch_size=64, epochs=30, callbacks = early_stopping)

In [None]:
accuracy, f1, precision, recall, Y_pred = prediction_pipeline(model, X_test, Y_test)
display(confusion_matrix(Y_test, Y_pred))
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1: {f1}")


array([[713,  33],
       [ 29,  58]])

Accuracy: 0.9255702280912365
Precision: 0.6373626373626373
Recall: 0.6666666666666666
F1: 0.651685393258427


In [None]:
model_bidirectionalLSTM = create_bilstm_model(150)

early_stopping = keras.callbacks.EarlyStopping(monitor ='val_loss', patience = 5, restore_best_weights = True)
adam = keras.optimizers.Adam(learning_rate = 0.0001)
#Hyperparameters to tune:
#Learning rate, batch_size, vocabsize with oov words 
model_bidirectionalLSTM.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
model_bidirectionalLSTM.fit(X_train_indices, Y_train, validation_data = (X_val_indices, Y_val), batch_size=64, epochs=40, callbacks = early_stopping)

In [None]:
accuracy, f1, precision, recall, Y_pred = prediction_pipeline(model_bidirectionalLSTM, X_test, Y_test)
display(confusion_matrix(Y_test, Y_pred))
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1: {f1}")

array([[703,  43],
       [ 28,  59]])

Accuracy: 0.9147659063625451
Precision: 0.5784313725490197
Recall: 0.6781609195402298
F1: 0.6243386243386243


In [None]:
#Attention layer implemented from https://matthewmcateer.me/blog/getting-started-with-attention-for-classification/
#This is created using a neural network to simulate, try using dot product instead 
def create_bilstm_attn_model(input_shape):
    X_indices = Input(input_shape)
    embeddings = Embedding(input_dim=vocab_len, output_dim=embed_vector_len, input_length=150, weights = [emb_matrix], trainable=True)(X_indices)
    lstm, forward_h, forward_c, backward_h, backward_c = Bidirectional(LSTM(128, dropout = 0.3, return_sequences = True, return_state = True))(embeddings)
    final_hidden_state = Concatenate()([forward_h, forward_c])
    final_hidden_state = tf.expand_dims(final_hidden_state, 1)

    feature_nn = Dense(10, activation = 'relu')(lstm)
    final_hidden_state_nn = Dense(10, activation = 'relu')(final_hidden_state)
    score = tf.nn.tanh(feature_nn + final_hidden_state_nn) ## w[x, h]
    final_scores = Dense(1)(score)
    attention_weights = tf.nn.softmax(final_scores, axis=1, name = 'attn_weights')
    context_vector = attention_weights * lstm
    context_vector = tf.reduce_sum(context_vector, axis=1)
    X = Dense(1, activation='sigmoid')(context_vector)
    model = Model(inputs=X_indices, outputs=X)
    attn_weights_intermediate = Model(inputs=X_indices, outputs = attention_weights)
    return model, attn_weights_intermediate

model_bidirectionalLSTM_attn, model_attn_weights_intermediate = create_bilstm_attn_model(150)

early_stopping = keras.callbacks.EarlyStopping(monitor ='val_loss', patience = 5, restore_best_weights = True)
adam = keras.optimizers.Adam(learning_rate = 0.0001)

model_bidirectionalLSTM_attn.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
model_bidirectionalLSTM_attn.fit(X_train_indices, Y_train, validation_data = (X_val_indices, Y_val), batch_size=64, epochs=40, callbacks = early_stopping)


In [None]:
accuracy, f1, precision, recall, Y_pred = prediction_pipeline(model_bidirectionalLSTM_attn, X_test, Y_test)
display(confusion_matrix(Y_test, Y_pred))
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1: {f1}")

array([[704,  42],
       [ 31,  56]])

Accuracy: 0.9123649459783914
Precision: 0.5714285714285714
Recall: 0.6436781609195402
F1: 0.6054054054054054


In [None]:
model_bidirectionalLSTM_attn, model_attn_weights_intermediate = create_bilstm_attn_model_dot(150)
early_stopping = keras.callbacks.EarlyStopping(monitor ='val_loss', patience = 4, restore_best_weights = True)
adam = keras.optimizers.Adam(learning_rate = 0.0001)
model_bidirectionalLSTM_attn.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
model_bidirectionalLSTM_attn.fit(X_train_indices, Y_train, validation_data = (X_val_indices, Y_val), batch_size=64, epochs=40, callbacks = early_stopping)

In [None]:
accuracy, f1, precision, recall, Y_pred = prediction_pipeline(model_bidirectionalLSTM_attn, X_test, Y_test)
display(confusion_matrix(Y_test, Y_pred))
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1: {f1}")

array([[703,  43],
       [ 28,  59]])

Accuracy: 0.9147659063625451
Precision: 0.5784313725490197
Recall: 0.6781609195402298
F1: 0.6243386243386243


### New Model Fine-tuning

In [None]:
model, model_attn_weights_intermediate = create_bilstm_attn_model_dot(150)
early_stopping = keras.callbacks.EarlyStopping(monitor ='val_loss', patience = 4, restore_best_weights = True)
adam = keras.optimizers.Adam(learning_rate = 0.0001)
model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train_indices, Y_train, validation_data = (X_val_indices, Y_val), batch_size=32, epochs=40, callbacks = early_stopping)

accuracy, f1, precision, recall, Y_pred = prediction_pipeline(model, X_test, Y_test)
display(confusion_matrix(Y_test, Y_pred))
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1: {f1}")

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40


array([[701,  45],
       [ 25,  62]])

Accuracy: 0.9159663865546218
Precision: 0.5794392523364486
Recall: 0.7126436781609196
F1: 0.6391752577319586


In [None]:
model, model_attn_weights_intermediate = create_bilstm_attn_model_dot(150)
early_stopping = keras.callbacks.EarlyStopping(monitor ='val_loss', patience = 4, restore_best_weights = True)
adam = keras.optimizers.Adam(learning_rate = 0.0001)
model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train_indices, Y_train, validation_data = (X_val_indices, Y_val), batch_size=16, epochs=40, callbacks = early_stopping)

accuracy, f1, precision, recall, Y_pred = prediction_pipeline(model, X_test, Y_test)
display(confusion_matrix(Y_test, Y_pred))
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1: {f1}")

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40


array([[687,  59],
       [ 18,  69]])

Accuracy: 0.907563025210084
Precision: 0.5390625
Recall: 0.7931034482758621
F1: 0.641860465116279


In [None]:
#Lets now use BS of 16 with a linear learning rate decay 
model, model_attn_weights_intermediate = create_bilstm_attn_model_dot(150)
early_stopping = keras.callbacks.EarlyStopping(monitor ='val_loss', patience = 4, restore_best_weights = True)

starter_learning_rate = 0.001
end_learning_rate = 0.0000001
decay_steps = len(train_df)//16 * 20
learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
    starter_learning_rate,
    decay_steps,
    end_learning_rate,
    power=1)

adam = keras.optimizers.Adam(learning_rate = learning_rate_fn)
model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train_indices, Y_train, validation_data = (X_val_indices, Y_val), batch_size=16, epochs=20, callbacks = early_stopping)

accuracy, f1, precision, recall, Y_pred = prediction_pipeline(model, X_test, Y_test)
display(confusion_matrix(Y_test, Y_pred))
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1: {f1}")

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20


array([[717,  29],
       [ 23,  64]])

Accuracy: 0.9375750300120048
Precision: 0.6881720430107527
Recall: 0.735632183908046
F1: 0.7111111111111111


In [None]:
#Lets now use BS of 16 with a linear learning rate decay 
model, model_attn_weights_intermediate = create_bilstm_attn_model_dot(150)
early_stopping = keras.callbacks.EarlyStopping(monitor ='val_loss', patience = 4, restore_best_weights = True)

starter_learning_rate = 0.001
end_learning_rate = 0.0000001
decay_steps = len(train_df)//32 * 20
learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
    starter_learning_rate,
    decay_steps,
    end_learning_rate,
    power=1)

adam = keras.optimizers.Adam(learning_rate = learning_rate_fn)
model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train_indices, Y_train, validation_data = (X_val_indices, Y_val), batch_size=32, epochs=20, callbacks = early_stopping)

accuracy, f1, precision, recall, Y_pred = prediction_pipeline(model, X_test, Y_test)
display(confusion_matrix(Y_test, Y_pred))
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1: {f1}")

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20


array([[718,  28],
       [ 29,  58]])

Accuracy: 0.9315726290516206
Precision: 0.6744186046511628
Recall: 0.6666666666666666
F1: 0.6705202312138728


In [None]:
#Lets now use BS of 16 with a linear learning rate decay 
model, model_attn_weights_intermediate = create_bilstm_attn_model_dot_embeddings(150)
early_stopping = keras.callbacks.EarlyStopping(monitor ='val_loss', patience = 4, restore_best_weights = True)

starter_learning_rate = 0.001
end_learning_rate = 0.0000001
decay_steps = len(train_df)//16 * 20
learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
    starter_learning_rate,
    decay_steps,
    end_learning_rate,
    power=1)

adam = keras.optimizers.Adam(learning_rate = learning_rate_fn)
model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train_indices, Y_train, validation_data = (X_val_indices, Y_val), batch_size=16, epochs=20, callbacks = early_stopping)

accuracy, f1, precision, recall, Y_pred = prediction_pipeline(model, X_test, Y_test)
display(confusion_matrix(Y_test, Y_pred))
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1: {f1}")

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20


array([[704,  42],
       [ 27,  60]])

Accuracy: 0.9171668667466987
Precision: 0.5882352941176471
Recall: 0.6896551724137931
F1: 0.6349206349206349


### Extract out attention weights from the text and key features

In [None]:
X_series, attn_weights = prediction_pipeline_attn(model_attn_weights_intermediate, X_test)

In [None]:
def highlight_attn_weights(X_series, attn_weights, tokenizer):
    #Given a set of attention weights, return the top 5 tokens per word and decode them
    #Returns a list of text that aligns with X_series 
    attn_weights = attn_weights.squeeze()
    top_5_indexes = attn_weights.argsort(axis = 1)[:, -1:-6:-1]
    top_5_tokens_arr = []
    for i in range(len(X_series)):
        top_5_words = X_series[i][top_5_indexes[0]]
        top_5_tokens_arr.append(top_5_words)
    top_5_tokens_arr = np.array(top_5_tokens_arr)
    return tokenizer.sequences_to_texts(top_5_tokens_arr)

In [None]:
eval_data['top_5_tokens'] = highlight_attn_weights(X_series, attn_weights, tokenizer)
eval_data

Unnamed: 0,id,sentence,label,top_5_tokens
0,9566,this would enable live traffic aware apps,0,live aware traffic apps enable
1,9569,please try other formatting like bold italics ...,1,shadow from to content like
2,9576,since computers were invented to save time i s...,1,in right the order them
3,9577,allow rearranging if the user wants to change ...,1,wants change to them user
4,9579,add simd instructions for better use of arm ne...,1,instructions and for games arm
...,...,...,...,...
828,6340,it could be something like,0,could something be like it
829,6341,for input nodes,0,input for nodes
830,6351,it would be very very appreciated,0,be very very appreciated would
831,6357,i have made an app when i search for it 10 app...,0,app of any them my


In [None]:
pos_eval_data = eval_data[eval_data['label'] == 1]
vectorizer = CountVectorizer()
count_vectors = vectorizer.fit_transform(pos_eval_data['top_5_tokens'])
pd.DataFrame(count_vectors.toarray(), columns = vectorizer.get_feature_names_out()).sum(axis = 0).sort_values(ascending = False)[:20]

the        27
to         23
in         12
for        12
and        11
app         8
it          7
is          6
like        5
of          5
by          4
one         4
windows     4
be          4
camera      3
from        3
data        3
user        3
just        3
should      3
dtype: int64

### Model Training with BERT based models

In [None]:
from transformers import BertModel, BertTokenizer

class BertForClassification(nn.Module):
    def __init__(self):
        super(BertForClassification, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(768, 1)

    def forward(self, input_ids, attention_mask):
        X = self.bert(input_ids, attention_mask, return_dict=False)[1]
        X = self.dropout(X)
        X = self.classifier(X)
        return X

    def extract_attn_weights(self, input_ids, attention_mask):
        attentions = self.bert(input_ids, attention_mask, output_attentions = True)['attentions ']
        return attentions

class BertDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer):
        self.labels = torch.tensor(df['label'].tolist(), dtype = torch.float64)
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = 256, truncation=True,
                                return_tensors="pt") for text in df['sentence']]
    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        return self.texts[idx]

    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)
        return batch_texts, batch_y

In [None]:
def train_loop(dataloader, loss_fn, optimizer, scheduler, model, device):
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        input_ids = X['input_ids'].squeeze(1).to(device)
        attn_mask = X['attention_mask'].to(device)
        y = y.to(device)
        pred = model(input_ids, attn_mask).squeeze()
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

def eval_loop(dataloader, loss_fn, model, device):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            input_ids = X['input_ids'].squeeze(1).to(device)
            attn_mask = X['attention_mask'].to(device)
            y = y.to(device)
            pred = model(input_ids, attn_mask).squeeze()
            try:
                test_loss += loss_fn(pred, y).item()
            except:
                print(pred)
                print(input_ids, attn_mask, y)
                raise ValueError("Error")
            correct += (pred == y).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

def generate_train_val_dataloaders(train_df, validation_df, tokenizer):
    train_dataset = BertDataset(train_df, tokenizer)
    validation_dataset = BertDataset(validation_df, tokenizer)
    train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, drop_last = True)
    validation_dataloader = DataLoader(validation_dataset, batch_size = 16, shuffle = True, drop_last = True)
    return train_dataloader, validation_dataloader

def full_train_cycle(num_epochs, train_df, validation_df, device, tokenizer):
    model = BertForClassification()
    model.to(device)
    train_dataloader, test_dataloader = generate_train_val_dataloaders(train_df, validation_df, tokenizer)
    loss_fn = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=2e-5, steps_per_epoch=len(train_dataloader), epochs=num_epochs, pct_start = 0.1, anneal_strategy  = 'linear')
    for t in range(num_epochs):
        print(f"Epoch {t+1}\n-------------------------------")
        train_loop(train_dataloader, loss_fn, optimizer, scheduler, model, device)
        eval_loop(test_dataloader, loss_fn, model, device)
    return model 

def make_prediction(test_df, model, tokenizer):
    model.eval()
    model.to('cpu')
    sentence_idx = test_df.columns.get_loc('sentence') + 1 
    res = []
    for row in test_df.itertuples():
        sentence = row[sentence_idx]
        tokenized = tokenizer(sentence, padding='max_length', max_length = 256, truncation=True, return_tensors="pt")
        pred = model(tokenized['input_ids'], tokenized['attention_mask']).squeeze()
        pred = torch.nn.functional.sigmoid(pred).item() 
        res.append(pred)
    sigmoid_res = [1 if logit >= 0.5 else 0 for logit in res]
    return sigmoid_res

In [None]:
train_df, validation_df = train_test_split(train_data, random_state = 42)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
trained_model = full_train_cycle(3, train_df, validation_df, device, tokenizer)

In [None]:
trained_model = BertForClassification()
trained_model.load_state_dict(torch.load(f"{root_dir}/Data/BertWeights.pt"))
trained_model.eval()

In [None]:
preds = make_prediction(eval_data, trained_model, tokenizer)
labels = eval_data['label'].tolist()
confusion_matrix(labels, preds)
print(f"F1 Score: {f1_score(labels, preds)}")
print(f"Recall Score: {recall_score(labels, preds)}")
print(f"Precision Score: {precision_score(labels, preds)}")
print(f"Accuracy Score: {accuracy_score(labels, preds)}")

F1 Score: 0.7567567567567568
Recall Score: 0.8045977011494253
Precision Score: 0.7142857142857143
Accuracy Score: 0.9459783913565426


### New set of improvements for BERT models: 
1. BERT With more dense layers before classification
2. BERT with LSTM 

In [None]:
class BertForClassificationMultiple(nn.Module):
    def __init__(self):
        super(BertForClassificationMultiple, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.3)
        self.l1 = nn.Linear(768, 512)
        self.dropout2 = nn.Dropout(0.3)
        self.l2 = nn.Linear(512, 256)
        self.classifier = nn.Linear(256, 1)
         

    def forward(self, input_ids, attention_mask):
        X = self.bert(input_ids, attention_mask, return_dict=False)[1]
        X = self.dropout(X)
        X = self.l1(X)
        X = self.dropout2(X)
        X = self.l2(X)
        X = self.classifier(X)
        return X

    def extract_attn_weights(self, input_ids, attention_mask):
        attentions = self.bert(input_ids, attention_mask, output_attentions = True)['attentions ']
        return attentions

def full_train_cycle_multiple(num_epochs, train_df, validation_df, device, tokenizer):
    model = BertForClassificationMultiple()
    model.to(device)
    train_dataloader, test_dataloader = generate_train_val_dataloaders(train_df, validation_df, tokenizer)
    loss_fn = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
    scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=1e-5, steps_per_epoch=len(train_dataloader), epochs=num_epochs, pct_start = 0.1, anneal_strategy  = 'linear')
    for t in range(num_epochs):
        print(f"Epoch {t+1}\n-------------------------------")
        train_loop(train_dataloader, loss_fn, optimizer, scheduler, model, device)
        eval_loop(test_dataloader, loss_fn, model, device)
    return model 

In [None]:
train_df, validation_df = train_test_split(train_data, random_state = 42)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
trained_model = full_train_cycle_multiple(3, train_df, validation_df, device, tokenizer)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch 1
-------------------------------
loss: 0.691674  [    0/ 6819]
loss: 0.416721  [  300/ 6819]
loss: 0.252449  [  600/ 6819]
loss: 0.181144  [  900/ 6819]
loss: 0.260524  [ 1200/ 6819]
Test Error: 
 Accuracy: 0.0%, Avg loss: 0.238401 

Epoch 2
-------------------------------
loss: 0.129126  [    0/ 6819]
loss: 0.252260  [  300/ 6819]
loss: 0.196877  [  600/ 6819]
loss: 0.552070  [  900/ 6819]
loss: 0.032336  [ 1200/ 6819]
Test Error: 
 Accuracy: 0.0%, Avg loss: 0.244936 

Epoch 3
-------------------------------
loss: 0.155832  [    0/ 6819]
loss: 0.019675  [  300/ 6819]
loss: 0.011135  [  600/ 6819]
loss: 0.213412  [  900/ 6819]
loss: 0.017594  [ 1200/ 6819]
Test Error: 
 Accuracy: 0.0%, Avg loss: 0.278703 



In [None]:
torch.save(trained_model.state_dict(), f"{root_dir}/Data/BertWeightsMultipleDenseLayers.pt")

In [None]:
preds = make_prediction(eval_data, trained_model, tokenizer)
labels = eval_data['label'].tolist()
print(f"F1 Score: {f1_score(labels, preds)}")
print(f"Recall Score: {recall_score(labels, preds)}")
print(f"Precision Score: {precision_score(labels, preds)}")
print(f"Accuracy Score: {accuracy_score(labels, preds)}")



F1 Score: 0.7582417582417583
Recall Score: 0.7931034482758621
Precision Score: 0.7263157894736842
Accuracy Score: 0.9471788715486195


In [None]:
confusion_matrix(labels, preds)

array([[720,  26],
       [ 18,  69]])

In [None]:
train_df, validation_df = train_test_split(train_data, random_state = 42)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
trained_model = full_train_cycle_multiple(3, train_df, validation_df, device, tokenizer)

preds = make_prediction(eval_data, trained_model, tokenizer)
labels = eval_data['label'].tolist()
print(f"F1 Score: {f1_score(labels, preds)}")
print(f"Recall Score: {recall_score(labels, preds)}")
print(f"Precision Score: {precision_score(labels, preds)}")
print(f"Accuracy Score: {accuracy_score(labels, preds)}")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch 1
-------------------------------
loss: 0.733559  [    0/ 6819]
loss: 0.597525  [  300/ 6819]
loss: 0.243157  [  600/ 6819]
loss: 0.229948  [  900/ 6819]
loss: 0.258119  [ 1200/ 6819]
Test Error: 
 Accuracy: 0.0%, Avg loss: 0.237608 

Epoch 2
-------------------------------
loss: 0.166411  [    0/ 6819]
loss: 0.351938  [  300/ 6819]
loss: 0.116235  [  600/ 6819]
loss: 0.110033  [  900/ 6819]
loss: 0.247821  [ 1200/ 6819]
Test Error: 
 Accuracy: 0.0%, Avg loss: 0.236538 

Epoch 3
-------------------------------
loss: 0.078914  [    0/ 6819]
loss: 0.009478  [  300/ 6819]
loss: 0.107530  [  600/ 6819]
loss: 0.065381  [  900/ 6819]
loss: 0.011523  [ 1200/ 6819]
Test Error: 
 Accuracy: 0.0%, Avg loss: 0.263963 





F1 Score: 0.7301587301587301
Recall Score: 0.7931034482758621
Precision Score: 0.6764705882352942
Accuracy Score: 0.9387755102040817


In [None]:
class BertForClassificationLSTM(nn.Module):
    def __init__(self):
        super(BertForClassificationLSTM, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.3)
        self.LSTM = nn.LSTM(input_size = 768, hidden_size = 256, batch_first = True)
        self.classifier = nn.Linear(256, 1)
         
    def forward(self, input_ids, attention_mask):
        X = self.bert(input_ids, attention_mask, return_dict=False)[0]
        X = self.dropout(X)
        X, _ = self.LSTM(X)
        X = self.classifier(X[:,-1,:])
        return X

    def extract_attn_weights(self, input_ids, attention_mask):
        attentions = self.bert(input_ids, attention_mask, output_attentions = True)['attentions ']
        return attentions

def full_train_cycle_BERTLstm(num_epochs, train_df, validation_df, device, tokenizer):
    model = BertForClassificationLSTM()
    model.to(device)
    train_dataloader, test_dataloader = generate_train_val_dataloaders(train_df, validation_df, tokenizer)
    loss_fn = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=2e-5, steps_per_epoch=len(train_dataloader), epochs=num_epochs, pct_start = 0.1, anneal_strategy  = 'linear')
    for t in range(num_epochs):
        print(f"Epoch {t+1}\n-------------------------------")
        train_loop(train_dataloader, loss_fn, optimizer, scheduler, model, device)
        eval_loop(test_dataloader, loss_fn, model, device)
    return model 

In [None]:
train_df, validation_df = train_test_split(train_data, random_state = 42)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
trained_model = full_train_cycle_BERTLstm(2, train_df, validation_df, device, tokenizer)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch 1
-------------------------------
loss: 0.709174  [    0/ 6819]
loss: 0.336621  [  300/ 6819]
loss: 0.345218  [  600/ 6819]
loss: 0.471101  [  900/ 6819]
loss: 0.447193  [ 1200/ 6819]
Test Error: 
 Accuracy: 0.0%, Avg loss: 0.237060 

Epoch 2
-------------------------------
loss: 0.185294  [    0/ 6819]
loss: 0.114724  [  300/ 6819]
loss: 0.145153  [  600/ 6819]
loss: 0.280575  [  900/ 6819]
loss: 0.030968  [ 1200/ 6819]
Test Error: 
 Accuracy: 0.0%, Avg loss: 0.230080 



In [None]:
preds = make_prediction(eval_data, trained_model, tokenizer)
labels = eval_data['label'].tolist()
display(confusion_matrix(labels, preds))
print(f"F1 Score: {f1_score(labels, preds)}")
print(f"Recall Score: {recall_score(labels, preds)}")
print(f"Precision Score: {precision_score(labels, preds)}")
print(f"Accuracy Score: {accuracy_score(labels, preds)}")



array([[715,  31],
       [ 17,  70]])

F1 Score: 0.7446808510638299
Recall Score: 0.8045977011494253
Precision Score: 0.693069306930693
Accuracy Score: 0.9423769507803121


In [None]:
torch.save(trained_model.state_dict(), f"{root_dir}/Data/BertWithLSTM.pt")

In [None]:
class BertForClassificationBiLSTM(nn.Module):
    def __init__(self):
        super(BertForClassificationBiLSTM, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.3)
        self.LSTM = nn.LSTM(input_size = 768, hidden_size = 256, batch_first = True, bidirectional = True)
        self.classifier = nn.Linear(512, 1)
         
    def forward(self, input_ids, attention_mask):
        X = self.bert(input_ids, attention_mask, return_dict=False)[0]
        X = self.dropout(X)
        X, _ = self.LSTM(X)
        X = self.classifier(X[:,-1,:])
        return X

    def extract_attn_weights(self, input_ids, attention_mask):
        attentions = self.bert(input_ids, attention_mask, output_attentions = True)['attentions ']
        return attentions

def full_train_cycle_BERTBiLstm(num_epochs, train_df, validation_df, device, tokenizer):
    model = BertForClassificationBiLSTM()
    model.to(device)
    train_dataloader, test_dataloader = generate_train_val_dataloaders(train_df, validation_df, tokenizer)
    loss_fn = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=2e-5, steps_per_epoch=len(train_dataloader), epochs=num_epochs, pct_start = 0.1, anneal_strategy  = 'linear')
    for t in range(num_epochs):
        print(f"Epoch {t+1}\n-------------------------------")
        train_loop(train_dataloader, loss_fn, optimizer, scheduler, model, device)
        eval_loop(test_dataloader, loss_fn, model, device)
    return model 

In [None]:
train_df, validation_df = train_test_split(train_data, random_state = 42)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
trained_model = full_train_cycle_BERTBiLstm(2, train_df, validation_df, device, tokenizer)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch 1
-------------------------------
loss: 0.684836  [    0/ 6819]
loss: 0.479770  [  300/ 6819]
loss: 0.346597  [  600/ 6819]
loss: 0.236227  [  900/ 6819]
loss: 0.188925  [ 1200/ 6819]
Test Error: 
 Accuracy: 0.0%, Avg loss: 0.235080 

Epoch 2
-------------------------------
loss: 0.118277  [    0/ 6819]
loss: 0.067105  [  300/ 6819]
loss: 0.067803  [  600/ 6819]
loss: 0.043720  [  900/ 6819]
loss: 0.438511  [ 1200/ 6819]
Test Error: 
 Accuracy: 0.0%, Avg loss: 0.236919 



In [None]:
preds = make_prediction(eval_data, trained_model, tokenizer)
labels = eval_data['label'].tolist()
display(confusion_matrix(labels, preds))
print(f"F1 Score: {f1_score(labels, preds)}")
print(f"Recall Score: {recall_score(labels, preds)}")
print(f"Precision Score: {precision_score(labels, preds)}")
print(f"Accuracy Score: {accuracy_score(labels, preds)}")



array([[717,  29],
       [ 17,  70]])

F1 Score: 0.7526881720430108
Recall Score: 0.8045977011494253
Precision Score: 0.7070707070707071
Accuracy Score: 0.9447779111644657


In [None]:
torch.save(trained_model.state_dict(), f"{root_dir}/Data/BertWithBiLSTM.pt")

### Extracting attention weights from the BERT Model

In [None]:
#Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.

def get_attention_weights(model, text, tokenizer):
    model.eval()
    tokenized = tokenizer(text, padding='max_length', max_length = 256, truncation=True, return_tensors="pt")
    input_ids, attention_mask = tokenized['input_ids'], tokenized['attention_mask']
    attentions = model.bert(input_ids, attention_mask, output_attentions = True)
    return attentions

a = get_attention_weights(trained_model, eval_data['sentence'][0], tokenizer)['attentions']