## DATA 255- Lab 2- Part 2 - Modeling  (Follow up from the Pre-processing File)

In [1]:
import pandas as pd

import numpy as np

import re

from collections import Counter

In [2]:
labels = ['toxicity', 'severe_toxicity', 'obscene', 'threat', 

          'insult', 'identity_attack', 'sexual_explicit']

**Importing the pre-processed data saved as pickle files**

In [3]:
import pickle
with open('/kaggle/input/toxic-class-stemmed/cleaned_data_stemmed.pkl', 'rb') as f:
   train_data = pickle.load(f)

In [4]:
with open('/kaggle/input/toxic-class-stemmed/cleaned_testdata_stemmed.pkl', 'rb') as f:
    test_data = pickle.load(f)

In [5]:
train_data[['toxicity', 'severe_toxicity', 'obscene', 'threat', 'insult', 'identity_attack', 'sexual_explicit']].isnull().sum()

toxicity           0
severe_toxicity    0
obscene            0
threat             0
insult             0
identity_attack    0
sexual_explicit    0
dtype: int64

**Setting up the target variable**

In [6]:
y = train_data[['toxicity', 'severe_toxicity', 'obscene', 'threat', 'insult', 'identity_attack', 'sexual_explicit']].to_numpy()

In [7]:
y

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.6212121 , 0.03030303, 0.03030303, ..., 0.6212121 , 0.04545455,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

**Setting up the parameters for tokenization and modeling**

In [8]:
max_features=100000      

maxpadlen = 200

embedding_dim = 300

In [9]:
processed_train_data = train_data['text'].tolist()

processed_test_data = test_data['text'].tolist()

In [10]:
from tensorflow import keras

from tensorflow.keras.preprocessing.text import Tokenizer

from keras.preprocessing.sequence import pad_sequences

**Performing Tokenization**

In [11]:
tokenizer = Tokenizer(num_words=max_features)

tokenizer.fit_on_texts(list(processed_train_data))

list_tokenized_train = tokenizer.texts_to_sequences(processed_train_data)

list_tokenized_test = tokenizer.texts_to_sequences(processed_test_data)

In [12]:
word_index=tokenizer.word_index

print("Words in Vocabulary: ",len(word_index))

Words in Vocabulary:  494342


**Ensuring all input sequences have the same length by padding shorter sequences and truncating longer ones**

In [13]:
X_t=pad_sequences(list_tokenized_train, maxlen=maxpadlen, padding = 'post')

X_te=pad_sequences(list_tokenized_test, maxlen=maxpadlen, padding = 'post')

In [15]:
print('Tokenized sentences: \n', X_t[10])

print('One hot label: \n', y[10])

Tokenized sentences: 
 [871   3 416 179   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0]
One hot label: 
 [0. 0. 0. 0. 0. 0. 0.]


In [16]:
indices = np.arange(X_t.shape[0])

np.random.shuffle(indices)

In [17]:
X_t = X_t[indices]

labels = y[indices]

**Splitting the train set into train and validation set**

In [18]:
num_validation_samples = int(0.1*X_t.shape[0])

x_train = X_t[: -num_validation_samples]

y_train = labels[: -num_validation_samples]

x_val = X_t[-num_validation_samples: ]

y_val = labels[-num_validation_samples: ]

In [19]:
print('Number of entries in each category:')

print('training: ', y_train.sum(axis=0))

print('validation: ', y_val.sum(axis=0))

Number of entries in each category:
training:  [167338.58942525   7428.59912898  22532.20269267  15135.91872203
 131853.58218503  36765.44823257  10725.00810806]
validation:  [18594.64483177   841.51189412  2514.41148111  1669.75293831
 14616.87802179  4089.16120737  1197.94193788]


**Fast Text Embeddings**

In [20]:
embeddings_index_fasttext = {}

f = open('/kaggle/input/fasttext-crawl-300d-2m/crawl-300d-2M.vec', encoding='utf8')

for line in f:

    values = line.split()

    word = values[0]

    embeddings_index_fasttext[word] = np.asarray(values[1:], dtype='float32')

f.close()

In [21]:
embedding_matrix_fasttext = np.random.random((len(word_index) + 1, embedding_dim))

for word, i in word_index.items():

    embedding_vector = embeddings_index_fasttext.get(word)

    if embedding_vector is not None:

        embedding_matrix_fasttext[i] = embedding_vector

print(" Completed!")

 Completed!


In [22]:
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation

from keras.layers import Bidirectional, GlobalMaxPool1D

from keras.models import Model

from keras.models import Sequential

from keras.layers import Conv1D, MaxPooling1D

from keras.layers import BatchNormalization

from keras import initializers, regularizers, constraints, optimizers, layers

**Model Building - BiLSTM**

In [23]:
inp_text = Input(shape=(maxpadlen, ))

x = Embedding(len(word_index) + 1, embedding_dim, weights=[embedding_matrix_fasttext], input_length=maxpadlen)(inp_text)

x = Bidirectional(LSTM(100, return_sequences=True))(x)  

x = GlobalMaxPool1D()(x)

x = Dense(75, activation="relu")(x)

x = Dropout(0.1)(x)

x = Dense(7, activation="sigmoid")(x)

model = Model(inputs=[inp_text], outputs=x)

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()



**Define the early stopping Criteria**

In [24]:
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(
    monitor='val_loss',        
    patience=3,                
    restore_best_weights=True  
)

**Model Training**

In [25]:
history = model.fit(x_train, y_train,

                    epochs=10,  

                    batch_size=256,  

                    validation_data=(x_val, y_val),

                    verbose=1,
                    
                    callbacks=[early_stopping])

Epoch 1/10




[1m6346/6346[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m892s[0m 140ms/step - accuracy: 0.8010 - loss: 0.1030 - val_accuracy: 0.9249 - val_loss: 0.0924
Epoch 2/10
[1m6346/6346[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m884s[0m 139ms/step - accuracy: 0.9185 - loss: 0.0922 - val_accuracy: 0.9406 - val_loss: 0.0919
Epoch 3/10
[1m6346/6346[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m884s[0m 139ms/step - accuracy: 0.9254 - loss: 0.0905 - val_accuracy: 0.9400 - val_loss: 0.0920
Epoch 4/10
[1m6346/6346[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m890s[0m 140ms/step - accuracy: 0.9258 - loss: 0.0887 - val_accuracy: 0.9538 - val_loss: 0.0926
Epoch 5/10
[1m6346/6346[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m890s[0m 140ms/step - accuracy: 0.9245 - loss: 0.0872 - val_accuracy: 0.9474 - val_loss: 0.0933


In [26]:
X_test = X_te

**Getting Predictions on Test Set on the vectors**

In [27]:
predictions = model.predict(X_test, batch_size=512)

binary_predictions = (predictions >= 0.5).astype(int)

print(binary_predictions)

[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 41ms/step
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [1 0 0 ... 1 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


**Getting actual text of the test set**

In [28]:
test=pd.read_csv('/kaggle/input/toxicity-classification/test.csv')

In [29]:
test.shape

(97320, 2)

In [30]:
binary_predictions.shape

(97320, 7)

In [31]:
for i in range(binary_predictions.shape[1]):
    test[f'prediction_label_{i}'] = binary_predictions[:, i]

In [32]:
test.head()

Unnamed: 0,id,text,prediction_label_0,prediction_label_1,prediction_label_2,prediction_label_3,prediction_label_4,prediction_label_5,prediction_label_6
0,0,[ Integrity means that you pay your debts.]\n\...,0,0,0,0,0,0,0
1,1,This is malfeasance by the Administrator and t...,0,0,0,0,0,0,0
2,2,@Rmiller101 - Spoken like a true elitist. But ...,0,0,0,0,0,0,0
3,3,"Paul: Thank you for your kind words. I do, in...",0,0,0,0,0,0,0
4,4,Sorry you missed high school. Eisenhower sent ...,0,0,0,0,0,0,0


In [33]:
test.columns = ['id', 'text', 'toxicity', 'severe_toxicity', 'obscene', 'threat', 'insult', 'identity_attack','sexual_explicit']

In [34]:
test.head()

Unnamed: 0,id,text,toxicity,severe_toxicity,obscene,threat,insult,identity_attack,sexual_explicit
0,0,[ Integrity means that you pay your debts.]\n\...,0,0,0,0,0,0,0
1,1,This is malfeasance by the Administrator and t...,0,0,0,0,0,0,0
2,2,@Rmiller101 - Spoken like a true elitist. But ...,0,0,0,0,0,0,0
3,3,"Paul: Thank you for your kind words. I do, in...",0,0,0,0,0,0,0
4,4,Sorry you missed high school. Eisenhower sent ...,0,0,0,0,0,0,0


In [35]:
test.to_csv('predictions_lstm5.csv')

**With this model training, we recieved 97.73 accuracy on the available dataset on Kaggle public leaderboard.**

## Thank you