## DATA 255- Lab 2- Part 2 - Modeling PyTorch  (Follow up from the Pre-processing File)

In [1]:
import pandas as pd

import numpy as np

import re

from collections import Counter

In [2]:
labels = ['toxicity', 'severe_toxicity', 'obscene', 'threat', 

          'insult', 'identity_attack', 'sexual_explicit']

**Importing the pre-processed data saved as pickle files**

In [3]:
import pickle
with open('/kaggle/input/toxic-class-stemmed/cleaned_data_stemmed.pkl', 'rb') as f:
   train_data = pickle.load(f)

In [4]:
with open('/kaggle/input/toxic-class-stemmed/cleaned_testdata_stemmed.pkl', 'rb') as f:
    test_data = pickle.load(f)

In [5]:
train_data[['toxicity', 'severe_toxicity', 'obscene', 'threat', 'insult', 'identity_attack', 'sexual_explicit']].isnull().sum()

toxicity           0
severe_toxicity    0
obscene            0
threat             0
insult             0
identity_attack    0
sexual_explicit    0
dtype: int64

**Setting up the target variable**

In [6]:
y = train_data[['toxicity', 'severe_toxicity', 'obscene', 'threat', 'insult', 'identity_attack', 'sexual_explicit']].to_numpy()

In [7]:
y

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.6212121 , 0.03030303, 0.03030303, ..., 0.6212121 , 0.04545455,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

**Setting up the parameters for tokenization and modeling**

In [8]:
max_features=100000      

maxpadlen = 200

embedding_dim = 300

In [9]:
processed_train_data = train_data['text'].tolist()

processed_test_data = test_data['text'].tolist()

**Performing Tokenization**

In [10]:
from tensorflow import keras

from tensorflow.keras.preprocessing.text import Tokenizer

from keras.preprocessing.sequence import pad_sequences

In [11]:
tokenizer = Tokenizer(num_words=max_features)

tokenizer.fit_on_texts(list(processed_train_data))

list_tokenized_train = tokenizer.texts_to_sequences(processed_train_data)

list_tokenized_test = tokenizer.texts_to_sequences(processed_test_data)

In [12]:
word_index=tokenizer.word_index

print("Words in Vocabulary: ",len(word_index))

Words in Vocabulary:  494342


**Ensuring all input sequences have the same length by padding shorter sequences and truncating longer ones**

In [13]:
X_t=pad_sequences(list_tokenized_train, maxlen=maxpadlen, padding = 'post')

X_te=pad_sequences(list_tokenized_test, maxlen=maxpadlen, padding = 'post')

In [14]:
print('Tokenized sentences: \n', X_t[10])

print('One hot label: \n', y[10])

Tokenized sentences: 
 [871   3 416 179   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0]
One hot label: 
 [0. 0. 0. 0. 0. 0. 0.]


In [15]:
indices = np.arange(X_t.shape[0])

np.random.shuffle(indices)

In [16]:
X_t = X_t[indices]

labels = y[indices]

**Splitting the train set into train and validation set**

In [17]:
num_validation_samples = int(0.1*X_t.shape[0])

x_train = X_t[: -num_validation_samples]

y_train = labels[: -num_validation_samples]

x_val = X_t[-num_validation_samples: ]

y_val = labels[-num_validation_samples: ]

In [18]:
print('Number of entries in each category:')

print('training: ', y_train.sum(axis=0))

print('validation: ', y_val.sum(axis=0))

Number of entries in each category:
training:  [167287.07053601   7459.65449701  22524.66422764  15152.66360899
 131780.33927305  36696.50232994  10711.73198483]
validation:  [18646.16372101   810.45652608  2521.94994614  1653.00805135
 14690.12093377  4158.10711     1211.21806111]


**Converting training and validation data to PyTorch tensors**

In [19]:
import torch
import torch.nn as nn

x_train_tensor = torch.tensor(x_train, dtype=torch.long)
y_train_tensor = torch.tensor(y_train, dtype=torch.float)
x_val_tensor = torch.tensor(x_val, dtype=torch.long)
y_val_tensor = torch.tensor(y_val, dtype=torch.float)

**Creating TensorDataset objects**

In [20]:
from torch.utils.data import TensorDataset, DataLoader

train_dataset = TensorDataset(x_train_tensor, y_train_tensor)
val_dataset = TensorDataset(x_val_tensor, y_val_tensor)

**Creating Data Loaders**

In [21]:
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

**Fast Text Embeddings**

In [23]:
embeddings_index_fasttext = {}

f = open('/kaggle/input/fasttext-crawl-300d-2m/crawl-300d-2M.vec', encoding='utf8')

for line in f:

    values = line.split()

    word = values[0]

    embeddings_index_fasttext[word] = np.asarray(values[1:], dtype='float32')

f.close()

In [24]:
embedding_matrix_fasttext = np.random.random((len(word_index) + 1, embedding_dim))

for word, i in word_index.items():

    embedding_vector = embeddings_index_fasttext.get(word)

    if embedding_vector is not None:

        embedding_matrix_fasttext[i] = embedding_vector

print(" Completed!")

 Completed!


In [25]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import gensim.downloader as api
from sklearn.metrics import accuracy_score, classification_report

**BiLSTM Model Architecure**

In [26]:
class BiLSTMMultiLabelModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, lstm_units, num_classes, 
                 embedding_weights=None, dropout_rate=0.1):
        super(BiLSTMMultiLabelModel, self).__init__()

        if embedding_weights is not None:
            self.embedding = nn.Embedding.from_pretrained(
                torch.FloatTensor(embedding_weights), 
                padding_idx=0, 
                freeze=False
            )
        else:
            self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

        self.bilstm = nn.LSTM(
            input_size=embedding_dim, 
            hidden_size=lstm_units, 
            num_layers=1, 
            bidirectional=True,
            batch_first=True
        )

        lstm_output_dim = lstm_units * 2
        

        self.classifier = nn.Sequential(
            # Global Max Pooling is handled in forward method
            nn.Linear(lstm_output_dim, 75),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(75, num_classes),
            nn.Sigmoid()  #for multi-label probability output
        )
        
    def forward(self, x):
        embedded = self.embedding(x)

        lstm_out, _ = self.bilstm(embedded)

        if self.bilstm.bidirectional:
            # Split forward and backward outputs
            forward = lstm_out[:, :, :self.bilstm.hidden_size]
            backward = lstm_out[:, :, self.bilstm.hidden_size:]
            
            # Concatenate forward and backward
            lstm_out = torch.cat((forward, backward), dim=-1)
        
        # Global Max Pooling
        pooled, _ = torch.max(lstm_out, dim=1)

        output = self.classifier(pooled)
        
        return output

**Model Training and Validation Phase**

In [42]:
import tqdm
import torch
import torch.nn.functional as F

def train_model(model, train_loader, val_loader, criterion, optimizer, 
                device, num_epochs=10):
    training_metrics = {
        'train_loss': [],
        'train_accuracy': [],
        'val_loss': [],
        'val_accuracy': []
    }
    
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        train_correct = 0
        train_total = 0
        
        with tqdm.tqdm(train_loader, desc=f"Epoch {epoch + 1} Train", leave=False) as tepoch:
            for batch_x, batch_y in tepoch:
                batch_x, batch_y = batch_x.to(device), batch_y.to(device)
                
                optimizer.zero_grad()

                outputs = model(batch_x)
                loss = criterion(outputs, batch_y)  # BCE loss
                
                loss.backward()
                optimizer.step()
                
                train_loss += loss.item()

                predicted = (torch.sigmoid(outputs) > 0.5).float()

                train_correct += (predicted == batch_y).float().sum().item()
                train_total += batch_y.numel()

                tepoch.set_postfix({
                    'Loss': f'{loss.item():.4f}',
                    'Acc': f'{(train_correct/train_total)*100:.2f}%'
                })

        model.eval()
        val_loss = 0.0
        val_correct = 0
        val_total = 0
        
        with torch.no_grad():
            with tqdm.tqdm(val_loader, desc=f"Epoch {epoch + 1} Validation", leave=False) as tepoch:
                for batch_x, batch_y in tepoch:
                    batch_x, batch_y = batch_x.to(device), batch_y.to(device)
                    
                    # Forward pass
                    outputs = model(batch_x)
                    loss = criterion(outputs, batch_y)  
                    
                    val_loss += loss.item()
s
                    predicted = (torch.sigmoid(outputs) > 0.5).float()

                    val_correct += (predicted == batch_y).float().sum().item()
                    val_total += batch_y.numel()

                    tepoch.set_postfix({
                        'Loss': f'{loss.item():.4f}',
                        'Acc': f'{(val_correct/val_total)*100:.2f}%'
                    })
        
        train_epoch_loss = train_loss / len(train_loader)
        train_epoch_accuracy = 100 * train_correct / train_total
        val_epoch_loss = val_loss / len(val_loader)
        val_epoch_accuracy = 100 * val_correct / val_total

        training_metrics['train_loss'].append(train_epoch_loss)
        training_metrics['train_accuracy'].append(train_epoch_accuracy)
        training_metrics['val_loss'].append(val_epoch_loss)
        training_metrics['val_accuracy'].append(val_epoch_accuracy)

        print(f"Epoch {epoch + 1}: Train Loss: {train_epoch_loss:.4f}, "
                   f"Train Acc: {train_epoch_accuracy:.2f}%, "
                   f"Val Loss: {val_epoch_loss:.4f}, "
                   f"Val Acc: {val_epoch_accuracy:.2f}%")
    
    return training_metrics

**Defining the parameters for Model Training**

In [29]:
lstm_units = 100  
num_classes = 7  
batch_size = 64
learning_rate = 0.001
num_epochs = 10

**Device Mode. Cuda if available**

In [30]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


**Model Compilation**

In [31]:
model = BiLSTMMultiLabelModel(
        vocab_size=len(word_index) + 1,
        embedding_dim=embedding_dim, 
        lstm_units=lstm_units, 
        num_classes=num_classes,
        embedding_weights=embedding_matrix_fasttext,
        dropout_rate=0.1
    ).to(device)

**Defining optimizer and Loss function**

In [40]:
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

**Model Training Starts**

In [43]:
training_metrics = train_model(
        model, train_loader, val_loader, 
        criterion, optimizer, device, 
        num_epochs=num_epochs
    )

                                                                                                 

Epoch 1: Train Loss: 0.7667, Train Acc: 76.68%, Val Loss: 0.7667, Val Acc: 76.94%


                                                                                                 

Epoch 2: Train Loss: 0.7364, Train Acc: 80.11%, Val Loss: 0.6925, Val Acc: 86.69%


                                                                                                 

Epoch 3: Train Loss: 0.6924, Train Acc: 86.22%, Val Loss: 0.6925, Val Acc: 86.67%


                                                                                                 

Epoch 4: Train Loss: 0.6924, Train Acc: 86.67%, Val Loss: 0.6925, Val Acc: 86.89%


                                                                                                 

Epoch 5: Train Loss: 0.6923, Train Acc: 86.73%, Val Loss: 0.6925, Val Acc: 86.75%


Epoch 6 Train:  70%|███████   | 17800/25382 [11:27<04:53, 25.84it/s, Loss=0.6931, Acc=86.77%]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

Epoch 6 Train:  90%|█████████ | 22867/25382 [14:43<01:36, 25.97it/s, Loss=0.6932, Acc=86.78%]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

                                                                                                 

Epoch 6: Train Loss: 0.6923, Train Acc: 86.79%, Val Loss: 0.6925, Val Acc: 86.89%


Epoch 7 Train:   7%|▋         | 1879/25382 [01:12<15:08, 25.87it/s, Loss=0.6908, Acc=86.86%]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

Epoch 7 Train:  29%|██▉       | 7339/25382 [04:43<11:36, 25.92it/s, Loss=0.6920, Acc=86.78%]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

Epoch 7 Train:  49%|████▊     | 12331/25382 [07:56<08:23, 25.93it/s, Loss=0.6922, Acc=86.80%]IOPub message rate exceeded.
The notebook server will temporarily stop se

Epoch 9: Train Loss: 0.6922, Train Acc: 86.87%, Val Loss: 0.6925, Val Acc: 86.88%


Epoch 10 Train:   0%|          | 112/25382 [00:04<16:35, 25.38it/s, Loss=0.6926, Acc=87.00%]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

Epoch 10 Train:  29%|██▉       | 7477/25382 [04:52<11:39, 25.60it/s, Loss=0.6904, Acc=86.86%]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

Epoch 10 Train:  53%|█████▎    | 13492/25382 [08:48<07:46, 25.50it/s, Loss=0.6933, Acc=86.88%]IOPub message rate exceeded.
The notebook server will temporarily stop 

We see that for both training and validation the accuracy has increased with the epochs. The highest increase we observed till epoch 3, and then it increased with minor updates.

**Saving the model**

In [44]:
torch.save(model, 'model_full.pth')

**Converting Test Dataset to pytorch tensors, Dataset creation and Creating a Data Loader. Preparing Test set for evaluation**

In [52]:
x_test_tensor = torch.tensor(X_te, dtype=torch.long)
test_dataset = TensorDataset(x_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

**Evaluating Trained Model on Test Set**

In [62]:
model.eval()
predictions = []
probabilities = []
with torch.no_grad():
    for batch in test_loader:
        inputs = batch[0].to(device)
        outputs = model(inputs)
        preds = torch.argmax(outputs, dim=1)
        predictions.extend(preds.tolist())
        probabilities.extend(outputs.tolist())

In [63]:
labels = ['toxicity', 'severe_toxicity', 'obscene', 'threat', 'insult', 'identity_attack', 'sexual_explicit']
df = pd.DataFrame(probabilities, columns=labels)

**converting probabilities to binary labels**

In [65]:
threshold = 0.5
binary_df = df.applymap(lambda x: 1 if x > threshold else 0)

print(binary_df)

       toxicity  severe_toxicity  obscene  threat  insult  identity_attack  \
0             0                0        0       0       0                0   
1             0                0        0       0       0                0   
2             0                0        0       0       0                0   
3             0                0        0       0       0                0   
4             0                0        0       0       0                0   
...         ...              ...      ...     ...     ...              ...   
97315         0                0        0       0       0                0   
97316         0                0        0       0       0                0   
97317         0                0        0       0       0                0   
97318         0                0        0       0       0                0   
97319         0                0        0       0       0                0   

       sexual_explicit  
0                    0  
1            

  binary_df = df.applymap(lambda x: 1 if x > threshold else 0)


In [68]:
test=pd.read_csv('/kaggle/input/toxicity-classification/test.csv')

In [69]:
test.shape

(97320, 2)

In [70]:
binary_df.shape

(97320, 7)

In [71]:
test.reset_index(drop=True, inplace=True)
binary_df.reset_index(drop=True, inplace=True)

final_df = pd.concat([test, binary_df], axis=1)

print(final_df.head())

   id                                               text  toxicity  \
0   0  [ Integrity means that you pay your debts.]\n\...         0   
1   1  This is malfeasance by the Administrator and t...         0   
2   2  @Rmiller101 - Spoken like a true elitist. But ...         0   
3   3  Paul: Thank you for your kind words.  I do, in...         0   
4   4  Sorry you missed high school. Eisenhower sent ...         0   

   severe_toxicity  obscene  threat  insult  identity_attack  sexual_explicit  
0                0        0       0       0                0                0  
1                0        0       0       0                0                0  
2                0        0       0       0                0                0  
3                0        0       0       0                0                0  
4                0        0       0       0                0                0  


In [72]:
display(final_df.head())

Unnamed: 0,id,text,toxicity,severe_toxicity,obscene,threat,insult,identity_attack,sexual_explicit
0,0,[ Integrity means that you pay your debts.]\n\...,0,0,0,0,0,0,0
1,1,This is malfeasance by the Administrator and t...,0,0,0,0,0,0,0
2,2,@Rmiller101 - Spoken like a true elitist. But ...,0,0,0,0,0,0,0
3,3,"Paul: Thank you for your kind words. I do, in...",0,0,0,0,0,0,0
4,4,Sorry you missed high school. Eisenhower sent ...,0,0,0,0,0,0,0


In [73]:
final_df = final_df.drop(columns='text')
display(final_df.head())

Unnamed: 0,id,toxicity,severe_toxicity,obscene,threat,insult,identity_attack,sexual_explicit
0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0
2,2,0,0,0,0,0,0,0
3,3,0,0,0,0,0,0,0
4,4,0,0,0,0,0,0,0


In [74]:
final_df.to_csv('predictions_lstm_5.csv')

**With this model training, we recieved 95.73 accuracy on the available dataset on Kaggle public leaderboard.**

## Thank you