In [None]:
# required model to pretrain for fasttext 
#!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
# !gunzip cc.en.300.bin.gz


Importing all libraries

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import json
import os, sys, optparse, gzip, re, logging, string
import json
import pandas as pd
import numpy as np
import torch 
from sklearn.utils import shuffle
from torch.optim import Adam
from torch import nn
import torch
import torch.nn as nn
import torch.optim as optim
import fasttext
import numpy as np
import nltk
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

Loading Dataset: The preprocessed data is used for here.  
Loading pre-trained fasttext embeddings for reference

In [39]:
# Load the dataset
processed_df = pd.read_csv("../data/processed_df.csv")

# Load pre-trained FastText embeddings
fasttext_model = fasttext.load_model('cc.en.300.bin')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')



The text_to_embedding function takes a string of text and a maximum length, and returns a numpy array of word embeddings for the text.

The function first tokenizes the text into individual words using the word_tokenize function from the nltk package. It then truncates the list of tokens to the maximum length specified.

Next, the function uses a pre-trained FastText model to obtain the word embeddings for each token in the list. If a token is not present in the pre-trained model's vocabulary, the function adds a zero vector to the list of embeddings.

If the list of embeddings is shorter than the maximum length, the function pads the list with additional zero vectors to match the maximum length.

Finally, the function returns the list of embeddings as a numpy array.

In [8]:
def text_to_embedding(text, max_length):
    tokens = nltk.word_tokenize(text)
    tokens = tokens[:max_length]  # Truncate to max_length
    embeddings = [fasttext_model.get_word_vector(token) for token in tokens]
    padding_length = max_length - len(embeddings)
    embeddings.extend([np.zeros(fasttext_model.get_dimension()) for _ in range(padding_length)])
    return np.array(embeddings)

Fasttext + CNN implementation

In [9]:
class FastTextCNN(nn.Module):
    def __init__(self, embedding_dim, num_filters, filter_sizes, num_classes, dropout_prob):
        super(FastTextCNN, self).__init__()

        self.convs1 = nn.ModuleList([
            nn.Conv2d(in_channels=1, out_channels=num_filters, kernel_size=(fs, embedding_dim))
            for fs in filter_sizes
        ])

        self.convs2 = nn.ModuleList([
            nn.Conv2d(in_channels=num_filters, out_channels=num_filters, kernel_size=(fs, 1))
            for fs in filter_sizes
        ])

        self.dropout = nn.Dropout(dropout_prob)

        self.fc = nn.Linear(len(filter_sizes) * num_filters, num_classes)

    def forward(self, x):
        x = x.unsqueeze(1)  # Add channel dimension (batch_size, 1, max_length, embedding_dim)

        # Apply first set of convolutional layers with different filter sizes
        conv_outputs1 = []
        for conv1 in self.convs1:
            conv_output1 = torch.relu(conv1(x))
            conv_outputs1.append(conv_output1)

        # Apply second set of convolutional layers with different filter sizes
        conv_outputs2 = []
        for conv2, conv_output1 in zip(self.convs2, conv_outputs1):
            conv_output2 = torch.relu(conv2(conv_output1)).squeeze(3)
            pooled_output = torch.max_pool1d(conv_output2, conv_output2.size(2)).squeeze(2)
            conv_outputs2.append(pooled_output)

        x = torch.cat(conv_outputs2, 1)
        x = self.dropout(x)  # Dropout layer
        x = self.fc(x)  # Fully connected layer (batch_size, num_classes)
        return x


Fasttext + LSTM implementation

In [10]:
class FastTextBiLSTM(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, num_layers, num_classes, dropout):
        super(FastTextBiLSTM, self).__init__()

        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(p=dropout)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_dim).to(device)
        c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_dim).to(device)

        out, (h_n, c_n) = self.lstm(x, (h0, c0))

        # Concatenate the hidden states of the forward and backward LSTM layers
        out = torch.cat((h_n[-2,:,:], h_n[-1,:,:]), dim=1)
        out = self.dropout(out)
        out = self.fc(out)

        return out

In [11]:
def data_split(processed_df, train=.75, test=.15):
    np.random.seed(111)
    df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=40), 
                                     [int(train*len(df)), int((1-test)*len(df))])
    
    return df_train, df_val, df_test

Training and test functions for model training and evaluation

In [12]:
def training(model, train_loader, val_loader, criterion, optimizer, device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'), num_epochs=20, modeltype='CNN'):
    best_val_loss = float('inf')
    for epoch in range(num_epochs):
        # Training loop-
        model.train()
        train_loss = 0
        for i, (inputs, labels) in enumerate(train_loader):
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        # Validation loop
        model.eval()
        val_loss = 0
        val_accuracy = 0
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                val_predictions = torch.argmax(outputs, dim=1)
                val_accuracy += (val_predictions == labels).float().mean().item()

        # Print epoch results
        train_loss /= len(train_loader)
        val_loss /= len(val_loader)
        val_accuracy /= len(val_loader)
        # Save the best model based on validation loss
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model = model.state_dict()
            torch.save(best_model, '../data/fasttext/fasttext_best_model_{}.pth'.format(modeltype))
                
        print(f'Epoch {epoch + 1}/{num_epochs}, Training Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy * 100:.2f}%')


In [13]:

def test(model, test_loader):
    model.eval()
    test_accuracy = 0
    with torch.no_grad():
        true_labels = []
        predicted_labels = []
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            test_predictions = torch.argmax(outputs, dim=1)
            test_accuracy += (test_predictions == labels).float().mean().item()
            true_labels.extend(labels.cpu().numpy())
            predicted_labels.extend(test_predictions.cpu().numpy())

    test_accuracy /= len(test_loader)
    
    # Convert label indices to label names
    classes = le.inverse_transform(list(range(num_classes)))
    true_labels_name = [classes[label_index] for label_index in true_labels]
    predicted_labels_name = [classes[label_index] for label_index in predicted_labels]
    precision, recall, f1_score, _ = precision_recall_fscore_support(true_labels, predicted_labels, average='weighted')
    
    print(f'Test accuracy: {test_accuracy * 100:.2f}%')
    print(f'Precision: {precision:.2f}, Recall: {recall:.2f}, F1-score: {f1_score:.2f}')


    return true_labels_name, predicted_labels_name


In [14]:
df=processed_df

Data Split

In [15]:

df_train, df_val, df_test = data_split(df)
train_texts = df_train['text'].astype(str).tolist()

train_labels = df_train['category'].tolist()
val_texts = df_val['text'].astype(str).tolist()
val_labels = df_val['category'].tolist()
test_texts = df_test['text'].astype(str).tolist()
test_labels = df_test['category'].tolist()

le = LabelEncoder()
df['category'] = le.fit_transform(df['category'])
texts = df['text'].tolist()
labels = df['category'].tolist()
embedding_dim = fasttext_model.get_dimension()  
num_filters = 200
filter_sizes = [2, 3, 4]
num_classes = len(le.classes_)
print(num_classes)
max_length = 250
hidden_dim= 64
num_layers=3

15


Dataloader

In [16]:
class DatasetLoader(Dataset):
    def __init__(self, texts, labels, text_to_embedding, max_length):
        self.texts = texts
        self.labels = labels
        self.text_to_embedding = text_to_embedding
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        text_embedding = self.text_to_embedding(text, self.max_length)
        return torch.tensor(text_embedding, dtype=torch.float32), torch.tensor(label, dtype=torch.long)


In [17]:
# Create a mapping of labels to integers
label_to_idx = {label: idx for idx, label in enumerate(set(train_labels))}
train_labels_num = [label_to_idx[label] for label in train_labels]
val_labels_num = [label_to_idx[label] for label in val_labels]
test_labels_num = [label_to_idx[label] for label in test_labels]

train_dataset = DatasetLoader(train_texts, train_labels_num, text_to_embedding, max_length)
val_dataset = DatasetLoader(val_texts, val_labels_num, text_to_embedding, max_length)
test_dataset = DatasetLoader(test_texts, test_labels_num, text_to_embedding, max_length)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)
test_loader = DataLoader(test_dataset, batch_size=32)

In [18]:
model_cnn = FastTextCNN(embedding_dim, num_filters, filter_sizes, num_classes, dropout_prob=0.2).to(device)
learning_rate = 0.001  
optimizer = optim.Adam(model_cnn.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

FASTTEXT+CNN Training

In [19]:
training(model_cnn, train_loader, val_loader, criterion, optimizer, device=device, num_epochs=5, modeltype='CNN')

Epoch 1/5, Training Loss: 1.2115, Validation Loss: 1.0172, Validation Accuracy: 68.70%
Epoch 2/5, Training Loss: 0.9819, Validation Loss: 0.9686, Validation Accuracy: 69.68%
Epoch 3/5, Training Loss: 0.8962, Validation Loss: 0.9737, Validation Accuracy: 69.64%
Epoch 4/5, Training Loss: 0.8181, Validation Loss: 0.9593, Validation Accuracy: 70.42%
Epoch 5/5, Training Loss: 0.7392, Validation Loss: 0.9818, Validation Accuracy: 70.42%


FASTTEXT+CNN Testing

In [20]:
true_labels_name_CNN, predicted_labels_name_CNN = test(model_cnn, test_loader)

Test accuracy: 70.30%
Precision: 0.70, Recall: 0.70, F1-score: 0.69


In [22]:
model_lstm = FastTextBiLSTM(embedding_dim, hidden_dim, num_layers, num_classes, dropout=0.2).to(device)
learning_rate = 0.001  
optimizer_lstm = torch.optim.Adam(model_lstm.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

FASTTEXT+LSTM Training

In [23]:
training(model_lstm, train_loader, val_loader, criterion, optimizer_lstm, device=device, num_epochs=5, modeltype='LSTM')

Epoch 1/5, Training Loss: 1.4100, Validation Loss: 1.1280, Validation Accuracy: 65.54%
Epoch 2/5, Training Loss: 1.0595, Validation Loss: 0.9883, Validation Accuracy: 69.30%
Epoch 3/5, Training Loss: 0.9568, Validation Loss: 0.9229, Validation Accuracy: 71.27%
Epoch 4/5, Training Loss: 0.8851, Validation Loss: 0.8889, Validation Accuracy: 71.95%
Epoch 5/5, Training Loss: 0.8337, Validation Loss: 0.8670, Validation Accuracy: 72.59%


FASTTEXT+LSTM Testing

In [24]:
true_labels_name_lstm, predicted_labels_name_lstm = test(model_lstm, test_loader)

Test accuracy: 72.45%
Precision: 0.72, Recall: 0.72, F1-score: 0.72


To run the best model for fasttext + CNN

In [34]:
saved_model_path = "../data/fasttext/fasttext_best_model_CNN.pth"
saved_model_state_dict = torch.load(saved_model_path)
num_filters = 200
filter_sizes = [2, 3, 4]
num_classes = 15
max_length = 250

# Model initialization
model = FastTextCNN(embedding_dim, num_filters, filter_sizes, num_classes, dropout_prob=0.2).to(device)

# Load the saved state dictionary into your model
model.load_state_dict(saved_model_state_dict)
true_labels_name_cnn, predicted_labels_name_cnn=test(model, test_loader)

Test accuracy: 70.66%
Precision: 0.71, Recall: 0.71, F1-score: 0.70


To run the best model for fasttext + LSTM

In [28]:
saved_model_path = "/content/drive/MyDrive/data/fasttext/fasttext_best_model_LSTM.pth"
saved_model_state_dict = torch.load(saved_model_path)
num_filters = 200
filter_sizes = [2, 3, 4]
num_classes = 15
max_length = 250
hidden_dim= 64
num_layers=3

# Model initialization
model = FastTextBiLSTM(embedding_dim, hidden_dim, num_layers, num_classes, dropout=0.2).to(device)
# Load the saved state dictionary into your model
model.load_state_dict(saved_model_state_dict)
true_labels_name_lstm, predicted_labels_name_lstm=test(model, test_loader)


Test accuracy: 72.45%
Precision: 0.72, Recall: 0.72, F1-score: 0.72
