## Text Classification on News Articles (GloVe)

The following notebook explains the methods and model used to develop the final version of the text classifier built based off of GloVe.

### Importing Libraries

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data_utils
import pandas as pd
import numpy as np
from txtclassifier_glove import *

### Load data

In [2]:
# Using a saved version of pre-processed file. 
df = pd.read_csv("../data/processed_df.csv", header=0).drop('Unnamed: 0', axis=1)
X = df['text']
y = df['category']
num_classes = len(y.unique())

### Loading glove word embeddings

The following code-snippet loads a pre-trained GloVe word embedding file, reads its contents, and creates a dictionary with each word as a key and its corresponding 100-dimensional vector as a value. The resulting `embedding_dict` can be used to look up the vector representation of any word in the GloVe embeddings file.

In [3]:
# Load GloVe embeddings
embedding_dict = {}
with open('./data/glove/glove.6B.100d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        embedding_dict[word] = vector

### Creating word to index mappings

The following code preprocesses text data, creates a word-to-index mapping for each unique word in the texts, and calculates the vocabulary size by counting the number of unique words in the **word-to-index** mapping. The resulting word_to_idx dictionary can be used to look up the index of each word in the vocabulary. The vocab_size variable holds the total number of unique words in the vocabulary.

In [4]:
# create word-to-index mapping
max_seq_len = 100
word_to_idx = {}
idx = 1
for text in X:
    words = text.split()
    for word in words:
        if word not in word_to_idx and word in embedding_dict:
            word_to_idx[word] = idx
            idx += 1
            
vocab_size = len(word_to_idx) + 1

### Splitting the data and loading the data

The below code converts the text data into sequences of word indices and one-hot encoded labels for the training, validation, and test sets. The resulting sequences of indices are truncated or padded with zeros to match the max_seq_len variable. The data is then converted to PyTorch tensors of appropriate data types.

In [5]:
df_train, df_val, df_test = data_split(df)
train_texts = df_train['text'].astype(str).tolist()
train_labels = df_train['category'].tolist()
val_texts = df_val['text'].astype(str).tolist()
val_labels = df_val['category'].tolist()
test_texts = df_test['text'].astype(str).tolist()
test_labels = df_test['category'].tolist()

train_texts = [[word_to_idx[word] for word in text.split() if word in word_to_idx][:max_seq_len] for text in train_texts]
train_texts = torch.tensor([xi + [0]*(max_seq_len - len(xi)) for xi in train_texts], dtype=torch.long)
train_labels = torch.tensor(pd.get_dummies(train_labels).values, dtype=torch.float32)

val_texts = [[word_to_idx[word] for word in text.split() if word in word_to_idx][:max_seq_len] for text in val_texts]
val_texts = torch.tensor([xi + [0]*(max_seq_len - len(xi)) for xi in val_texts], dtype=torch.long)
val_labels = torch.tensor(pd.get_dummies(val_labels).values, dtype=torch.float32)

test_texts = [[word_to_idx[word] for word in text.split() if word in word_to_idx][:max_seq_len] for text in test_texts]
test_texts = torch.tensor([xi + [0]*(max_seq_len - len(xi)) for xi in test_texts], dtype=torch.long)
test_labels = torch.tensor(pd.get_dummies(test_labels).values, dtype=torch.float32)

In [6]:
train_dataset = data_utils.TensorDataset(train_texts, train_labels)
val_dataset = data_utils.TensorDataset(val_texts, val_labels)
test_dataset = data_utils.TensorDataset(test_texts, test_labels)

batch_size = 32
train_loader = data_utils.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = data_utils.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = data_utils.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

###  Building different Models

### Simple CNN Model

In [7]:
class TextCNN(nn.Module):
    def __init__(self, embedding_dict, num_classes, max_seq_len):
        super(TextCNN, self).__init__()
        embedding_dim = 100
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding.weight.data.copy_(torch.from_numpy(np.random.normal(0, 1, (vocab_size, embedding_dim))))
        for word, idx in word_to_idx.items():
            if word in embedding_dict:
                self.embedding.weight.data[idx] = torch.from_numpy(embedding_dict[word])
        self.conv1 = nn.Conv1d(in_channels=embedding_dim, out_channels=100, kernel_size=3)
        self.maxpool = nn.MaxPool1d(max_seq_len - 3 + 1)
        self.fc = nn.Linear(100, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(0, 2, 1)
        x = nn.functional.relu(self.conv1(x))
        x = self.maxpool(x)
        x = x.view(-1, 100)
        x = self.fc(x)
        return x

### Enhanced CNN model

In [8]:
class TextCNN_enhanced(nn.Module):
    def __init__(self, embedding_dict, num_classes, max_seq_len):
        super(TextCNN_enhanced, self).__init__()
        embedding_dim = 100
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding.weight.data.copy_(torch.from_numpy(np.random.normal(0, 1, (vocab_size, embedding_dim))))
        for word, idx in word_to_idx.items():
            if word in embedding_dict:
                self.embedding.weight.data[idx] = torch.from_numpy(embedding_dict[word])
        self.conv1 = nn.Conv1d(in_channels=embedding_dim, out_channels=100, kernel_size=3)
        self.conv2 = nn.Conv1d(in_channels=embedding_dim, out_channels=100, kernel_size=4)
        self.conv3 = nn.Conv1d(in_channels=embedding_dim, out_channels=100, kernel_size=5)
        self.maxpool = nn.MaxPool1d(max_seq_len - 3 - 4 - 5 + 3 + 1 + 1)
        self.fc = nn.Linear(300, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(0, 2, 1)
        x1 = nn.functional.relu(self.conv1(x))
        x2 = nn.functional.relu(self.conv2(x))
        x3 = nn.functional.relu(self.conv3(x))
        x1 = self.maxpool(x1)
        x2 = self.maxpool(x2)
        x3 = self.maxpool(x3)
        x = torch.cat((x1, x2, x3), dim=1)
        x = x.view(-1, 300)
        x = self.fc(x)
        return x

### Training the model

In [9]:
def train(model, optimizer, loss_fn, train_loader, val_loader, num_epochs, save_every_n_epochs):
    train_losses = []
    val_losses = []

    best_model = None
    best_val_loss = float("inf")

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            y_pred = model(X_batch)
            loss = loss_fn(y_pred, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        train_loss = total_loss / len(train_loader)
        train_losses.append(train_loss)
        
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                y_pred = model(X_batch)
                loss = loss_fn(y_pred, y_batch)
                val_loss += loss.item()
        val_loss /= len(val_loader)
        val_losses.append(val_loss)

        if val_loss < best_val_loss:
            best_model = model
            best_val_loss = val_loss

        if epoch % save_every_n_epochs == 0:
            # Save the model
            if type(model).__name__ == 'TextCNN_enhanced':
                best_model_path = f'../data/glove/glove_cnn_enhance_{epoch}.pth'
            elif type(model).__name__ == 'TextCNN':
                best_model_path = f'../data/glove/glove_cnn_{epoch}.pth'
            if best_model is not None:
                torch.save(best_model.state_dict(), best_model_path)

        print(f'Epoch {epoch}: train loss = {train_loss:.4f}, val loss = {val_loss:.4f}')

    return best_model, train_losses, val_losses

In [11]:
num_epochs = 5
learning_rate = 0.001

cnn_model = TextCNN(embedding_dict, num_classes = num_classes, max_seq_len = max_seq_len)
optimizer = torch.optim.Adam(cnn_model.parameters(), lr = learning_rate)

# Takes about 30 mins for a complete run
print("Training CNN model")
train(cnn_model, optimizer, nn.CrossEntropyLoss(), train_loader, val_loader, num_epochs,2)

Training CNN model
Epoch 0: train loss = 1.1170, val loss = 0.8215
Epoch 1: train loss = 0.8227, val loss = 0.6468
Epoch 2: train loss = 0.6557, val loss = 0.4648
Epoch 3: train loss = 0.4999, val loss = 0.3177
Epoch 4: train loss = 0.3579, val loss = 0.2135


(TextCNN(
   (embedding): Embedding(59959, 100)
   (conv1): Conv1d(100, 100, kernel_size=(3,), stride=(1,))
   (maxpool): MaxPool1d(kernel_size=98, stride=98, padding=0, dilation=1, ceil_mode=False)
   (fc): Linear(in_features=100, out_features=15, bias=True)
 ),
 [1.1169984333701108,
  0.8227214720292825,
  0.655671860916362,
  0.4999406697196955,
  0.357872627222395],
 [0.821484368431652,
  0.6467710630609864,
  0.4647854510671623,
  0.31771604419155314,
  0.21345315998235936])

In [12]:
num_epochs = 3
learning_rate = 0.001

enhanced_cnn_model = TextCNN_enhanced(embedding_dict, num_classes = num_classes, max_seq_len = max_seq_len)
optimizer = torch.optim.Adam(enhanced_cnn_model.parameters(), lr = learning_rate)

# Takes about 30-45 mins for a complete run
print("Training Enhanced TextCNN model")
train(enhanced_cnn_model, optimizer, nn.CrossEntropyLoss(), train_loader, val_loader, num_epochs,2)

Training Enhanced TextCNN model
Epoch 0: train loss = 1.0784, val loss = 0.7910
Epoch 1: train loss = 0.7841, val loss = 0.5609
Epoch 2: train loss = 0.5899, val loss = 0.3664


(TextCNN_enhanced(
   (embedding): Embedding(59959, 100)
   (conv1): Conv1d(100, 100, kernel_size=(3,), stride=(1,))
   (conv2): Conv1d(100, 100, kernel_size=(4,), stride=(1,))
   (conv3): Conv1d(100, 100, kernel_size=(5,), stride=(1,))
   (maxpool): MaxPool1d(kernel_size=93, stride=93, padding=0, dilation=1, ceil_mode=False)
   (fc): Linear(in_features=300, out_features=15, bias=True)
 ),
 [1.0783945596940665, 0.7840815012150182, 0.5899295413764144],
 [0.7910271767476003, 0.5608900474414482, 0.36639281172254484])

### Testing the model

In [16]:
def evaluate_model(test_model, test_loader):
    
    # Set the model to evaluation mode
    test_model.eval()

    # Calculate the accuracy on the test set
    num_correct = 0
    num_samples = 0
    with torch.no_grad():
        predicted_labels = []
        for X_batch, y_batch in test_loader:
            y_pred = test_model(X_batch)
            y_pred = y_pred.argmax(dim=1)
            predicted_labels.extend(y_pred.tolist())

        true_labels = []
        for X_batch, y_batch in test_loader:
            true_labels.extend(y_batch.tolist())

        true_labels = [torch.argmax(torch.tensor(batch_labels)) for batch_labels in true_labels]
        true_labels = torch.tensor(true_labels, dtype=torch.int64)
        predicted_labels = torch.tensor(predicted_labels)

        class_counts = torch.bincount(true_labels)
        correct_counts = torch.bincount(true_labels[predicted_labels == true_labels], minlength=len(class_counts))

        accuracy = float(correct_counts.sum()) / float(class_counts.sum())
        precision = float(correct_counts[1]) / float(class_counts[1])
        recall = float(correct_counts[1]) / float(class_counts[1] + class_counts[0])
        f1_score = 2 * (precision * recall) / (precision + recall)

    return accuracy, precision, recall, f1_score

### Evaluation Results 

#### Simple CNN model

In [17]:
test_model = TextCNN(embedding_dict, num_classes = num_classes, max_seq_len = max_seq_len)

# Load the saved model state dict
state_dict = torch.load('../data/glove/glove_cnn_4.pth')

# Load the state dict into the model
test_model.load_state_dict(state_dict)

# Set the model to evaluation mode
test_model.eval()

accuracy, precision, recall, f1_score = evaluate_model(test_model, test_loader)

print("Evaluation Results:\n"
      "Accuracy: {:.4f}\n"
      "Precision: {:.4f}\n"
      "Recall: {:.4f}\n"
      "F1 Score: {:.4f}\n".format(accuracy, precision, recall, f1_score))

Evaluation Results:
Accuracy: 0.6756
Precision: 0.5743
Recall: 0.4479
F1 Score: 0.5033



#### Enhanced CNN model

In [18]:
test_model = TextCNN_enhanced(embedding_dict, num_classes = num_classes, max_seq_len = max_seq_len)

# Load the saved model state dict
state_dict = torch.load('../data/glove/glove_cnn_enhance_2.pth')

# Load the state dict into the model
test_model.load_state_dict(state_dict)

# Set the model to evaluation mode
test_model.eval()

accuracy, precision, recall, f1_score = evaluate_model(test_model, test_loader)

print("Evaluation Results:\n"
      "Accuracy: {:.4f}\n"
      "Precision: {:.4f}\n"
      "Recall: {:.4f}\n"
      "F1 Score: {:.4f}\n".format(accuracy, precision, recall, f1_score))

Evaluation Results:
Accuracy: 0.7006
Precision: 0.5970
Recall: 0.4656
F1 Score: 0.5232

