In [1]:
from collections import Counter
import nltk, json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from torchmetrics import Accuracy, Precision, Recall

In [3]:
# nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\plang\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [2]:
# Import data and labels
with open(r"Data/words.json", 'r') as f1:
    words = json.load(f1)
with open(r"Data/text.json", 'r') as f2:
    text = json.load(f2)
labels = np.load(r'Data/labels.npy')

In [3]:
# Dictionaries to store the word to index mappings and vice versa
word2idx = {o:i for i,o in enumerate(words)}
idx2word = {i:o for i,o in enumerate(words)}

# Looking up the mapping dictionary and assigning the index to the respective words
for i, sentence in enumerate(text):
    text[i] = [word2idx[word] if word in word2idx else 0 for word in sentence]
    
# Defining a function that either shortens sentences or pads sentences with 0 to a fixed length
def pad_input(sentences, seq_len):
    features = np.zeros((len(sentences), seq_len),dtype=int)
    for ii, review in enumerate(sentences):
        if len(review) != 0:
            features[ii, -len(review):] = np.array(review)[:seq_len]
    return features

text = pad_input(text, 50)

num_classes = len(np.unique(labels))

In [4]:
# Splitting dataset
train_text, test_text, train_label, test_label = train_test_split(text, labels, test_size=0.1, random_state=42)
train_text, valid_text, train_label, valid_label = train_test_split(train_text, train_label, test_size=0.1, random_state=42)

train_data = TensorDataset(torch.from_numpy(train_text), torch.from_numpy(train_label).long())
valid_data = TensorDataset(torch.from_numpy(valid_text), torch.from_numpy(valid_label).long())
test_data = TensorDataset(torch.from_numpy(test_text), torch.from_numpy(test_label).long())

In [5]:
class CNNTextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes):
        super(CNNTextClassifier, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_dim)
        self.cnn = nn.Conv1d(in_channels=embed_dim, out_channels=embed_dim, kernel_size=3, stride=1, padding=1)
        self.fc = nn.Linear(embed_dim, num_classes)
    def forward(self, text):
        embedded=self.embedding(text).permute(1,0)
        conved=F.relu(self.cnn(embedded))
        conved=conved.mean(dim=1)
        outputs=self.fc(conved)
        proba=F.softmax(outputs, dim=0)
        return(proba)

In [143]:
cnn=CNNTextClassifier(vocab_size=len(word2idx), embed_dim=10, num_classes=num_classes)

# Training loop
criterion = nn.CrossEntropyLoss()
another_criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(cnn.parameters(), lr=0.001)

for epoch in range(10):
    cnn.train()
    for text, label in train_data:
        optimizer.zero_grad()
        pred_proba = cnn(text)
        loss = criterion(pred_proba, label)
        loss.backward()
        optimizer.step()

    cnn.eval()
    with torch.no_grad():
        for text, label in valid_data:
            pred_proba=cnn(text)
            valid_loss = another_criterion(pred_proba, label)   
    print(f'Epoch: {epoch+1}, Training Loss: {loss.item()}, Validation Loss: {valid_loss.item()}')

Epoch: 1, Training Loss: 0.9119582772254944, Validation Loss: 1.5952026844024658
Epoch: 2, Training Loss: 0.9050331711769104, Validation Loss: 1.7598860263824463
Epoch: 3, Training Loss: 0.9048406481742859, Validation Loss: 1.8258984088897705
Epoch: 4, Training Loss: 0.9048325419425964, Validation Loss: 1.823211431503296
Epoch: 5, Training Loss: 0.9048324823379517, Validation Loss: 1.7896498441696167
Epoch: 6, Training Loss: 0.9048324823379517, Validation Loss: 1.7396031618118286
Epoch: 7, Training Loss: 0.9048324823379517, Validation Loss: 1.6988259553909302
Epoch: 8, Training Loss: 0.9048324823379517, Validation Loss: 1.6282529830932617
Epoch: 9, Training Loss: 0.9048324823379517, Validation Loss: 1.4629325866699219
Epoch: 10, Training Loss: 0.9048324823379517, Validation Loss: 1.2836328744888306


In [146]:
accuracy = Accuracy(task='multiclass', num_classes=num_classes, average=None)
precision = Precision(task='multiclass', num_classes=num_classes, average=None)
recall = Recall(task='multiclass', num_classes=num_classes, average=None)
f1 = F1Score(task='multiclass', num_classes=num_classes, average=None)

cnn_valid_outputs = torch.stack([cnn(features) for features, label in valid_data])
valid_labels =  torch.stack([label for features, label in valid_data])
print(accuracy(cnn_valid_outputs, valid_labels))
print(precision(cnn_valid_outputs, valid_labels))
print(recall(cnn_valid_outputs, valid_labels))
print(f1(cnn_valid_outputs, valid_labels))

tensor([0.6712, 0.7097, 0.8182, 0.8158, 0.8807])
tensor([0.5385, 0.8049, 0.8265, 0.8378, 0.9143])
tensor([0.6712, 0.7097, 0.8182, 0.8158, 0.8807])
tensor([0.5976, 0.7543, 0.8223, 0.8267, 0.8972])
