In [6]:
from models import *
from utils import *
from sentiment_data import *
from sentiment_classifier import *
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\muhammadawais.naeem\AppData\Roaming\nltk_data
[nltk_data]     ...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
train_exs = read_sentiment_examples("data/train.txt")
dev_exs = read_sentiment_examples("data/dev.txt")

In [8]:
word_embeddings = read_word_embeddings("data/glove.6B.300d-relativized.txt")

Read in 14923 vectors of size 300


In [48]:
class FFNN(nn.Module):
    def __init__(self, input_size, hidden_layers, output_classes, word_embeddings):
        super(FFNN, self).__init__()
        self.embed = word_embeddings.get_initialized_embedding_layer()
        self.linear1 = nn.Linear(input_size, hidden_layers)
        self.linear2 = nn.Linear(hidden_layers, 64)
        self.linear3 = nn.Linear(64, output_classes)
        # Initialize weights according to a formula due to Xavier Glorot.
        nn.init.xavier_uniform_(self.linear1.weight)
        nn.init.xavier_uniform_(self.linear2.weight)

    def forward(self, input_data):
        x = self.embed(input_data)
        x = torch.mean(x, dim=1)
        x = self.linear1(x)
        x = self.linear2(x)
        x = self.linear3(x)
        return x

In [61]:
# RUN TRAINING AND TEST
pad_length=40
num_epochs = 50
input_size = 300
hidden_layers = 128
output_classes = 2
batch_size = 16
total_samples = len(train_exs)
batch_indices = np.arange(0, total_samples, batch_size)

ffnn = FFNN(input_size, hidden_layers, output_classes, word_embeddings)
initial_learning_rate = 0.0001
optimizer = optim.Adam(ffnn.parameters(), lr=initial_learning_rate)
criterion = nn.CrossEntropyLoss()

for epoch in range(num_epochs):
    total_loss = 0.0
    random.seed(42)
    random.shuffle(train_exs)
    
    for i in range(0, len(batch_indices)-1):
        train_batch = train_exs[i:i+1]
        word_indices = []
        labels = []
        # Pad or truncate each sample in the current batch
        for train_sample in train_batch:
            word_list = train_sample.words
            index = []
            for word in word_list:
                word_index = word_embeddings.word_indexer.index_of(word)
                if word_index != -1:
                    index.append(word_index)
                else:
                    index.append(0)
            
            # Pad or truncate the index
            index_length = len(index)
            if index_length > pad_length:
                index = index[0:pad_length]
            else:
                for i in range(index_length, pad_length):
                    index.append(1)
            
            word_indices.append(index)
            labels.append([train_sample.label])

        x = torch.from_numpy(np.array(word_indices)).int()
        y = train_sample.label
        
        # # Build one-hot representation of y. Instead of the label 0 or 1, y_onehot is either [0, 1] or [1, 0]. This
        # # way we can take the dot product directly with a probability vector to get class probabilities.
        y_onehot = torch.zeros((len(labels), output_classes))
        # # scatter will write the value of 1 into the position of y_onehot given by y
        y_onehot.scatter_(1, torch.from_numpy(np.asarray(labels, dtype=np.int64)), 1)

        # Zero out the gradients from the FFNN object. *THIS IS VERY IMPORTANT TO DO BEFORE CALLING BACKWARD()*
        optimizer.zero_grad()
        output = ffnn(x)
        # Can also use built-in NLLLoss as a shortcut here but we're being explicit here
        loss = criterion(output, y_onehot)
        # Computes the gradient and takes the optimizer step
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss:.4f}")

Epoch 1/50, Loss: 296.9442
Epoch 2/50, Loss: 292.5028
Epoch 3/50, Loss: 281.8027
Epoch 4/50, Loss: 268.5917
Epoch 5/50, Loss: 262.5089
Epoch 6/50, Loss: 255.5067
Epoch 7/50, Loss: 238.7361
Epoch 8/50, Loss: 230.0239
Epoch 9/50, Loss: 216.9828
Epoch 10/50, Loss: 209.5482
Epoch 11/50, Loss: 211.5334
Epoch 12/50, Loss: 219.0288
Epoch 13/50, Loss: 200.4082
Epoch 14/50, Loss: 212.5712
Epoch 15/50, Loss: 203.6005
Epoch 16/50, Loss: 198.7287
Epoch 17/50, Loss: 208.8603
Epoch 18/50, Loss: 200.0419
Epoch 19/50, Loss: 201.3248
Epoch 20/50, Loss: 196.4376
Epoch 21/50, Loss: 197.1770
Epoch 22/50, Loss: 202.5271
Epoch 23/50, Loss: 206.6694
Epoch 24/50, Loss: 200.4968
Epoch 25/50, Loss: 208.2820
Epoch 26/50, Loss: 190.6926
Epoch 27/50, Loss: 213.7326
Epoch 28/50, Loss: 186.9994
Epoch 29/50, Loss: 198.2222
Epoch 30/50, Loss: 205.5710
Epoch 31/50, Loss: 203.9734
Epoch 32/50, Loss: 205.1412
Epoch 33/50, Loss: 187.0697
Epoch 34/50, Loss: 201.7696
Epoch 35/50, Loss: 196.5718
Epoch 36/50, Loss: 171.9337
E

In [62]:
from sklearn.metrics import classification_report, accuracy_score

In [63]:
batch_size = 16
total_samples = len(train_exs)
batch_indices = np.arange(0, total_samples, batch_size)
y_true = []
y_pred = []
for i in range(0, len(batch_indices)-1):
    train_batch = train_exs[i:i+1]
    word_indices = []
    # Pad or truncate each sample in the current batch
    for train_sample in train_batch:
        word_list = train_sample.words
        index = []
        for word in word_list:
            word_index = word_embeddings.word_indexer.index_of(word)
            if word_index != -1:
                index.append(word_index)
            else:
                index.append(1) #unknown 'UAK'
        
        # Pad or truncate the index
        index_length = len(index)
        if index_length > pad_length:
            index = index[0:pad_length]
        else:
            for i in range(index_length, pad_length):
                index.append(0)
        
        word_indices.append(index)
        y_true.append(train_sample.label)

    x = torch.from_numpy(np.array(word_indices)).int()
    output = ffnn(x)
    y_pred.append(torch.argmax(output).item())

In [64]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.75      0.77       204
           1       0.79      0.83      0.81       228

    accuracy                           0.79       432
   macro avg       0.79      0.79      0.79       432
weighted avg       0.79      0.79      0.79       432



In [65]:
batch_size = 16
total_samples = len(dev_exs)
batch_indices = np.arange(0, total_samples, batch_size)
y_true = []
y_pred = []
for i in range(0, len(batch_indices)-1):
    train_batch = dev_exs[i:i+1]
    word_indices = []
    # Pad or truncate each sample in the current batch
    for train_sample in train_batch:
        word_list = train_sample.words
        index = []
        for word in word_list:
            word_index = word_embeddings.word_indexer.index_of(word)
            if word_index != -1:
                index.append(word_index)
            else:
                index.append(0)
        
        # Pad or truncate the index
        index_length = len(index)
        if index_length > pad_length:
            index = index[0:pad_length]
        else:
            for i in range(index_length, pad_length):
                index.append(1)
        
        word_indices.append(index)
        y_true.append(train_sample.label)

    x = torch.from_numpy(np.array(word_indices)).int()
    output = ffnn(x)
    y_pred.append(torch.argmax(output).item())

In [66]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.18      0.67      0.29         3
           1       0.98      0.82      0.89        51

    accuracy                           0.81        54
   macro avg       0.58      0.75      0.59        54
weighted avg       0.93      0.81      0.86        54

