In [23]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt



# load the data
train = pd.read_csv('train.csv')
X_train = train['Text']
y_train = train['Verdict']

X_training, X_validation, y_training, y_validation = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Map class labels to stert from 0 to fit NN model
y_training_mapped = y_training.map({-1: 0, 0: 1, 1: 2})
y_validation_mapped = y_validation.map({-1: 0, 0: 1, 1: 2})

bow_vectorizer = CountVectorizer(stop_words='english', ngram_range=(1, 1))

X_training_bow = bow_vectorizer.fit_transform(X_training)
X_validation_bow = bow_vectorizer.transform(X_validation)

# convert sparse matrices returned by CountVectorizer into dense matrices before converting them into PyTorch tensors
X_training_bow_tensor = torch.FloatTensor(X_training_bow.toarray())
X_validation_bow_tensor = torch.FloatTensor(X_validation_bow.toarray())
y_training_tensor = torch.LongTensor(y_training_mapped)
y_validation_tensor = torch.LongTensor(y_validation_mapped.to_numpy())

num_features = X_training_bow.shape[1]
num_classes = len(np.unique(y_training))


class SimpleNeuralNet(nn.Module):
    def __init__(self):
        super(SimpleNeuralNet, self).__init__()
        self.layer1 = nn.Linear(num_features, 512)  # Adjust input features to match vectorizer's max_features
        self.relu = nn.ReLU()
        self.layer2 = nn.Linear(512, 256)
        self.output_layer = nn.Linear(256, num_classes)  # Output size matches the number of classes

    def forward(self, x):
        x = self.relu(self.layer1(x))
        x = self.relu(self.layer2(x))
        x = self.output_layer(x)
        return x


torch.manual_seed(6) # Set seed to some fixed value

epochs = 1000

nn_model = SimpleNeuralNet()
# the optimizer controls the learning rate
optimiser = torch.optim.SGD(nn_model.parameters(), lr=1e-2, momentum=0)
loss_fn = nn.CrossEntropyLoss()

print('Epoch', 'Loss', '\n-----', '----', sep='\t')
for i in range(1, epochs + 1):
    # reset gradients to 0
    optimiser.zero_grad()
    # get predictions
    y_pred = nn_model(X_training_bow_tensor)

    # print(y_pred.shape)
    # print(y_training_tensor.shape)
    # compute loss
    loss = loss_fn(y_pred, y_training_tensor)
    # backpropagate
    loss.backward()
    # update the model weights
    optimiser.step()

    # Print every 1000 epochs
    if i % 100 == 0:
        print (f"{i:5d}", loss.item(), sep='\t')


with torch.no_grad():  # No gradient computation for evaluation
    y_prediction_logits = nn_model(X_validation_bow_tensor)
    y_prediction_classes = torch.argmax(y_prediction_logits, dim=1)  # Convert logits to class labels

# Convert tensors to numpy arrays for sklearn functions
y_validation_numpy = y_validation_tensor.numpy() - 1
y_prediction_numpy = y_prediction_classes.numpy() - 1

print(y_validation_numpy)
print(y_prediction_numpy)

print(classification_report(y_validation_numpy, y_prediction_numpy))
print(f1_score(y_validation_numpy, y_prediction_numpy, average='macro'))


# Evaluate the model
#print(classification_report(y_validation.numpy(), y_prediction.numpy()))

#f1_score(y_validation, y_prediction, average='macro')


Epoch	Loss	
-----	----
  100	0.9656950831413269
  200	0.9035910367965698
  300	0.8778517842292786
  400	0.8671694993972778
  500	0.8624724745750427
  600	0.8601602911949158
  700	0.8588528633117676
  800	0.8580092191696167
  900	0.8574013710021973
 1000	0.856920599937439
[ 1 -1 -1 ...  0 -1 -1]
[-1 -1 -1 ... -1 -1 -1]
              precision    recall  f1-score   support

          -1       0.65      1.00      0.79      2926
           0       0.00      0.00      0.00       502
           1       0.00      0.00      0.00      1073

    accuracy                           0.65      4501
   macro avg       0.22      0.33      0.26      4501
weighted avg       0.42      0.65      0.51      4501

0.26264530317310714


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [25]:
# get the prediction for the test set
test = pd.read_csv('test.csv')
X_test = test['Text']

X_test_bow = bow_vectorizer.transform(X_test)
X_test_bow_tensor = torch.FloatTensor(X_test_bow.toarray())

with torch.no_grad():
    y_testing_logits = nn_model(X_test_bow_tensor)
    y_testing_classes = torch.argmax(y_testing_logits, dim=1) 

# Convert tensors to numpy arrays for sklearn functions
y_test_numpy = y_testing_classes.numpy() - 1

test['Verdict'] = pd.Series(y_test_numpy)
test.drop(columns=['Text'], inplace=True)
test.to_csv('A0233573E_Simple_Neural_Network.csv', index=False)