In [3]:
import gensim.downloader as api
import matplotlib.pyplot as plt
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from textblob import TextBlob
from torch.utils.data import DataLoader, TensorDataset

nlp = spacy.load("en_core_web_sm")
minmax_scaler = MinMaxScaler()
word_vectors = api.load("word2vec-google-news-300")

Text Preprocessing

In [4]:
# Define a function for preprocessing
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Join tokens back into a string
    # text = ' '.join(tokens)
    # but better to lemmatize
    #text = [token.lemma_ for token in nlp(text)] #spacy
    text = tokens
    return text

Sentiment analysis with Textblob + Frequency of each POS tag in a sentence with SpaCy + Sentence length + Ratio of unique words to length of sentence with SpaCy

In [5]:
def sentiment_analysis_textblob(X_training):
    
    list_of_features = []

    for list_of_tokens in X_training:

        # Join tokens back into a single string
        sentence = ' '.join(list_of_tokens)

        # Analyze sentiment with TextBlob
        blob = TextBlob(sentence)
        sentiment_polarity = blob.sentiment.polarity
        sentiment_subjectivity = blob.sentiment.subjectivity

        # Combine sentence length, POS tag counts, and sentiment features into a single feature array for this sentence
        features = [sentiment_polarity, sentiment_subjectivity]
        list_of_features.append(features)

    features_array = np.array(list_of_features)

    minmax_scaler = MinMaxScaler(feature_range=(0, 1))
    features_scaled = minmax_scaler.fit_transform(features_array)

    return features_scaled

In [6]:
def feature_engineering(X_training):

    pos_tags_of_interest = ['NOUN', 'VERB', 'ADJ', 'ADV']  # nouns, verbs, adjectives, adverbs
    list_of_features = []

    # Convert list of tokens back into sentences
    sentences = [' '.join(list_of_tokens) for list_of_tokens in X_training]

    for doc in nlp.pipe(sentences):
    # for doc in nlp.pipe(sentences, disable=["ner", "parser"]):  # only tagger component is needed
        # Count the occurrences of each POS tag of interest
        pos_counts = {tag: 0 for tag in pos_tags_of_interest}
        for token in doc:
            if token.pos_ in pos_tags_of_interest:
                pos_counts[token.pos_] += 1 

        
        # Calculate unique words vs sentence length ratio
        unique_words_ratio = len(set(token.text for token in doc)) / len(doc) if doc else 0

        # Calculate sentence length
        sentence_length = len(doc)

        # Combine sentence length and POS tag counts into a single feature array for this sentence
        features = [unique_words_ratio] + [sentence_length] + [pos_counts[tag] for tag in pos_tags_of_interest]
        list_of_features.append(features)

    # Convert the list of features into a numpy array
    features_array = np.array(list_of_features)

    # Scale the features using MinMaxScaler
    minmax_scaler = MinMaxScaler(feature_range=(0, 1))
    features_scaled = minmax_scaler.fit_transform(features_array)

    return features_scaled

Main neural network code -- Original Dataset

In [18]:
# def sentence_to_vec(sentence, model):
#     vecs = []
#     for word in sentence.split():
#         if word in model.key_to_index:
#             vecs.append(model[word])
#     if len(vecs) == 0:
#         return np.zeros(model.vector_size)
#     vecs = np.array(vecs)
#     return vecs.mean(axis=0)


def list_of_tokens_to_vec(list_of_tokens, model):
    vecs = []
    for word in list_of_tokens:
        if word in model.key_to_index:
            vecs.append(model[word])
    if len(vecs) == 0:
        return np.zeros(model.vector_size)
    vecs = np.array(vecs)
    return vecs.mean(axis=0)


# load the data
train = pd.read_csv('D:/!Education/CS4248/Project/fulltrain.csv')
X_train = train['Text']
y_train = train['Label']

preprocessed_train = [preprocess_text(text) for text in X_train]
X_train_preprocessed = preprocessed_train
print("Preprocessing complete")

X_training, X_validation, y_training, y_validation = train_test_split(X_train_preprocessed, y_train, test_size=0.2, random_state=42)
print("Splitting done")

# Map class labels to start from 0 to fit NN model
y_training_mapped = y_training - 1
y_validation_mapped = y_validation - 1
print("Mapped class labels to start from 0 instead of 1")

# bow_vectorizer = CountVectorizer(stop_words='english', ngram_range=(1, 1))

# X_training_bow = bow_vectorizer.fit_transform(X_training)
# X_validation_bow = bow_vectorizer.transform(X_validation)

X_training_with_sentiment = sentiment_analysis_textblob(X_training)
X_validation_with_sentiment = sentiment_analysis_textblob(X_validation)
print("Sentiment analysis completed")

X_training_engineered = feature_engineering(X_training)
X_validation_engineered = feature_engineering(X_validation)
print("Feature engineering completed")

X_training_w2v = np.array([list_of_tokens_to_vec(list_of_tokens, word_vectors) for list_of_tokens in X_training])
X_validation_w2v = np.array([list_of_tokens_to_vec(list_of_tokens, word_vectors) for list_of_tokens in X_validation])
print("Converted sentences to vectors")

print(X_training_w2v.shape)  # Should be (39083, num_features_w2v)
print(X_training_engineered.shape)  # Should be (39083, num_features_engineered)
print(X_training_with_sentiment.shape)  # Should be (39083, num_features_sentiment)

X_training_combined = np.hstack([X_training_w2v, X_training_engineered, X_training_with_sentiment])
X_validation_combined = np.hstack([X_validation_w2v, X_validation_engineered, X_validation_with_sentiment])
print("Integrated feature engineering into the dataset")

# convert sparse matrices returned by CountVectorizer into dense matrices before converting them into PyTorch tensors
# X_training_bow_tensor = torch.FloatTensor(X_training_bow.toarray())
# X_training_w2v_tensor = torch.FloatTensor(X_training_w2v)
X_training_w2v_tensor = torch.FloatTensor(X_training_combined)

# X_validation_bow_tensor = torch.FloatTensor(X_validation_bow.toarray())
# X_validation_w2v_tensor = torch.FloatTensor(X_validation_w2v)
X_validation_w2v_tensor = torch.FloatTensor(X_validation_combined)

y_training_tensor = torch.LongTensor(y_training_mapped)
# y_training_tensor = torch.LongTensor(y_training)
y_validation_tensor = torch.LongTensor(y_validation_mapped.to_numpy())
# y_validation_tensor = torch.LongTensor(y_validation.to_numpy())
print("Converted vectors to tensors")

# num_features = X_training_bow.shape[1]
embedding_size = word_vectors.vector_size + 8  # This should be 300 for Google News Word2Vec + 8 additional features
hidden_size = 512
num_classes = len(np.unique(y_training_mapped))  # The number of unique classes in target variable
# num_classes = len(np.unique(y_training))  # The number of unique classes in target variable
dropout_rate = 0.2

print("Number of unique classes:", num_classes)


class SimpleNeuralNet(nn.Module):
    def __init__(self, embedding_size, hidden_size, num_classes, dropout_rate):
        super(SimpleNeuralNet, self).__init__()
        self.layer1 = nn.Linear(embedding_size, hidden_size)
        self.layer2 = nn.Linear(hidden_size, hidden_size // 2)  # halving the size
        self.layer3 = nn.Linear(hidden_size // 2, hidden_size // 4)
        self.output_layer = nn.Linear(hidden_size // 4, num_classes) # Output size matches the number of classes
        self.dropout = nn.Dropout(dropout_rate)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.layer1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.layer2(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.layer3(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.output_layer(x)
        return x


torch.manual_seed(6) # Set seed to some fixed value
epochs = 150
# nn_model = SimpleNeuralNet()
nn_model = SimpleNeuralNet(embedding_size, hidden_size, num_classes, dropout_rate)
# the optimiser controls the learning rate
# optimiser = torch.optim.SGD(nn_model.parameters(), lr=1e-2, momentum=0)

# optimiser with weight decay for L2 regularization
# The weight_decay parameter is used to specify the L2 penalty
optimiser = torch.optim.Adam(nn_model.parameters(), lr=1e-3, weight_decay=1e-5) 

# Adam without L2 regularisation
# optimiser = torch.optim.Adam(nn_model.parameters(), lr=1e-3)

'''
The L2 regularization penalty works by adding a term to the loss that penalizes large weights, which helps prevent overfitting. 
This term is the sum of the squares of all the weights multiplied by the weight_decay value. During backpropagation, 
this has the effect of shrinking the weights slightly on every update, hence the term "weight decay."

Be cautious when combining L2 regularization with the Adam optimizer, as Adam already includes a form of regularization 
through its moving average of squared gradients (which is similar to RMSprop). Excessive regularization might lead to underfitting, 
so it’s important to find a balance that works well with your data and model architecture.
'''

loss_fn = nn.CrossEntropyLoss()

print('Epoch', 'Loss', '\n-----', '----', sep='\t')
for i in range(1, epochs + 1):
    # reset gradients to 0
    optimiser.zero_grad()
    # get predictions
    y_pred = nn_model(X_training_w2v_tensor)

    # print(y_pred.shape)
    # print(y_training_tensor.shape)
    # compute loss
    loss = loss_fn(y_pred, y_training_tensor)
    # backpropagate
    loss.backward()
    # update the model weights
    optimiser.step()
    # Print every 1000 epochs
    if i % 10 == 0:
        print (f"{i:5d}", loss.item(), sep='\t')


with torch.no_grad():  # No gradient computation for evaluation
    y_prediction_logits = nn_model(X_validation_w2v_tensor)
    y_prediction_classes = torch.argmax(y_prediction_logits, dim=1)  # Convert logits to class labels

# Convert tensors to numpy arrays for sklearn functions
y_validation_numpy = y_validation_tensor.numpy()
y_prediction_numpy = y_prediction_classes.numpy()

print("Validation data evaluated")

# print(y_validation_numpy)
# print(y_prediction_numpy)

print(classification_report(y_validation_numpy, y_prediction_numpy))
print(f1_score(y_validation_numpy, y_prediction_numpy, average='macro'))

# Evaluate the model
#print(classification_report(y_validation.numpy(), y_prediction.numpy()))

#f1_score(y_validation, y_prediction, average='macro')


Preprocessing complete
Splitting done
Mapped class labels to start from 0 instead of 1
Sentiment analysis completed
Feature engineering completed
Converted sentences to vectors
(39083, 300)
(39083, 6)
(39083, 2)
Integrated feature engineering into the dataset
Converted vectors to tensors
Number of unique classes: 4
Epoch	Loss	
-----	----
   10	1.3080294132232666
   20	1.1031100749969482
   30	0.8303283452987671
   40	0.6381742358207703
   50	0.5139918327331543
   60	0.42740750312805176
   70	0.3725055158138275
   80	0.33087286353111267
   90	0.29380762577056885
  100	0.26402580738067627
  110	0.24643246829509735
  120	0.2280084639787674
  130	0.21397952735424042
  140	0.20103852450847626
  150	0.19210505485534668
Validation data evaluated
              precision    recall  f1-score   support

           0       0.95      0.72      0.82      2793
           1       0.94      0.80      0.87      1371
           2       0.75      0.98      0.85      3587
           3       0.86      0.76 

Testing the model with balancedtest.csv

In [2]:
# get the prediction for the test set
test = pd.read_csv('D:/!Education/CS4248/Project/4248_project_Dissect_LUN/dataset/balancedtest.csv')
X_test = test['Text']
y_test = test['Label']


X_test_preprocessed = [preprocess_text(text) for text in X_test]
X_test_w2v = np.array([list_of_tokens_to_vec(list_of_tokens, word_vectors) for list_of_tokens in X_test_preprocessed])
X_test_with_sentiment = sentiment_analysis_textblob(X_test_preprocessed)
X_test_engineered = feature_engineering(X_test_preprocessed)

print(X_test_w2v.shape)
print(X_test_engineered.shape)
print(X_test_with_sentiment.shape)

X_test_combined = np.hstack([X_test_w2v, X_test_engineered, X_test_with_sentiment])
X_test_tensor = torch.FloatTensor(X_test_combined)

with torch.no_grad():
    # y_testing_logits = nn_model(X_test_bow_tensor)
    y_testing_logits = nn_model(X_test_tensor)
    y_testing_classes = torch.argmax(y_testing_logits, dim=1) 

# Convert tensors to numpy arrays for sklearn functions
# Convert 'y_test' to integers, then to a NumPy array
y_test_true_labels_numpy = y_test.astype(int).to_numpy() # true labels are 1, 2, 3, 4
y_test_prediction_numpy = y_testing_classes.numpy() + 1 # predicted labels are 0, 1, 2, 3 -- need to add 1

print(classification_report(y_test_true_labels_numpy, y_test_prediction_numpy))
print(f1_score(y_test_true_labels_numpy, y_test_prediction_numpy, average='macro'))

# test['Verdict'] = pd.Series(y_test_numpy)
# test.drop(columns=['Text'], inplace=True)
# test.to_csv('Project simple neural network.csv', index=False)

# Create a new DataFrame with the necessary information
results_df = pd.DataFrame({
    'Text': X_test,  # Assuming 'Text' is the column containing the text data
    'Original Label': y_test,  # Assuming 'Label' contains the original labels
    'Predicted Label': y_test_prediction_numpy
})

# Display the DataFrame
print(results_df.head())

NameError: name 'pd' is not defined

Training on the augmented dataset

In [30]:
def list_of_tokens_to_vec(list_of_tokens, model):
    vecs = []
    for word in list_of_tokens:
        if word in model.key_to_index:
            vecs.append(model[word])
    if len(vecs) == 0:
        return np.zeros(model.vector_size)
    vecs = np.array(vecs)
    return vecs.mean(axis=0)


# load the data
train = pd.read_csv('D:/!Education/CS4248/Project/merged_final_df_with_topics_new.csv')
X_train = train['text']
y_train = train['label']

preprocessed_train = [preprocess_text(text) for text in X_train]
X_train_preprocessed = preprocessed_train
print("Preprocessing complete")

X_training, X_validation, y_training, y_validation = train_test_split(X_train_preprocessed, y_train, test_size=0.2, random_state=42)
print("Splitting done")

# Map class labels to start from 0 to fit NN model
y_training_mapped = y_training - 1
y_validation_mapped = y_validation - 1
print("Mapped class labels to start from 0 instead of 1")

X_training_with_sentiment = sentiment_analysis_textblob(X_training)
X_validation_with_sentiment = sentiment_analysis_textblob(X_validation)
print("Sentiment analysis completed")

X_training_engineered = feature_engineering(X_training)
X_validation_engineered = feature_engineering(X_validation)
print("Feature engineering completed")

X_training_w2v = np.array([list_of_tokens_to_vec(list_of_tokens, word_vectors) for list_of_tokens in X_training])
X_validation_w2v = np.array([list_of_tokens_to_vec(list_of_tokens, word_vectors) for list_of_tokens in X_validation])
print("Converted sentences to vectors")

print(X_training_w2v.shape)  # Should be (39083, num_features_w2v)
print(X_training_engineered.shape)  # Should be (39083, num_features_engineered)
print(X_training_with_sentiment.shape)  # Should be (39083, num_features_sentiment)

X_training_combined = np.hstack([X_training_w2v, X_training_engineered, X_training_with_sentiment])
X_validation_combined = np.hstack([X_validation_w2v, X_validation_engineered, X_validation_with_sentiment])
print("Integrated feature engineering into the dataset")

# convert sparse matrices returned by CountVectorizer into dense matrices before converting them into PyTorch tensors
X_training_w2v_tensor = torch.FloatTensor(X_training_combined)

X_validation_w2v_tensor = torch.FloatTensor(X_validation_combined)

y_training_tensor = torch.LongTensor(y_training_mapped)
y_validation_tensor = torch.LongTensor(y_validation_mapped.to_numpy())
print("Converted vectors to tensors")

# num_features = X_training_bow.shape[1]
embedding_size = word_vectors.vector_size + 8  # This should be 300 for Google News Word2Vec + 8 additional features
hidden_size = 512
num_classes = len(np.unique(y_training_mapped))  # The number of unique classes in target variable
dropout_rate = 0.2

print("Number of unique classes:", num_classes)


class SimpleNeuralNet(nn.Module):
    def __init__(self, embedding_size, hidden_size, num_classes, dropout_rate):
        super(SimpleNeuralNet, self).__init__()
        self.layer1 = nn.Linear(embedding_size, hidden_size)
        self.layer2 = nn.Linear(hidden_size, hidden_size // 2)  # halving the size
        self.layer3 = nn.Linear(hidden_size // 2, hidden_size // 4)
        self.output_layer = nn.Linear(hidden_size // 4, num_classes) # Output size matches the number of classes
        self.dropout = nn.Dropout(dropout_rate)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.layer1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.layer2(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.layer3(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.output_layer(x)
        return x


torch.manual_seed(6) # Set seed to some fixed value
epochs = 150
# nn_model = SimpleNeuralNet()
nn_model = SimpleNeuralNet(embedding_size, hidden_size, num_classes, dropout_rate)
# the optimiser controls the learning rate

# optimiser with weight decay for L2 regularization
# The weight_decay parameter is used to specify the L2 penalty
optimiser = torch.optim.Adam(nn_model.parameters(), lr=1e-3, weight_decay=1e-5) 

# Adam without L2 regularisation
# optimiser = torch.optim.Adam(nn_model.parameters(), lr=1e-3)

loss_fn = nn.CrossEntropyLoss()

print('Epoch', 'Loss', '\n-----', '----', sep='\t')
for i in range(1, epochs + 1):
    # reset gradients to 0
    optimiser.zero_grad()
    # get predictions
    y_pred = nn_model(X_training_w2v_tensor)

    # print(y_pred.shape)
    # print(y_training_tensor.shape)
    # compute loss
    loss = loss_fn(y_pred, y_training_tensor)
    # backpropagate
    loss.backward()
    # update the model weights
    optimiser.step()
    # Print every 1000 epochs
    if i % 10 == 0:
        print (f"{i:5d}", loss.item(), sep='\t')


with torch.no_grad():  # No gradient computation for evaluation
    y_prediction_logits = nn_model(X_validation_w2v_tensor)
    y_prediction_classes = torch.argmax(y_prediction_logits, dim=1)  # Convert logits to class labels

# Convert tensors to numpy arrays for sklearn functions
y_validation_numpy = y_validation_tensor.numpy()
y_prediction_numpy = y_prediction_classes.numpy()

print("Validation data evaluated")

# print(y_validation_numpy)
# print(y_prediction_numpy)

print(classification_report(y_validation_numpy, y_prediction_numpy))
print(f1_score(y_validation_numpy, y_prediction_numpy, average='macro'))

Preprocessing complete
Splitting done
Mapped class labels to start from 0 instead of 1
Sentiment analysis completed
Feature engineering completed
Converted sentences to vectors
(47836, 300)
(47836, 6)
(47836, 2)
Integrated feature engineering into the dataset
Converted vectors to tensors
Number of unique classes: 4
Epoch	Loss	
-----	----
   10	1.3157390356063843
   20	0.9693562388420105
   30	0.6574960947036743
   40	0.5233080983161926
   50	0.43827539682388306
   60	0.3805628716945648
   70	0.3389802575111389
   80	0.31142109632492065
   90	0.28969696164131165
  100	0.27078738808631897
  110	0.2552439570426941
  120	0.24159997701644897
  130	0.23066191375255585
  140	0.21697601675987244
  150	0.20749744772911072
Validation data evaluated
              precision    recall  f1-score   support

           0       0.94      0.74      0.83      2801
           1       0.94      0.86      0.90      2793
           2       0.75      0.97      0.85      3508
           3       0.86      0.80 

Testing

In [32]:
# get the prediction for the test set
test = pd.read_csv('D:/!Education/CS4248/Project/4248_project_Dissect_LUN/dataset/test_final_with_topics_new.csv')
X_test = test['text']
y_test = test['label']


X_test_preprocessed = [preprocess_text(text) for text in X_test]
X_test_w2v = np.array([list_of_tokens_to_vec(list_of_tokens, word_vectors) for list_of_tokens in X_test_preprocessed])
X_test_with_sentiment = sentiment_analysis_textblob(X_test_preprocessed)
X_test_engineered = feature_engineering(X_test_preprocessed)

print(X_test_w2v.shape)
print(X_test_engineered.shape)
print(X_test_with_sentiment.shape)

X_test_combined = np.hstack([X_test_w2v, X_test_engineered, X_test_with_sentiment])
X_test_tensor = torch.FloatTensor(X_test_combined)

with torch.no_grad():
    # y_testing_logits = nn_model(X_test_bow_tensor)
    y_testing_logits = nn_model(X_test_tensor)
    y_testing_classes = torch.argmax(y_testing_logits, dim=1) 

# Convert tensors to numpy arrays for sklearn functions
# Convert 'y_test' to integers, then to a NumPy array
y_test_true_labels_numpy = y_test.astype(int).to_numpy() # true labels are 1, 2, 3, 4
y_test_prediction_numpy = y_testing_classes.numpy() + 1 # predicted labels are 0, 1, 2, 3 -- need to add 1

print(classification_report(y_test_true_labels_numpy, y_test_prediction_numpy))
print(f1_score(y_test_true_labels_numpy, y_test_prediction_numpy, average='macro'))

# test['Verdict'] = pd.Series(y_test_numpy)
# test.drop(columns=['Text'], inplace=True)
# test.to_csv('Project simple neural network.csv', index=False)

# Create a new DataFrame with the necessary information
results_df = pd.DataFrame({
    'Text': X_test,  # Assuming 'Text' is the column containing the text data
    'Original Label': y_test,  # Assuming 'Label' contains the original labels
    'Predicted Label': y_test_prediction_numpy
})

# Display the DataFrame
print(results_df.head())

(2996, 300)
(2996, 6)
(2996, 2)
              precision    recall  f1-score   support

           0       0.91      0.24      0.38       750
           1       0.45      0.02      0.03       747
           2       0.30      1.00      0.46       750
           3       0.68      0.23      0.35       749

    accuracy                           0.37      2996
   macro avg       0.59      0.37      0.31      2996
weighted avg       0.59      0.37      0.31      2996

0.3059691595428497


Trying a simpler neural network with two hidden layers

In [None]:
def list_of_tokens_to_vec(list_of_tokens, model):
    vecs = []
    for word in list_of_tokens:
        if word in model.key_to_index:
            vecs.append(model[word])
    if len(vecs) == 0:
        return np.zeros(model.vector_size)
    vecs = np.array(vecs)
    return vecs.mean(axis=0)

# load the data
train = pd.read_csv('D:/!Education/CS4248/Project/fulltrain.csv')
X_train = train['Text']
y_train = train['Label']

preprocessed_train = [preprocess_text(text) for text in X_train]
X_train_preprocessed = preprocessed_train
print("Preprocessing complete")

X_training, X_validation, y_training, y_validation = train_test_split(X_train_preprocessed, y_train, test_size=0.2, random_state=42)
print("Splitting done")

# Map class labels to start from 0 to fit NN model
y_training_mapped = y_training - 1
y_validation_mapped = y_validation - 1
print("Mapped class labels to start from 0 instead of 1")

X_training_with_sentiment = sentiment_analysis_textblob(X_training)
X_validation_with_sentiment = sentiment_analysis_textblob(X_validation)
print("Sentiment analysis completed")

X_training_engineered = feature_engineering(X_training)
X_validation_engineered = feature_engineering(X_validation)
print("Feature engineering completed")

X_training_w2v = np.array([list_of_tokens_to_vec(list_of_tokens, word_vectors) for list_of_tokens in X_training])
X_validation_w2v = np.array([list_of_tokens_to_vec(list_of_tokens, word_vectors) for list_of_tokens in X_validation])
print("Converted sentences to vectors")

print(X_training_w2v.shape)  # Should be (39083, num_features_w2v)
print(X_training_engineered.shape)  # Should be (39083, num_features_engineered)
print(X_training_with_sentiment.shape)  # Should be (39083, num_features_sentiment)

X_training_combined = np.hstack([X_training_w2v, X_training_engineered, X_training_with_sentiment])
X_validation_combined = np.hstack([X_validation_w2v, X_validation_engineered, X_validation_with_sentiment])
print("Integrated feature engineering into the dataset")

# convert sparse matrices returned by CountVectorizer into dense matrices before converting them into PyTorch tensors
X_training_w2v_tensor = torch.FloatTensor(X_training_combined)

X_validation_w2v_tensor = torch.FloatTensor(X_validation_combined)

y_training_tensor = torch.LongTensor(y_training_mapped)
y_validation_tensor = torch.LongTensor(y_validation_mapped.to_numpy())
print("Converted vectors to tensors")

# num_features = X_training_bow.shape[1]
embedding_size = word_vectors.vector_size + 8  # This should be 300 for Google News Word2Vec + 8 additional features
hidden_size = 512
num_classes = len(np.unique(y_training_mapped))  # The number of unique classes in target variable
dropout_rate = 0.2

print("Number of unique classes:", num_classes)


class SimpleNeuralNet(nn.Module):
    def __init__(self, embedding_size, hidden_size, num_classes, dropout_rate):
        super(SimpleNeuralNet, self).__init__()
        self.layer1 = nn.Linear(embedding_size, hidden_size)
        self.layer2 = nn.Linear(hidden_size, hidden_size // 2)  # halving the size
        self.output_layer = nn.Linear(hidden_size // 2, num_classes) # Output size matches the number of classes
        self.dropout = nn.Dropout(dropout_rate)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.layer1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.layer2(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.output_layer(x)
        return x


torch.manual_seed(6) # Set seed to some fixed value
epochs = 150
# nn_model = SimpleNeuralNet()
nn_model = SimpleNeuralNet(embedding_size, hidden_size, num_classes, dropout_rate)
# the optimiser controls the learning rate

# optimiser with weight decay for L2 regularization
# The weight_decay parameter is used to specify the L2 penalty
optimiser = torch.optim.Adam(nn_model.parameters(), lr=1e-3, weight_decay=1e-5) 

# Adam without L2 regularisation
# optimiser = torch.optim.Adam(nn_model.parameters(), lr=1e-3)

loss_fn = nn.CrossEntropyLoss()

print('Epoch', 'Loss', '\n-----', '----', sep='\t')
for i in range(1, epochs + 1):
    # reset gradients to 0
    optimiser.zero_grad()
    # get predictions
    y_pred = nn_model(X_training_w2v_tensor)

    # print(y_pred.shape)
    # print(y_training_tensor.shape)
    # compute loss
    loss = loss_fn(y_pred, y_training_tensor)
    # backpropagate
    loss.backward()
    # update the model weights
    optimiser.step()
    # Print every 1000 epochs
    if i % 10 == 0:
        print (f"{i:5d}", loss.item(), sep='\t')


with torch.no_grad():  # No gradient computation for evaluation
    y_prediction_logits = nn_model(X_validation_w2v_tensor)
    y_prediction_classes = torch.argmax(y_prediction_logits, dim=1)  # Convert logits to class labels

# Convert tensors to numpy arrays for sklearn functions
y_validation_numpy = y_validation_tensor.numpy()
y_prediction_numpy = y_prediction_classes.numpy()

print("Validation data evaluated")

# print(y_validation_numpy)
# print(y_prediction_numpy)

print(classification_report(y_validation_numpy, y_prediction_numpy))
print(f1_score(y_validation_numpy, y_prediction_numpy, average='macro'))