In [18]:
import gensim.downloader as api
import matplotlib.pyplot as plt
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from textblob import TextBlob
from torch.utils.data import DataLoader, TensorDataset

nlp = spacy.load("en_core_web_sm")
minmax_scaler = MinMaxScaler()
word_vectors = api.load("word2vec-google-news-300")

Sentiment analysis with Textblob + Frequency of each POS tag in a sentence with SpaCy + Sentence length + Ratio of unique words to length of sentence with SpaCy

In [31]:
def sentiment_analysis_textblob(X_training):
    
    list_of_features = []

    for sentence in X_training:

        # Analyze sentiment with TextBlob
        blob = TextBlob(sentence)
        sentiment_polarity = blob.sentiment.polarity
        sentiment_subjectivity = blob.sentiment.subjectivity

        # Combine sentence length, POS tag counts, and sentiment features into a single feature array for this sentence
        features = [sentiment_polarity, sentiment_subjectivity]
        list_of_features.append(features)

    features_array = np.array(list_of_features)

    minmax_scaler = MinMaxScaler(feature_range=(0, 1))
    features_scaled = minmax_scaler.fit_transform(features_array)

    return features_scaled

In [38]:
def feature_engineering(X_training):

    pos_tags_of_interest = ['NOUN', 'VERB', 'ADJ', 'ADV']  # nouns, verbs, adjectives, adverbs
    list_of_features = []

    for doc in nlp.pipe(X_training):
        # Count the occurrences of each POS tag of interest
        pos_counts = {tag: 0 for tag in pos_tags_of_interest}
        for token in doc:
            if token.pos_ in pos_tags_of_interest:
                pos_counts[token.pos_] += 1

        
        # Calculate unique words vs sentence length ratio
        unique_words_ratio = len(set(token.text for token in doc)) / len(doc) if doc else 0

        # Calculate sentence length
        sentence_length = len(doc)

        # Combine sentence length and POS tag counts into a single feature array for this sentence
        features = [unique_words_ratio] + [sentence_length] + [pos_counts[tag] for tag in pos_tags_of_interest]
        list_of_features.append(features)

    # Convert the list of features into a numpy array
    features_array = np.array(list_of_features)

    # Scale the features using MinMaxScaler
    minmax_scaler = MinMaxScaler(feature_range=(0, 1))
    features_scaled = minmax_scaler.fit_transform(features_array)

    return features_scaled

In [47]:
def sentence_to_vec(sentence, model):
    vecs = []
    for word in sentence.split():
        if word in model.key_to_index:
            vecs.append(model[word])
    if len(vecs) == 0:
        return np.zeros(model.vector_size)
    vecs = np.array(vecs)
    return vecs.mean(axis=0)


# load the data
train = pd.read_csv('train.csv')
X_train = train['Text']
y_train = train['Verdict']

X_training, X_validation, y_training, y_validation = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
print("Splitting done")

# Map class labels to stert from 0 to fit NN model
y_training_mapped = y_training.map({-1: 0, 0: 1, 1: 2})
y_validation_mapped = y_validation.map({-1: 0, 0: 1, 1: 2})

# bow_vectorizer = CountVectorizer(stop_words='english', ngram_range=(1, 1))

# X_training_bow = bow_vectorizer.fit_transform(X_training)
# X_validation_bow = bow_vectorizer.transform(X_validation)

X_training_with_sentiment = sentiment_analysis_textblob(X_training)
X_validation_with_sentiment = sentiment_analysis_textblob(X_validation)

X_training_engineered = feature_engineering(X_training)
X_validation_engineered = feature_engineering(X_validation)
print("Feature engineering completed")

X_training_w2v = np.array([sentence_to_vec(sentence, word_vectors) for sentence in X_training])
X_validation_w2v = np.array([sentence_to_vec(sentence, word_vectors) for sentence in X_validation])
print("Converted sentences to vectors")

print(X_training_w2v.shape)  # Should be (18000, num_features_w2v)
print(X_training_engineered.shape)  # Should be (18000, num_features_engineered)
print(X_training_with_sentiment.shape)  # Should be (18000, num_features_sentiment)

X_training_combined = np.hstack([X_training_w2v, X_training_engineered, X_training_with_sentiment])
X_validation_combined = np.hstack([X_validation_w2v, X_validation_engineered, X_validation_with_sentiment])
print("Integrated feature engineering into the dataset")

# convert sparse matrices returned by CountVectorizer into dense matrices before converting them into PyTorch tensors
# X_training_bow_tensor = torch.FloatTensor(X_training_bow.toarray())
# X_training_w2v_tensor = torch.FloatTensor(X_training_w2v)
X_training_w2v_tensor = torch.FloatTensor(X_training_combined)

# X_validation_bow_tensor = torch.FloatTensor(X_validation_bow.toarray())
# X_validation_w2v_tensor = torch.FloatTensor(X_validation_w2v)
X_validation_w2v_tensor = torch.FloatTensor(X_validation_combined)

y_training_tensor = torch.LongTensor(y_training_mapped)
y_validation_tensor = torch.LongTensor(y_validation_mapped.to_numpy())
print("Converted vectors to tensors")

# num_features = X_training_bow.shape[1]
embedding_size = word_vectors.vector_size + 8  # This should be 300 for Google News Word2Vec + 8 additional features
hidden_size = 512
num_classes = len(np.unique(y_training_mapped))  # The number of unique classes in your target variable
dropout_rate = 0.2



class SimpleNeuralNet(nn.Module):
    def __init__(self, embedding_size, hidden_size, num_classes, dropout_rate):
        super(SimpleNeuralNet, self).__init__()
        self.layer1 = nn.Linear(embedding_size, hidden_size)
        self.layer2 = nn.Linear(hidden_size, hidden_size // 2)  # halving the size
        self.layer3 = nn.Linear(hidden_size // 2, hidden_size // 4)
        self.output_layer = nn.Linear(hidden_size // 4, num_classes) # Output size matches the number of classes
        self.dropout = nn.Dropout(dropout_rate)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.layer1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.layer2(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.layer3(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.output_layer(x)
        return x


torch.manual_seed(6) # Set seed to some fixed value
epochs = 150
# nn_model = SimpleNeuralNet()
nn_model = SimpleNeuralNet(embedding_size, hidden_size, num_classes, dropout_rate)
# the optimiser controls the learning rate
# optimiser = torch.optim.SGD(nn_model.parameters(), lr=1e-2, momentum=0)

# optimiser with weight decay for L2 regularization
# The weight_decay parameter is used to specify the L2 penalty
optimiser = torch.optim.Adam(nn_model.parameters(), lr=1e-3, weight_decay=1e-5) 

# Adam without L2 regularisation
# optimiser = torch.optim.Adam(nn_model.parameters(), lr=1e-3) 

'''
The L2 regularization penalty works by adding a term to the loss that penalizes large weights, which helps prevent overfitting. 
This term is the sum of the squares of all the weights multiplied by the weight_decay value. During backpropagation, 
this has the effect of shrinking the weights slightly on every update, hence the term "weight decay."

Be cautious when combining L2 regularization with the Adam optimizer, as Adam already includes a form of regularization 
through its moving average of squared gradients (which is similar to RMSprop). Excessive regularization might lead to underfitting, 
so itâ€™s important to find a balance that works well with your data and model architecture.
'''

loss_fn = nn.CrossEntropyLoss()

print('Epoch', 'Loss', '\n-----', '----', sep='\t')
for i in range(1, epochs + 1):
    # reset gradients to 0
    optimiser.zero_grad()
    # get predictions
    y_pred = nn_model(X_training_w2v_tensor)

    # print(y_pred.shape)
    # print(y_training_tensor.shape)
    # compute loss
    loss = loss_fn(y_pred, y_training_tensor)
    # backpropagate
    loss.backward()
    # update the model weights
    optimiser.step()
    # Print every 1000 epochs
    if i % 10 == 0:
        print (f"{i:5d}", loss.item(), sep='\t')


with torch.no_grad():  # No gradient computation for evaluation
    y_prediction_logits = nn_model(X_validation_w2v_tensor)
    y_prediction_classes = torch.argmax(y_prediction_logits, dim=1)  # Convert logits to class labels

# Convert tensors to numpy arrays for sklearn functions
y_validation_numpy = y_validation_tensor.numpy() - 1
y_prediction_numpy = y_prediction_classes.numpy() - 1

print("Validation data evaluated")

# print(y_validation_numpy)
# print(y_prediction_numpy)

print(classification_report(y_validation_numpy, y_prediction_numpy))
print(f1_score(y_validation_numpy, y_prediction_numpy, average='macro'))


# Evaluate the model
#print(classification_report(y_validation.numpy(), y_prediction.numpy()))

#f1_score(y_validation, y_prediction, average='macro')


Splitting done
Feature engineering completed
Converted sentences to vectors
(18000, 300)
(18000, 6)
(18000, 2)
Integrated feature engineering into the dataset
Converted vectors to tensors
Epoch	Loss	
-----	----
   10	0.8464913964271545
   20	0.8250763416290283
   30	0.7739527821540833
   40	0.7273467183113098
   50	0.6923595666885376
   60	0.6534910798072815
   70	0.6211737990379333
   80	0.5962772965431213
   90	0.5782447457313538
  100	0.5616274476051331
  110	0.5434944033622742
  120	0.5298254489898682
  130	0.5128721594810486
  140	0.49789106845855713
  150	0.48027804493904114
Validation data evaluated
              precision    recall  f1-score   support

          -1       0.83      0.89      0.86      2926
           0       0.59      0.29      0.39       502
           1       0.65      0.68      0.66      1073

    accuracy                           0.77      4501
   macro avg       0.69      0.62      0.64      4501
weighted avg       0.76      0.77      0.76      4501

0.636

In [25]:
# get the prediction for the test set
test = pd.read_csv('test.csv')
X_test = test['Text']

# X_test_bow = bow_vectorizer.transform(X_test)
X_test_to_vec = np.array([sentence_to_vec(sentence, word_vectors) for sentence in X_test])
# X_test_bow_tensor = torch.FloatTensor(X_test_bow.toarray())
X_test_w2v_tensor = torch.FloatTensor(X_test_to_vec)


with torch.no_grad():
    # y_testing_logits = nn_model(X_test_bow_tensor)
    y_testing_logits = nn_model(X_test_w2v_tensor)
    y_testing_classes = torch.argmax(y_testing_logits, dim=1) 

# Convert tensors to numpy arrays for sklearn functions
y_test_numpy = y_testing_classes.numpy() - 1

test['Verdict'] = pd.Series(y_test_numpy)
test.drop(columns=['Text'], inplace=True)
test.to_csv('Project simple neural network.csv', index=False)