In [1]:
import torch

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import re
from bs4 import BeautifulSoup
import warnings 
warnings.filterwarnings("ignore")
from gensim.test.utils import datapath
from gensim import utils
import gensim.models
import gensim.downloader as api

from nltk.corpus import stopwords
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch.nn as nn 
import torch.nn.functional as f 
from torch.utils.data import Dataset, DataLoader
from gensim.downloader import load


## 1) Dataset Generation (5 points)
We will use the Amazon reviews dataset used in HW1. Load the dataset
and build a balanced dataset of 250K reviews along with their ratings (50K
instances per each rating score) through random selection. Create ternary
labels using the ratings. We assume that ratings more than 3 denote positive
1
sentiment (class 1) and rating less than 3 denote negative sentiment (class
2). Reviews with rating 3 are considered to have neutral sentiment (class 3).
You can store your dataset after generation and reuse it to reduce the computational load. For your experiments consider a 80%/20% training/testing
split.

In [2]:
df = pd.read_csv("data/amazon_reviews_us_Office_Products_v1_00.tsv", sep='\t', on_bad_lines='skip')#, usecols=['review_body','star_rating']) #lineterminator='\r'
df.drop(df.columns[0], axis=1, inplace=True)
df = df[['review_body', 'star_rating']]
df.columns

Index(['review_body', 'star_rating'], dtype='object')

### Preprocess/Cleaning take 15 minutes

In [3]:
contraction_mapping = {
    "ain't": "am not",
    "aren't": "are not",
    "can't": "cannot",
    "can't've": "cannot have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "I'd": "I would",
    "I'd've": "I would have",
    "I'll": "I will",
    "I'll've": "I will have",
    "I'm": "I am",
    "I've": "I have",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so is",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have"
}


pattern_contractions = re.compile('(%s)' % '|'.join(contraction_mapping.keys()))
lemmatizer = WordNetLemmatizer()
nltk.download('stopwords', 'punkt')
stop_words = set(stopwords.words('english'))


def expand_contractions(text, contraction_map=contraction_mapping):
    return pattern_contractions.sub(lambda occurrence: contraction_map[occurrence.group(0)], text)


def rem_stopwords(review,stp):
    words = review.split()
    filtered_words = [word for word in words if word not in stp]
    filtered_sentence = ' '.join(filtered_words)
    return filtered_sentence


def lemmazation(review):
    words = review.split()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    lemmatized_review = ' '.join(lemmatized_words)
    return lemmatized_review


def clean_preproc_reviews(reviews, stp):
    ### CLEANING
    reviews = reviews.str.lower()
    # reviews = reviews.apply(lambda x: BeautifulSoup(x, "html.parser").get_text())
    # reviews = reviews.replace(r'http\S+', '', regex=True)
    # reviews = reviews.replace("[^a-zA-Z]", " ", regex=True)
    # reviews = reviews.replace('\s+', ' ', regex=True).str.strip()
    # reviews = reviews.apply(lambda x: expand_contractions(x))

    # ### PREPROCESSING
    # reviews = reviews.apply(lambda x : rem_stopwords(x, stp))
    # reviews = reviews.apply(lemmazation)

    return reviews

# Clean the reviews
df['review_body'] =df['review_body'].astype(str)
df.dropna(subset=['review_body'], inplace=True)
df['review_body'] = clean_preproc_reviews(df['review_body'], stop_words)
df.dropna(subset=['review_body'], inplace=True)


[nltk_data] Downloading package stopwords to punkt...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
df['label'] = df['star_rating'].apply(lambda x: 0 if x in [4, 5] else (1 if x in [1, 2] else 2))

star_ratings = [5, 4, 3, 2, 1]
samples = [ df[df['star_rating'] == rating].sample(n = 50000, random_state = 42) for rating in star_ratings]
merged_dataset = pd.concat(samples)

In [5]:
print("Sample review and label after creating merged dataset:")
print(merged_dataset[['review_body', 'star_rating', 'label']].sample(1))


Sample review and label after creating merged dataset:
                                               review_body star_rating  label
1333654  i had this for about eight months when it fail...           2      1


## 2) Word Embedding (30 points)
In this part the of the assignment, you will learn how to generate two sets
of Word2Vec features for the dataset you generated. You can use Gensim
library for this purpose. A helpful tutorial is available in the following link:
https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.
html


### (a) (10 points)
Load the pretrained “word2vec-google-news-300” Word2Vec model and learn
how to extract word embeddings for your dataset. Try to check semantic
similarities of the generated vectors using two examples of your own, e.g.,
King − M an + W oman = Queen or excellent ∼ outstanding.


In [6]:
# # wv['buger'] - wv['fries'] + wv['fish'] ?= wv['chips']
# test_relationship = wv['burger'] - wv['fries'] + wv['fish']
# print(test_relationship, wv['chips']) 


In [7]:
pretrained_model = api.load('word2vec-google-news-300')

result = pretrained_model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
print("King - Man + Woman =", result[0][0])

similarity = pretrained_model.similarity('excellent', 'outstanding')
print("Similarity between 'excellent' and 'outstanding':", similarity)


King - Man + Woman = queen
Similarity between 'excellent' and 'outstanding': 0.5567486


In [8]:
# Function to find the most similar word
def most_similar_vector(vector):
    return pretrained_model.similar_by_vector(vector, topn=1)[0][0]

# Vector arithmetic: "King - Man + Woman"
result_vector = pretrained_model['king'] - pretrained_model['man'] + pretrained_model['woman']
analogy_result = most_similar_vector(result_vector)
print(f"King - Man + Woman = {analogy_result}")

# Function to compute cosine similarity
def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

# Compute similarity between 'excellent' and 'outstanding'
similarity_score = cosine_similarity(pretrained_model['excellent'], pretrained_model['outstanding'])
print(f"Similarity between 'excellent' and 'outstanding': {similarity_score}")


King - Man + Woman = king
Similarity between 'excellent' and 'outstanding': 0.556748628616333


### (b) (20 points)
Train a Word2Vec model using your own dataset. Set the embedding size
to be 300 and the window size to be 11. You can also consider a minimum
word count of 10. Check the semantic similarities for the same two examples
in part (a). What do you conclude from comparing vectors generated by
yourself and the pretrained model? Which of the Word2Vec models seems
to encode semantic similarities between words better?

In [9]:
class MyCorpus:
    def __init__(self, df, col):
        self.df = df
        self.col = col

    def __iter__(self):
        for line in self.df[self.col]:
            yield utils.simple_preprocess(line)


sentences = MyCorpus(merged_dataset, 'review_body')
my_model = gensim.models.Word2Vec(sentences=sentences, vector_size=300, window=11, min_count=10, workers=4)

In [10]:
### NEW Dataframe

word_vectors = my_model.wv

result = word_vectors.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
print("King - Man + Woman =", result[0][0])
similarity = word_vectors.similarity('excellent', 'outstanding')
print("Similarity between 'excellent' and 'outstanding':", similarity)
print( my_model.wv)

King - Man + Woman = jumbo
Similarity between 'excellent' and 'outstanding': 0.8390566
KeyedVectors<vector_size=300, 16476 keys>


In [12]:
def document_vector(word2vec_model, doc):
    # Filter out words in the document that are not in the word2vec model's vocabulary.
    doc = [word for word in doc if word in word2vec_model.key_to_index]

    # If the document contains no words present in the model's vocabulary, return a zero vector.
    if len(doc) == 0:
        return np.zeros(word2vec_model.vector_size)
        
    # Otherwise, return the average of the word vectors for the words in the document.
    return np.mean(word2vec_model[doc], axis=0)

merged_dataset['processed_text'] = merged_dataset['review_body'].apply(gensim.utils.simple_preprocess)
merged_dataset['pretrained_vector'] = merged_dataset['processed_text'].apply(lambda doc: document_vector(pretrained_model, doc))
merged_dataset['doc_vector'] = merged_dataset['processed_text'].apply(lambda doc: document_vector(my_model.wv, doc))

Y = merged_dataset['label']
X = merged_dataset['doc_vector']
X_pre = merged_dataset['pretrained_vector']


## 3) Simple models (20 points)
Using the Word2Vec features that you can generate using the two models
you prepared in the Word Embedding section, train a perceptron and an
SVM model similar to HW1 for class 1 and class 2 (binary models). For this
purpose, you can just use the average Word2Vec vectors for each review as
the input feature (x = 1N PNi=1 Wi for a review with N words). To improve 2
your performance, use the data cleaning and preprocessing steps of HW1
to include only important words from each review when you compute the
average x = 1 N PN i=1 Wi.
Report your accuracy values on the testing split for
these models for each feature type along with values you reported in your
HW1 submission, i.e., for each of perceptron and SVM, you need to report
three accuracy values for “word2vec-google-news-300”, your own Word2Vec,
and TF-IDF features.
What do you conclude from comparing performances for the models
trained using the three different feature types (TF-IDF, pretrained Word2Vec,
your trained Word2Vec)?


In [13]:
#### PERCEPTRON 
### USE AVERAGE WORD VECTOR

def evaulate(y_label, y_predicted):
    accuracy = accuracy_score(y_label, y_predicted)
    precision = precision_score(y_label, y_predicted, average='weighted')
    recall = recall_score(y_label, y_predicted, average='weighted')
    f1 = f1_score(y_label, y_predicted, average='weighted')
    return accuracy, precision, recall,f1

# def evaluate(y_label, y_predicted):
#     precision = precision_score(y_label, y_predicted, average=None)
#     cm = confusion_matrix(y_label, y_predicted)
#     class_accuracy = cm.diagonal() / cm.sum(axis=1)
#     return class_accuracy, precision

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test = np.vstack(X_train), np.vstack(X_test), np.vstack(y_train), np.vstack(y_test)

clf = Perceptron(tol=1e-5, random_state=0)
clf.fit(X_train, y_train)
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)

tr_acc, tr_prec, tr_rec, tr_f1 = evaulate(y_train, y_pred_train)
te_acc, te_prec, te_rec, te_f1 = evaulate(y_test, y_pred_test)

print("Training: Accuracy: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, F1-Score: {:.4f}".format(tr_acc, tr_prec, tr_rec, tr_f1))
print(" Testing: Accuracy: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, F1-Score: {:.4f}".format(te_acc, te_prec, te_rec, te_f1))

Training: Accuracy: 0.6006, Precision: 0.6505, Recall: 0.6006, F1-Score: 0.6172
 Testing: Accuracy: 0.5998, Precision: 0.6509, Recall: 0.5998, F1-Score: 0.6167


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X_pre, Y, test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test = np.vstack(X_train), np.vstack(X_test), np.vstack(y_train), np.vstack(y_test)

clf = Perceptron(tol=1e-5, random_state=0)
clf.fit(X_train, y_train)
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)

tr_acc, tr_prec, tr_rec, tr_f1 = evaulate(y_train, y_pred_train)
te_acc, te_prec, te_rec, te_f1 = evaulate(y_test, y_pred_test)

print("Training: Accuracy: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, F1-Score: {:.4f}".format(tr_acc, tr_prec, tr_rec, tr_f1))
print(" Testing: Accuracy: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, F1-Score: {:.4f}".format(te_acc, te_prec, te_rec, te_f1))

Training: Accuracy: 0.5898, Precision: 0.6162, Recall: 0.5898, F1-Score: 0.5344
 Testing: Accuracy: 0.5861, Precision: 0.6103, Recall: 0.5861, F1-Score: 0.5309


## 4) Feedforward Neural Networks (25 points)
Using the features that you can generate using the models you prepared in
the Word “Embedding section”, train a feedforward multilayer perceptron
network for sentiment analysis classification. Consider a network with two
hidden layers, each with 50 and 10 nodes, respectively. You can use cross
entropy loss and your own choice for other hyperparamters, e.g., nonlinearity,
number of epochs, etc. Part of getting good results is to select good values
for these hyperparamters.
You can also refer to the following tutorial to familiarize yourself:
https://www.kaggle.com/mishra1993/pytorch-multi-layer-perceptron-mnist
Although the above tutorial is for image data but the concept of training
an MLP is very similar to what we want to do.

### (a) (10 points)
To generate the input features, use the average Word2Vec vectors similar to
the “Simple models” section and train the neural network. Train a network
for binary classification using class 1 and class 2 and also a ternary model for
the three classes. Report accuracy values on the testing split for your MLP
model for each of the binary and ternary classification cases.


## TERNARY

In [15]:
class Net(nn.Module): 
    def __init__(self, n_classes):
        super(Net, self).__init__()
        n_dim = 300
        hidden_1 = 50
        hidden_2 = 10
        self.n_classes = n_classes

        self.fc1 = nn.Linear(n_dim, hidden_1)
        self.fc2 = nn.Linear(hidden_1, hidden_2)
        self.fc3 = nn.Linear(hidden_2, n_classes)
        self.dropout = nn.Dropout(0.4)

    def forward(self, x):
        x = f.gelu(self.fc1(x))
        x = self.dropout(x)
        x = f.gelu(self.fc2(x))
        x = self.dropout(x)
        x = f.softmax(self.fc3(x))
        return x 


ternary_model = Net(n_classes=3)
binary_model = Net(n_classes=2)
print(ternary_model)

Net(
  (fc1): Linear(in_features=300, out_features=50, bias=True)
  (fc2): Linear(in_features=50, out_features=10, bias=True)
  (fc3): Linear(in_features=10, out_features=3, bias=True)
  (dropout): Dropout(p=0.4, inplace=False)
)


In [16]:

print("Features and labels before creating datasets:")
for i in range(5):  
    vector_norm = torch.norm(X_train_tensor[i], p=2)  # L2 norm 
    #vector_summary = ", ".join(f"{x:.2f}" for x in X_train_tensor[i][:3]) + "..."  # Summarize vector 
    print(f"Sample {i+1}: Norm = {vector_norm:.2f}, Label: {y_train_tensor[i]}")

Features and labels before creating datasets:


NameError: name 'X_train_tensor' is not defined

In [17]:
class TextDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

## TERNARY MODEL

In [18]:
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

train_dataset = TextDataset(X_train_tensor, y_train_tensor)
test_dataset = TextDataset(X_test_tensor, y_test_tensor)

batch_size = 64 
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(ternary_model.parameters(), lr=0.01)
valid_loss_min = np.Inf

n_epochs = 50 

for epoch in range(n_epochs):
    train_loss = 0.0
    valid_loss = 0.0
    correct_train = 0
    correct_valid = 0
    
    ternary_model.train()
    for data, target in train_loader:
        optimizer.zero_grad()
        output = ternary_model(data)
        target = target.squeeze()
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * data.size(0)
        _, predicted = torch.max(output.data, 1)
        correct_train += (predicted == target).sum().item()

    ternary_model.eval()
    with torch.no_grad():
        for data, target in test_loader:
            output = ternary_model(data)
            target = target.squeeze()
            loss = criterion(output, target)
            valid_loss += loss.item() * data.size(0)
            _, predicted = torch.max(output.data, 1)
            correct_valid += (predicted == target).sum().item()

    train_loss = train_loss / len(train_loader.dataset)
    valid_loss = valid_loss / len(test_loader.dataset)
    
    train_accuracy = correct_train / len(train_loader.dataset)
    valid_accuracy = correct_valid / len(test_loader.dataset)

    print('Epoch: {} \tTraining Loss: {:.6f} \tTraining Accuracy: {:.2f}% \tValidation Loss: {:.6f} \tValidation Accuracy: {:.2f}%'.format(
        epoch+1, 
        train_loss,
        train_accuracy * 100,
        valid_loss,
        valid_accuracy * 100
    ))

    if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
        valid_loss_min,
        valid_loss))
        torch.save(ternary_model.state_dict(), 'ternary_model.pt')
        valid_loss_min = valid_loss

Epoch: 1 	Training Loss: 0.890880 	Training Accuracy: 64.87% 	Validation Loss: 0.872804 	Validation Accuracy: 66.46%
Validation loss decreased (inf --> 0.872804).  Saving model ...


KeyboardInterrupt: 

## BINARY MODEL

In [None]:
merged_dataset['binary_label'] = merged_dataset['label'].apply(lambda x: 1 if x == 2 else (0 if x == 0 else 1))
merged_dataset['binary_label'].sample(100)

X_binary = merged_dataset['doc_vector'].values
Y_binary = merged_dataset['binary_label'].values
print(X_binary.shape, type(X_binary) )

x_bin_train, x_bin_test, y_bin_train, y_bin_test = train_test_split(X_binary, Y_binary, test_size=0.2)

print(x_bin_train.shape, type(x_bin_train))

x_bin_train = np.array(x_bin_train.tolist(), dtype=np.float32)  # Convert to float32 numpy array if not already
y_bin_train = np.array(y_bin_train)

x_bin_test = np.array(x_bin_test.tolist(), dtype=np.float32)  # Convert to float32 numpy array if not already
y_bin_test = np.array(y_bin_test)

# Now convert to torch tensors
x_bin_train = torch.tensor(x_bin_train, dtype=torch.float32)
y_bin_train = torch.tensor(y_bin_train, dtype=torch.long)

x_bin_test = torch.tensor(x_bin_test, dtype=torch.float32)
y_bin_test = torch.tensor(y_bin_test, dtype=torch.long)


train_dataset_bin = TextDataset(x_bin_train, y_bin_train)
test_dataset_bin = TextDataset(x_bin_test, y_bin_test)

batch_size = 64 
train_loader_bin = DataLoader(train_dataset_bin, batch_size=batch_size, shuffle=True)
test_loader_bin = DataLoader(test_dataset_bin, batch_size=batch_size, shuffle=False)


criterion_bin = nn.CrossEntropyLoss()
optimizer_bin = torch.optim.Adam(binary_model.parameters(), lr=0.01)
valid_loss_min = np.Inf

n_epochs = 50 

# TRAIN LOOP

for epoch in range(n_epochs):
    train_loss = 0.0
    valid_loss = 0.0
    correct_train = 0
    correct_valid = 0
    
    binary_model.train()
    for data, target in train_loader_bin:
        optimizer_bin.zero_grad()
        output = binary_model(data)
        target = target.squeeze()
        loss = criterion_bin(output, target)
        loss.backward()
        optimizer_bin.step()
        train_loss += loss.item() * data.size(0)
        _, predicted = torch.max(output.data, 1)
        correct_train += (predicted == target).sum().item()

    binary_model.eval()
    with torch.no_grad():
        for data, target in test_loader_bin:
            output = binary_model(data)
            target = target.squeeze()
            loss = criterion_bin(output, target)
            valid_loss += loss.item() * data.size(0)
            _, predicted = torch.max(output.data, 1)
            correct_valid += (predicted == target).sum().item()

    train_loss = train_loss / len(train_loader.dataset)
    valid_loss = valid_loss / len(test_loader.dataset)
    
    train_accuracy = correct_train / len(train_loader.dataset)
    valid_accuracy = correct_valid / len(test_loader.dataset)

    print('Epoch: {} \tTraining Loss: {:.6f} \tTraining Accuracy: {:.2f}% \tValidation Loss: {:.6f} \tValidation Accuracy: {:.2f}%'.format(
        epoch+1, 
        train_loss,
        train_accuracy * 100,
        valid_loss,
        valid_accuracy * 100
    ))

    if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
        valid_loss_min,
        valid_loss))
        torch.save(binary_model.state_dict(), 'binary_model.pt')
        valid_loss_min = valid_loss

(b) (15 points)
To generate the input features, concatenate the first 10 Word2Vec vectors
for each review as the input feature (x = [WT
1, ..., WT [10]) and train the neural 3
network. Report the accuracy value on the testing split for your MLP model
for each of the binary and ternary classification cases.
What do you conclude by comparing accuracy values you obtain with
those obtained in the “’Simple Models” section (note you can compare the
accuracy values for binary classification).

In [None]:
df['label'] = df['star_rating'].apply(lambda x: 0 if x in [4, 5] else (1 if x in [1, 2] else 2))
merged_dataset['label'] = merged_dataset['label'].apply(lambda x: 1 if x is 2)

## 5) Convolutional Neural Networks (20 points)
Using the vectors you prepared in the “Word Embedding” section, train a
convolutional neural network (CNN) for sentiment analysis classification.
Train a simple CNN for sentiment analysis. You can consider an two-layer
CNN with the output channel sizes of 50 and 10. To feed your data into the
CNN, limit the maximum review length to 50 by truncating longer reviews
and padding shorter reviews with a null value (0). You can use cross entropy
loss and your own choice for other hyperparamters, e.g., nonlinearity, number
of epochs, etc. Train the CNN network for binary classification using class 1
and class 2 and also a ternary model for the three classes. Report accuracy
values on the testing split for your CNN model.

In [19]:
# Convert the 'label' column to a binary format
merged_dataset['binary_label'] = merged_dataset['label'].apply(lambda x: 1 if x == 2 else (0 if x == 0 else 1))
merged_dataset['binary_label'].sample(100)

# Extracting features and labels for binary classification
X_binary = merged_dataset['doc_vector'].values
Y_binary = merged_dataset['binary_label'].values
print(X_binary.shape, type(X_binary))

# Splitting the dataset into training and testing sets for binary classification
x_bin_train, x_bin_test, y_bin_train, y_bin_test = train_test_split(X_binary, Y_binary, test_size=0.2)
print(x_bin_train.shape, type(x_bin_train))

# Converting lists to numpy arrays of type float32
x_bin_train = np.array(x_bin_train.tolist(), dtype=np.float32)
y_bin_train = np.array(y_bin_train)

x_bin_test = np.array(x_bin_test.tolist(), dtype=np.float32)
y_bin_test = np.array(y_bin_test)

# Converting numpy arrays to PyTorch tensors
x_bin_train = torch.tensor(x_bin_train, dtype=torch.float32)
y_bin_train = torch.tensor(y_bin_train, dtype=torch.long)

x_bin_test = torch.tensor(x_bin_test, dtype=torch.float32)
y_bin_test = torch.tensor(y_bin_test, dtype=torch.long)

# Creating datasets and dataloaders for binary classification
train_dataset_bin = TextDataset(x_bin_train, y_bin_train)
test_dataset_bin = TextDataset(x_bin_test, y_bin_test)

batch_size_bin = 64
train_loader_bin = DataLoader(train_dataset_bin, batch_size=batch_size_bin, shuffle=True)
test_loader_bin = DataLoader(test_dataset_bin, batch_size=batch_size_bin, shuffle=False)

# Setting up the loss function and optimizer for binary classification
criterion_bin = nn.CrossEntropyLoss()
optimizer_bin = torch.optim.Adam(binary_model.parameters(), lr=0.01)
valid_loss_min_bin = np.Inf

n_epochs_bin = 5

# Training loop for binary classification
for epoch in range(n_epochs_bin):
    train_loss_bin = 0.0
    valid_loss_bin = 0.0
    correct_train_bin = 0
    correct_valid_bin = 0
    
    binary_model.train()
    for data, target in train_loader_bin:
        optimizer_bin.zero_grad()
        output = binary_model(data)
        target = target.squeeze()
        loss = criterion_bin(output, target)
        loss.backward()
        optimizer_bin.step()
        train_loss_bin += loss.item() * data.size(0)
        _, predicted = torch.max(output.data, 1)
        correct_train_bin += (predicted == target).sum().item()

    binary_model.eval()
    with torch.no_grad():
        for data, target in test_loader_bin:
            output = binary_model(data)
            target = target.squeeze()
            loss = criterion_bin(output, target)
            valid_loss_bin += loss.item() * data.size(0)
            _, predicted = torch.max(output.data, 1)
            correct_valid_bin += (predicted == target).sum().item()

    train_loss_bin = train_loss_bin / len(train_loader_bin.dataset)
    valid_loss_bin = valid_loss_bin / len(test_loader_bin.dataset)
    
    train_accuracy_bin = correct_train_bin / len(train_loader_bin.dataset)
    valid_accuracy_bin = correct_valid_bin / len(test_loader_bin.dataset)

    print(f'Epoch: {epoch+1} \tTraining Loss: {train_loss_bin:.6f} \tTraining Accuracy: {train_accuracy_bin * 100:.2f}% \tValidation Loss: {valid_loss_bin:.6f} \tValidation Accuracy: {valid_accuracy_bin * 100:.2f}%')

    if valid_loss_bin <= valid_loss_min_bin:
        print(f'Validation loss decreased ({valid_loss_min_bin:.6f} --> {valid_loss_bin:.6f}).  Saving model ...')
        torch.save(binary_model.state_dict(), 'binary_model.pt')
        valid_loss_min_bin = valid_loss_bin


(250000,) <class 'numpy.ndarray'>
(200000,) <class 'numpy.ndarray'>
Epoch: 1 	Training Loss: 0.512096 	Training Accuracy: 78.83% 	Validation Loss: 0.493364 	Validation Accuracy: 80.53%
Validation loss decreased (inf --> 0.493364).  Saving model ...
Epoch: 2 	Training Loss: 0.503597 	Training Accuracy: 79.71% 	Validation Loss: 0.490508 	Validation Accuracy: 80.76%
Validation loss decreased (0.493364 --> 0.490508).  Saving model ...
Epoch: 3 	Training Loss: 0.499485 	Training Accuracy: 80.18% 	Validation Loss: 0.494315 	Validation Accuracy: 80.74%
Epoch: 4 	Training Loss: 0.502700 	Training Accuracy: 79.89% 	Validation Loss: 0.486292 	Validation Accuracy: 81.54%
Validation loss decreased (0.490508 --> 0.486292).  Saving model ...
Epoch: 5 	Training Loss: 0.501991 	Training Accuracy: 79.77% 	Validation Loss: 0.488278 	Validation Accuracy: 81.60%


In [20]:
# Assuming X_train, X_test, y_train, y_test are already defined for the ternary classification task

X_train_ternary_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_ternary_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_ternary_tensor = torch.tensor(y_train, dtype=torch.long)
y_test_ternary_tensor = torch.tensor(y_test, dtype=torch.long)

train_dataset_ternary = TextDataset(X_train_ternary_tensor, y_train_ternary_tensor)
test_dataset_ternary = TextDataset(X_test_ternary_tensor, y_test_ternary_tensor)

batch_size_ternary = 64 
train_loader_ternary = DataLoader(train_dataset_ternary, batch_size=batch_size_ternary, shuffle=True)
test_loader_ternary = DataLoader(test_dataset_ternary, batch_size=batch_size_ternary, shuffle=False)

criterion_ternary = nn.CrossEntropyLoss()
optimizer_ternary = torch.optim.Adam(ternary_model.parameters(), lr=0.01)
valid_loss_min_ternary = np.Inf

n_epochs_ternary = 50 

for epoch in range(n_epochs_ternary):
    train_loss_ternary = 0.0
    valid_loss_ternary = 0.0
    correct_train_ternary = 0
    correct_valid_ternary = 0
    
    ternary_model.train()
    for data, target in train_loader_ternary:
        optimizer_ternary.zero_grad()
        output = ternary_model(data)
        target = target.squeeze()
        loss = criterion_ternary(output, target)
        loss.backward()
        optimizer_ternary.step()
        train_loss_ternary += loss.item() * data.size(0)
        _, predicted = torch.max(output.data, 1)
        correct_train_ternary += (predicted == target).sum().item()

    ternary_model.eval()
    with torch.no_grad():
        for data, target in test_loader_ternary:
            output = ternary_model(data)
            target = target.squeeze()
            loss = criterion_ternary(output, target)
            valid_loss_ternary += loss.item() * data.size(0)
            _, predicted = torch.max(output.data, 1)
            correct_valid_ternary += (predicted == target).sum().item()

    train_loss_ternary = train_loss_ternary / len(train_loader_ternary.dataset)
    valid_loss_ternary = valid_loss_ternary / len(test_loader_ternary.dataset)
    
    train_accuracy_ternary = correct_train_ternary / len(train_loader_ternary.dataset)
    valid_accuracy_ternary = correct_valid_ternary / len(test_loader_ternary.dataset)

    print('Epoch: {} \tTraining Loss: {:.6f} \tTraining Accuracy: {:.2f}% \tValidation Loss: {:.6f} \tValidation Accuracy: {:.2f}%'.format(
        epoch+1, 
        train_loss_ternary,
        train_accuracy_ternary * 100,
        valid_loss_ternary,
        valid_accuracy_ternary * 100
    ))

    if valid_loss_ternary <= valid_loss_min_ternary:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
        valid_loss_min_ternary,
        valid_loss_ternary))
        torch.save(ternary_model.state_dict(), 'ternary_model.pt')
        valid_loss_min_ternary = valid_loss_ternary


Epoch: 1 	Training Loss: 0.877268 	Training Accuracy: 66.05% 	Validation Loss: 0.861012 	Validation Accuracy: 67.76%
Validation loss decreased (inf --> 0.861012).  Saving model ...
Epoch: 2 	Training Loss: 0.875181 	Training Accuracy: 66.32% 	Validation Loss: 0.864552 	Validation Accuracy: 67.54%
Epoch: 3 	Training Loss: 0.871953 	Training Accuracy: 66.57% 	Validation Loss: 0.862998 	Validation Accuracy: 67.82%
Epoch: 4 	Training Loss: 0.871334 	Training Accuracy: 66.75% 	Validation Loss: 0.858151 	Validation Accuracy: 67.94%
Validation loss decreased (0.861012 --> 0.858151).  Saving model ...
Epoch: 5 	Training Loss: 0.869552 	Training Accuracy: 66.95% 	Validation Loss: 0.853986 	Validation Accuracy: 68.42%
Validation loss decreased (0.858151 --> 0.853986).  Saving model ...
Epoch: 6 	Training Loss: 0.868843 	Training Accuracy: 67.02% 	Validation Loss: 0.862916 	Validation Accuracy: 67.73%
Epoch: 7 	Training Loss: 0.868962 	Training Accuracy: 66.98% 	Validation Loss: 0.867549 	Validat