#### Requirements:
Python 3.9, I have used torch.device("mps") which is cuda equivalent for MacOS

# 1. Dataset Generation

In [1]:
import warnings
import pandas as pd
warnings.filterwarnings("ignore", category=UserWarning, module="pandas")
pd.options.mode.chained_assignment = None
import re
import nltk
from bs4 import BeautifulSoup
import numpy as np
import gensim.downloader as api
from gensim.models import Word2Vec
from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split  
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# Loading the dataset
dataset = pd.read_csv('data.tsv', sep='\t', on_bad_lines='skip')

  dataset = pd.read_csv('data.tsv', sep='\t', on_bad_lines='skip')


In [3]:
#Segregating the dataset into 5 different datasets based on the star_rating
rating1=dataset.loc[dataset['star_rating'].isin([1])].sample(n=50000, random_state=36)
rating2=dataset.loc[dataset['star_rating'].isin([2])].sample(n=50000, random_state=36)
rating3=dataset.loc[dataset['star_rating'].isin([3])].sample(n=50000, random_state=36)
rating4=dataset.loc[dataset['star_rating'].isin([4])].sample(n=50000, random_state=36)
rating5=dataset.loc[dataset['star_rating'].isin([5])].sample(n=50000, random_state=36)
finaldf = pd.concat([rating1, rating2, rating3, rating4, rating5])


In [4]:
#Cleaning Data
df = finaldf[['star_rating', 'review_headline', 'review_body']]
df.rename(columns={'star_rating': 'ratings'}, inplace=True)
df['review_headline'] = df['review_headline'].apply(str)
df['review_body'] = df['review_body'].apply(str)
df['reviews'] = df[['review_headline', 'review_body']].agg(' '.join, axis=1)
df = df.drop('review_headline', axis=1)
df = df.drop('review_body', axis=1)

In [5]:
#Separating Classes
df['class'] = df['ratings'].apply(lambda x: 1 if x > 3 else 2 if x < 3 else 3)

In [6]:
# Convert to lower case
df['reviews'] = df['reviews'].str.lower()
# Remove HTML tags
df['reviews'] = df['reviews'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())
# Remove URLs
df['reviews'] = df['reviews'].apply(lambda x: re.sub(r'http\S+', '', x))
# Remove non-alphabetical characters
df['reviews'] = df['reviews'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))
# Remove extra spaces
df['reviews'] = df['reviews'].apply(lambda x: ' '.join(x.split()))

# Performing contractions
contractions = {
    "aren't": "are not",
    "can't": "cannot",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'll": "he will",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "i will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "i'd": "i would",
    "won't": "will not",
    "can't": "cannot",
    "I'm": "I am",
    "you're": "you are",
    "he's": "he is",
    "she's": "she is",
    "it's": "it is",
    "that's": "that is",
    "we're": "we are",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "they're": "they are",
    "haven't": "have not",
    "hasn't": "has not",
    "don't": "do not",
    "doesn't": "does not",
    "didn't": "did not",
    "isn't": "is not",
    "aren't": "are not",
    "wasn't": "was not",
    "weren't": "were not",
    "haven't": "have not",
    "hasn't": "has not",
    "hadn't": "had not",
}

df['reviews'] = df['reviews'].replace(contractions, regex=True)

#Removing Stopwords
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
df['reviews'] = df['reviews'].apply(lambda text: ' '.join([word for word in str(text).split() if word.lower() not in stop_words]))

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
df['reviews'] = df['reviews'].apply(lambda text: ' '.join([lemmatizer.lemmatize(word) for word in text.split()]))

  df['reviews'] = df['reviews'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())
[nltk_data] Downloading package stopwords to /Users/dev/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# 2. Word Embedding

(a)

In [7]:
print("Pretrained: ")
# Load the pretrained Word2Vec model
word2vec_model = api.load("word2vec-google-news-300")

# Example 1: Finding the analogy "King - Man + Woman = Queen"
analogy_result = word2vec_model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
print("Analogy Result:", analogy_result)

# Example 2: Finding semantic similarity between "excellent" and "outstanding"
similarity_score = word2vec_model.similarity('excellent', 'outstanding') * 100

print(f"Similarity Score between 'excellent' and 'outstanding': {similarity_score}%")

Pretrained: 
Analogy Result: [('queen', 0.7118192911148071)]
Similarity Score between 'excellent' and 'outstanding': 55.6748628616333%


(b)

In [8]:
sentences = [str(sentence).split() for sentence in df['reviews'].values]
model_own = Word2Vec(sentences, vector_size=300, window=11, min_count=10)
#Saving Model
model_own.save("word2vec_model_own")

In [9]:
#Loading Model
model_own = Word2Vec.load("word2vec_model_own")

In [10]:
print("Own Model: ")
analogy_result = model_own.wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
print(f"Analogy: king - man + woman = {analogy_result[0][0]}")

similarity_score = model_own.wv.similarity('excellent', 'outstanding') * 100
print(f"Similarity between 'excellent' and 'outstanding': {similarity_score}%")

Own Model: 
Analogy: king - man + woman = population
Similarity between 'excellent' and 'outstanding': 71.83629274368286%


### We can see that the Analogy is more accurate on the pretrained model, whereas the similarity score between excellent and outstanding is more in our custom trained model

# 3. Simple models

In [11]:
df1=df.copy(deep=True)  

df1 = df.loc[:, ["class", "reviews"]]
class1 = df1[df1['class'] == 1]  
class2 = df1[df1['class'] == 2]  
df1 = pd.concat([class1, class2], ignore_index=True)

Perceptron & SVM for TF-IDF

In [12]:
# Split the dataset into training and testing sets with an 80%/20% split  
X_train, X_test, y_train, y_test = train_test_split(df1['reviews'], df1['class'], test_size=0.2, random_state=36)

In [13]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
#Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

#Transform the test data using the same vectorizer
X_test_tfidf = tfidf_vectorizer.transform(X_test)

#Now, X_train_tfidf and X_test_tfidf contain the TF-IDF features for training and testing sets

X_train_df = pd.DataFrame(X_train_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
X_test_df = pd.DataFrame(X_test_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

In [14]:
perceptron_model = Perceptron(random_state=42)
perceptron_model.fit(X_train_tfidf, y_train)

# Predictions on training set
train_predictions = perceptron_model.predict(X_train_tfidf)

# Predictions on testing set
test_predictions = perceptron_model.predict(X_test_tfidf)

# Calculate metrics on the testing set
test_accuracy = accuracy_score(y_test, test_predictions) * 100

# Print the metrics
print(f'TF-IDF: Perceptron Accuracy: {test_accuracy}%')


TF-IDF: Perceptron Accuracy: 83.87%


In [15]:
# Train an SVM model
svm_model = LinearSVC(max_iter=5000)
svm_model.fit(X_train_tfidf, y_train)

# Predictions on training set
train_predictions = svm_model.predict(X_train_tfidf)

# Predictions on testing set
test_predictions = svm_model.predict(X_test_tfidf)

# Calculate metrics on the training set
train_accuracy = accuracy_score(y_train, train_predictions)

# Calculate metrics on the testing set
test_accuracy = accuracy_score(y_test, test_predictions) * 100

# Print the metrics
print(f'TF-IDF - SVM Accuracy: {test_accuracy}%')



TF-IDF - SVM Accuracy: 88.1375%


Perceptron & SVM for custom Word2Vec

In [16]:
def average_word2vec(review, wv1):
    if isinstance(review, str):  
        important_words = [word for word in review.split() if word in wv1]
        if important_words:
            return np.mean([wv1[word] for word in important_words], axis=0)
        else:
            return np.zeros(wv1.vector_size)
    else:
        return np.zeros(wv1.vector_size)

In [17]:
X = np.array([average_word2vec(reviews, model_own.wv) for reviews in df1['reviews']])
y = df1['class'].values 
y = np.array(y) - 1 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=31)

In [18]:
perceptron_model = Perceptron()
perceptron_model.fit(X_train, y_train)

# Evaluate models
perceptron_accuracy = perceptron_model.score(X_test, y_test) * 100

print(f'Custom - Perceptron Accuracy: {perceptron_accuracy}%')

Custom - Perceptron Accuracy: 82.3675%


In [19]:
# Train a linear SVM model
svm_model = LinearSVC()
svm_model.fit(X_train, y_train)

svm_accuracy = svm_model.score(X_test, y_test) * 100

print(f'Custom - SVM Accuracy: {svm_accuracy}%')



Custom - SVM Accuracy: 88.1375%




Simple models for pretrained word2vec

In [20]:
X1 = np.array([average_word2vec(reviews, word2vec_model) for reviews in df1['reviews']])
y = df1['class'].values
y = np.array(y) - 1 

X1_train, X1_test, y_train, y_test = train_test_split(X1, y, test_size=0.2, random_state=31)

# Train a perceptron model
perceptron_model = Perceptron()
perceptron_model.fit(X1_train, y_train)

# Evaluate the perceptron model
perceptron_accuracy = perceptron_model.score(X1_test, y_test) * 100

print(f'Pretrained - Perceptron Accuracy: {perceptron_accuracy}%')

Pretrained - Perceptron Accuracy: 71.5225%


In [21]:
# Train a linear SVM model
svm_model = LinearSVC(max_iter=5000)
svm_model.fit(X1_train, y_train)

svm_accuracy = svm_model.score(X1_test, y_test)*100

print(f'Linear SVM Accuracy (word2vec-google-news-300): {svm_accuracy}%')



Linear SVM Accuracy (word2vec-google-news-300): 84.34%


### We can see that TF-IDF SVM accuracy is the highest amongst all other

# 4. Feedforward Neural Networks using PyTorch

In [22]:
class MLP(nn.Module):
    def __init__(self, input_size, num_classes):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_size, 50)
        self.fc2 = nn.Linear(50, 10)
        self.fc3 = nn.Linear(10, num_classes)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

(a)

Custom Binary

In [23]:
X1 = np.array([average_word2vec(reviews, model_own.wv) for reviews in df1['reviews']])
y = df1['class'].values
y = np.array(y) - 1 

X_train, X_test, y_train, y_test = train_test_split(X1, y, test_size=0.2, random_state=42)

X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Initialize the model
input_size = X1.shape[1]
model = MLP(input_size, num_classes=2)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Train the model
num_epochs = 10
for epoch in range(num_epochs):
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

# Evaluate the model
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for inputs, targets in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        total += targets.size(0)
        correct += (predicted == targets).sum().item()

    accuracy = 100 * correct / total
    print(f'Custom word2vec Binary - Testing Accuracy: {accuracy}%')



Custom word2vec Binary - Testing Accuracy: 89.585%


Pretrained binary

In [24]:
X1 = np.array([average_word2vec(reviews, word2vec_model) for reviews in df1['reviews']])
y = df1['class'].values
y = np.array(y) - 1 

X_train, X_test, y_train, y_test = train_test_split(X1, y, test_size=0.2, random_state=42)

X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Initialize the model
input_size = X1.shape[1]
model = MLP(input_size, num_classes=2)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Train the model
num_epochs = 10
for epoch in range(num_epochs):
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

# Evaluate the model
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for inputs, targets in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        total += targets.size(0)
        correct += (predicted == targets).sum().item()

    accuracy = 100 * correct / total
    print(f'Pretrained word2vec Binary - Testing Accuracy: {accuracy}%')

Pretrained word2vec Binary - Testing Accuracy: 87.3975%


Custom Ternary

In [25]:
#Train feedforward neural network for all three classes
X2 = np.array([average_word2vec(reviews, model_own.wv) for reviews in df['reviews']])
y2 = df['class'].values
y2 = np.array(y2) - 1

X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=31)


X_train_tensor = torch.tensor(X2_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y2_train, dtype=torch.long)
X_test_tensor = torch.tensor(X2_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y2_test, dtype=torch.long)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Model initialization
input_size = X2.shape[1]
model = MLP(input_size, num_classes=3)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    for inputs, labels in train_loader:
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
#    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Evaluation
with torch.no_grad():
    outputs = model(X_test_tensor)
    _, predicted = torch.max(outputs.data, 1)
    total = y_test_tensor.size(0)
    correct = (predicted == y_test_tensor).sum().item()
    accuracy = 100 * correct / total
    print(f'Custom word2vec Ternary - Testing Accuracy: {accuracy} %')


Custom word2vec Ternary - Testing Accuracy: 76.13 %


Pretrained ternary

In [26]:
#Train feedforward neural network for all three classes
X2 = np.array([average_word2vec(reviews, word2vec_model) for reviews in df['reviews']])
y2 = df['class'].values
y2 = np.array(y2) - 1

X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=31)

X_train_tensor = torch.tensor(X2_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y2_train, dtype=torch.long)
X_test_tensor = torch.tensor(X2_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y2_test, dtype=torch.long)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Model initialization
input_size = X2.shape[1] 
model = MLP(input_size, num_classes=3)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    for inputs, labels in train_loader:
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
#    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Evaluation
with torch.no_grad():
    outputs = model(X_test_tensor)
    _, predicted = torch.max(outputs.data, 1)
    total = y_test_tensor.size(0)
    correct = (predicted == y_test_tensor).sum().item()
    accuracy = 100 * correct / total
    print(f'Pretrained word2vec Ternary - Testing Accuracy: {accuracy} %')


Pretrained word2vec Ternary - Testing Accuracy: 73.402 %


(b)

In [27]:
def average_word2vec_concat10(review, wv):
    words = review.split()[:10]
    vectors = [wv[word] for word in words if word in wv]

    if len(vectors) < 10:
        vectors += [np.zeros(wv.vector_size)] * (10 - len(vectors))
    return np.concatenate(vectors)

Custom Concat Binary

In [28]:
X1 = np.array([average_word2vec_concat10(reviews, model_own.wv) for reviews in df1['reviews']])
y = df1['class'].values
y = np.array(y) - 1 

X_train, X_test, y_train, y_test = train_test_split(X1, y, test_size=0.2, random_state=42)

X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Initialize the model
input_size = X1.shape[1]
model = MLP(input_size, num_classes=2)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Train the model
num_epochs = 10
for epoch in range(num_epochs):
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

# Evaluate the model
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for inputs, targets in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        total += targets.size(0)
        correct += (predicted == targets).sum().item()

    accuracy = 100 * correct / total
    
    print(f'Custom Concat word2vec Binary - Testing Accuracy: {accuracy} %')


Custom Concat word2vec Binary - Testing Accuracy: 84.2 %


Pretrained Concat binary

In [29]:
X1 = np.array([average_word2vec_concat10(reviews, word2vec_model) for reviews in df1['reviews']])
y = df1['class'].values
y = np.array(y) - 1 

X_train, X_test, y_train, y_test = train_test_split(X1, y, test_size=0.2, random_state=42)

X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Initialize the model
input_size = X1.shape[1]
model = MLP(input_size, num_classes=2)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Train the model
num_epochs = 10
for epoch in range(num_epochs):
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

# Evaluate the model
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for inputs, targets in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        total += targets.size(0)
        correct += (predicted == targets).sum().item()

    accuracy = 100 * correct / total
    print(f'Pretrained Concat word2vec Binary - Testing Accuracy: {accuracy} %')

Pretrained Concat word2vec Binary - Testing Accuracy: 83.05 %


Custom Concat Ternary

In [30]:
X2 = np.array([average_word2vec_concat10(reviews, model_own.wv) for reviews in df['reviews']])
y2 = df['class'].values
y2 = np.array(y2) - 1

X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=42)

X_train_tensor = torch.tensor(X2_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y2_train, dtype=torch.long)
X_test_tensor = torch.tensor(X2_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y2_test, dtype=torch.long)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Model initialization
input_size = X2.shape[1] 
model = MLP(input_size, num_classes=3)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    for inputs, labels in train_loader:
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
   # print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Evaluation
with torch.no_grad():
    outputs = model(X_test_tensor)
    _, predicted = torch.max(outputs.data, 1)
    total = y_test_tensor.size(0)
    correct = (predicted == y_test_tensor).sum().item()
    accuracy = 100 * correct / total
    print(f'Custom Concat word2vec Ternary - Testing Accuracy: {accuracy} %')


Custom Concat word2vec Ternary - Testing Accuracy: 70.766 %


Pretrained concat Ternary

In [31]:
X2 = np.array([average_word2vec_concat10(reviews, word2vec_model) for reviews in df['reviews']])
y2 = df['class'].values
y2 = np.array(y2) - 1

X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=42)

X_train_tensor = torch.tensor(X2_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y2_train, dtype=torch.long)
X_test_tensor = torch.tensor(X2_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y2_test, dtype=torch.long)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Model initialization
input_size = X2.shape[1] 
model = MLP(input_size, num_classes=3)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    for inputs, labels in train_loader:
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

# Evaluation
with torch.no_grad():
    outputs = model(X_test_tensor)
    _, predicted = torch.max(outputs.data, 1)
    total = y_test_tensor.size(0)
    correct = (predicted == y_test_tensor).sum().item()
    accuracy = 100 * correct / total
    print(f'Accuracy of the ternary model on the test set: {accuracy} %')


Accuracy of the ternary model on the test set: 69.174 %


### We can see that Custom word2vec Binary's Testing Accuracy is more than any other simple model

# 5. CNN

In [13]:
class CNN(nn.Module):
    def __init__(self, num_classes):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv1d(1, 50, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(50, 10, kernel_size=3, padding=1)
        self.fc = nn.Linear(30000, num_classes)
        self.softmax = nn.Softmax(dim=1) if num_classes > 2 else nn.Sigmoid()

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        x = self.softmax(x)
        return x

In [14]:
def word2vec50(review, wv):
    length = 50
    words = str(review)[:length].split()[:10]
    vectors = [wv[word] for word in words if word in wv]
    if len(vectors) < 10:
        vectors += [np.zeros(wv.vector_size)] * (10 - len(vectors))
    return np.concatenate(vectors)

In [15]:
def train_and_test_model(model, criterion, optimizer, train_loader, test_loader, num_epochs):
    device = torch.device("mps")
    model = model.to(device)
    
    for epoch in range(num_epochs):
        running_loss = 0.0
        for i, data in enumerate(train_loader, 0):
            inputs, labels = data
            inputs = inputs.to(device)
            labels = labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
        
        #print('Epoch %d loss: %.3f' % (epoch + 1, running_loss / len(train_loader)))

    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for data in test_loader:
            inputs, labels = data
            inputs = inputs.to(device).float()
            labels = labels.to(device).float()
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    #print(f'{(100 * correct / total)} %')
    return (100 * correct / total)

Pretrained Binary CNN

In [35]:
#%pip install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cpu

In [36]:
x = np.array([word2vec50(reviews, word2vec_model) for reviews in df1['reviews']])
y = df1['class'].values
y = np.array(y) - 1

x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=df1['class'], test_size=0.2, random_state=42)

In [37]:
X_train_tensor = torch.tensor(x_train, dtype=torch.float32).unsqueeze(1)
X_test_tensor = torch.tensor(x_test, dtype=torch.float32).unsqueeze(1)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=True)

In [38]:
binary_model = CNN(2)
criterion = nn.CrossEntropyLoss()
binary_optimizer = optim.Adam(binary_model.parameters(), lr=0.0001)

In [39]:
print(f'Pretrained Binary CNN - Test Accuracy: {train_and_test_model(binary_model, criterion, binary_optimizer, train_loader, test_loader, 5)}')

Epoch 1 loss: 0.554
Epoch 2 loss: 0.488
Epoch 3 loss: 0.477
Epoch 4 loss: 0.471
Epoch 5 loss: 0.467
Pretrained Binary CNN - Test Accuracy: 83.2125


Custom Binary CNN

In [16]:
x = np.array([word2vec50(reviews, model_own.wv) for reviews in df1['reviews']])
y = df1['class'].values
y = np.array(y) - 1
x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=df1['class'], test_size=0.2, random_state=42)

In [17]:
X_train_tensor = torch.tensor(x_train, dtype=torch.float32).unsqueeze(1)
X_test_tensor = torch.tensor(x_test, dtype=torch.float32).unsqueeze(1)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=True)

In [18]:
binary_model = CNN(2)
criterion = nn.CrossEntropyLoss()
binary_optimizer = optim.Adam(binary_model.parameters(), lr=0.0001)

In [19]:
print(f'Custom Binary CNN - Test Accuracy: {train_and_test_model(binary_model, criterion, binary_optimizer, train_loader, test_loader, 5)}')

Custom Binary CNN - Test Accuracy: 85.5675


Pretrained ternary CNN

In [20]:
x = np.array([word2vec50(reviews, word2vec_model) for reviews in df['reviews']])
y = df['class'].values
y = np.array(y) - 1

x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=df['class'], test_size=0.2, random_state=42)

In [21]:
X_train_tensor = torch.tensor(x_train, dtype=torch.float32).unsqueeze(1)
X_test_tensor = torch.tensor(x_test, dtype=torch.float32).unsqueeze(1)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=True)

In [22]:
ternary_model = CNN(3)
criterion = nn.CrossEntropyLoss()
binary_optimizer = optim.Adam(binary_model.parameters(), lr=0.0001)

In [23]:
print(f'Pretrained Ternary CNN - Test Accuracy: {train_and_test_model(binary_model, criterion, binary_optimizer, train_loader, test_loader, 5)}')

Pretrained Ternary CNN - Test Accuracy: 67.28


Custom Ternary CNN

In [25]:
x = np.array([word2vec50(reviews, model_own.wv) for reviews in df['reviews']])
y = df['class'].values
y = np.array(y) - 1
x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=df['class'], test_size=0.2, random_state=42)

In [26]:
X_train_tensor = torch.tensor(x_train, dtype=torch.float32).unsqueeze(1)
X_test_tensor = torch.tensor(x_test, dtype=torch.float32).unsqueeze(1)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=True)

In [27]:
ternary_model = CNN(3)
criterion = nn.CrossEntropyLoss()
binary_optimizer = optim.Adam(binary_model.parameters(), lr=0.0001)

In [28]:
print(f'Custom Ternary CNN - Test Accuracy: {train_and_test_model(binary_model, criterion, binary_optimizer, train_loader, test_loader, 5)}')

Custom Ternary CNN - Test Accuracy: 68.248


### We can see that Custom modal's Binary CNN architecture had the highest accuracy of 85.5675% amongst the other models.