# Data Generation

In [1]:
import pandas as pd
import gzip


file_path = './amazon_reviews_us_Office_Products_v1_00.tsv.gz'


df = pd.read_csv(gzip.open(file_path), sep='\t', usecols=['review_body', 'star_rating'])

df = df[pd.to_numeric(df['star_rating'], errors='coerce').notna()]

df['star_rating'] = df['star_rating'].astype(int)
df = df.dropna()

rating_dfs = []
for i in range(1, 6):
    rating_df = df[df['star_rating'] == i].sample(n=50000, random_state=42)
    rating_dfs.append(rating_df)

dataset = pd.concat(rating_dfs, ignore_index=True)

def categorize_rating(rating):
    if rating > 3:
        return 1
    elif rating < 3:
        return 2
    else:  
        return 3
    

dataset['class'] = dataset['star_rating'].apply(categorize_rating)

dataset.to_csv('data.csv', index=False)

In [1]:
import pandas as pd

df = pd.read_csv('data.csv')

# Word Embeddings

## Google Word2Vec

In [2]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')

In [4]:
# Perform the vector arithmetic indirectly using `most_similar`
result = wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)

print(result)


[('queen', 0.7118192911148071)]


In [None]:
w1='excellent'
w2='outstanding'
print('%r\t%r\t%.2f' % (w1, w2, wv.similarity(w1, w2)))

'excellent'	'outstanding'	0.56


## Custom Trained Model Word2Vec

### Preprocessing the data

In [3]:
df

Unnamed: 0,star_rating,review_body,class
0,1,The photo is deceiving - makes it look like a ...,2
1,1,Worst labels ever! I purchased these labels to...,2
2,1,This product broke in a very short time. It a...,2
3,1,The printer head is malfunctioning since the i...,2
4,1,When this item shipped to me I was very excite...,2
...,...,...,...
249995,5,Produces great prints.,1
249996,5,perfect for my high school student to use in h...,1
249997,5,The product was Excellent! !,1
249998,5,Arrived fast and works great--good buy!,1


In [2]:
import re
from bs4 import BeautifulSoup
df['review_body'] = df['review_body'].str.lower()

df['review_body'] = df['review_body'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())
df['review_body'] = df['review_body'].apply(lambda x: re.sub(r'http\S+', ' ', x))
df['review_body'] = df['review_body'].str.replace('[^a-zA-Z\s]', ' ', regex=True)
df['review_body'] = df['review_body'].str.replace('\s+', ' ', regex=True)


import contractions
df['review_body'] = df['review_body'].apply(contractions.fix)



import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def tokenize_text(text):
    return nltk.word_tokenize(text)

def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    return [word for word in tokens if word.lower() not in stop_words]

def lemmatize_text(tokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in tokens]

def process_text(text):
    # Tokenization
    tokens = tokenize_text(text)
    # Remove stop words
    tokens_no_stopwords = remove_stopwords(tokens)
    # Lemmatization
    lemmatized_tokens = lemmatize_text(tokens_no_stopwords)

    return lemmatized_tokens


tokenized_data = df['review_body'].apply(process_text)

  df['review_body'] = df['review_body'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())
  df['review_body'] = df['review_body'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())


In [3]:
tokenized_data

0         [photo, deceiving, make, look, like, set, pen,...
1         [worst, label, ever, purchased, label, try, re...
2         [product, broke, short, time, also, poor, job,...
3         [printer, head, malfunctioning, since, install...
4         [item, shipped, excited, outside, great, quali...
                                ...                        
249995                              [produce, great, print]
249996    [perfect, high, school, student, use, math, cl...
249997                                 [product, excellent]
249998              [arrived, fast, work, great, good, buy]
249999    [glad, bought, headset, hear, better, ever, un...
Name: review_body, Length: 250000, dtype: object

In [5]:
type(tokenized_data)

pandas.core.series.Series

### Training Word2Vec

In [37]:
from gensim.models import Word2Vec

model = Word2Vec(tokenized_data, vector_size=300, window=11, min_count=10)


model.save("word2vec.model")




In [4]:
from gensim.models import Word2Vec
# Load model
model = Word2Vec.load("word2vec.model")

In [5]:
similarity = model.wv.similarity('happy', 'impressed')
print(similarity)

0.5609798


In [41]:
w1='excellent'
w2='outstanding'
print('%r\t%r\t%.2f' % (w1, w2, model.wv.similarity(w1, w2)))

'excellent'	'outstanding'	0.78


In [42]:
word_pairs = [
    ('smartphone', 'camera'), ('laptop', 'charger'), ('headphone', 'bluetooth'),
    ('novel', 'author'), ('fiction', 'character'),
]


def compare_similarities(model1, model2, word_pairs):
    
    results = []

    for word1, word2 in word_pairs:
        similarity_model1 = model1.similarity(word1, word2)
        similarity_model2 = model2.similarity(word1, word2)
        results.append({
            'Word Pair': f'{word1}, {word2}',
            'Pretrained Model Similarity': round(similarity_model1, 2),
            'Amazon Review Model Similarity': round(similarity_model2, 2)
        })
    
   
    results_df = pd.DataFrame(results)

   
    print(results_df)


compare_similarities(wv, model.wv, word_pairs)


              Word Pair  Pretrained Model Similarity  \
0    smartphone, camera                         0.32   
1       laptop, charger                         0.47   
2  headphone, bluetooth                         0.49   
3         novel, author                         0.46   
4    fiction, character                         0.25   

   Amazon Review Model Similarity  
0                            0.41  
1                            0.25  
2                            0.45  
3                            0.58  
4                            0.31  


# Simple Models

## Perceptron and SVM with Custom Word2Vec

In [43]:
import numpy as np

def average_word2vec(reviews, word2vec_model, vector_size):
    features = []

    for review in reviews:
        
        valid_words = [word for word in review if word in word2vec_model.wv.key_to_index]

        if not valid_words:
           
            features.append(np.zeros(vector_size))
            continue

        
        word_vectors = np.array([word2vec_model.wv[word] for word in valid_words])
        avg_vector = word_vectors.mean(axis=0)
        features.append(avg_vector)

    return np.array(features)


avg_features = average_word2vec(tokenized_data, model, vector_size=300)


In [44]:

df_filtered = df[df['class'] != 3]

filtered_indices = df_filtered.index.to_numpy()

avg_features_filtered = avg_features[filtered_indices]


print("Filtered DataFrame shape:", df_filtered.shape)
print("Filtered avg_features shape:", avg_features_filtered.shape)

Filtered DataFrame shape: (200000, 3)
Filtered avg_features shape: (200000, 300)


In [45]:
X=avg_features_filtered
y=df_filtered['class']

In [46]:
X.shape, y.shape

((200000, 300), (200000,))

In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [48]:
from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

perceptron_model = Perceptron()

perceptron_model.fit(X_train, y_train)

y_train_pred = perceptron_model.predict(X_train)
y_test_pred = perceptron_model.predict(X_test)


accuracy_train = accuracy_score(y_train, y_train_pred)
precision_train = precision_score(y_train, y_train_pred)
recall_train = recall_score(y_train, y_train_pred)
f1_train = f1_score(y_train, y_train_pred)
accuracy_test = accuracy_score(y_test, y_test_pred)
precision_test = precision_score(y_test, y_test_pred)
recall_test = recall_score(y_test, y_test_pred)
f1_test = f1_score(y_test, y_test_pred)


print("\nTraining Metrics For Perceptron:")
print(f"Accuracy: {accuracy_train}")
print(f"Precision: {precision_train}")
print(f"Recall: {recall_train}")
print(f"F1-Score: {f1_train}")
print("Testing Metrics For Perceptron:")
print(f"Accuracy: {accuracy_test}")
print(f"Precision: {precision_test}")
print(f"Recall: {recall_test}")
print(f"F1-Score: {f1_test}")


Training Metrics For Perceptron:
Accuracy: 0.80206875
Precision: 0.7983036618188103
Recall: 0.8083332291575512
F1-Score: 0.8032871402749222
Testing Metrics For Perceptron:
Accuracy: 0.803375
Precision: 0.7995657751899734
Recall: 0.8099165292147749
F1-Score: 0.8047078687954708


In [49]:
from sklearn.svm import LinearSVC

svm_model = LinearSVC()

svm_model.fit(X_train, y_train)

y_train_pred = svm_model.predict(X_train)
y_test_pred = svm_model.predict(X_test)

accuracy_train = accuracy_score(y_train, y_train_pred)
precision_train = precision_score(y_train, y_train_pred)
recall_train = recall_score(y_train, y_train_pred)
f1_train = f1_score(y_train, y_train_pred)

# Evaluate the model on testing data
accuracy_test = accuracy_score(y_test, y_test_pred)
precision_test = precision_score(y_test, y_test_pred)
recall_test = recall_score(y_test, y_test_pred)
f1_test = f1_score(y_test, y_test_pred)

# Print the results
print("\nTraining Metrics For SVM:")
print(f"Accuracy: {accuracy_train}")
print(f"Precision: {precision_train}")
print(f"Recall: {recall_train}")
print(f"F1-Score: {f1_train}")
print("Testing Metrics For SVM:")
print(f"Accuracy: {accuracy_test}")
print(f"Precision: {precision_test}")
print(f"Recall: {recall_test}")
print(f"F1-Score: {f1_test}")





Training Metrics For SVM:
Accuracy: 0.84153125
Precision: 0.8524577473874339
Recall: 0.8259972747615416
F1-Score: 0.8390189393217906
Testing Metrics For SVM:
Accuracy: 0.84235
Precision: 0.8529184483025088
Recall: 0.8275103713700205
F1-Score: 0.8400223248262214


## Perceptron and SVM with Google Word2Vec

In [50]:
def average_word2vec_google(reviews,vector_size):
    features = []

    for review in reviews:
        
        valid_words = [word for word in review if word in wv.key_to_index]

        if not valid_words:
           
            features.append(np.zeros(vector_size))
            continue

        
        word_vectors = np.array([wv[word] for word in valid_words])
        avg_vector = word_vectors.mean(axis=0)
        features.append(avg_vector)

    return np.array(features)


avg_features_pretrained = average_word2vec_google(tokenized_data,vector_size=300)

In [51]:
# Get the indices of the remaining rows after filtering
filtered_indices = df_filtered.index.to_numpy()

# Now, use these indices to filter the avg_features array
avg_features_filtered_pretrained = avg_features_pretrained[filtered_indices]

# Checking the dimensions to ensure they match
print("Filtered DataFrame shape:", df_filtered.shape)
print("Filtered avg_features shape:", avg_features_filtered_pretrained.shape)

Filtered DataFrame shape: (200000, 3)
Filtered avg_features shape: (200000, 300)


In [52]:
X=avg_features_filtered_pretrained
y=df_filtered['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [54]:
from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

perceptron_model = Perceptron()

perceptron_model.fit(X_train, y_train)

y_train_pred = perceptron_model.predict(X_train)
y_test_pred = perceptron_model.predict(X_test)


accuracy_train = accuracy_score(y_train, y_train_pred)
precision_train = precision_score(y_train, y_train_pred)
recall_train = recall_score(y_train, y_train_pred)
f1_train = f1_score(y_train, y_train_pred)
accuracy_test = accuracy_score(y_test, y_test_pred)
precision_test = precision_score(y_test, y_test_pred)
recall_test = recall_score(y_test, y_test_pred)
f1_test = f1_score(y_test, y_test_pred)


print("\nTraining Metrics For Perceptron:")
print(f"Accuracy: {accuracy_train}")
print(f"Precision: {precision_train}")
print(f"Recall: {recall_train}")
print(f"F1-Score: {f1_train}")
print("Testing Metrics For Perceptron:")
print(f"Accuracy: {accuracy_test}")
print(f"Precision: {precision_test}")
print(f"Recall: {recall_test}")
print(f"F1-Score: {f1_test}")


Training Metrics For Perceptron:
Accuracy: 0.7436875
Precision: 0.8928074807037343
Recall: 0.5538234595527108
F1-Score: 0.6835989939358403
Testing Metrics For Perceptron:
Accuracy: 0.7438
Precision: 0.8917074737095609
Recall: 0.5552056780126956
F1-Score: 0.6843272548053229


In [55]:
from sklearn.svm import LinearSVC

svm_model = LinearSVC()

svm_model.fit(X_train, y_train)

y_train_pred = svm_model.predict(X_train)
y_test_pred = svm_model.predict(X_test)

accuracy_train = accuracy_score(y_train, y_train_pred)
precision_train = precision_score(y_train, y_train_pred)
recall_train = recall_score(y_train, y_train_pred)
f1_train = f1_score(y_train, y_train_pred)

# Evaluate the model on testing data
accuracy_test = accuracy_score(y_test, y_test_pred)
precision_test = precision_score(y_test, y_test_pred)
recall_test = recall_score(y_test, y_test_pred)
f1_test = f1_score(y_test, y_test_pred)

# Print the results
print("\nTraining Metrics For SVM:")
print(f"Accuracy: {accuracy_train}")
print(f"Precision: {precision_train}")
print(f"Recall: {recall_train}")
print(f"F1-Score: {f1_train}")
print("Testing Metrics For SVM:")
print(f"Accuracy: {accuracy_test}")
print(f"Precision: {precision_test}")
print(f"Recall: {recall_test}")
print(f"F1-Score: {f1_test}")




Training Metrics For SVM:
Accuracy: 0.81735625
Precision: 0.8353191376941773
Recall: 0.7905316715212581
F1-Score: 0.8123085223222029
Testing Metrics For SVM:
Accuracy: 0.816875
Precision: 0.8335963804713805
Recall: 0.7919728095166692
F1-Score: 0.8122516980648469


# Feedforward Neural Networks

## Feedforward Neural Networks Average Values Custom Word2Vec Binary

In [57]:
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader

In [58]:
X=avg_features_filtered
y=df_filtered['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [61]:

from sklearn.model_selection import train_test_split
from torch.optim import lr_scheduler

# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
X_val_tensor = torch.tensor(X_test, dtype=torch.float32)
y_val_tensor = torch.tensor(y_test.values, dtype=torch.long)
y_train_tensor = y_train_tensor - 1
y_val_tensor = y_val_tensor - 1
# Create datasets and dataloaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

# Define the MLP model
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(300, 50),
            nn.ReLU(),
            nn.Linear(50, 10),
            nn.ReLU(),
            nn.Linear(10, 2)
        )

    def forward(self, x):
        return self.layers(x)

# Initialize the model, loss function, and optimizer
mlp_model = MLP()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(mlp_model.parameters(), lr=0.001)

def train_model(model, criterion, optimizer, train_loader, val_loader, epochs=10, patience=3):
    # Initialize early stopping variables
    best_val_loss = float('inf')
    epochs_no_improve = 0
    early_stop = False
    
    # Scheduler for learning rate decay
    scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=patience // 2, factor=0.1, verbose=True)

    for epoch in range(epochs):
        model.train()
        for data, target in train_loader:
            optimizer.zero_grad() # clears old gradients,
            output = model(data)
            loss = criterion(output, target) # Calculate the pred-actual Loss
            loss.backward() # Back proprogation (Calculating the gradient)
            optimizer.step() # Weight updates
        
        # Validation phase
        model.eval() # Switches the into evaluation mode
        val_loss = 0
        correct = 0
        with torch.no_grad(): # Ensures gradient is not calculated(Saves memory and computation)
            for data, target in val_loader:
                output = model(data) ## Output in the form of probabitilities
                val_loss += criterion(output, target).item() 
                pred = output.argmax(dim=1, keepdim=True) # Probabitlity with the max value is the output
                correct += pred.eq(target.view_as(pred)).sum().item() # counts the number of correct predictions

        val_loss /= len(val_loader.dataset)
        accuracy = 100. * correct / len(val_loader.dataset)
        print(f'Epoch: {epoch+1}, Validation Loss: {val_loss:.4f}, Accuracy: {accuracy:.2f}%')
        
        # Early stopping logic
        if val_loss < best_val_loss: # if the  current loss is less the best loss
            best_val_loss = val_loss
            epochs_no_improve = 0 
        else: # no improvement in loss
            epochs_no_improve += 1
            if epochs_no_improve >= patience:
                print('Early stopping triggered. Training stopped.')
                early_stop = True
                break
        
        # Learning rate scheduler step
        scheduler.step(val_loss)
        
        if early_stop:
            print("Stopped early at epoch:", epoch+1)
            break


train_model(mlp_model, criterion, optimizer, train_loader, val_loader, epochs=100, patience=3)



Epoch: 1, Validation Loss: 0.0054, Accuracy: 85.55%
Epoch: 2, Validation Loss: 0.0052, Accuracy: 85.98%
Epoch: 3, Validation Loss: 0.0052, Accuracy: 85.82%
Epoch: 4, Validation Loss: 0.0051, Accuracy: 86.12%
Epoch: 5, Validation Loss: 0.0051, Accuracy: 86.17%
Epoch: 6, Validation Loss: 0.0051, Accuracy: 86.22%
Epoch: 7, Validation Loss: 0.0051, Accuracy: 86.22%
Epoch: 8, Validation Loss: 0.0050, Accuracy: 86.59%
Epoch: 9, Validation Loss: 0.0050, Accuracy: 86.60%
Epoch: 10, Validation Loss: 0.0050, Accuracy: 86.58%
Epoch: 11, Validation Loss: 0.0050, Accuracy: 86.53%
Epoch: 12, Validation Loss: 0.0050, Accuracy: 86.57%
Epoch: 13, Validation Loss: 0.0050, Accuracy: 86.56%
Epoch: 14, Validation Loss: 0.0050, Accuracy: 86.54%
Early stopping triggered. Training stopped.


In [62]:
# Evaluate the model on training data
y_train_pred = mlp_model(X_train_tensor).argmax(dim=1).numpy()
accuracy_train = accuracy_score(y_train_tensor.numpy(), y_train_pred)
precision_train = precision_score(y_train_tensor.numpy(), y_train_pred)
recall_train = recall_score(y_train_tensor.numpy(), y_train_pred)
f1_train = f1_score(y_train_tensor.numpy(), y_train_pred)

# Evaluate the model on testing data
y_test_pred = mlp_model(X_val_tensor).argmax(dim=1).numpy()
accuracy_test = accuracy_score(y_val_tensor.numpy(), y_test_pred)
precision_test = precision_score(y_val_tensor.numpy(), y_test_pred)
recall_test = recall_score(y_val_tensor.numpy(), y_test_pred)
f1_test = f1_score(y_val_tensor.numpy(), y_test_pred)

# Print the results
print("Training Metrics For MLP:")
print(f"Accuracy: {accuracy_train:.4f}")
print(f"Precision: {precision_train:.4f}")
print(f"Recall: {recall_train:.4f}")
print(f"F1-Score: {f1_train:.4f}")
print("\nTesting Metrics For MLP:")
print(f"Accuracy: {accuracy_test:.4f}")
print(f"Precision: {precision_test:.4f}")
print(f"Recall: {recall_test:.4f}")
print(f"F1-Score: {f1_test:.4f}")

Training Metrics For MLP:
Accuracy: 0.8796
Precision: 0.8803
Recall: 0.8787
F1-Score: 0.8795

Testing Metrics For MLP:
Accuracy: 0.8654
Precision: 0.8664
Recall: 0.8640
F1-Score: 0.8652


## Feedforward Neural Networks Average Values Google Word2Vec Binary

In [63]:
X=avg_features_filtered_pretrained
y=df_filtered['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [64]:
X.shape

(200000, 300)

In [65]:
from sklearn.model_selection import train_test_split
from torch.optim import lr_scheduler

# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
X_val_tensor = torch.tensor(X_test, dtype=torch.float32)
y_val_tensor = torch.tensor(y_test.values, dtype=torch.long)
y_train_tensor = y_train_tensor - 1
y_val_tensor = y_val_tensor - 1
# Create datasets and dataloaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

# Define the MLP model
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(300, 50),
            nn.ReLU(),
            nn.Linear(50, 10),
            nn.ReLU(),
            nn.Linear(10, 2)
        )

    def forward(self, x):
        return self.layers(x)

# Initialize the model, loss function, and optimizer
mlp_model = MLP()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(mlp_model.parameters(), lr=0.001)

def train_model(model, criterion, optimizer, train_loader, val_loader, epochs=10, patience=3):
    # Initialize early stopping variables
    best_val_loss = float('inf')
    epochs_no_improve = 0
    early_stop = False
    
    # Scheduler for learning rate decay
    scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=patience // 2, factor=0.1, verbose=True)

    for epoch in range(epochs):
        model.train()
        for data, target in train_loader:
            optimizer.zero_grad() # clears old gradients,
            output = model(data)
            loss = criterion(output, target) # Calculate the pred-actual Loss
            loss.backward() # Back proprogation (Calculating the gradient)
            optimizer.step() # Weight updates
        
        # Validation phase
        model.eval() # Switches the into evaluation mode
        val_loss = 0
        correct = 0
        with torch.no_grad(): # Ensures gradient is not calculated(Saves memory and computation)
            for data, target in val_loader:
                output = model(data) ## Output in the form of probabitilities
                val_loss += criterion(output, target).item() 
                pred = output.argmax(dim=1, keepdim=True) # Probabitlity with the max value is the output
                correct += pred.eq(target.view_as(pred)).sum().item() # counts the number of correct predictions

        val_loss /= len(val_loader.dataset)
        accuracy = 100. * correct / len(val_loader.dataset)
        print(f'Epoch: {epoch+1}, Validation Loss: {val_loss:.4f}, Accuracy: {accuracy:.2f}%')
        
        # Early stopping logic
        if val_loss < best_val_loss: # if the  current loss is less the best loss
            best_val_loss = val_loss
            epochs_no_improve = 0 
        else: # no improvement in loss
            epochs_no_improve += 1
            if epochs_no_improve >= patience:
                print('Early stopping triggered. Training stopped.')
                early_stop = True
                break
        
        # Learning rate scheduler step
        scheduler.step(val_loss)
        
        if early_stop:
            print("Stopped early at epoch:", epoch+1)
            break


train_model(mlp_model, criterion, optimizer, train_loader, val_loader, epochs=100, patience=3)



Epoch: 1, Validation Loss: 0.0061, Accuracy: 82.96%
Epoch: 2, Validation Loss: 0.0059, Accuracy: 83.64%
Epoch: 3, Validation Loss: 0.0058, Accuracy: 83.66%
Epoch: 4, Validation Loss: 0.0058, Accuracy: 83.50%
Epoch: 5, Validation Loss: 0.0057, Accuracy: 84.24%
Epoch: 6, Validation Loss: 0.0056, Accuracy: 84.35%
Epoch: 7, Validation Loss: 0.0057, Accuracy: 84.30%
Epoch: 8, Validation Loss: 0.0056, Accuracy: 84.14%
Epoch: 9, Validation Loss: 0.0055, Accuracy: 84.54%
Epoch: 10, Validation Loss: 0.0055, Accuracy: 84.57%
Epoch: 11, Validation Loss: 0.0055, Accuracy: 84.64%
Epoch: 12, Validation Loss: 0.0055, Accuracy: 84.43%
Epoch: 13, Validation Loss: 0.0055, Accuracy: 84.53%
Epoch: 14, Validation Loss: 0.0055, Accuracy: 84.55%
Epoch: 15, Validation Loss: 0.0055, Accuracy: 84.54%
Epoch: 16, Validation Loss: 0.0055, Accuracy: 84.54%
Epoch: 17, Validation Loss: 0.0055, Accuracy: 84.53%
Early stopping triggered. Training stopped.


In [66]:
# Evaluate the model on training data
y_train_pred = mlp_model(X_train_tensor).argmax(dim=1).numpy()
accuracy_train = accuracy_score(y_train_tensor.numpy(), y_train_pred)
precision_train = precision_score(y_train_tensor.numpy(), y_train_pred)
recall_train = recall_score(y_train_tensor.numpy(), y_train_pred)
f1_train = f1_score(y_train_tensor.numpy(), y_train_pred)

# Evaluate the model on testing data
y_test_pred = mlp_model(X_val_tensor).argmax(dim=1).numpy()
accuracy_test = accuracy_score(y_val_tensor.numpy(), y_test_pred)
precision_test = precision_score(y_val_tensor.numpy(), y_test_pred)
recall_test = recall_score(y_val_tensor.numpy(), y_test_pred)
f1_test = f1_score(y_val_tensor.numpy(), y_test_pred)

# Print the results
print("Training Metrics For MLP:")
print(f"Accuracy: {accuracy_train:.4f}")
print(f"Precision: {precision_train:.4f}")
print(f"Recall: {recall_train:.4f}")
print(f"F1-Score: {f1_train:.4f}")
print("\nTesting Metrics For MLP:")
print(f"Accuracy: {accuracy_test:.4f}")
print(f"Precision: {precision_test:.4f}")
print(f"Recall: {recall_test:.4f}")
print(f"F1-Score: {f1_test:.4f}")

Training Metrics For MLP:
Accuracy: 0.8624
Precision: 0.8586
Recall: 0.8676
F1-Score: 0.8631

Testing Metrics For MLP:
Accuracy: 0.8453
Precision: 0.8416
Recall: 0.8505
F1-Score: 0.8460


## Feedforward Neural Networks Average Values Custom Word2Vec Ternary

In [69]:
X=avg_features
y=df['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [70]:
X.shape, y.shape

((250000, 300), (250000,))

In [72]:
# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
X_val_tensor = torch.tensor(X_test, dtype=torch.float32)
y_val_tensor = torch.tensor(y_test.values, dtype=torch.long)
y_train_tensor = y_train_tensor - 1
y_val_tensor = y_val_tensor - 1
# Create datasets and dataloaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

# Define the MLP model
class MLP_2(nn.Module):
    def __init__(self):
        super(MLP_2, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(300, 50),
            nn.ReLU(),
            nn.Linear(50, 10),
            nn.ReLU(),
            nn.Linear(10, 3)
        )

    def forward(self, x):
        return self.layers(x)

# Initialize the model, loss function, and optimizer
mlp_model_2 = MLP_2()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(mlp_model_2.parameters(), lr=0.001)

def train_model(model, criterion, optimizer, train_loader, val_loader, epochs=10, patience=3):
    # Initialize early stopping variables
    best_val_loss = float('inf')
    epochs_no_improve = 0
    early_stop = False
    
    # Scheduler for learning rate decay
    scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=patience // 2, factor=0.1, verbose=True)

    for epoch in range(epochs):
        model.train()
        for data, target in train_loader:
            optimizer.zero_grad() # clears old gradients,
            output = model(data)
            loss = criterion(output, target) # Calculate the pred-actual Loss
            loss.backward() # Back proprogation (Calculating the gradient)
            optimizer.step() # Weight updates
        
        # Validation phase
        model.eval() # Switches the into evaluation mode
        val_loss = 0
        correct = 0
        with torch.no_grad(): # Ensures gradient is not calculated(Saves memory and computation)
            for data, target in val_loader:
                output = model(data) ## Output in the form of probabitilities
                val_loss += criterion(output, target).item() 
                pred = output.argmax(dim=1, keepdim=True) # Probabitlity with the max value is the output
                correct += pred.eq(target.view_as(pred)).sum().item() # counts the number of correct predictions

        val_loss /= len(val_loader.dataset)
        accuracy = 100. * correct / len(val_loader.dataset)
        print(f'Epoch: {epoch+1}, Validation Loss: {val_loss:.4f}, Accuracy: {accuracy:.2f}%')
        
        # Early stopping logic
        if val_loss < best_val_loss: # if the  current loss is less the best loss
            best_val_loss = val_loss
            epochs_no_improve = 0 
        else: # no improvement in loss
            epochs_no_improve += 1
            if epochs_no_improve >= patience:
                print('Early stopping triggered. Training stopped.')
                early_stop = True
                break
        
        # Learning rate scheduler step
        scheduler.step(val_loss)
        
        if early_stop:
            print("Stopped early at epoch:", epoch+1)
            break


train_model(mlp_model_2, criterion, optimizer, train_loader, val_loader, epochs=100, patience=10)



Epoch: 1, Validation Loss: 0.0114, Accuracy: 68.62%
Epoch: 2, Validation Loss: 0.0111, Accuracy: 69.70%
Epoch: 3, Validation Loss: 0.0110, Accuracy: 69.88%
Epoch: 4, Validation Loss: 0.0111, Accuracy: 69.75%
Epoch: 5, Validation Loss: 0.0110, Accuracy: 69.82%
Epoch: 6, Validation Loss: 0.0109, Accuracy: 70.02%
Epoch: 7, Validation Loss: 0.0110, Accuracy: 70.02%
Epoch: 8, Validation Loss: 0.0109, Accuracy: 70.06%
Epoch: 9, Validation Loss: 0.0109, Accuracy: 70.16%
Epoch: 10, Validation Loss: 0.0109, Accuracy: 70.12%
Epoch: 11, Validation Loss: 0.0109, Accuracy: 70.30%
Epoch: 12, Validation Loss: 0.0109, Accuracy: 70.40%
Epoch: 13, Validation Loss: 0.0110, Accuracy: 70.00%
Epoch: 14, Validation Loss: 0.0109, Accuracy: 70.03%
Epoch: 15, Validation Loss: 0.0110, Accuracy: 69.65%
Epoch: 16, Validation Loss: 0.0109, Accuracy: 70.18%
Epoch: 17, Validation Loss: 0.0109, Accuracy: 69.96%
Epoch: 18, Validation Loss: 0.0109, Accuracy: 70.24%
Epoch: 19, Validation Loss: 0.0108, Accuracy: 70.39%
Ep

In [75]:
# Evaluate the model on training data
y_train_pred = mlp_model_2(X_train_tensor).argmax(dim=1).numpy()
accuracy_train = accuracy_score(y_train_tensor.numpy(), y_train_pred)
precision_train = precision_score(y_train_tensor.numpy(), y_train_pred, average='weighted')
recall_train = recall_score(y_train_tensor.numpy(), y_train_pred, average='weighted')
f1_train = f1_score(y_train_tensor.numpy(), y_train_pred, average='weighted')

# Evaluate the model on testing data
y_test_pred = mlp_model_2(X_val_tensor).argmax(dim=1).numpy()
accuracy_test = accuracy_score(y_val_tensor.numpy(), y_test_pred)
precision_test = precision_score(y_val_tensor.numpy(), y_test_pred, average='weighted')
recall_test = recall_score(y_val_tensor.numpy(), y_test_pred, average='weighted')
f1_test = f1_score(y_val_tensor.numpy(), y_test_pred, average='weighted')

# Print the results
print("Training Metrics For MLP:")
print(f"Accuracy: {accuracy_train:.4f}")
print(f"Precision: {precision_train:.4f}")
print(f"Recall: {recall_train:.4f}")
print(f"F1-Score: {f1_train:.4f}")
print("\nTesting Metrics For MLP:")
print(f"Accuracy: {accuracy_test:.4f}")
print(f"Precision: {precision_test:.4f}")
print(f"Recall: {recall_test:.4f}")
print(f"F1-Score: {f1_test:.4f}")

Training Metrics For MLP:
Accuracy: 0.7287
Precision: 0.7059
Recall: 0.7287
F1-Score: 0.7091

Testing Metrics For MLP:
Accuracy: 0.7041
Precision: 0.6781
Recall: 0.7041
F1-Score: 0.6834


## Feedforward Neural Networks Average Values Google Word2Vec Ternary

In [None]:
X=avg_features_pretrained
y=df['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [76]:
# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
X_val_tensor = torch.tensor(X_test, dtype=torch.float32)
y_val_tensor = torch.tensor(y_test.values, dtype=torch.long)
y_train_tensor = y_train_tensor - 1
y_val_tensor = y_val_tensor - 1
# Create datasets and dataloaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

# Define the MLP model
class MLP_2(nn.Module):
    def __init__(self):
        super(MLP_2, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(300, 50),
            nn.ReLU(),
            nn.Linear(50, 10),
            nn.ReLU(),
            nn.Linear(10, 3)
        )

    def forward(self, x):
        return self.layers(x)

# Initialize the model, loss function, and optimizer
mlp_model_2 = MLP_2()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(mlp_model_2.parameters(), lr=0.001)

def train_model(model, criterion, optimizer, train_loader, val_loader, epochs=10, patience=3):
    # Initialize early stopping variables
    best_val_loss = float('inf')
    epochs_no_improve = 0
    early_stop = False
    
    # Scheduler for learning rate decay
    scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=patience // 2, factor=0.1, verbose=True)

    for epoch in range(epochs):
        model.train()
        for data, target in train_loader:
            optimizer.zero_grad() # clears old gradients,
            output = model(data)
            loss = criterion(output, target) # Calculate the pred-actual Loss
            loss.backward() # Back proprogation (Calculating the gradient)
            optimizer.step() # Weight updates
        
        # Validation phase
        model.eval() # Switches the into evaluation mode
        val_loss = 0
        correct = 0
        with torch.no_grad(): # Ensures gradient is not calculated(Saves memory and computation)
            for data, target in val_loader:
                output = model(data) ## Output in the form of probabitilities
                val_loss += criterion(output, target).item() 
                pred = output.argmax(dim=1, keepdim=True) # Probabitlity with the max value is the output
                correct += pred.eq(target.view_as(pred)).sum().item() # counts the number of correct predictions

        val_loss /= len(val_loader.dataset)
        accuracy = 100. * correct / len(val_loader.dataset)
        print(f'Epoch: {epoch+1}, Validation Loss: {val_loss:.4f}, Accuracy: {accuracy:.2f}%')
        
        # Early stopping logic
        if val_loss < best_val_loss: # if the  current loss is less the best loss
            best_val_loss = val_loss
            epochs_no_improve = 0 
        else: # no improvement in loss
            epochs_no_improve += 1
            if epochs_no_improve >= patience:
                print('Early stopping triggered. Training stopped.')
                early_stop = True
                break
        
        # Learning rate scheduler step
        scheduler.step(val_loss)
        
        if early_stop:
            print("Stopped early at epoch:", epoch+1)
            break


train_model(mlp_model_2, criterion, optimizer, train_loader, val_loader, epochs=100, patience=10)



Epoch: 1, Validation Loss: 0.0113, Accuracy: 69.02%
Epoch: 2, Validation Loss: 0.0111, Accuracy: 69.71%
Epoch: 3, Validation Loss: 0.0111, Accuracy: 69.80%
Epoch: 4, Validation Loss: 0.0111, Accuracy: 69.51%
Epoch: 5, Validation Loss: 0.0110, Accuracy: 69.90%
Epoch: 6, Validation Loss: 0.0109, Accuracy: 69.96%
Epoch: 7, Validation Loss: 0.0109, Accuracy: 70.13%
Epoch: 8, Validation Loss: 0.0109, Accuracy: 70.10%
Epoch: 9, Validation Loss: 0.0109, Accuracy: 70.01%
Epoch: 10, Validation Loss: 0.0109, Accuracy: 70.29%
Epoch: 11, Validation Loss: 0.0109, Accuracy: 70.06%
Epoch: 12, Validation Loss: 0.0109, Accuracy: 70.03%
Epoch: 13, Validation Loss: 0.0109, Accuracy: 70.26%
Epoch: 14, Validation Loss: 0.0109, Accuracy: 70.15%
Epoch: 15, Validation Loss: 0.0109, Accuracy: 70.01%
Epoch: 16, Validation Loss: 0.0109, Accuracy: 70.07%
Epoch: 17, Validation Loss: 0.0108, Accuracy: 70.21%
Epoch: 18, Validation Loss: 0.0108, Accuracy: 70.23%
Epoch: 19, Validation Loss: 0.0109, Accuracy: 70.19%
Ep

In [77]:
# Evaluate the model on training data
y_train_pred = mlp_model_2(X_train_tensor).argmax(dim=1).numpy()
accuracy_train = accuracy_score(y_train_tensor.numpy(), y_train_pred)
precision_train = precision_score(y_train_tensor.numpy(), y_train_pred, average='weighted')
recall_train = recall_score(y_train_tensor.numpy(), y_train_pred, average='weighted')
f1_train = f1_score(y_train_tensor.numpy(), y_train_pred, average='weighted')

# Evaluate the model on testing data
y_test_pred = mlp_model_2(X_val_tensor).argmax(dim=1).numpy()
accuracy_test = accuracy_score(y_val_tensor.numpy(), y_test_pred)
precision_test = precision_score(y_val_tensor.numpy(), y_test_pred, average='weighted')
recall_test = recall_score(y_val_tensor.numpy(), y_test_pred, average='weighted')
f1_test = f1_score(y_val_tensor.numpy(), y_test_pred, average='weighted')

# Print the results
print("Training Metrics For MLP:")
print(f"Accuracy: {accuracy_train:.4f}")
print(f"Precision: {precision_train:.4f}")
print(f"Recall: {recall_train:.4f}")
print(f"F1-Score: {f1_train:.4f}")
print("\nTesting Metrics For MLP:")
print(f"Accuracy: {accuracy_test:.4f}")
print(f"Precision: {precision_test:.4f}")
print(f"Recall: {recall_test:.4f}")
print(f"F1-Score: {f1_test:.4f}")

Training Metrics For MLP:
Accuracy: 0.7270
Precision: 0.7048
Recall: 0.7270
F1-Score: 0.7074

Testing Metrics For MLP:
Accuracy: 0.7015
Precision: 0.6753
Recall: 0.7015
F1-Score: 0.6804


## Feedforward Neural Networks Concatenated Values Custom Word2Vec Binary

In [78]:
def concatenated_word2vec(reviews, word2vec_model, vector_size, concat_size=10):
    features = []

    for review in reviews:
        valid_words = [word for word in review if word in word2vec_model.wv.key_to_index]

        if len(valid_words) >= concat_size:
            # Take the embeddings of the first 'concat_size' valid words
            word_vectors = np.array([word2vec_model.wv[word] for word in valid_words[:concat_size]])
            concat_vector = word_vectors.flatten()
        else:
            # If there aren't enough valid words, pad the rest with zeros
            word_vectors = np.array([word2vec_model.wv[word] for word in valid_words] +
                                    [np.zeros(vector_size) for _ in range(concat_size - len(valid_words))])
            concat_vector = word_vectors.flatten()

        features.append(concat_vector)

    return np.array(features)

# Assuming 'tokenized_data' is your list of tokenized reviews and 'model' is your trained Word2Vec model
concat_features = concatenated_word2vec(tokenized_data, model, vector_size=300)


In [79]:
concat_features.shape

(250000, 3000)

In [81]:
# Get the indices of the remaining rows after filtering
filtered_indices = df_filtered.index.to_numpy()

# Now, use these indices to filter the avg_features array
concat_features_filtered = concat_features[filtered_indices]

# Checking the dimensions to ensure they match
print("Filtered DataFrame shape:", df_filtered.shape)
print("Filtered concat_features shape:", concat_features_filtered.shape)

Filtered DataFrame shape: (200000, 3)
Filtered concat_features shape: (200000, 3000)


In [82]:
X=concat_features_filtered
y=df_filtered['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [83]:
X.shape,y.shape

((200000, 3000), (200000,))

In [84]:
# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
X_val_tensor = torch.tensor(X_test, dtype=torch.float32)
y_val_tensor = torch.tensor(y_test.values, dtype=torch.long)
y_train_tensor = y_train_tensor - 1
y_val_tensor = y_val_tensor - 1
# Create datasets and dataloaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

# Define the MLP model
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(3000, 50),
            nn.ReLU(),
            nn.Linear(50, 10),
            nn.ReLU(),
            nn.Linear(10, 2)
        )

    def forward(self, x):
        return self.layers(x)

# Initialize the model, loss function, and optimizer
mlp_model = MLP()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(mlp_model.parameters(), lr=0.001)

def train_model(model, criterion, optimizer, train_loader, val_loader, epochs=10, patience=3):
    # Initialize early stopping variables
    best_val_loss = float('inf')
    epochs_no_improve = 0
    early_stop = False
    
    # Scheduler for learning rate decay
    scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=patience // 2, factor=0.1, verbose=True)

    for epoch in range(epochs):
        model.train()
        for data, target in train_loader:
            optimizer.zero_grad() # clears old gradients,
            output = model(data)
            loss = criterion(output, target) # Calculate the pred-actual Loss
            loss.backward() # Back proprogation (Calculating the gradient)
            optimizer.step() # Weight updates
        
        # Validation phase
        model.eval() # Switches the into evaluation mode
        val_loss = 0
        correct = 0
        with torch.no_grad(): # Ensures gradient is not calculated(Saves memory and computation)
            for data, target in val_loader:
                output = model(data) ## Output in the form of probabitilities
                val_loss += criterion(output, target).item() 
                pred = output.argmax(dim=1, keepdim=True) # Probabitlity with the max value is the output
                correct += pred.eq(target.view_as(pred)).sum().item() # counts the number of correct predictions

        val_loss /= len(val_loader.dataset)
        accuracy = 100. * correct / len(val_loader.dataset)
        print(f'Epoch: {epoch+1}, Validation Loss: {val_loss:.4f}, Accuracy: {accuracy:.2f}%')
        
        # Early stopping logic
        if val_loss < best_val_loss: # if the  current loss is less the best loss
            best_val_loss = val_loss
            epochs_no_improve = 0 
        else: # no improvement in loss
            epochs_no_improve += 1
            if epochs_no_improve >= patience:
                print('Early stopping triggered. Training stopped.')
                early_stop = True
                break
        
        # Learning rate scheduler step
        scheduler.step(val_loss)
        
        if early_stop:
            print("Stopped early at epoch:", epoch+1)
            break


train_model(mlp_model, criterion, optimizer, train_loader, val_loader, epochs=100, patience=3)



Epoch: 1, Validation Loss: 0.0069, Accuracy: 79.24%
Epoch: 2, Validation Loss: 0.0068, Accuracy: 79.69%
Epoch: 3, Validation Loss: 0.0070, Accuracy: 79.29%
Epoch: 4, Validation Loss: 0.0071, Accuracy: 79.18%
Epoch: 5, Validation Loss: 0.0076, Accuracy: 79.30%
Early stopping triggered. Training stopped.


In [85]:
# Evaluate the model on training data
y_train_pred = mlp_model(X_train_tensor).argmax(dim=1).numpy()
accuracy_train = accuracy_score(y_train_tensor.numpy(), y_train_pred)
precision_train = precision_score(y_train_tensor.numpy(), y_train_pred)
recall_train = recall_score(y_train_tensor.numpy(), y_train_pred)
f1_train = f1_score(y_train_tensor.numpy(), y_train_pred)

# Evaluate the model on testing data
y_test_pred = mlp_model(X_val_tensor).argmax(dim=1).numpy()
accuracy_test = accuracy_score(y_val_tensor.numpy(), y_test_pred)
precision_test = precision_score(y_val_tensor.numpy(), y_test_pred)
recall_test = recall_score(y_val_tensor.numpy(), y_test_pred)
f1_test = f1_score(y_val_tensor.numpy(), y_test_pred)

# Print the results
print("Training Metrics For MLP:")
print(f"Accuracy: {accuracy_train:.4f}")
print(f"Precision: {precision_train:.4f}")
print(f"Recall: {recall_train:.4f}")
print(f"F1-Score: {f1_train:.4f}")
print("\nTesting Metrics For MLP:")
print(f"Accuracy: {accuracy_test:.4f}")
print(f"Precision: {precision_test:.4f}")
print(f"Recall: {recall_test:.4f}")
print(f"F1-Score: {f1_test:.4f}")

Training Metrics For MLP:
Accuracy: 0.8921
Precision: 0.8866
Recall: 0.8992
F1-Score: 0.8929

Testing Metrics For MLP:
Accuracy: 0.7930
Precision: 0.7876
Recall: 0.8022
F1-Score: 0.7948


## Feedforward Neural Networks Concatenated Values Google Word2Vec Binary

In [86]:
def concatenated_word2vec_google(reviews,vector_size, concat_size=10):
    features = []

    for review in reviews:
        valid_words = [word for word in review if word in wv.key_to_index]

        if len(valid_words) >= concat_size:
            # Take the embeddings of the first 'concat_size' valid words
            word_vectors = np.array([wv[word] for word in valid_words[:concat_size]])
            concat_vector = word_vectors.flatten()
        else:
            # If there aren't enough valid words, pad the rest with zeros
            word_vectors = np.array([wv[word] for word in valid_words] +
                                    [np.zeros(vector_size) for _ in range(concat_size - len(valid_words))])
            concat_vector = word_vectors.flatten()

        features.append(concat_vector)

    return np.array(features)

# Assuming 'tokenized_data' is your list of tokenized reviews and 'model' is your trained Word2Vec model
concat_features_pretrained = concatenated_word2vec_google(tokenized_data,vector_size=300)


In [87]:
concat_features_pretrained.shape

(250000, 3000)

In [88]:
# Get the indices of the remaining rows after filtering
filtered_indices = df_filtered.index.to_numpy()

# Now, use these indices to filter the avg_features array
concat_features_filtered_pretrained = concat_features_pretrained[filtered_indices]

# Checking the dimensions to ensure they match
print("Filtered DataFrame shape:", df_filtered.shape)
print("Filtered concat_features shape:", concat_features_filtered_pretrained.shape)

Filtered DataFrame shape: (200000, 3)
Filtered concat_features shape: (200000, 3000)


In [89]:
X=concat_features_filtered_pretrained
y=df_filtered['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [90]:
X.shape,y.shape

((200000, 3000), (200000,))

In [91]:
# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
X_val_tensor = torch.tensor(X_test, dtype=torch.float32)
y_val_tensor = torch.tensor(y_test.values, dtype=torch.long)
y_train_tensor = y_train_tensor - 1
y_val_tensor = y_val_tensor - 1
# Create datasets and dataloaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

# Define the MLP model
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(3000, 50),
            nn.ReLU(),
            nn.Linear(50, 10),
            nn.ReLU(),
            nn.Linear(10, 2)
        )

    def forward(self, x):
        return self.layers(x)

# Initialize the model, loss function, and optimizer
mlp_model = MLP()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(mlp_model.parameters(), lr=0.001)

def train_model(model, criterion, optimizer, train_loader, val_loader, epochs=10, patience=3):
    # Initialize early stopping variables
    best_val_loss = float('inf')
    epochs_no_improve = 0
    early_stop = False
    
    # Scheduler for learning rate decay
    scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=patience // 2, factor=0.1, verbose=True)

    for epoch in range(epochs):
        model.train()
        for data, target in train_loader:
            optimizer.zero_grad() # clears old gradients,
            output = model(data)
            loss = criterion(output, target) # Calculate the pred-actual Loss
            loss.backward() # Back proprogation (Calculating the gradient)
            optimizer.step() # Weight updates
        
        # Validation phase
        model.eval() # Switches the into evaluation mode
        val_loss = 0
        correct = 0
        with torch.no_grad(): # Ensures gradient is not calculated(Saves memory and computation)
            for data, target in val_loader:
                output = model(data) ## Output in the form of probabitilities
                val_loss += criterion(output, target).item() 
                pred = output.argmax(dim=1, keepdim=True) # Probabitlity with the max value is the output
                correct += pred.eq(target.view_as(pred)).sum().item() # counts the number of correct predictions

        val_loss /= len(val_loader.dataset)
        accuracy = 100. * correct / len(val_loader.dataset)
        print(f'Epoch: {epoch+1}, Validation Loss: {val_loss:.4f}, Accuracy: {accuracy:.2f}%')
        
        # Early stopping logic
        if val_loss < best_val_loss: # if the  current loss is less the best loss
            best_val_loss = val_loss
            epochs_no_improve = 0 
        else: # no improvement in loss
            epochs_no_improve += 1
            if epochs_no_improve >= patience:
                print('Early stopping triggered. Training stopped.')
                early_stop = True
                break
        
        # Learning rate scheduler step
        scheduler.step(val_loss)
        
        if early_stop:
            print("Stopped early at epoch:", epoch+1)
            break


train_model(mlp_model, criterion, optimizer, train_loader, val_loader, epochs=100, patience=3)



Epoch: 1, Validation Loss: 0.0072, Accuracy: 77.88%
Epoch: 2, Validation Loss: 0.0071, Accuracy: 78.58%
Epoch: 3, Validation Loss: 0.0072, Accuracy: 78.42%
Epoch: 4, Validation Loss: 0.0075, Accuracy: 77.86%
Epoch: 5, Validation Loss: 0.0081, Accuracy: 77.86%
Early stopping triggered. Training stopped.


In [92]:
# Evaluate the model on training data
y_train_pred = mlp_model(X_train_tensor).argmax(dim=1).numpy()
accuracy_train = accuracy_score(y_train_tensor.numpy(), y_train_pred)
precision_train = precision_score(y_train_tensor.numpy(), y_train_pred)
recall_train = recall_score(y_train_tensor.numpy(), y_train_pred)
f1_train = f1_score(y_train_tensor.numpy(), y_train_pred)

# Evaluate the model on testing data
y_test_pred = mlp_model(X_val_tensor).argmax(dim=1).numpy()
accuracy_test = accuracy_score(y_val_tensor.numpy(), y_test_pred)
precision_test = precision_score(y_val_tensor.numpy(), y_test_pred)
recall_test = recall_score(y_val_tensor.numpy(), y_test_pred)
f1_test = f1_score(y_val_tensor.numpy(), y_test_pred)

# Print the results
print("Training Metrics For MLP:")
print(f"Accuracy: {accuracy_train:.4f}")
print(f"Precision: {precision_train:.4f}")
print(f"Recall: {recall_train:.4f}")
print(f"F1-Score: {f1_train:.4f}")
print("\nTesting Metrics For MLP:")
print(f"Accuracy: {accuracy_test:.4f}")
print(f"Precision: {precision_test:.4f}")
print(f"Recall: {recall_test:.4f}")
print(f"F1-Score: {f1_test:.4f}")

Training Metrics For MLP:
Accuracy: 0.8966
Precision: 0.8920
Recall: 0.9025
F1-Score: 0.8972

Testing Metrics For MLP:
Accuracy: 0.7786
Precision: 0.7738
Recall: 0.7872
F1-Score: 0.7804


## Feedforward Neural Networks Concatenated Values Custom Word2Vec Ternary

In [93]:
X=concat_features
y=df['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [94]:
X.shape,y.shape

((250000, 3000), (250000,))

In [95]:
# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
X_val_tensor = torch.tensor(X_test, dtype=torch.float32)
y_val_tensor = torch.tensor(y_test.values, dtype=torch.long)
y_train_tensor = y_train_tensor - 1
y_val_tensor = y_val_tensor - 1
# Create datasets and dataloaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

# Define the MLP model
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(3000, 50),
            nn.ReLU(),
            nn.Linear(50, 10),
            nn.ReLU(),
            nn.Linear(10, 3)
        )

    def forward(self, x):
        return self.layers(x)

# Initialize the model, loss function, and optimizer
mlp_model = MLP()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(mlp_model.parameters(), lr=0.001)

def train_model(model, criterion, optimizer, train_loader, val_loader, epochs=10, patience=3):
    # Initialize early stopping variables
    best_val_loss = float('inf')
    epochs_no_improve = 0
    early_stop = False
    
    # Scheduler for learning rate decay
    scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=patience // 2, factor=0.1, verbose=True)

    for epoch in range(epochs):
        model.train()
        for data, target in train_loader:
            optimizer.zero_grad() # clears old gradients,
            output = model(data)
            loss = criterion(output, target) # Calculate the pred-actual Loss
            loss.backward() # Back proprogation (Calculating the gradient)
            optimizer.step() # Weight updates
        
        # Validation phase
        model.eval() # Switches the into evaluation mode
        val_loss = 0
        correct = 0
        with torch.no_grad(): # Ensures gradient is not calculated(Saves memory and computation)
            for data, target in val_loader:
                output = model(data) ## Output in the form of probabitilities
                val_loss += criterion(output, target).item() 
                pred = output.argmax(dim=1, keepdim=True) # Probabitlity with the max value is the output
                correct += pred.eq(target.view_as(pred)).sum().item() # counts the number of correct predictions

        val_loss /= len(val_loader.dataset)
        accuracy = 100. * correct / len(val_loader.dataset)
        print(f'Epoch: {epoch+1}, Validation Loss: {val_loss:.4f}, Accuracy: {accuracy:.2f}%')
        
        # Early stopping logic
        if val_loss < best_val_loss: # if the  current loss is less the best loss
            best_val_loss = val_loss
            epochs_no_improve = 0 
        else: # no improvement in loss
            epochs_no_improve += 1
            if epochs_no_improve >= patience:
                print('Early stopping triggered. Training stopped.')
                early_stop = True
                break
        
        # Learning rate scheduler step
        scheduler.step(val_loss)
        
        if early_stop:
            print("Stopped early at epoch:", epoch+1)
            break


train_model(mlp_model, criterion, optimizer, train_loader, val_loader, epochs=100, patience=10)



Epoch: 1, Validation Loss: 0.0128, Accuracy: 63.80%
Epoch: 2, Validation Loss: 0.0127, Accuracy: 64.32%
Epoch: 3, Validation Loss: 0.0128, Accuracy: 63.93%
Epoch: 4, Validation Loss: 0.0130, Accuracy: 63.47%
Epoch: 5, Validation Loss: 0.0132, Accuracy: 63.21%
Epoch: 6, Validation Loss: 0.0136, Accuracy: 62.46%
Epoch: 7, Validation Loss: 0.0141, Accuracy: 62.21%
Epoch: 8, Validation Loss: 0.0146, Accuracy: 61.88%
Epoch: 9, Validation Loss: 0.0157, Accuracy: 61.53%
Epoch: 10, Validation Loss: 0.0164, Accuracy: 61.32%
Epoch: 11, Validation Loss: 0.0168, Accuracy: 61.00%
Epoch: 12, Validation Loss: 0.0173, Accuracy: 60.98%
Early stopping triggered. Training stopped.


In [96]:
# Evaluate the model on training data
y_train_pred = mlp_model(X_train_tensor).argmax(dim=1).numpy()
accuracy_train = accuracy_score(y_train_tensor.numpy(), y_train_pred)
precision_train = precision_score(y_train_tensor.numpy(), y_train_pred, average='weighted')
recall_train = recall_score(y_train_tensor.numpy(), y_train_pred, average='weighted')
f1_train = f1_score(y_train_tensor.numpy(), y_train_pred, average='weighted')

# Evaluate the model on testing data
y_test_pred = mlp_model(X_val_tensor).argmax(dim=1).numpy()
accuracy_test = accuracy_score(y_val_tensor.numpy(), y_test_pred)
precision_test = precision_score(y_val_tensor.numpy(), y_test_pred, average='weighted')
recall_test = recall_score(y_val_tensor.numpy(), y_test_pred, average='weighted')
f1_test = f1_score(y_val_tensor.numpy(), y_test_pred, average='weighted')

# Print the results
print("Training Metrics For MLP:")
print(f"Accuracy: {accuracy_train:.4f}")
print(f"Precision: {precision_train:.4f}")
print(f"Recall: {recall_train:.4f}")
print(f"F1-Score: {f1_train:.4f}")
print("\nTesting Metrics For MLP:")
print(f"Accuracy: {accuracy_test:.4f}")
print(f"Precision: {precision_test:.4f}")
print(f"Recall: {recall_test:.4f}")
print(f"F1-Score: {f1_test:.4f}")

Training Metrics For MLP:
Accuracy: 0.8026
Precision: 0.7943
Recall: 0.8026
F1-Score: 0.7926

Testing Metrics For MLP:
Accuracy: 0.6098
Precision: 0.5892
Recall: 0.6098
F1-Score: 0.5961


## Feedforward Neural Networks Concatenated Values Google Word2Vec Ternary

In [97]:
X=concat_features_pretrained
y=df['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [98]:
X.shape,y.shape

((250000, 3000), (250000,))

In [99]:
# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
X_val_tensor = torch.tensor(X_test, dtype=torch.float32)
y_val_tensor = torch.tensor(y_test.values, dtype=torch.long)
y_train_tensor = y_train_tensor - 1
y_val_tensor = y_val_tensor - 1
# Create datasets and dataloaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

# Define the MLP model
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(3000, 50),
            nn.ReLU(),
            nn.Linear(50, 10),
            nn.ReLU(),
            nn.Linear(10, 3)
        )

    def forward(self, x):
        return self.layers(x)

# Initialize the model, loss function, and optimizer
mlp_model = MLP()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(mlp_model.parameters(), lr=0.001)

def train_model(model, criterion, optimizer, train_loader, val_loader, epochs=10, patience=3):
    # Initialize early stopping variables
    best_val_loss = float('inf')
    epochs_no_improve = 0
    early_stop = False
    
    # Scheduler for learning rate decay
    scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=patience // 2, factor=0.1, verbose=True)

    for epoch in range(epochs):
        model.train()
        for data, target in train_loader:
            optimizer.zero_grad() # clears old gradients,
            output = model(data)
            loss = criterion(output, target) # Calculate the pred-actual Loss
            loss.backward() # Back proprogation (Calculating the gradient)
            optimizer.step() # Weight updates
        
        # Validation phase
        model.eval() # Switches the into evaluation mode
        val_loss = 0
        correct = 0
        with torch.no_grad(): # Ensures gradient is not calculated(Saves memory and computation)
            for data, target in val_loader:
                output = model(data) ## Output in the form of probabitilities
                val_loss += criterion(output, target).item() 
                pred = output.argmax(dim=1, keepdim=True) # Probabitlity with the max value is the output
                correct += pred.eq(target.view_as(pred)).sum().item() # counts the number of correct predictions

        val_loss /= len(val_loader.dataset)
        accuracy = 100. * correct / len(val_loader.dataset)
        print(f'Epoch: {epoch+1}, Validation Loss: {val_loss:.4f}, Accuracy: {accuracy:.2f}%')
        
        # Early stopping logic
        if val_loss < best_val_loss: # if the  current loss is less the best loss
            best_val_loss = val_loss
            epochs_no_improve = 0 
        else: # no improvement in loss
            epochs_no_improve += 1
            if epochs_no_improve >= patience:
                print('Early stopping triggered. Training stopped.')
                early_stop = True
                break
        
        # Learning rate scheduler step
        scheduler.step(val_loss)
        
        if early_stop:
            print("Stopped early at epoch:", epoch+1)
            break


train_model(mlp_model, criterion, optimizer, train_loader, val_loader, epochs=100, patience=10)



Epoch: 1, Validation Loss: 0.0131, Accuracy: 62.76%
Epoch: 2, Validation Loss: 0.0130, Accuracy: 63.27%
Epoch: 3, Validation Loss: 0.0131, Accuracy: 63.06%
Epoch: 4, Validation Loss: 0.0132, Accuracy: 62.97%
Epoch: 5, Validation Loss: 0.0136, Accuracy: 62.24%
Epoch: 6, Validation Loss: 0.0141, Accuracy: 61.62%
Epoch: 7, Validation Loss: 0.0146, Accuracy: 61.20%
Epoch: 8, Validation Loss: 0.0154, Accuracy: 60.87%
Epoch: 9, Validation Loss: 0.0163, Accuracy: 60.69%
Epoch: 10, Validation Loss: 0.0169, Accuracy: 60.24%
Epoch: 11, Validation Loss: 0.0174, Accuracy: 60.02%
Epoch: 12, Validation Loss: 0.0178, Accuracy: 59.85%
Early stopping triggered. Training stopped.


In [100]:
# Evaluate the model on training data
y_train_pred = mlp_model(X_train_tensor).argmax(dim=1).numpy()
accuracy_train = accuracy_score(y_train_tensor.numpy(), y_train_pred)
precision_train = precision_score(y_train_tensor.numpy(), y_train_pred, average='weighted')
recall_train = recall_score(y_train_tensor.numpy(), y_train_pred, average='weighted')
f1_train = f1_score(y_train_tensor.numpy(), y_train_pred, average='weighted')

# Evaluate the model on testing data
y_test_pred = mlp_model(X_val_tensor).argmax(dim=1).numpy()
accuracy_test = accuracy_score(y_val_tensor.numpy(), y_test_pred)
precision_test = precision_score(y_val_tensor.numpy(), y_test_pred, average='weighted')
recall_test = recall_score(y_val_tensor.numpy(), y_test_pred, average='weighted')
f1_test = f1_score(y_val_tensor.numpy(), y_test_pred, average='weighted')

# Print the results
print("Training Metrics For MLP:")
print(f"Accuracy: {accuracy_train:.4f}")
print(f"Precision: {precision_train:.4f}")
print(f"Recall: {recall_train:.4f}")
print(f"F1-Score: {f1_train:.4f}")
print("\nTesting Metrics For MLP:")
print(f"Accuracy: {accuracy_test:.4f}")
print(f"Precision: {precision_test:.4f}")
print(f"Recall: {recall_test:.4f}")
print(f"F1-Score: {f1_test:.4f}")

Training Metrics For MLP:
Accuracy: 0.8237
Precision: 0.8166
Recall: 0.8237
F1-Score: 0.8156

Testing Metrics For MLP:
Accuracy: 0.5985
Precision: 0.5797
Recall: 0.5985
F1-Score: 0.5863


# Convolutional Neural Network

## Convolutional Neural Network Padded Value Custom Word2Vec Binary

In [6]:
import numpy as np

def padded_word2vec(reviews, word2vec_model, vector_size, pad_size=50):
    features = []

    for review in reviews:
        valid_words = [word for word in review if word in word2vec_model.wv.key_to_index]
        review_features = np.zeros((vector_size, pad_size), dtype=np.float32)

        for i, word in enumerate(valid_words[:pad_size]):
            review_features[:, i] = word2vec_model.wv[word]

        features.append(review_features)

    return np.array(features)

# Assuming 'tokenized_data' is your list of tokenized reviews and 'model' is your trained Word2Vec model
padded_features = padded_word2vec(tokenized_data, model, vector_size=300)

In [7]:
padded_features.shape

(250000, 300, 50)

In [8]:
# Filter out rows from the dataframe where 'class' is not equal to 3
df = df[df['class'] != 3]

# Get the indices of the remaining rows after filtering
filtered_indices = df.index.to_numpy()

# Now, use these indices to filter the avg_features array
padded_features = padded_features[filtered_indices]



In [9]:
# Checking the dimensions to ensure they match
print("Filtered DataFrame shape:", df.shape)
print("Filtered padded_features shape:", padded_features.shape)

Filtered DataFrame shape: (200000, 3)
Filtered padded_features shape: (200000, 300, 50)


In [10]:
X=padded_features
y=df['class']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
X_train.shape,y_train.shape

((160000, 300, 50), (160000,))

In [12]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

# Define the CNN model
class SimpleCNN(nn.Module):
    def __init__(self, num_classes=3):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=300, out_channels=50, kernel_size=5, padding=2)
        self.conv2 = nn.Conv1d(in_channels=50, out_channels=10, kernel_size=5, padding=2)
        self.fc = None  # Will be initialized after the first forward pass

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.max_pool1d(x, kernel_size=2, stride=2)
        x = F.relu(self.conv2(x))
        x = F.max_pool1d(x, kernel_size=2, stride=2)
        
        # Check if the fc layer has been initialized, if not, do it dynamically
        if self.fc is None:
            # Calculate the correct input feature size
            n_size = x.view(x.size(0), -1).size(1)
            self.fc = nn.Linear(n_size, 2).to(x.device)
        
        x = x.view(x.size(0), -1)  # Flatten the tensor
        x = self.fc(x)
        return x

# Initialize the model
model = SimpleCNN(num_classes=2)

In [13]:
# Print the model architecture
print(model)

SimpleCNN(
  (conv1): Conv1d(300, 50, kernel_size=(5,), stride=(1,), padding=(2,))
  (conv2): Conv1d(50, 10, kernel_size=(5,), stride=(1,), padding=(2,))
)


In [16]:
# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)


y_train_tensor = y_train_tensor - 1
y_test_tensor = y_test_tensor - 1

# Create datasets and dataloaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)



In [19]:
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Function to calculate accuracy
def calculate_accuracy(y_pred, y_true):
    _, predicted = torch.max(y_pred.data, 1)
    correct = (predicted == y_true).sum().item()
    return correct / y_true.size(0)

def train_model(model, train_loader, test_loader, criterion, optimizer, epochs=10, patience=3):
    best_val_acc = 0.0  # Track the best validation accuracy
    patience_counter = 0  # Counter for how many epochs without improvement

    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        running_corrects = 0

        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * inputs.size(0)
            running_corrects += calculate_accuracy(outputs, labels)

        epoch_loss = running_loss / len(train_loader.dataset)
        epoch_acc = running_corrects / len(train_loader)

        # Validation phase
        model.eval()
        val_running_corrects = 0
        with torch.no_grad():
            for inputs, labels in test_loader:
                outputs = model(inputs)
                val_running_corrects += calculate_accuracy(outputs, labels)

        val_epoch_acc = val_running_corrects / len(test_loader)
        print(f'Epoch {epoch+1}/{epochs} : Training loss: {epoch_loss:.4f} | Training Accuracy: {epoch_acc:.4f} | Val Accuracy: {val_epoch_acc:.4f}')

        # Early Stopping Check
        if val_epoch_acc > best_val_acc:
            best_val_acc = val_epoch_acc
            patience_counter = 0  # Reset patience
        else:
            patience_counter += 1  # Increment patience

        if patience_counter >= patience:
            print("Early stopping triggered")
            break

# Train the model
train_model(model, train_loader, test_loader, criterion, optimizer, epochs=100, patience=5)

Epoch 1/100 : Training loss: 0.3755 | Training Accuracy: 0.8373 | Val Accuracy: 0.8532
Epoch 2/100 : Training loss: 0.3260 | Training Accuracy: 0.8627 | Val Accuracy: 0.8575
Epoch 3/100 : Training loss: 0.2996 | Training Accuracy: 0.8741 | Val Accuracy: 0.8591
Epoch 4/100 : Training loss: 0.2748 | Training Accuracy: 0.8866 | Val Accuracy: 0.8601
Epoch 5/100 : Training loss: 0.2554 | Training Accuracy: 0.8949 | Val Accuracy: 0.8521
Epoch 6/100 : Training loss: 0.2358 | Training Accuracy: 0.9040 | Val Accuracy: 0.8555
Epoch 7/100 : Training loss: 0.2182 | Training Accuracy: 0.9120 | Val Accuracy: 0.8534
Epoch 8/100 : Training loss: 0.2035 | Training Accuracy: 0.9179 | Val Accuracy: 0.8478
Epoch 9/100 : Training loss: 0.1892 | Training Accuracy: 0.9242 | Val Accuracy: 0.8485
Early stopping triggered


In [20]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch

def evaluate_model(model, dataloader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for inputs, labels in dataloader:
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.tolist())
            all_labels.extend(labels.tolist())
    return all_preds, all_labels

def calculate_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    return accuracy, precision, recall, f1


y_train_pred, y_train_true = evaluate_model(model, train_loader)
y_test_pred, y_test_true = evaluate_model(model, test_loader)

accuracy_train, precision_train, recall_train, f1_train = calculate_metrics(y_train_true, y_train_pred)
accuracy_test, precision_test, recall_test, f1_test = calculate_metrics(y_test_true, y_test_pred)

# Print the results
print("Training Metrics:")
print(f"Accuracy: {accuracy_train:.4f}")
print(f"Precision: {precision_train:.4f}")
print(f"Recall: {recall_train:.4f}")
print(f"F1-Score: {f1_train:.4f}")
print("\nTesting Metrics:")
print(f"Accuracy: {accuracy_test:.4f}")
print(f"Precision: {precision_test:.4f}")
print(f"Recall: {recall_test:.4f}")
print(f"F1-Score: {f1_test:.4f}")


Training Metrics:
Accuracy: 0.9358
Precision: 0.9372
Recall: 0.9358
F1-Score: 0.9358

Testing Metrics:
Accuracy: 0.8485
Precision: 0.8504
Recall: 0.8485
F1-Score: 0.8483


## Convolutional Neural Network Padded Value Google Word2Vec Binary

In [4]:
import numpy as np

def padded_word2vec_google(reviews,vector_size, pad_size=50):
    features = []

    for review in reviews:
        valid_words = [word for word in review if word in wv.key_to_index]
        review_features = np.zeros((vector_size, pad_size), dtype=np.float32)

        for i, word in enumerate(valid_words[:pad_size]):
            review_features[:, i] = wv[word]

        features.append(review_features)

    return np.array(features)

# Assuming 'tokenized_data' is your list of tokenized reviews and 'model' is your trained Word2Vec model
padded_features_pretrained = padded_word2vec_google(tokenized_data,vector_size=300)

In [9]:
padded_features_pretrained.shape

(250000, 300, 50)

In [5]:
# Filter out rows from the dataframe where 'class' is not equal to 3
df = df[df['class'] != 3]

# Get the indices of the remaining rows after filtering
filtered_indices = df.index.to_numpy()

# Now, use these indices to filter the avg_features array
padded_features_pretrained = padded_features_pretrained[filtered_indices]

In [6]:
# Checking the dimensions to ensure they match
print("Filtered DataFrame shape:", df.shape)
print("Filtered padded_features shape:", padded_features_pretrained.shape)

Filtered DataFrame shape: (200000, 3)
Filtered padded_features shape: (200000, 300, 50)


In [7]:
X=padded_features_pretrained
y=df['class']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

# Define the CNN model
class SimpleCNN(nn.Module):
    def __init__(self, num_classes=3):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=300, out_channels=50, kernel_size=5, padding=2)
        self.conv2 = nn.Conv1d(in_channels=50, out_channels=10, kernel_size=5, padding=2)
        self.fc = None 

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.max_pool1d(x, kernel_size=2, stride=2)
        x = F.relu(self.conv2(x))
        x = F.max_pool1d(x, kernel_size=2, stride=2)
        
        
        if self.fc is None:
            # Calculate the correct input feature size
            n_size = x.view(x.size(0), -1).size(1)
            self.fc = nn.Linear(n_size, 2).to(x.device)
        
        x = x.view(x.size(0), -1)  # Flatten the tensor
        x = self.fc(x)
        return x

# Initialize the model
model = SimpleCNN(num_classes=2)

In [9]:
# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)


y_train_tensor = y_train_tensor - 1
y_test_tensor = y_test_tensor - 1

# Create datasets and dataloaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [10]:
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Function to calculate accuracy
def calculate_accuracy(y_pred, y_true):
    _, predicted = torch.max(y_pred.data, 1)
    correct = (predicted == y_true).sum().item()
    return correct / y_true.size(0)

def train_model(model, train_loader, test_loader, criterion, optimizer, epochs=10, patience=3):
    best_val_acc = 0.0  
    patience_counter = 0  

    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        running_corrects = 0

        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * inputs.size(0)
            running_corrects += calculate_accuracy(outputs, labels)

        epoch_loss = running_loss / len(train_loader.dataset)
        epoch_acc = running_corrects / len(train_loader)

        # Validation phase
        model.eval()
        val_running_corrects = 0
        with torch.no_grad():
            for inputs, labels in test_loader:
                outputs = model(inputs)
                val_running_corrects += calculate_accuracy(outputs, labels)

        val_epoch_acc = val_running_corrects / len(test_loader)
        print(f'Epoch {epoch+1}/{epochs} : Training loss: {epoch_loss:.4f} | Training Accuracy: {epoch_acc:.4f} | Val Accuracy: {val_epoch_acc:.4f}')

        # Early Stopping Check
        if val_epoch_acc > best_val_acc:
            best_val_acc = val_epoch_acc
            patience_counter = 0  # Reset patience
        else:
            patience_counter += 1  # Increment patience

        if patience_counter >= patience:
            print("Early stopping triggered")
            break

# Train the model
train_model(model, train_loader, test_loader, criterion, optimizer, epochs=100, patience=3)

Epoch 1/100 : Training loss: 0.4123 | Training Accuracy: 0.8165 | Val Accuracy: 0.8439
Epoch 2/100 : Training loss: 0.3417 | Training Accuracy: 0.8554 | Val Accuracy: 0.8474
Epoch 3/100 : Training loss: 0.3083 | Training Accuracy: 0.8719 | Val Accuracy: 0.8593
Epoch 4/100 : Training loss: 0.2805 | Training Accuracy: 0.8851 | Val Accuracy: 0.8601
Epoch 5/100 : Training loss: 0.2568 | Training Accuracy: 0.8959 | Val Accuracy: 0.8595
Epoch 6/100 : Training loss: 0.2348 | Training Accuracy: 0.9062 | Val Accuracy: 0.8583
Epoch 7/100 : Training loss: 0.2163 | Training Accuracy: 0.9155 | Val Accuracy: 0.8548
Early stopping triggered


In [11]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch

def evaluate_model(model, dataloader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for inputs, labels in dataloader:
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.tolist())
            all_labels.extend(labels.tolist())
    return all_preds, all_labels

def calculate_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    return accuracy, precision, recall, f1


y_train_pred, y_train_true = evaluate_model(model, train_loader)
y_test_pred, y_test_true = evaluate_model(model, test_loader)

accuracy_train, precision_train, recall_train, f1_train = calculate_metrics(y_train_true, y_train_pred)
accuracy_test, precision_test, recall_test, f1_test = calculate_metrics(y_test_true, y_test_pred)

# Print the results
print("Training Metrics:")
print(f"Accuracy: {accuracy_train:.4f}")
print(f"Precision: {precision_train:.4f}")
print(f"Recall: {recall_train:.4f}")
print(f"F1-Score: {f1_train:.4f}")
print("\nTesting Metrics:")
print(f"Accuracy: {accuracy_test:.4f}")
print(f"Precision: {precision_test:.4f}")
print(f"Recall: {recall_test:.4f}")
print(f"F1-Score: {f1_test:.4f}")

Training Metrics:
Accuracy: 0.9362
Precision: 0.9362
Recall: 0.9362
F1-Score: 0.9362

Testing Metrics:
Accuracy: 0.8548
Precision: 0.8548
Recall: 0.8548
F1-Score: 0.8547


## Convolutional Neural Network Padded Value Custom Word2Vec Ternary

In [4]:
from gensim.models import Word2Vec
# Load model
model = Word2Vec.load("word2vec.model")

In [5]:
import numpy as np

def padded_word2vec(reviews, word2vec_model, vector_size, pad_size=50):
    features = []

    for review in reviews:
        valid_words = [word for word in review if word in word2vec_model.wv.key_to_index]
        review_features = np.zeros((vector_size, pad_size), dtype=np.float32)

        for i, word in enumerate(valid_words[:pad_size]):
            review_features[:, i] = word2vec_model.wv[word]

        features.append(review_features)

    return np.array(features)

padded_features = padded_word2vec(tokenized_data, model, vector_size=300)

In [6]:

print("Filtered DataFrame shape:", df.shape)
print("Filtered padded_features shape:", padded_features.shape)

Filtered DataFrame shape: (250000, 3)
Filtered padded_features shape: (250000, 300, 50)


In [7]:
X=padded_features
y=df['class']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
del padded_features
del df
del tokenized_data
del model
del X
del y

In [9]:
X_train.shape,y_train.shape

((200000, 300, 50), (200000,))

In [10]:
X_test.shape,y_test.shape

((50000, 300, 50), (50000,))

In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

# Define the CNN model
class SimpleCNN(nn.Module):
    def __init__(self, num_classes=3):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=300, out_channels=50, kernel_size=5, padding=2)
        self.conv2 = nn.Conv1d(in_channels=50, out_channels=10, kernel_size=5, padding=2)
        self.fc = None  # Will be initialized after the first forward pass

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.max_pool1d(x, kernel_size=2, stride=2)
        x = F.relu(self.conv2(x))
        x = F.max_pool1d(x, kernel_size=2, stride=2)
        
        
        if self.fc is None:
           
            n_size = x.view(x.size(0), -1).size(1)
            self.fc = nn.Linear(n_size, 3).to(x.device)
        
        x = x.view(x.size(0), -1)  # Flatten the tensor
        x = self.fc(x)
        return x

# Initialize the model
model = SimpleCNN(num_classes=3)

In [12]:
# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

y_train_tensor = y_train_tensor - 1
y_test_tensor = y_test_tensor - 1

In [13]:
del X_train
del X_test
del y_train
del y_test

In [14]:

# Create datasets and dataloaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [15]:
del X_train_tensor
del X_test_tensor
del y_train_tensor
del y_test_tensor

In [17]:
del train_dataset
del test_dataset

In [19]:
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Function to calculate accuracy
def calculate_accuracy(y_pred, y_true):
    _, predicted = torch.max(y_pred.data, 1)
    correct = (predicted == y_true).sum().item()
    return correct / y_true.size(0)

def train_model(model, train_loader, test_loader, criterion, optimizer, epochs=10, patience=3):
    best_val_acc = 0.0  # Track the best validation accuracy
    patience_counter = 0  # Counter for how many epochs without improvement

    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        running_corrects = 0

        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * inputs.size(0)
            running_corrects += calculate_accuracy(outputs, labels)

        epoch_loss = running_loss / len(train_loader.dataset)
        epoch_acc = running_corrects / len(train_loader)

        # Validation phase
        model.eval()
        val_running_corrects = 0
        with torch.no_grad():
            for inputs, labels in test_loader:
                outputs = model(inputs)
                val_running_corrects += calculate_accuracy(outputs, labels)

        val_epoch_acc = val_running_corrects / len(test_loader)
        print(f'Epoch {epoch+1}/{epochs} : Training loss: {epoch_loss:.4f} | Training Accuracy: {epoch_acc:.4f} | Val Accuracy: {val_epoch_acc:.4f}')

        # Early Stopping Check
        if val_epoch_acc > best_val_acc:
            best_val_acc = val_epoch_acc
            patience_counter = 0  # Reset patience
        else:
            patience_counter += 1  # Increment patience

        if patience_counter >= patience:
            print("Early stopping triggered")
            break

# Train the model
train_model(model, train_loader, test_loader, criterion, optimizer, epochs=100, patience=5)

Epoch 1/100 : Training loss: 0.8054 | Training Accuracy: 0.6555 | Val Accuracy: 0.6721
Epoch 2/100 : Training loss: 0.7431 | Training Accuracy: 0.6866 | Val Accuracy: 0.6807
Epoch 3/100 : Training loss: 0.7154 | Training Accuracy: 0.6985 | Val Accuracy: 0.6834
Epoch 4/100 : Training loss: 0.6923 | Training Accuracy: 0.7100 | Val Accuracy: 0.6824
Epoch 5/100 : Training loss: 0.6722 | Training Accuracy: 0.7187 | Val Accuracy: 0.6824
Epoch 6/100 : Training loss: 0.6556 | Training Accuracy: 0.7259 | Val Accuracy: 0.6762
Epoch 7/100 : Training loss: 0.6400 | Training Accuracy: 0.7335 | Val Accuracy: 0.6728
Epoch 8/100 : Training loss: 0.6260 | Training Accuracy: 0.7408 | Val Accuracy: 0.6743
Early stopping triggered


In [20]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch

def evaluate_model(model, dataloader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for inputs, labels in dataloader:
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.tolist())
            all_labels.extend(labels.tolist())
    return all_preds, all_labels

def calculate_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    return accuracy, precision, recall, f1


y_train_pred, y_train_true = evaluate_model(model, train_loader)
y_test_pred, y_test_true = evaluate_model(model, test_loader)

accuracy_train, precision_train, recall_train, f1_train = calculate_metrics(y_train_true, y_train_pred)
accuracy_test, precision_test, recall_test, f1_test = calculate_metrics(y_test_true, y_test_pred)

# Print the results
print("Training Metrics:")
print(f"Accuracy: {accuracy_train:.4f}")
print(f"Precision: {precision_train:.4f}")
print(f"Recall: {recall_train:.4f}")
print(f"F1-Score: {f1_train:.4f}")
print("\nTesting Metrics:")
print(f"Accuracy: {accuracy_test:.4f}")
print(f"Precision: {precision_test:.4f}")
print(f"Recall: {recall_test:.4f}")
print(f"F1-Score: {f1_test:.4f}")


Training Metrics:
Accuracy: 0.7621
Precision: 0.7478
Recall: 0.7621
F1-Score: 0.7469

Testing Metrics:
Accuracy: 0.6744
Precision: 0.6478
Recall: 0.6744
F1-Score: 0.6557


## Convolutional Neural Network Padded Value Google Word2Vec Ternary

In [4]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')

In [5]:
import numpy as np

def padded_word2vec_google(reviews,vector_size, pad_size=50):
    features = []

    for review in reviews:
        valid_words = [word for word in review if word in wv.key_to_index]
        review_features = np.zeros((vector_size, pad_size), dtype=np.float32)

        for i, word in enumerate(valid_words[:pad_size]):
            review_features[:, i] = wv[word]

        features.append(review_features)

    return np.array(features)


padded_features_pretrained = padded_word2vec_google(tokenized_data,vector_size=300)

In [6]:

print("Filtered DataFrame shape:", df.shape)
print("Filtered padded_features shape:", padded_features_pretrained.shape)

Filtered DataFrame shape: (250000, 3)
Filtered padded_features shape: (250000, 300, 50)


In [7]:
X=padded_features_pretrained
y=df['class']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
del padded_features_pretrained
del df
del tokenized_data
del wv
del X
del y

In [11]:
X_train.shape,y_train.shape

((200000, 300, 50), (200000,))

In [12]:
X_test.shape,y_test.shape

((50000, 300, 50), (50000,))

In [13]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

# Define the CNN model
class SimpleCNN(nn.Module):
    def __init__(self, num_classes=3):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=300, out_channels=50, kernel_size=5, padding=2)
        self.conv2 = nn.Conv1d(in_channels=50, out_channels=10, kernel_size=5, padding=2)
        self.fc = None  # Will be initialized after the first forward pass

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.max_pool1d(x, kernel_size=2, stride=2)
        x = F.relu(self.conv2(x))
        x = F.max_pool1d(x, kernel_size=2, stride=2)
        
        
        if self.fc is None:
            # Calculate the correct input feature size
            n_size = x.view(x.size(0), -1).size(1)
            self.fc = nn.Linear(n_size, 3).to(x.device)
        
        x = x.view(x.size(0), -1)  # Flatten the tensor
        x = self.fc(x)
        return x

# Initialize the model
model = SimpleCNN(num_classes=3)

In [14]:
# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

y_train_tensor = y_train_tensor - 1
y_test_tensor = y_test_tensor - 1

In [15]:
del X_train
del X_test
del y_train
del y_test

In [16]:


# Create datasets and dataloaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [17]:
del X_train_tensor
del X_test_tensor
del y_train_tensor
del y_test_tensor
del train_dataset
del test_dataset

In [18]:
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Function to calculate accuracy
def calculate_accuracy(y_pred, y_true):
    _, predicted = torch.max(y_pred.data, 1)
    correct = (predicted == y_true).sum().item()
    return correct / y_true.size(0)

def train_model(model, train_loader, test_loader, criterion, optimizer, epochs=10, patience=3):
    best_val_acc = 0.0  # Track the best validation accuracy
    patience_counter = 0  # Counter for how many epochs without improvement

    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        running_corrects = 0

        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * inputs.size(0)
            running_corrects += calculate_accuracy(outputs, labels)

        epoch_loss = running_loss / len(train_loader.dataset)
        epoch_acc = running_corrects / len(train_loader)

        # Validation phase
        model.eval()
        val_running_corrects = 0
        with torch.no_grad():
            for inputs, labels in test_loader:
                outputs = model(inputs)
                val_running_corrects += calculate_accuracy(outputs, labels)

        val_epoch_acc = val_running_corrects / len(test_loader)
        print(f'Epoch {epoch+1}/{epochs} : Training loss: {epoch_loss:.4f} | Training Accuracy: {epoch_acc:.4f} | Val Accuracy: {val_epoch_acc:.4f}')

        # Early Stopping Check
        if val_epoch_acc > best_val_acc:
            best_val_acc = val_epoch_acc
            patience_counter = 0  # Reset patience
        else:
            patience_counter += 1  # Increment patience

        if patience_counter >= patience:
            print("Early stopping triggered")
            break

# Train the model
train_model(model, train_loader, test_loader, criterion, optimizer, epochs=100, patience=3)

Epoch 1/100 : Training loss: 0.8213 | Training Accuracy: 0.6449 | Val Accuracy: 0.6752
Epoch 2/100 : Training loss: 0.7363 | Training Accuracy: 0.6887 | Val Accuracy: 0.6865
Epoch 3/100 : Training loss: 0.6987 | Training Accuracy: 0.7053 | Val Accuracy: 0.6843
Epoch 4/100 : Training loss: 0.6710 | Training Accuracy: 0.7185 | Val Accuracy: 0.6891
Epoch 5/100 : Training loss: 0.6478 | Training Accuracy: 0.7309 | Val Accuracy: 0.6878
Epoch 6/100 : Training loss: 0.6270 | Training Accuracy: 0.7395 | Val Accuracy: 0.6888
Epoch 7/100 : Training loss: 0.6089 | Training Accuracy: 0.7481 | Val Accuracy: 0.6804
Early stopping triggered


In [19]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch

def evaluate_model(model, dataloader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for inputs, labels in dataloader:
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.tolist())
            all_labels.extend(labels.tolist())
    return all_preds, all_labels

def calculate_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    return accuracy, precision, recall, f1


y_train_pred, y_train_true = evaluate_model(model, train_loader)
y_test_pred, y_test_true = evaluate_model(model, test_loader)

accuracy_train, precision_train, recall_train, f1_train = calculate_metrics(y_train_true, y_train_pred)
accuracy_test, precision_test, recall_test, f1_test = calculate_metrics(y_test_true, y_test_pred)

# Print the results
print("Training Metrics:")
print(f"Accuracy: {accuracy_train:.4f}")
print(f"Precision: {precision_train:.4f}")
print(f"Recall: {recall_train:.4f}")
print(f"F1-Score: {f1_train:.4f}")
print("\nTesting Metrics:")
print(f"Accuracy: {accuracy_test:.4f}")
print(f"Precision: {precision_test:.4f}")
print(f"Recall: {recall_test:.4f}")
print(f"F1-Score: {f1_test:.4f}")

Training Metrics:
Accuracy: 0.7704
Precision: 0.7598
Recall: 0.7704
F1-Score: 0.7622

Testing Metrics:
Accuracy: 0.6803
Precision: 0.6635
Recall: 0.6803
F1-Score: 0.6697
