# Machine Learning Models

Now that we tried non-machine learning models, let's try some machine learning models.

In [2]:
from gensim.scripts.glove2word2vec import glove2word2vec

glove_input_file = 'glove.6B/glove.6B.100d.txt'
word2vec_output_file = 'glove.6B.100d.txt.word2vec'

glove2word2vec(glove_input_file, word2vec_output_file)

  glove2word2vec(glove_input_file, word2vec_output_file)


(400000, 100)

In [3]:
import pandas as pd

# Load the data
df = pd.read_csv('data/Books_rating_stemmed.csv')
df.head()

Unnamed: 0,score,stemmed_summary_text
0,5.0,best edition classic always recommended yale e...
1,5.0,great book required reading 16 yr old son book...
2,4.0,book consultant plain spoken finished book tak...
3,1.0,outrageously bad wow one ridiculous story ever...
4,4.0,cunning determination crew mutinied threatens ...


In [4]:
# load glove model from file
from gensim.models import KeyedVectors

glove_model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)

In [5]:
from tqdm import tqdm
import numpy as np

# Assuming df is a DataFrame with columns 'rating' and 'review'
review_term_matrix = []

for score, stemmed_summary_text in tqdm(df[['score', 'stemmed_summary_text']].itertuples(index=False), total=len(df)):
    review_matrix = np.zeros(glove_model.vector_size)

    num_words = 0

    for word in stemmed_summary_text:
        if word in glove_model:
            review_matrix += glove_model[word]
            num_words += 1

    if num_words > 0:
        review_matrix /= num_words

    review_array = [review_matrix, score]
    review_term_matrix.append(review_array)

review_term_matrix = np.array(review_term_matrix)

print(review_term_matrix.shape)

# save the review term matrix
np.save('data/review_term_matrix.npy', review_term_matrix)

 50%|█████     | 503993/999999 [08:45<08:36, 959.63it/s] 


KeyboardInterrupt: 

In [6]:
from torch.utils.data import Dataset
import numpy as np

class ReviewDataset(Dataset):
    def __init__(self, review_term_matrix):
        self.review_term_matrix = review_term_matrix

    def __len__(self):
        return len(self.review_term_matrix)

    def __getitem__(self, idx):
        return self.review_term_matrix[idx]

review_term_matrix = np.load('data/review_term_matrix.npy', allow_pickle=True)
dataset = ReviewDataset(review_term_matrix)

In [7]:
from torch.utils.data import random_split, DataLoader
import torch

train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size

def custom_collate(batch):
    data = [item[0] for item in batch]
    target = [item[1] for item in batch]
    return [torch.FloatTensor(data), torch.LongTensor(target)]

train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=custom_collate)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=True, collate_fn=custom_collate)

In [8]:
import torch
import torch.nn as nn
import torch.optim as optim

# Define the neural network architecture
class ReviewNet(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(ReviewNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Set the input size, hidden size, and output size based on the GloVe vector size and the number of classes (5 scores)
input_size = glove_model.vector_size
hidden_size = 512  # You can adjust this based on your needs
output_size = 5  # Number of classes (scores)

# Move the model and data to M1 GPU
is_gpu = torch.backends.mps.is_available()

if is_gpu:
    device = torch.device("mps")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

# Instantiate the model, loss function, and optimizer
model = ReviewNet(input_size, hidden_size, output_size)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 10

for epoch in range(epochs):
    model.train()
    for data, target in train_loader:
        optimizer.zero_grad()
        data = torch.FloatTensor(data)  # Convert data to torch.FloatTensor
        output = model(data)
        target = torch.LongTensor(target) - 1  # Adjust target values to start from 0
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()

    # Evaluate the model on the test set, get loss, accuracy, mse, and rmse
    model.eval()
    correct = 0
    total = 0
    mse = 0
    rmse = 0

    with torch.no_grad():
        for data, target in test_loader:
            data = torch.FloatTensor(data)
            output = model(data)
            target = torch.LongTensor(target) - 1
            loss = criterion(output, target)
            _, predicted = torch.max(output.data, 1)
            total += target.size(0)

            correct += (predicted == target).sum().item()
            mse += loss.item() * target.size(0)
            rmse += (loss.item() ** 0.5) * target.size(0)

    accuracy = correct / total
    mse /= total
    rmse /= total

    print('Epoch: {}, Loss: {:.4f}, Accuracy: {:.4f}, MSE: {:.4f}, RMSE: {:.4f}'.format(epoch + 1, loss.item(), accuracy, mse, rmse))

# Save the trained model
torch.save(model.state_dict(), 'review_net.pth')

GPU is available


  return [torch.FloatTensor(data), torch.LongTensor(target)]
  return [torch.FloatTensor(data), torch.LongTensor(target)]


Epoch: 1, Loss: 1.0463, Accuracy: 0.6024, MSE: 1.1450, RMSE: 1.0687
Epoch: 2, Loss: 1.0554, Accuracy: 0.6024, MSE: 1.1406, RMSE: 1.0667
Epoch: 3, Loss: 1.0951, Accuracy: 0.6024, MSE: 1.1412, RMSE: 1.0668
Epoch: 4, Loss: 1.0715, Accuracy: 0.6024, MSE: 1.1405, RMSE: 1.0669
Epoch: 5, Loss: 1.2419, Accuracy: 0.6024, MSE: 1.1363, RMSE: 1.0648
Epoch: 6, Loss: 1.0351, Accuracy: 0.6024, MSE: 1.1355, RMSE: 1.0645
Epoch: 7, Loss: 1.3200, Accuracy: 0.6024, MSE: 1.1358, RMSE: 1.0645
Epoch: 8, Loss: 0.9218, Accuracy: 0.6024, MSE: 1.1364, RMSE: 1.0646
Epoch: 9, Loss: 1.1079, Accuracy: 0.6023, MSE: 1.1355, RMSE: 1.0645
Epoch: 10, Loss: 1.2421, Accuracy: 0.6024, MSE: 1.1340, RMSE: 1.0636
