In [None]:
import os
import zipfile
import string

import pickle
import numpy as np
import scipy.sparse as sp
import pandas as pd
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import gensim
from gensim.utils import simple_preprocess
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import tensorflow as tf
import tensorflow_hub as hub

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
tokenize = gensim.utils.simple_preprocess
sym = list(string.punctuation)
stop_words = set(stopwords.words("english")) | set(["br"]) | set(sym) | set(["/><br",'\'s'])

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
batch_size = 256
num_factors = 16

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
dataset_name = "AmazonLuxuryBeauty"
splits_ready = False
data_path = '/content/drive/My Drive/Colab Data/M.Tech. Project/datasets/Amazon/' + dataset_name + "/" + dataset_name + ".zip"
save_path = "./drive/My Drive/Colab Data/M.Tech. Project/saved splits/Amazon/" + dataset_name + "/"

with zipfile.ZipFile(data_path, 'r') as zip_ref:
    zip_ref.extractall("/content/data/" + dataset_name + "/")

In [None]:
df = pd.read_json("/content/data/" + dataset_name + "/" + dataset_name + ".json", lines=True)
df = df[['reviewerID', 'asin', 'reviewText', 'overall']]
columns = ['userId', 'itemId', 'review', 'rating']
df.columns = columns
num_users = len(df['userId'].unique())
num_items = len(df['itemId'].unique())

In [None]:
def process_text(text):
    processed = " ".join(
        [token for token in tokenize(text)
            if token not in stop_words and len(token) > 2]
    )
    return processed

def generate_splits(df):
    train_df, test_df = train_test_split(df, test_size=0.2)
    val_df, test_df = train_test_split(test_df, test_size=0.5)
    return (train_df, val_df, test_df)


def get_embeddings(reviews, load=True):
    if load:
        with open(save_path + 'true_sentence_embeddings.pkl', 'rb') as f:
            true_embeddings = pickle.load(f)
    else:
        reviews = reviews.tolist()
        true_embeddings = []
        for review in reviews:
            embeddings = embed([review]).numpy()
            true_embeddings.append(embeddings)
        true_embeddings = np.array(true_embeddings).squeeze()
        with open(save_path + 'true_sentence_embeddings.pkl', 'wb') as f:
            pickle.dump(true_embeddings, f, pickle.HIGHEST_PROTOCOL)
    return true_embeddings


def create_dataset(df, mode="Test"):
    user_item_ratings = {}
    if mode == "Train":
        for idx, row in df.iterrows():
            user_item_ratings[idx] = [int(row[0]), int(row[1]), true_embeddings[idx], row[3]]
    else:
        for idx, row in df.iterrows():
            user_item_ratings[idx] = [int(row[0]), int(row[1]), row[3]]
    return user_item_ratings

In [None]:
if not splits_ready:
    df = df[~df['review'].isna()]
    df['review'] = df['review'].apply(lambda review: process_text(review))
    df = df[~df['review'].isna()]
    df = df.sample(frac=1)
    
    df['userId'] = df['userId'].astype('category').cat.codes
    df['itemId'] = df['itemId'].astype('category').cat.codes
    
    train_df, val_df, test_df = generate_splits(df)
    print(f"Train size: {len(train_df)} | Val size: {len(val_df)} | Test size: {len(test_df)}")

    df.to_csv(save_path + 'df.csv', index=False)
    train_df.to_csv(save_path + 'train_df.csv', index=False)
    val_df.to_csv(save_path + 'val_df.csv', index=False)
    test_df.to_csv(save_path + 'test_df.csv', index=False)
else:
    df = pd.read_csv(save_path + 'df.csv')
    train_df = pd.read_csv(save_path + 'train_df.csv')
    val_df = pd.read_csv(save_path + 'val_df.csv')
    test_df = pd.read_csv(save_path + 'test_df.csv')

Train size: 27412 | Val size: 3426 | Test size: 3427


In [None]:
df = pd.read_csv(save_path + 'df.csv')
train_df = pd.read_csv(save_path + 'train_df.csv')
val_df = pd.read_csv(save_path + 'val_df.csv')
test_df = pd.read_csv(save_path + 'test_df.csv')

df = df[~df['review'].isna()].reset_index(drop=True)
train_df = train_df[~train_df['review'].isna()].reset_index(drop=True)
val_df = val_df[~val_df['review'].isna()].reset_index(drop=True)
test_df = test_df[~test_df['review'].isna()].reset_index(drop=True)

true_embeddings = get_embeddings(train_df["review"], load=splits_ready)
print(f"Train size: {len(train_df)} | Val size: {len(val_df)} | Test size: {len(test_df)}")

train_set = create_dataset(train_df, mode="Train")
val_set = create_dataset(val_df, mode="Val")
test_set = create_dataset(test_df, mode="Test")

train_loader = DataLoader(dataset=train_set, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(dataset=val_set, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(dataset=test_set, batch_size=batch_size, shuffle=False)

Train size: 27386 | Val size: 3422 | Test size: 3427


In [None]:
def evaluate(model, data_loader):
    model.eval()
    mse_fn = nn.MSELoss()
    mae_fn = nn.L1Loss()

    for user_item_rating in data_loader:
        users, items, ratings = user_item_rating
        users, items, ratings = users.to(device), items.to(device), ratings.to(device)
        preds = model(users, items)
        ratings = ratings.float().view(preds[0].size())
        mse = mse_fn(preds[0], ratings)
        mae = mae_fn(preds[0], ratings)
    
    return mse.item(), mae.item()
    

def train_one_epoch(model, data_loader, loss_function, optimizer, epoch):
    model.train()
    epoch_loss = []
    epoch_loss1 = []
    epoch_loss2 = []
    decay = 5

    for user_item_rating in data_loader:
        users, items, reviews, ratings = user_item_rating
        users, items, reviews, ratings = users.to(device), items.to(device), reviews.to(device), ratings.to(device)
        preds = model(users, items, reviews)
        ratings = ratings.float().view(preds[0].size())
        loss1 = loss_function(preds[0], ratings)
        loss2 = decay*loss_function(preds[1], reviews)
        loss = loss1 + loss2
        # loss = loss1

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_loss.append(loss.item())
        epoch_loss1.append(loss1.item())
        epoch_loss2.append(loss2.item())

    epoch_loss = np.mean(epoch_loss)
    epoch_loss1 = np.mean(epoch_loss1)
    epoch_loss2 = np.mean(epoch_loss2)
    val_mse, val_mae = evaluate(model, val_loader)
    print(f"Epoch: {epoch} | Train MSE: {epoch_loss}")
    print(f"\t Model MSE: {epoch_loss1} | Regularizer MSE: {epoch_loss2} ")
    print(f"\t Val MSE: {val_mse} | Val MAE: {val_mae} ")
    return epoch_loss, epoch_loss1, epoch_loss2, val_mse, val_mae

In [None]:
num_factors = 16

class ReviewRegularizer(nn.Module):
    def __init__(self, num_factors, num_layers=1):
        super(ReviewRegularizer, self).__init__()
        modules = []
        input_size = 128
        self.model = nn.Sequential(
            nn.Dropout(p=0.7),
            nn.Linear(input_size, 512),
            nn.Sigmoid()
        )

    def forward(self, interaction):
        output = self.model(interaction)
        return output


class NCF(nn.Module):
    def __init__(self, review_regularizer, num_users, num_items, num_factors=16, num_layers=4):
        super(NCF, self).__init__()

        embed_dim = num_factors * (2 ** (num_layers - 1))
        self.embed_user_MLP = nn.Embedding(num_embeddings=num_users, embedding_dim=embed_dim)
        self.embed_item_MLP = nn.Embedding(num_embeddings=num_items, embedding_dim=embed_dim)
        self.dropout = nn.Dropout(p=0.5)
        MLP_modules = []
        for i in range(num_layers):
            input_size = num_factors * (2 ** (num_layers - i))
            MLP_modules.append(nn.Dropout(p=0.4))
            MLP_modules.append(nn.Linear(input_size, input_size//2))
            MLP_modules.append(nn.ReLU())
        self.MLP_forward = nn.Sequential(*MLP_modules)
        predict_size = num_factors
        self.predict = nn.Linear(predict_size, 1)

        self.review_regularizer = review_regularizer

    def forward(self, user, item, review=None):
        embed_user_MLP = self.embed_user_MLP(user)
        embed_item_MLP = self.embed_item_MLP(item)
        interaction = torch.cat((embed_user_MLP, embed_item_MLP), -1)
        output_MLP = self.MLP_forward(self.dropout(interaction))
        preds = []
        preds.append(self.predict(output_MLP).view(-1))

        if review is not None:
            regularizer = self.review_regularizer(interaction)
            preds.append(regularizer)

        return preds


review_regularizer = ReviewRegularizer(num_factors=num_factors).to(device)
model = NCF(review_regularizer, num_users, num_items, num_factors=num_factors, num_layers=3).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=0.0001)
loss_function = nn.MSELoss()

num_epochs = 80
losses = []
losses1 = []
losses2 = []
val_mses = []
val_maes = []
best_val_mse = 100
best_model = None

In [None]:
for epoch in range(1, num_epochs + 1):
    epoch_loss, epoch_loss1, epoch_loss2, val_mse, val_mae = train_one_epoch(
        model=model, 
        data_loader=train_loader, 
        loss_function=loss_function, 
        optimizer=optimizer, 
        epoch=epoch
    )
    if val_mse < best_val_mse:
        best_val_mse = val_mse
        best_model = model
        torch.save(model.state_dict(), "./model.pth")
        print("Saving model...")
    losses.append(epoch_loss)
    losses1.append(epoch_loss1)
    losses2.append(epoch_loss2)
    val_mses.append(val_mse)
    val_maes.append(val_mae)

Epoch: 1 | Train MSE: 8.225450836609458
	 Model MSE: 6.975387682424527 | Regularizer MSE: 1.2500631096207093 
	 Val MSE: 4.945785045623779 | Val MAE: 2.0940771102905273 
Saving model...
Epoch: 2 | Train MSE: 3.4267347193209923
	 Model MSE: 2.4478197632548966 | Regularizer MSE: 0.9789149460391463 
	 Val MSE: 4.141963958740234 | Val MAE: 1.9140702486038208 
Saving model...
Epoch: 3 | Train MSE: 3.0895324889744553
	 Model MSE: 2.2390713836545144 | Regularizer MSE: 0.8504611008635191 
	 Val MSE: 3.7900052070617676 | Val MAE: 1.8158149719238281 
Saving model...
Epoch: 4 | Train MSE: 2.8413227228360753
	 Model MSE: 2.0929287237541696 | Regularizer MSE: 0.7483939879408507 
	 Val MSE: 3.093989849090576 | Val MAE: 1.62710440158844 
Saving model...
Epoch: 5 | Train MSE: 2.614825914953357
	 Model MSE: 1.9537305486536471 | Regularizer MSE: 0.6610953713131842 
	 Val MSE: 2.810434341430664 | Val MAE: 1.5310927629470825 
Saving model...
Epoch: 6 | Train MSE: 2.419850898680286
	 Model MSE: 1.830698177

In [None]:
test_model = NCF(review_regularizer, num_users, num_items, num_factors=num_factors, num_layers=3).to(device)
test_model.load_state_dict(torch.load("./model.pth"))
test_mse, test_mae = evaluate(data_loader=val_loader, model=test_model)
print("test_mse", test_mse)
print("test_mae", test_mae)

test_mse 0.5585541725158691
test_mae 0.5497888326644897


In [None]:
print("Train MSE 1:")
print(losses1)
print("Train MSE 2:")
print(losses2)
print("Train MSE Overall:")
print(losses)
print("Val MSE:")
print(val_mses)
print("Val MAE:")
print(val_maes)

Train MSE 1:
[6.975387682424527, 2.4478197632548966, 2.2390713836545144, 2.0929287237541696, 1.9537305486536471, 1.830698177079174, 1.6802593937544066, 1.5380160507754745, 1.4044613258860936, 1.3048175083142575, 1.252107774542871, 1.2220517926126997, 1.178480645206487, 1.1532885393249654, 1.130443045469088, 1.1176087811728503, 1.1029907682231654, 1.0985352179714452, 1.0881191820741813, 1.0822742391969555, 1.0817130319425994, 1.0750847169171984, 1.0714717278970736, 1.0625552585191815, 1.0566946000696342, 1.0413551999029713, 1.0277580451742512, 0.9920190143808026, 0.954094694039532, 0.9024516984681102, 0.8611442697382419, 0.8427348610396698, 0.8059221355714531, 0.784281529555811, 0.7658335839476541, 0.745366521527834, 0.7258392043202837, 0.7153672503533764, 0.6987694248417827, 0.6891251011429546, 0.6710373413897006, 0.6663967811058615, 0.6503953318172526, 0.637832385914348, 0.6350061077380849, 0.6206849381745418, 0.613675621625419, 0.6031197111740291, 0.5965623312464384, 0.59010180747397