In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import time
import random
import itertools
import os
import warnings
warnings.filterwarnings('ignore')
import copy

from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import train_test_split, GridSearchCV, HalvingGridSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer
from sklearn import decomposition
from sklearn.decomposition import PCA


import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.tensorboard import SummaryWriter

def fix_random(seed: int) -> None:
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True  # slower
fix_random(42)


In [2]:
df = pd.read_csv('dataset.csv')

# PyTorch Device
if torch.cuda.is_available():
    device = torch.device('cuda')
elif torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cpu')

print('Using device:', device)

Using device: cuda


In [3]:
columns_to_transform = ['year', 'rating_count']


def transform(X):
    X_norm2 = np.linalg.norm(X, ord=2)
    X = X / X_norm2
    return X

def normalize(df, type):
    print(type)
    for column in columns_to_transform:
        df[column] = transform(df[column])
    return df
df=normalize(df, 'L2_normalization')

L2_normalization


In [4]:
X = df.drop('rating', axis=1)
Y = df['rating']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.1, random_state=42)

X_train = X_train.to_numpy()
X_val = X_val.to_numpy()
X_test = X_test.to_numpy()
Y_train = Y_train.to_numpy()
Y_val = Y_val.to_numpy()
Y_test = Y_test.to_numpy()

In [5]:
pca = PCA(n_components=0.95)
pca.fit(X_train)
X_train = pca.transform(X_train)
X_val = pca.transform(X_val)
X_test = pca.transform(X_test)

In [6]:
print(f'Number of training samples: {X_train.shape[0]}')
print(f'Number of validation samples: {X_val.shape[0]}')
print(f'Number of testing samples: {X_test.shape[0]}')
print(f'\nNumber of features: {X_train.shape[1]}')

val_loader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(torch.tensor(X_val, dtype=torch.float32), torch.tensor(Y_val, dtype=torch.float32)), batch_size=Y_val.shape[0], shuffle=False)
test_loader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(torch.tensor(X_test, dtype=torch.float32), torch.tensor(Y_test, dtype=torch.float32)), batch_size=Y_test.shape[0], shuffle=False)

Number of training samples: 9934
Number of validation samples: 1104
Number of testing samples: 2760

Number of features: 543


In [7]:
#write a function to get Deep Learning model with torch.nn
def get_model(input_size, hidden_size, output_size,dropout_prob=0, depth=1):
    model = nn.Sequential(
        nn.Linear(input_size, hidden_size),
        nn.ReLU(),
        nn.Linear(hidden_size, output_size)
    )
    for i in range(depth):
        model.append(torch.nn.Linear(hidden_size, hidden_size))
        model.append(torch.nn.ReLU())
        model.append(torch.nn.Dropout(dropout_prob))

    model.append(torch.nn.Linear(hidden_size, 1))
    return torch.nn.Sequential(*model)


In [8]:
# Hyperparameters
hidden_sizes =  [256, 512, 1024]
nums_epochs = [200]
depth = [3, 4, 5]
batch_sizes = [8, 16, 32]
learning_rate = [0.01, 0.001]
step_size_lr_decay = [10, 20]
momentum = [0.9]
dropout_prob = 0.2
patience = 10

hyperparameters = itertools.product(hidden_sizes, depth, nums_epochs, batch_sizes, learning_rate, step_size_lr_decay, momentum)
n_comb = len(hidden_sizes)*len(depth)*len(nums_epochs)*len(batch_sizes)*len(learning_rate)*len(step_size_lr_decay)*len(momentum)
print (f'Number of hyperparameter combinations: {n_comb}')

Number of hyperparameter combinations: 108


In [None]:
#train the model given by the function get_model() with hyperparameters defined above
def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, writer, device, patience, num_epochs):
    best_model_wts = copy.deepcopy(model.state_dict())
    best_loss = 1000000
    best_epoch = 0
    early_stop_counter = 0
    start = time.time()

    for epoch in range(num_epochs):
        model.train()
        start_epoch = time.time()
        train_loss = 0
        for X, Y in train_loader:
            X = X.to(device)
            Y = Y.to(device)
            optimizer.zero_grad()
            Y_hat = model(X)
            loss = criterion(Y_hat, Y.unsqueeze(1))
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        train_loss /= len(train_loader)
        writer.add_scalar('Loss/train', train_loss, epoch)
        model.eval()
        val_loss = test_model(model, val_loader, criterion, device)
        writer.add_scalar('Loss/val', val_loss, epoch)
        scheduler.step(val_loss)
        if val_loss < best_loss:
            best_loss = val_loss
            best_epoch = epoch
            best_model_wts = copy.deepcopy(model.state_dict())
            early_stop_counter = 0
        else:
            early_stop_counter += 1
        if early_stop_counter >= patience:
            print(f'Early stopping at epoch {epoch}')
            break
        
        print('Epoch [{}/{}] - {:.2f} seconds - val_loss: {:.6f} - patience: {}'.format(epoch+1,
              num_epochs, time.time() - start_epoch, val_loss, early_stop_counter), end='\r')

    print('\nTraining ended after {:.2f} seconds - Best val_loss: {:.6f}'.format(time.time() - start, best_loss))

    model.load_state_dict(best_model_wts)
    return model, best_epoch, best_loss

#write a function to evaluate the model
def test_model(model, test_loader, criterion, device):
    model.eval()
    test_loss = 0
    for X, Y in test_loader:
        X = X.to(device)
        Y = Y.to(device)
        Y_hat = model(X)
        loss = criterion(Y_hat, Y.unsqueeze(1))
        test_loss += loss.item()
    test_loss /= len(test_loader)
    return test_loss

