In [9]:
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

from math import pi
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

import seaborn as sns
sns.set_theme(style="white", context="talk")

import iqplot
import bokeh.io
from bokeh.layouts import column, gridplot
from bokeh.models import ColorBar, ColorMapper, LinearColorMapper, Ticker
bokeh.io.output_notebook()

import sgd_utils as sgd

In [10]:
def load_data(f_data, f_movies):
    """Load the user and movie data, FZL modified"""
    data = pd.read_csv(
        "https://raw.githubusercontent.com/lakigigar/Caltech-CS155-2021/main/projects/project2/data/"+f_data,
        sep="\t",
        header=None,
        names=["USER", "MOVIE", "RATING"],
    )
    movies = pd.read_csv(
        "https://raw.githubusercontent.com/lakigigar/Caltech-CS155-2021/main/projects/project2/data/"+f_movies,
        encoding="latin-1",
        sep="\t",
        header=None,
        names=[
            "MOVIE_ID",
            "TITLE",
            "UNKNOWN",
            "ACTION",
            "ADVENTURE",
            "ANIMATION",
            "CHILDREN",
            "COMEDY",
            "CRIME",
            "DOCUMENTARY",
            "DRAMA",
            "FANTASY",
            "FILM-NOIR",
            "HORROR",
            "MUSICAL",
            "MYSTERY",
            "ROMANCE",
            "SCI-FI",
            "THRILLER",
            "WAR",
            "WESTERN",
        ],
    )

    movies.loc[movies.TITLE == "unknown", "TITLE"] = "MOVIE_ID: " + movies.loc[
        movies.TITLE == "unknown", "MOVIE_ID"
    ].astype("str")

    return data, movies

In [11]:
data, movies = load_data(f_data="data.txt", f_movies="movies.txt")

## Matrix Factorization visualizations
### Method 1: HW5 solution

In [15]:
Y_train_df = data.head(90000)
Y_train = Y_train_df.to_numpy() # to be changed later to take train.txt and test.txt, for now I just split data.txt

Y_test_df = data.tail(10000)
Y_test = Y_test_df.to_numpy()

In [18]:
M = max(max(Y_train[:,0]), max(Y_test[:,0])).astype(int) # users
N = max(max(Y_train[:,1]), max(Y_test[:,1])).astype(int) # movies
print("Factorizing with ", M, " users, ", N, " movies.")
Ks = 20

reg = 0.0
eta = 0.03 # learning rate
E_in = []
E_out = []

# Use to compute Ein and Eout
U, V, err_training = sgd.train_model(M, N, Ks, eta, reg, Y_train)
print('Err training', err_training)
print('Err testing', sgd.get_err(U, V, Y_test))
print('dim U [m x k]', U.shape)
print('dim V [k x n]', V.shape)


Factorizing with  943  users,  1682  movies.
Epoch 1, E_in (regularized MSE): 0.527156677352
Epoch 2, E_in (regularized MSE): 0.40205063537061975
Epoch 3, E_in (regularized MSE): 0.3571268833806738
Epoch 4, E_in (regularized MSE): 0.33233207737225656
Epoch 5, E_in (regularized MSE): 0.31100694674253787
Epoch 6, E_in (regularized MSE): 0.2995225167684944
Epoch 7, E_in (regularized MSE): 0.28553281585476087
Epoch 8, E_in (regularized MSE): 0.27881018606212976
Epoch 9, E_in (regularized MSE): 0.2709466244109805
Epoch 10, E_in (regularized MSE): 0.2662403635549109
Epoch 11, E_in (regularized MSE): 0.2604885132437713
Epoch 12, E_in (regularized MSE): 0.25842116333586573
Epoch 13, E_in (regularized MSE): 0.25895229025770805
Err training 0.25895229025770805
Err testing 0.6658982425144363
dim U [m x k] (943, 20)
dim V [k x n] (20, 1682)


In [28]:
## SVD

A, sigma, B = np.linalg.svd(V,  full_matrices=False)
A_two_cols = A[:,0:2]
print(A_two_cols.shape)

U_proj = A_two_cols.transpose()@U_T
V_proj = A_two_cols.transpose()@V

print(U_proj.shape)
print(V_proj.shape)

(20, 2)
(2, 943)
(2, 1682)


In [None]:
print(V_proj)
print(data.head())

### Method 2: Incorporate a and b for each user and move to model global tendencies of the various moves and users

### Method 3: off-the-shelf implementation 

In [None]:
from surprise import SVD, Reader
from surprise import Dataset
from surprise.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import sklearn

In [None]:
reader = Reader(rating_scale=(0,5))
dataset_train = Dataset.load_from_df(Y_train_df, reader)
dataset_test = Dataset.load_from_df(Y_test_df, reader)


trainset = dataset_train.build_full_trainset()
algo = SVD()

# Run 5-fold cross-validation and print results.
cross_validate(algo, dataset_train, measures=['RMSE', 'MAE'], cv=5, verbose=True)

pred_trainset = algo.fit(trainset)


In [None]:
print(pred_trainset)