# Create Train and Test Set for SVD

In [1]:
import pickle
from datetime import datetime
from tqdm import tqdm
import numpy as np
import pandas as pd
import os
import sklearn
from sklearn.metrics.pairwise import cosine_similarity
import random
from surprise import Reader, Dataset
from surprise import SVD
from surprise import SVDpp
from surprise.model_selection import GridSearchCV
import xgboost as xbg
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

## Analyze Averages from Numpy

In [5]:
# for each loop to append numpy array
# get movie and customer id from sparse file
smovie = np.array([])
for i in tqdm(range(0, 6)):
    smovie = np.append(smovie, np.load("numpy" + str(i) + ".npy"))

100%|██████████| 6/6 [08:49<00:00, 88.29s/it] 


In [None]:
split = int(smovie.shape[0]*0.8)
movie_train = smovie.iloc[:split]
movie_test = smovie.iloc[split + 1:]

## Run SVD on Subset

In [2]:
cwd = os.getcwd()
movie = pd.read_csv(cwd + "/data/final.csv")

Split movies df into 1.5 million total data points, approximately 1.5% of the total data. Randomize the data so training and testing sets aren't split by movie.

In [3]:
movie_train, movie_test = sklearn.model_selection.train_test_split(movie.iloc[:1500000], train_size = 0.8)
movie_test

Unnamed: 0,MovieID,CustomerID,Rating,Date
812452,191,137594,3,2004-06-28
279291,58,1009895,5,2005-02-16
958197,216,1335524,3,2004-06-14
814873,191,2373724,4,2004-04-16
695591,187,2564680,2,2003-05-29
...,...,...,...,...
267290,58,1068104,4,2004-11-03
169355,30,2441830,4,2004-07-11
1412944,299,956510,4,2004-04-25
495457,143,1143230,3,2000-06-05


In [4]:
movie.shape

(100480507, 4)

Get columns for user, movie, and rating to build trainset. Create train and testing sets to input to the model.

In [5]:
reader = Reader(rating_scale=(1,5))
movieInput = pd.DataFrame()
movieInput['CustomerID'] = movie_train['CustomerID']
movieInput['MovieID'] = movie_train['MovieID']
movieInput['Rating'] = movie_train['Rating']

train_data = Dataset.load_from_df(movieInput, reader)
trainset = train_data.build_full_trainset()

In [6]:
testset = list(zip(movie_test["CustomerID"].values, movie_test["MovieID"].values, movie_test["Rating"].values))

In [7]:
error_table = pd.DataFrame(columns = ["Model", "Train_RMSE", "Test_RMSE"])

## Utility Functions

In [8]:
def make_table(model_name, rmse_train, rmse_test):
    global error_table
    error_table = error_table.append(pd.DataFrame([[model_name, rmse_train, rmse_test]], columns = ["Model", "Train_RMSE", "Test_RMSE"]))
    error_table.reset_index(drop = True, inplace = True)

In [9]:
def run_surprise(algo, trainset, testset, model_name):
    start = datetime.now()
    algo.fit(trainset)
    
    pred_train = algo.test(trainset.build_testset())
    
    trainActual = np.array([p.r_ui for p in pred_train])
    trainPred = np.array([p.est for p in pred_train]) 
    trainRMSE = np.sqrt(mean_squared_error(trainActual, trainPred))
    
    print("Train Data RMSE: {}".format(trainRMSE))
    print("\n")
    
    train = {"RMSE": trainRMSE, "Prediction": trainPred}
    
    pred_test = algo.test(testset)
    testActual = np.array([p.r_ui for p in pred_test])
    testPred = np.array([p.est for p in pred_test])
    testRMSE = np.sqrt(mean_squared_error(testActual, testPred))
    
    print("Test Data RMSE: {}".format(testRMSE))
    print("\n")
    
    test = {"RMSE": testRMSE, "Prediction": testPred}
    
    print("Time Taken = " + str(datetime.now() - start))
    
    make_table(model_name, trainRMSE, testRMSE)
    
    return train, test

## Running SVD

In [10]:
params = { 'n_factors': [5, 10, 15, 20, 25, 30, 35, 40, 50]}
grid = GridSearchCV(SVD, params, measures=['rmse'], cv=3, refit=True)
grid.fit(train_data)
print(grid.best_score['rmse'])

0.9770881459156584


In [116]:
train_data.df
grid.predict(train_data.df[0:1])

TypeError: predict() missing 1 required positional argument: 'iid'

In [None]:
#Build a small testset that I know results to


In [11]:
algo = SVD(n_factors = grid.best_params['rmse']['n_factors'], biased=True, verbose=True)
train_result, test_result = run_surprise(algo, trainset, testset, "SVD")

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Train Data RMSE: 0.8840100239486314


Test Data RMSE: 0.9699048687100014


Time Taken = 0:00:37.228808


In [12]:
test_result['Prediction']

array([4.53723186, 4.20900739, 3.34187682, ..., 3.87511602, 4.21399651,
       4.1902662 ])

In [26]:
algo.predict(6, 97)

Prediction(uid=6, iid=97, r_ui=None, est=3.818978007100799, details={'was_impossible': False})

In [100]:
test_result['Prediction'].shape

(300000,)

In [134]:
trainset.build_anti_testset()

[(591844, 240, 3.6378525),
 (591844, 199, 3.6378525),
 (591844, 270, 3.6378525),
 (591844, 8, 3.6378525),
 (591844, 175, 3.6378525),
 (591844, 312, 3.6378525),
 (591844, 77, 3.6378525),
 (591844, 241, 3.6378525),
 (591844, 290, 3.6378525),
 (591844, 143, 3.6378525),
 (591844, 269, 3.6378525),
 (591844, 295, 3.6378525),
 (591844, 118, 3.6378525),
 (591844, 108, 3.6378525),
 (591844, 57, 3.6378525),
 (591844, 28, 3.6378525),
 (591844, 45, 3.6378525),
 (591844, 252, 3.6378525),
 (591844, 298, 3.6378525),
 (591844, 47, 3.6378525),
 (591844, 181, 3.6378525),
 (591844, 187, 3.6378525),
 (591844, 33, 3.6378525),
 (591844, 261, 3.6378525),
 (591844, 171, 3.6378525),
 (591844, 102, 3.6378525),
 (591844, 285, 3.6378525),
 (591844, 71, 3.6378525),
 (591844, 3, 3.6378525),
 (591844, 167, 3.6378525),
 (591844, 213, 3.6378525),
 (591844, 160, 3.6378525),
 (591844, 83, 3.6378525),
 (591844, 16, 3.6378525),
 (591844, 81, 3.6378525),
 (591844, 138, 3.6378525),
 (591844, 125, 3.6378525),
 (591844, 223, 

In [27]:
# run_surprise with anti_test_set passed in as test_set and the list of test_result["Predictions"] 
# will be our predictions
grid.best_params['rmse']['n_factors']

5