# Module 03

## Session 12 Recommendation System

# Collaborative Filtering

Use u data:
* build SVD and ALS model
* evaluate SVD and ALS model using validation
* evaluate SVD and ALS model using cross validation
* optimize SVD using hyperparameter tuning and compare(before-after)
* generate the prediction result

# Library

In [1]:
import pandas as pd

import seaborn as sns

from surprise import Reader
from surprise import Dataset

from surprise import SVD
from surprise import BaselineOnly

from surprise import accuracy
from surprise.model_selection import cross_validate, train_test_split
from surprise.model_selection import GridSearchCV

# Data

In [5]:
column_names = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('datasets/u.data', sep='\t', names=column_names)

In [6]:
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,0,50,5,881250949
1,0,172,5,881250949
2,0,133,1,881250949
3,196,242,3,881250949
4,186,302,3,891717742


In [7]:
cross_user = df.pivot_table(values='rating', index='user_id', columns='item_id')
cross_user.head()

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,,,,,,...,,,,,,,,,,
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [8]:
reader = Reader(rating_scale=(1,5))
data = Dataset.load_from_df(df[['user_id', 'item_id', 'rating']], reader)

In [9]:
data.df.head()

Unnamed: 0,user_id,item_id,rating
0,0,50,5
1,0,172,5
2,0,133,1
3,196,242,3
4,186,302,3


# Modeling: Validate

In [10]:
trainset, testset = train_test_split(
    data,
    test_size=0.25,
    random_state=101
)

## SVD

In [11]:
algo = SVD()
algo.fit(trainset)
predictions = algo.test(testset)

In [12]:
accuracy.rmse(predictions)

RMSE: 0.9375


0.9374850954428063

## ALS

In [13]:
bsl_options= {'method':'als'}
algo = BaselineOnly(bsl_options=bsl_options)
algo.fit(trainset)
predictions = algo.test(testset)

Estimating biases using als...


In [14]:
accuracy.rmse(predictions)

RMSE: 0.9433


0.943287541263747

# Modeling: Cross Validation

## SVD

In [17]:
algo = SVD()

cv_svd = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9415  0.9319  0.9319  0.9357  0.9379  0.9358  0.0037  
MAE (testset)     0.7417  0.7359  0.7352  0.7353  0.7405  0.7377  0.0028  
Fit time          2.37    2.38    2.38    2.37    2.39    2.38    0.01    
Test time         0.09    0.05    0.09    0.05    0.09    0.07    0.02    


In [18]:
cv_svd

{'test_rmse': array([0.94153415, 0.93191065, 0.93185913, 0.93567576, 0.93791664]),
 'test_mae': array([0.74173762, 0.73589677, 0.73516157, 0.73532018, 0.74054047]),
 'fit_time': (2.3738818168640137,
  2.3836867809295654,
  2.375641107559204,
  2.3733022212982178,
  2.3938539028167725),
 'test_time': (0.08555912971496582,
  0.04776787757873535,
  0.08695816993713379,
  0.0482938289642334,
  0.09264993667602539)}

In [19]:
cv_svd['test_rmse'].mean()

0.935779265576034

## ALS

In [20]:
bsl_options= {'method':'als'}
algo = BaselineOnly(bsl_options=bsl_options)

cv_svd = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE, MAE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9427  0.9428  0.9438  0.9465  0.9441  0.9440  0.0013  
MAE (testset)     0.7463  0.7477  0.7485  0.7503  0.7488  0.7483  0.0013  
Fit time          0.06    0.07    0.07    0.06    0.07    0.07    0.00    
Test time         0.03    0.03    0.07    0.03    0.08    0.05    0.02    


In [21]:
cv_svd['test_rmse'].mean()

0.9439932190249876

# Hyperparameter Tuning

In [23]:
param_grid = {
    'n_epochs':[5,10,20],
    'lr_all':[0.002,0.005],
    'reg_all':[0.02,0.04,0.2]
}

gs = GridSearchCV(SVD, param_grid, measures=['RMSE','MAE'], cv=3)

In [24]:
gs.fit(data)

In [25]:
gs.best_score['rmse']
gs.best_params['rmse']

{'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.04}

In [26]:
gs.best_score['mae']
gs.best_params['mae']

{'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.04}

# Comparison

In [27]:
algo = SVD()

cv_svd = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9448  0.9302  0.9361  0.9261  0.9463  0.9367  0.0079  
MAE (testset)     0.7424  0.7349  0.7386  0.7303  0.7457  0.7384  0.0054  
Fit time          2.40    2.42    2.39    2.41    2.45    2.41    0.02    
Test time         0.09    0.05    0.09    0.05    0.09    0.07    0.02    


In [28]:
algo = SVD(n_epochs=20, lr_all=0.005, reg_all=0.04)

cv_svd = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9290  0.9387  0.9365  0.9267  0.9259  0.9313  0.0052  
MAE (testset)     0.7320  0.7402  0.7407  0.7312  0.7326  0.7353  0.0042  
Fit time          2.39    2.39    2.39    2.39    2.39    2.39    0.00    
Test time         0.05    0.05    0.09    0.05    0.09    0.06    0.02    


# Prediction Result

In [30]:
df_test = pd.DataFrame(columns=['user_id', 'item_id'])

for i in [0,11,212]:
    for j in [647, 665, 565, 677]:
        df_test = df_test.append({'user_id':i, 'item_id':j}, ignore_index=True)

In [31]:
df_test

Unnamed: 0,user_id,item_id
0,0,647
1,0,665
2,0,565
3,0,677
4,11,647
5,11,665
6,11,565
7,11,677
8,212,647
9,212,665


In [32]:
algo = SVD(n_epochs=20, lr_all=0.005, reg_all=0.04)
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x14e9d0370>

In [35]:
y=[]

for _, row in df_test.iterrows():
    est = algo.predict(row['user_id'], row['item_id'])
    y.append(est[3])

In [37]:
df_test['rating'] = y

In [38]:
df_test

Unnamed: 0,user_id,item_id,rating
0,0,647,4.284895
1,0,665,2.719833
2,0,565,2.961825
3,0,677,3.37774
4,11,647,4.118253
5,11,665,2.942384
6,11,565,2.827956
7,11,677,3.38294
8,212,647,3.839375
9,212,665,2.758438
