In [25]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from surprise import SVD,Dataset, Reader, SVDpp, NMF
from surprise.model_selection import cross_validate, GridSearchCV

In [2]:
traindata = pd.read_csv('archive/interactions_train.csv')
traindata.head()

Unnamed: 0,user_id,recipe_id,date,rating,u,i
0,2046,4684,2000-02-25,5.0,22095,44367
1,2046,517,2000-02-25,5.0,22095,87844
2,1773,7435,2000-03-13,5.0,24732,138181
3,1773,278,2000-03-13,4.0,24732,93054
4,2046,3431,2000-04-07,5.0,22095,101723


In [3]:
reader = Reader(rating_scale=(traindata.rating.min(), traindata.rating.max()))
data = Dataset.load_from_df(traindata[['user_id', 'recipe_id', 'rating']], reader)

In [6]:
# svd = SVD(n_epochs=10)
# results = cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=10, verbose=True)
# print("Average MAE: ", np.average(results["test_mae"]))
# print("Average RMSE: ", np.average(results["test_rmse"]))

In [36]:
param_grid = {
  'n_factors': [20, 50, 100],
  'n_epochs': [5, 10, 20], "lr_all": [0.002, 0.005], "reg_all": [0.02, 0.4, 0.6]
}

svd_results = {SVD: {}, SVDpp: {}}
for method in [SVD, SVDpp]:
    print(method)
    gridSearch = GridSearchCV(method, param_grid, measures=['rmse', 'mae'], cv=10)
    gridSearch.fit(data)
 
    print(gridSearch.best_score['rmse'])
    print(gridSearch.best_params['rmse'])

    svd_results[method]["best_score"] = gridSearch.best_score['rmse']
    svd_results[method]["best_params"] = gridSearch.best_params['rmse']



<class 'surprise.prediction_algorithms.matrix_factorization.SVD'>


In [None]:
NMF_grid = {'n_factors': [20, 50, 100], 'n_epochs': [5, 10, 20]}
gridSearch = GridSearchCV(NMF, param_grid, measures=['rmse', 'mae'], cv=10)
gridSearch.fit(data)

print(gridSearch.best_score['rmse'])
print(gridSearch.best_params['rmse'])
svd_results["NMF"] = {}
svd_results["NMF"]["best_score"] = gridSearch.best_score['rmse']
svd_results["NMF"]["best_params"] = gridSearch.best_params['rmse']

TypeError: __init__() got an unexpected keyword argument 'lr_all'

In [22]:
gridSearch.best_estimator

{'rmse': <surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fad7ffacfa0>,
 'mae': <surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fad7ffacf10>}

In [11]:
best_factor = gridSearch.best_params['rmse']['n_factors']
best_epoch = gridSearch.best_params['rmse']['n_epochs']
best_lr = gridSearch.best_params['rmse']['lr_all']
best_reg = gridSearch.best_params['rmse']['reg_all']
 
 
svd = SVD(n_factors=best_factor, n_epochs=best_epoch, lr_all=best_lr, reg_all=best_reg)
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fad7ffa9520>

In [12]:
testdata = pd.read_csv('archive/interactions_test.csv')
testdata.head()

Unnamed: 0,user_id,recipe_id,date,rating,u,i
0,8937,44551,2005-12-23,4.0,2,173538
1,56680,126118,2006-10-07,4.0,16,177847
2,349752,219596,2008-04-12,0.0,26,89896
3,628951,82783,2007-11-13,2.0,45,172637
4,92816,435013,2013-07-31,3.0,52,177935


In [13]:
svd.predict('56680','126118')

Prediction(uid='56680', iid='126118', r_ui=None, est=4.574089892559891, details={'was_impossible': False})

In [14]:
testset = Dataset.load_from_df(testdata[['user_id', 'recipe_id', 'rating']], reader)
testset = testset.build_full_trainset().build_testset()
predictions = svd.test(testset)
predictions = [(p.est, p.details['was_impossible']) for p in predictions]

In [15]:
## Calculating metrics (accuracy, TP, FP, TN, FN, precision, recall, F1, MSE, MAE)
def metrics(pred, label):
    TP = [a and b for (a,b) in zip(pred,label)]
    TN = [not a and not b for (a,b) in zip(pred,label)]
    FP = [a and not b for (a,b) in zip(pred,label)]
    FN = [not a and b for (a,b) in zip(pred,label)]

    TP = sum(TP)
    TN = sum(TN)
    FP = sum(FP)
    FN = sum(FN)
    BER = 0.5 * (FP / (TN + FP) + FN / (FN + TP))
    accuracy = (TP +TN) / (TP + TN + FP + FN)
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    F1 = 2 * (precision * recall) / (precision + recall)
    MSE = sum([(a - b) ** 2 for (a,b) in zip(pred,label)]) / len(pred)
    MAE = sum([abs(a - b) for (a,b) in zip(pred,label)]) / len(pred)

    outDict = {'TP': TP, 'TN': TN, 'FP': FP, 'FN': FN, 'BER': BER, 'accuracy': accuracy, 'precision': precision, 'recall': recall, 'F1': F1, 'MSE': MSE, 'MAE': MAE}
    return outDict


In [16]:
metrics([p[0] for p in predictions], list(testdata.rating))

{'TP': 52474.0,
 'TN': 0,
 'FP': 687,
 'FN': 0,
 'BER': 0.5,
 'accuracy': 0.9870769925321194,
 'precision': 0.9870769925321194,
 'recall': 1.0,
 'F1': 0.9934964737066313,
 'MSE': 1.784481870638255,
 'MAE': 0.841176008162039}