In [16]:
import pandas as pd
import pickle
import random
from surprise import prediction_algorithms as pa
from surprise import Dataset, Reader, GridSearch, accuracy, dump
from surprise import evaluate, print_perf
from sklearn.model_selection import train_test_split
import time

In [2]:
data = pd.read_csv('./ml-100k/data.csv')
df = pd.DataFrame(data)
df.drop('timestamp', axis=1, inplace=True)
reader = Reader(rating_scale=(1, 5))
dataset = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader)
raw_ratings = dataset.raw_ratings
random.shuffle(raw_ratings)
threshold = int(.8 * len(raw_ratings))
A_raw_ratings = raw_ratings[:threshold]
B_raw_ratings = raw_ratings[threshold:]
dataset.raw_ratings = A_raw_ratings
dataset.split(n_folds=5)

In [3]:
res_tune = {}
latent_factors = [5,10,15,20,25,30]
regularizations = [0.5,0.2,0.1,0.05,0.02,0.01,0.005]
for regularization in regularizations:
    print("regularization : ", regularization)
    print("start time : ", int(time.time()))
    param_grid = {'n_factors': latent_factors, 'n_epochs': [100], 'reg_pu': [regularization], 'reg_qi': [regularization]}
    grid_search = GridSearch(pa.matrix_factorization.NMF, param_grid=param_grid, measures=['MAE', 'RMSE', 'FCP'])
    grid_search.evaluate(dataset)
    print("end time : ", int(time.time()))
    res_tune[regularization] = grid_search.cv_results
pickle.dump(res_tune, open("nmf_result1","wb"))

regularization :  0.5
start time :  1509637446
[{'n_factors': 5, 'n_epochs': 100, 'reg_pu': 0.5, 'reg_qi': 0.5}, {'n_factors': 10, 'n_epochs': 100, 'reg_pu': 0.5, 'reg_qi': 0.5}, {'n_factors': 15, 'n_epochs': 100, 'reg_pu': 0.5, 'reg_qi': 0.5}, {'n_factors': 20, 'n_epochs': 100, 'reg_pu': 0.5, 'reg_qi': 0.5}, {'n_factors': 25, 'n_epochs': 100, 'reg_pu': 0.5, 'reg_qi': 0.5}, {'n_factors': 30, 'n_epochs': 100, 'reg_pu': 0.5, 'reg_qi': 0.5}]
------------
Parameters combination 1 of 6
params:  {'n_factors': 5, 'n_epochs': 100, 'reg_pu': 0.5, 'reg_qi': 0.5}
------------
Mean MAE : 0.8860
Mean RMSE: 1.0727
Mean FCP : 0.6889
------------
------------
Parameters combination 2 of 6
params:  {'n_factors': 10, 'n_epochs': 100, 'reg_pu': 0.5, 'reg_qi': 0.5}
------------
Mean MAE : 0.8860
Mean RMSE: 1.0727
Mean FCP : 0.6889
------------
------------
Parameters combination 3 of 6
params:  {'n_factors': 15, 'n_epochs': 100, 'reg_pu': 0.5, 'reg_qi': 0.5}
------------
Mean MAE : 0.8860
Mean RMSE: 1.072

------------
Mean MAE : 1.4039
Mean RMSE: 1.6115
Mean FCP : 0.6752
------------
------------
Parameters combination 2 of 6
params:  {'n_factors': 10, 'n_epochs': 100, 'reg_pu': 0.01, 'reg_qi': 0.01}
------------
Mean MAE : 0.9021
Mean RMSE: 1.1140
Mean FCP : 0.6585
------------
------------
Parameters combination 3 of 6
params:  {'n_factors': 15, 'n_epochs': 100, 'reg_pu': 0.01, 'reg_qi': 0.01}
------------
Mean MAE : 0.8228
Mean RMSE: 1.0722
Mean FCP : 0.6541
------------
------------
Parameters combination 4 of 6
params:  {'n_factors': 20, 'n_epochs': 100, 'reg_pu': 0.01, 'reg_qi': 0.01}
------------
Mean MAE : 0.9105
Mean RMSE: 1.1996
Mean FCP : 0.6340
------------
------------
Parameters combination 5 of 6
params:  {'n_factors': 25, 'n_epochs': 100, 'reg_pu': 0.01, 'reg_qi': 0.01}
------------
Mean MAE : 1.0093
Mean RMSE: 1.3203
Mean FCP : 0.5927
------------
------------
Parameters combination 6 of 6
params:  {'n_factors': 30, 'n_epochs': 100, 'reg_pu': 0.01, 'reg_qi': 0.01}
-----

vary the number of factors and regularization for the user and item vectors
the best solutions are obtained for number of factors close to 20 and with regularization of 0.1

In [7]:
for reg in res_tune:
    c = 0
    print("regularization : ", reg)
    s = ""
    for i in res_tune[reg]['RMSE']:
        s+=str(latent_factors[c])+"||"+str(i)+"   "
        c+=1
    print(s)

regularization :  0.5
5||1.07271064851   10||1.07270884096   15||1.07270931582   20||1.07271214409   25||1.07271186057   30||1.07271345796   
regularization :  0.2
5||0.964481732192   10||0.960037586959   15||0.956822702207   20||0.956545037179   25||0.956869048284   30||0.956324455172   
regularization :  0.1
5||0.955844858224   10||0.952624763697   15||0.951253673568   20||0.950019273807   25||0.945821494002   30||0.945462901898   
regularization :  0.05
5||0.985925740406   10||0.98539991551   15||0.985536284856   20||0.984885433828   25||0.9815093813   30||0.978900764633   
regularization :  0.02
5||1.2208449747   10||1.04369924497   15||1.03583439752   20||1.07286084228   25||1.10262809749   30||1.1324891287   
regularization :  0.01
5||1.61152718343   10||1.11401453898   15||1.07218293294   20||1.19959027659   25||1.32032438757   30||1.40348216765   
regularization :  0.005
5||1.91400284988   10||1.19015833846   15||1.09552698668   20||1.31976364152   25||1.50313258803   30||1.596

In [10]:
res_tune = {}
latent_factors = [15,20,25]
regularizations = [0.2,0.1,0.05,0.02]
learn_bias = [0.02,0.01,0.005,0.002,0.001]
for regularization in regularizations:
    res_tune[regularization] = {}
    for lb in learn_bias:
        print("regularization : ", regularization, " bias : ", lb)
        print("start time : ", int(time.time()))
        param_grid = {'n_factors': latent_factors, 'n_epochs': [100], 'reg_pu': [regularization], 'reg_qi': [regularization], 
                      'lr_bu': [lb], 'lr_bi': [lb], 'reg_bu': [regularization], 'reg_bi': [regularization]}
        grid_search = GridSearch(pa.matrix_factorization.NMF, param_grid=param_grid, measures=['MAE', 'RMSE', 'FCP'])
        grid_search.evaluate(dataset)
        print("end time : ", int(time.time()))
        res_tune[regularization][lb] = grid_search.cv_results
pickle.dump(res_tune, open("nmf_result2","wb"))

regularization :  0.2  bias :  0.02
start time :  1509640330
[{'n_factors': 15, 'n_epochs': 100, 'reg_pu': 0.2, 'reg_qi': 0.2, 'lr_bu': 0.02, 'lr_bi': 0.02, 'reg_bu': 0.2, 'reg_bi': 0.2}, {'n_factors': 20, 'n_epochs': 100, 'reg_pu': 0.2, 'reg_qi': 0.2, 'lr_bu': 0.02, 'lr_bi': 0.02, 'reg_bu': 0.2, 'reg_bi': 0.2}, {'n_factors': 25, 'n_epochs': 100, 'reg_pu': 0.2, 'reg_qi': 0.2, 'lr_bu': 0.02, 'lr_bi': 0.02, 'reg_bu': 0.2, 'reg_bi': 0.2}]
------------
Parameters combination 1 of 3
params:  {'n_factors': 15, 'n_epochs': 100, 'reg_pu': 0.2, 'reg_qi': 0.2, 'lr_bu': 0.02, 'lr_bi': 0.02, 'reg_bu': 0.2, 'reg_bi': 0.2}
------------
Mean MAE : 0.7700
Mean RMSE: 0.9580
Mean FCP : 0.7019
------------
------------
Parameters combination 2 of 3
params:  {'n_factors': 20, 'n_epochs': 100, 'reg_pu': 0.2, 'reg_qi': 0.2, 'lr_bu': 0.02, 'lr_bi': 0.02, 'reg_bu': 0.2, 'reg_bi': 0.2}
------------
Mean MAE : 0.7691
Mean RMSE: 0.9568
Mean FCP : 0.7024
------------
------------
Parameters combination 3 of 3
par

------------
Mean MAE : 0.7531
Mean RMSE: 0.9503
Mean FCP : 0.7026
------------
------------
Parameters combination 3 of 3
params:  {'n_factors': 25, 'n_epochs': 100, 'reg_pu': 0.1, 'reg_qi': 0.1, 'lr_bu': 0.01, 'lr_bi': 0.01, 'reg_bu': 0.1, 'reg_bi': 0.1}
------------
Mean MAE : 0.7525
Mean RMSE: 0.9488
Mean FCP : 0.7018
------------
end time :  1509641074
regularization :  0.1  bias :  0.005
start time :  1509641074
[{'n_factors': 15, 'n_epochs': 100, 'reg_pu': 0.1, 'reg_qi': 0.1, 'lr_bu': 0.005, 'lr_bi': 0.005, 'reg_bu': 0.1, 'reg_bi': 0.1}, {'n_factors': 20, 'n_epochs': 100, 'reg_pu': 0.1, 'reg_qi': 0.1, 'lr_bu': 0.005, 'lr_bi': 0.005, 'reg_bu': 0.1, 'reg_bi': 0.1}, {'n_factors': 25, 'n_epochs': 100, 'reg_pu': 0.1, 'reg_qi': 0.1, 'lr_bu': 0.005, 'lr_bi': 0.005, 'reg_bu': 0.1, 'reg_bi': 0.1}]
------------
Parameters combination 1 of 3
params:  {'n_factors': 15, 'n_epochs': 100, 'reg_pu': 0.1, 'reg_qi': 0.1, 'lr_bu': 0.005, 'lr_bi': 0.005, 'reg_bu': 0.1, 'reg_bi': 0.1}
------------
M

------------
Mean MAE : 0.7706
Mean RMSE: 0.9846
Mean FCP : 0.6816
------------
------------
Parameters combination 2 of 3
params:  {'n_factors': 20, 'n_epochs': 100, 'reg_pu': 0.05, 'reg_qi': 0.05, 'lr_bu': 0.002, 'lr_bi': 0.002, 'reg_bu': 0.05, 'reg_bi': 0.05}
------------
Mean MAE : 0.7698
Mean RMSE: 0.9843
Mean FCP : 0.6827
------------
------------
Parameters combination 3 of 3
params:  {'n_factors': 25, 'n_epochs': 100, 'reg_pu': 0.05, 'reg_qi': 0.05, 'lr_bu': 0.002, 'lr_bi': 0.002, 'reg_bu': 0.05, 'reg_bi': 0.05}
------------
Mean MAE : 0.7712
Mean RMSE: 0.9857
Mean FCP : 0.6789
------------
end time :  1509641815
regularization :  0.05  bias :  0.001
start time :  1509641815
[{'n_factors': 15, 'n_epochs': 100, 'reg_pu': 0.05, 'reg_qi': 0.05, 'lr_bu': 0.001, 'lr_bi': 0.001, 'reg_bu': 0.05, 'reg_bi': 0.05}, {'n_factors': 20, 'n_epochs': 100, 'reg_pu': 0.05, 'reg_qi': 0.05, 'lr_bu': 0.001, 'lr_bi': 0.001, 'reg_bu': 0.05, 'reg_bi': 0.05}, {'n_factors': 25, 'n_epochs': 100, 'reg_pu'

for a biased version vary the number of factors and regularization close in a range close to optimal solution and vary the learning rate for the bias

In [11]:
for reg in res_tune:
    for bias in res_tune[reg]:
        c = 0
        print("regularization : ", reg, " bias : ", bias)
        s = ""
        for i in res_tune[reg][bias]['RMSE']:
            s+=str(latent_factors[c])+"||"+str(i)+"   "
            c+=1
        print(s)

regularization :  0.2  bias :  0.02
15||0.957955886225   20||0.956801639138   25||0.956965052735   
regularization :  0.2  bias :  0.01
15||0.957088244361   20||0.957191720697   25||0.956629829629   
regularization :  0.2  bias :  0.005
15||0.957868404408   20||0.956749298616   25||0.956374413264   
regularization :  0.2  bias :  0.002
15||0.956646400892   20||0.957080635227   25||0.955929301294   
regularization :  0.2  bias :  0.001
15||0.95726915137   20||0.95660908118   25||0.956713608124   
regularization :  0.1  bias :  0.02
15||0.952305422938   20||0.950052743508   25||0.948310202591   
regularization :  0.1  bias :  0.01
15||0.952300623538   20||0.950300946018   25||0.948769506117   
regularization :  0.1  bias :  0.005
15||0.951659542136   20||0.95090190507   25||0.947646116169   
regularization :  0.1  bias :  0.002
15||0.952707252012   20||0.951264994958   25||0.947228744207   
regularization :  0.1  bias :  0.001
15||0.951696110829   20||0.949736989011   25||0.947766130909 

the best solution is obtained for the model with the parameters
number of factors = 20
regularization = 0.1
and learning rate for bias = 0.001
not to learn the model with these parameters

In [17]:
trainset = dataset.build_full_trainset()
algo = pa.matrix_factorization.NMF(n_factors=20, n_epochs=100,reg_pu=0.1, reg_qi=0.1, reg_bu=0.1, reg_bi=0.1, lr_bu=0.001, lr_bi=0.001)
algo.train(trainset)
testset = dataset.construct_testset(B_raw_ratings)
predictions = algo.test(testset)
print('Unbiased accuracy on B,', accuracy.rmse(predictions))
accuracy.rmse(predictions)
accuracy.mae(predictions)
accuracy.fcp(predictions)
dump.dump('./nmf_algo',predictions,algo)

RMSE: 0.9384
Unbiased accuracy on B, 0.938448606268
RMSE: 0.9384
MAE:  0.7443
FCP:  0.7140
