In [6]:
import pandas as pd
from surprise import prediction_algorithms as pa
from surprise import Dataset, Reader, GridSearch, accuracy
from surprise import evaluate, print_perf
import time,random
from collections import defaultdict


def precision_recall_at_k(predictions, k=10, threshold=3.5):
    '''Return precision and recall at k metrics for each user.'''
    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))
    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():
        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)
        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])
        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])
        # Precision@K: Proportion of recommended items that are relevant
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1
        # Recall@K: Proportion of relevant items that are recommended
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1
    return precisions, recalls

one_hundred_k_data = pd.read_csv('./ml-100k/data.csv')
two_hundred_fifty_data = pd.read_csv("ratingsNew_250k.csv")
five_hundred_data = pd.read_csv("ratingsNew_500k.csv")
seven_hundred_fifty_data = pd.read_csv("ratingsNew_750k.csv")
one_million_data = pd.read_csv("./movielens_1M/ratings.csv")

datasets = [one_hundred_k_data, two_hundred_fifty_data, five_hundred_data, seven_hundred_fifty_data, one_million_data]
datasets_names = ["100k", "250k", "500k", "750k", "1M"]
surprise_datasets = []

for each_dataset in datasets:
    df = pd.DataFrame(each_dataset)
    df.drop('timestamp', axis=1, inplace=True)
    reader = Reader(rating_scale=(1, 5))
    surprise_datasets.append(Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader))


param_grid = {'n_factors': [80,100,120,140,160], 'n_epochs': [50], 'reg_all': [0.1,0.05,0.02,0.01], 'lr_all': [0.005,0.002]}
grid_search = GridSearch(pa.matrix_factorization.SVD, param_grid=param_grid, measures=['MAE', 'RMSE', 'FCP'])

results_dict = {"Dataset": [], "Average_Running_Time": [], "RMSE": [], "MAE": [], "Precision": [], "Recall": [], "Running_Time": []}

for i in range(len(surprise_datasets)):
    
    dataset = surprise_datasets[i]
    raw_ratings = surprise_datasets[i].raw_ratings
    random.shuffle(raw_ratings)
    threshold = int(.8 * len(raw_ratings))
    A_raw_ratings = raw_ratings[:threshold]
    B_raw_ratings = raw_ratings[threshold:]
    dataset.raw_ratings = A_raw_ratings
    dataset.split(n_folds=5)
    
    start_time = int(time.time())
    grid_search.evaluate(dataset)
    end_time = int(time.time())
    time_taken = (end_time-start_time)/40.0
    
    best_score = grid_search.best_score["RMSE"]
    best_params = grid_search.best_params["RMSE"]
    print best_score,best_params
    
    start_time = int(time.time())
    trainset = dataset.build_full_trainset()
    algo = pa.matrix_factorization.SVD(n_factors=best_params['n_factors'], n_epochs=50, biased=True, 
                                       reg_all=best_params['reg_all'], lr_all=best_params['lr_all'])
    algo.train(trainset)
    end_time = int(time.time())
    running_time = end_time-start_time
    
    testset = dataset.construct_testset(B_raw_ratings)
    predictions = algo.test(testset)
    
    rmse = accuracy.rmse(predictions)
    mae = accuracy.mae(predictions)
    precisions, recalls = precision_recall_at_k(predictions=predictions, k=10, threshold=3)
    precision_percent = (sum(prec for prec in precisions.values()) / float(len(precisions)))
    recall_percent = (sum(rec for rec in recalls.values()) / float(len(recalls)))
    print "Precision, Recall: ", precision_percent, recall_percent
    
    
    results_dict["Dataset"].append(datasets_names[i])
    results_dict["Average_Running_Time"].append(time_taken)
    results_dict["RMSE"].append(rmse)
    results_dict["MAE"].append(mae)
    results_dict["Precision"].append(precision_percent)
    results_dict["Recall"].append(recall_percent)
    results_dict["Running_Time"].append(running_time)
    print "Average RMSE: ", rmse
    print "Average MAE: ", mae
    print "Average Precision: ", precision_percent
    print "Average Recall: ", recall_percent
    print "Time Difference: ", running_time
    
results_df = pd.DataFrame(results_dict)
results_df.to_csv("scale_svd_results.csv")


[{'lr_all': 0.005, 'reg_all': 0.1, 'n_factors': 80, 'n_epochs': 50}, {'lr_all': 0.005, 'reg_all': 0.1, 'n_factors': 100, 'n_epochs': 50}, {'lr_all': 0.005, 'reg_all': 0.1, 'n_factors': 120, 'n_epochs': 50}, {'lr_all': 0.005, 'reg_all': 0.1, 'n_factors': 140, 'n_epochs': 50}, {'lr_all': 0.005, 'reg_all': 0.1, 'n_factors': 160, 'n_epochs': 50}, {'lr_all': 0.005, 'reg_all': 0.05, 'n_factors': 80, 'n_epochs': 50}, {'lr_all': 0.005, 'reg_all': 0.05, 'n_factors': 100, 'n_epochs': 50}, {'lr_all': 0.005, 'reg_all': 0.05, 'n_factors': 120, 'n_epochs': 50}, {'lr_all': 0.005, 'reg_all': 0.05, 'n_factors': 140, 'n_epochs': 50}, {'lr_all': 0.005, 'reg_all': 0.05, 'n_factors': 160, 'n_epochs': 50}, {'lr_all': 0.005, 'reg_all': 0.02, 'n_factors': 80, 'n_epochs': 50}, {'lr_all': 0.005, 'reg_all': 0.02, 'n_factors': 100, 'n_epochs': 50}, {'lr_all': 0.005, 'reg_all': 0.02, 'n_factors': 120, 'n_epochs': 50}, {'lr_all': 0.005, 'reg_all': 0.02, 'n_factors': 140, 'n_epochs': 50}, {'lr_all': 0.005, 'reg_all'

Mean FCP : 0.6932
------------
------------
Parameters combination 28 of 40
params:  {'lr_all': 0.002, 'reg_all': 0.05, 'n_factors': 120, 'n_epochs': 50}
------------
Mean MAE : 0.7436
Mean RMSE: 0.9396
Mean FCP : 0.6937
------------
------------
Parameters combination 29 of 40
params:  {'lr_all': 0.002, 'reg_all': 0.05, 'n_factors': 140, 'n_epochs': 50}
------------
Mean MAE : 0.7443
Mean RMSE: 0.9409
Mean FCP : 0.6897
------------
------------
Parameters combination 30 of 40
params:  {'lr_all': 0.002, 'reg_all': 0.05, 'n_factors': 160, 'n_epochs': 50}
------------
Mean MAE : 0.7431
Mean RMSE: 0.9388
Mean FCP : 0.6934
------------
------------
Parameters combination 31 of 40
params:  {'lr_all': 0.002, 'reg_all': 0.02, 'n_factors': 80, 'n_epochs': 50}
------------
Mean MAE : 0.7441
Mean RMSE: 0.9425
Mean FCP : 0.6862
------------
------------
Parameters combination 32 of 40
params:  {'lr_all': 0.002, 'reg_all': 0.02, 'n_factors': 100, 'n_epochs': 50}
------------
Mean MAE : 0.7452
Mean

Mean FCP : 0.7135
------------
------------
Parameters combination 27 of 40
params:  {'lr_all': 0.002, 'reg_all': 0.05, 'n_factors': 100, 'n_epochs': 50}
------------
Mean MAE : 0.7154
Mean RMSE: 0.9044
Mean FCP : 0.7149
------------
------------
Parameters combination 28 of 40
params:  {'lr_all': 0.002, 'reg_all': 0.05, 'n_factors': 120, 'n_epochs': 50}
------------
Mean MAE : 0.7153
Mean RMSE: 0.9046
Mean FCP : 0.7143
------------
------------
Parameters combination 29 of 40
params:  {'lr_all': 0.002, 'reg_all': 0.05, 'n_factors': 140, 'n_epochs': 50}
------------
Mean MAE : 0.7151
Mean RMSE: 0.9041
Mean FCP : 0.7157
------------
------------
Parameters combination 30 of 40
params:  {'lr_all': 0.002, 'reg_all': 0.05, 'n_factors': 160, 'n_epochs': 50}
------------
Mean MAE : 0.7153
Mean RMSE: 0.9043
Mean FCP : 0.7157
------------
------------
Parameters combination 31 of 40
params:  {'lr_all': 0.002, 'reg_all': 0.02, 'n_factors': 80, 'n_epochs': 50}
------------
Mean MAE : 0.7171
Mean

Mean FCP : 0.7178
------------
------------
Parameters combination 26 of 40
params:  {'lr_all': 0.002, 'reg_all': 0.05, 'n_factors': 80, 'n_epochs': 50}
------------
Mean MAE : 0.7039
Mean RMSE: 0.8900
Mean FCP : 0.7244
------------
------------
Parameters combination 27 of 40
params:  {'lr_all': 0.002, 'reg_all': 0.05, 'n_factors': 100, 'n_epochs': 50}
------------
Mean MAE : 0.7031
Mean RMSE: 0.8891
Mean FCP : 0.7254
------------
------------
Parameters combination 28 of 40
params:  {'lr_all': 0.002, 'reg_all': 0.05, 'n_factors': 120, 'n_epochs': 50}
------------
Mean MAE : 0.7029
Mean RMSE: 0.8889
Mean FCP : 0.7245
------------
------------
Parameters combination 29 of 40
params:  {'lr_all': 0.002, 'reg_all': 0.05, 'n_factors': 140, 'n_epochs': 50}
------------
Mean MAE : 0.7021
Mean RMSE: 0.8878
Mean FCP : 0.7265
------------
------------
Parameters combination 30 of 40
params:  {'lr_all': 0.002, 'reg_all': 0.05, 'n_factors': 160, 'n_epochs': 50}
------------
Mean MAE : 0.7024
Mean

Mean MAE : 0.7121
Mean RMSE: 0.8970
Mean FCP : 0.7238
------------
------------
Parameters combination 26 of 40
params:  {'lr_all': 0.002, 'reg_all': 0.05, 'n_factors': 80, 'n_epochs': 50}
------------
Mean MAE : 0.6993
Mean RMSE: 0.8838
Mean FCP : 0.7329
------------
------------
Parameters combination 27 of 40
params:  {'lr_all': 0.002, 'reg_all': 0.05, 'n_factors': 100, 'n_epochs': 50}
------------
Mean MAE : 0.6985
Mean RMSE: 0.8828
Mean FCP : 0.7342
------------
------------
Parameters combination 28 of 40
params:  {'lr_all': 0.002, 'reg_all': 0.05, 'n_factors': 120, 'n_epochs': 50}
------------
Mean MAE : 0.6980
Mean RMSE: 0.8823
Mean FCP : 0.7347
------------
------------
Parameters combination 29 of 40
params:  {'lr_all': 0.002, 'reg_all': 0.05, 'n_factors': 140, 'n_epochs': 50}
------------
Mean MAE : 0.6973
Mean RMSE: 0.8816
Mean FCP : 0.7349
------------
------------
Parameters combination 30 of 40
params:  {'lr_all': 0.002, 'reg_all': 0.05, 'n_factors': 160, 'n_epochs': 50}

Mean FCP : 0.7286
------------
------------
Parameters combination 25 of 40
params:  {'lr_all': 0.002, 'reg_all': 0.1, 'n_factors': 160, 'n_epochs': 50}
------------
Mean MAE : 0.7117
Mean RMSE: 0.8973
Mean FCP : 0.7294
------------
------------
Parameters combination 26 of 40
params:  {'lr_all': 0.002, 'reg_all': 0.05, 'n_factors': 80, 'n_epochs': 50}
------------
Mean MAE : 0.6973
Mean RMSE: 0.8826
Mean FCP : 0.7390
------------
------------
Parameters combination 27 of 40
params:  {'lr_all': 0.002, 'reg_all': 0.05, 'n_factors': 100, 'n_epochs': 50}
------------
Mean MAE : 0.6961
Mean RMSE: 0.8811
Mean FCP : 0.7401
------------
------------
Parameters combination 28 of 40
params:  {'lr_all': 0.002, 'reg_all': 0.05, 'n_factors': 120, 'n_epochs': 50}
------------
Mean MAE : 0.6955
Mean RMSE: 0.8803
Mean FCP : 0.7409
------------
------------
Parameters combination 29 of 40
params:  {'lr_all': 0.002, 'reg_all': 0.05, 'n_factors': 140, 'n_epochs': 50}
------------
Mean MAE : 0.6954
Mean 

run the svd model for different datasets 100K, 250K, 500K, 750K and 1M and output the rmse,mae, precision, recall and runtime for all the datasets