In [1]:
import pandas as pd
import random
import pickle
import os

from surprise import Dataset
from surprise import Reader
from surprise import evaluate, print_perf
from surprise import BaselineOnly
from surprise import GridSearch
from surprise import accuracy
from surprise import dump

In [2]:
df = pd.read_csv('train.csv', header=None)
df.columns = ['userId', 'itemId', 'rating']

In [3]:
# Load the full dataset.
reader = Reader(rating_scale=(0.5,5))
data = Dataset.load_from_df(df[['userId', 'itemId', 'rating']], reader)
raw_ratings = data.raw_ratings

# shuffle ratings
random.shuffle(raw_ratings)

# A = 80% of the data, B = 20% of the data
threshold = int(.8 * len(raw_ratings))
A_raw_ratings = raw_ratings[:threshold]
B_raw_ratings = raw_ratings[threshold:]

data.raw_ratings = A_raw_ratings
data.split(n_folds=3)

In [89]:
# Select your best algo with grid search.
print('Grid Search...')
param_grid = {'bsl_options': {'method': ['als'],
                              'n_epochs': [20, 50, 100],
                              'reg_u': [5, 15, 100],
                              'reg_i': [5, 10, 100]}
             }
grid_search = GridSearch(BaselineOnly, param_grid, measures=['RMSE'], verbose=0)
grid_search.evaluate(data)

algo = grid_search.best_estimator['RMSE']

Grid Search...
[{'bsl_options': {'n_epochs': 20, 'reg_i': 5, 'method': 'als', 'reg_u': 5}}, {'bsl_options': {'n_epochs': 50, 'reg_i': 5, 'method': 'als', 'reg_u': 5}}, {'bsl_options': {'n_epochs': 100, 'reg_i': 5, 'method': 'als', 'reg_u': 5}}, {'bsl_options': {'n_epochs': 20, 'reg_i': 10, 'method': 'als', 'reg_u': 5}}, {'bsl_options': {'n_epochs': 50, 'reg_i': 10, 'method': 'als', 'reg_u': 5}}, {'bsl_options': {'n_epochs': 100, 'reg_i': 10, 'method': 'als', 'reg_u': 5}}, {'bsl_options': {'n_epochs': 20, 'reg_i': 100, 'method': 'als', 'reg_u': 5}}, {'bsl_options': {'n_epochs': 50, 'reg_i': 100, 'method': 'als', 'reg_u': 5}}, {'bsl_options': {'n_epochs': 100, 'reg_i': 100, 'method': 'als', 'reg_u': 5}}, {'bsl_options': {'n_epochs': 20, 'reg_i': 5, 'method': 'als', 'reg_u': 15}}, {'bsl_options': {'n_epochs': 50, 'reg_i': 5, 'method': 'als', 'reg_u': 15}}, {'bsl_options': {'n_epochs': 100, 'reg_i': 5, 'method': 'als', 'reg_u': 15}}, {'bsl_options': {'n_epochs': 20, 'reg_i': 10, 'method': 

In [97]:
pickle.dump(grid_search.cv_results, open("baseline_result","wb"))

In [9]:
# retrain on the whole set A
trainset = data.build_full_trainset()
algo.train(trainset)

Estimating biases using als...


In [99]:
# Compute biased accuracy on A - predictions of the 'original' algorithm.
predictions = algo.test(trainset.build_testset())
accuracy.rmse(predictions)

RMSE: 0.8470


0.84698115006857355

In [101]:
# Compute unbiased accuracy on B - unseen testset
testset = data.construct_testset(B_raw_ratings)  # testset is now the set B
predictions = algo.test(testset)
accuracy.rmse(predictions)

RMSE: 0.8594


0.8593878321944346

In [104]:
# Dump algorithm 
file_name = os.path.expanduser('dump_BaselineAlgo')
dump.dump(file_name, predictions, algo=algo)

KeyboardInterrupt: 

In [8]:
# Reload algorithm
_, algo = dump.load('dump_BaselineAlgo')

### Train on the final_train dataset

In [93]:
df = pd.read_csv('final_train.csv', header=None)
df.columns = ['userId', 'itemId', 'rating']

In [94]:
# Load the full dataset.
reader = Reader(rating_scale=(0.5,5))
data = Dataset.load_from_df(df[['userId', 'itemId', 'rating']], reader)

In [95]:
# Retrieve the trainset.
trainset = data.build_full_trainset()

In [96]:
algo.train(trainset)

Estimating biases using als...


In [105]:
# Dump algorithm 
file_name = os.path.expanduser('dump_finalBaselineAlgo')
dump.dump(file_name, algo=algo)

In [151]:
# Reload algorithm
_, algo = dump.load('dump_finalBaselineAlgo')

### predicted value from the trainset

#### (a)

In [170]:
df['pred'] = df.apply(lambda x: algo.predict(x[0],x[1])[3], axis=1) 

KeyboardInterrupt: 

#### (b)

In [171]:
df = pd.read_csv('final_train.csv', header=None)
df.columns = ['userId', 'itemId', 'rating']

In [172]:
# Load the full dataset.
reader = Reader(rating_scale=(0.5,5))
data = Dataset.load_from_df(df[['userId', 'itemId', 'rating']], reader)

In [173]:
raw_ratings = data.raw_ratings

In [174]:
train_dataset = data.construct_testset(raw_ratings)  
predictions = algo.test(train_dataset)

In [180]:
accuracy.rmse(predictions)

RMSE: 0.8496


0.84957029131673034

In [181]:
accuracy.mae(predictions)

MAE:  0.6521


0.65212994575808203

In [179]:
f = open('prediction_final_train', 'w')
for l in predictions:
    f.write(str(l.uid)+','+str(l.iid)+','+str(round(l.est,3))+'\n')
f.close()






### map inner id to raw id for user and item bias

In [97]:
raw2inner_id_items_dict = trainset._raw2inner_id_items
raw2inner_id_users_dict = trainset._raw2inner_id_users

In [98]:
new_raw2inner_id_items_dict = {}
for (k, v) in raw2inner_id_items_dict.items():
    new_raw2inner_id_items_dict[v] = k

In [99]:
new_raw2inner_id_users_dict = {}
for (k, v) in raw2inner_id_users_dict.items():
    new_raw2inner_id_users_dict[v] = k

In [100]:
def map_to_raw_id(l, d):
    keys = d.values()
    values = l
    new_dict = dict(zip(keys, values))
    return zip(keys, values)

In [108]:
bi = algo.bi
rawid_bi = map_to_raw_id(bi, new_raw2inner_id_items_dict)
rawid_bi.sort()
pickle.dump(rawid_bi, open('final_rawidItemBias', 'wb'))

In [109]:
bu = algo.bu
rawid_bu = map_to_raw_id(bu, new_raw2inner_id_users_dict)
rawid_bu.sort()
pickle.dump(rawid_bu, open('final_rawidUserBias', 'wb'))

### test dataset

In [141]:
df = pd.read_csv('final_test.csv', header=None)
df.columns = ['userId', 'itemId', 'rating']

In [145]:
df['pred'] = df.apply(lambda x: algo.predict(x[0],x[1])[3], axis=1) 

In [148]:
df.head()

Unnamed: 0,userId,itemId,rating,pred
0,51,1512,3.0,2.39035
1,51,1425,2.0,2.939926
2,51,2371,3.0,3.612307
3,51,1107,3.0,3.571033
4,51,2532,4.0,2.94219


In [149]:
df.to_csv('test_prediction.csv')

### RMSE and MAE on test dataset

In [158]:
# Load the full dataset.
reader = Reader(rating_scale=(0.5,5))
data = Dataset.load_from_df(df[['userId', 'itemId', 'rating']], reader)

In [159]:
raw_ratings = data.raw_ratings

In [161]:
testset = testdata.construct_testset(raw_ratings)
predictions = algo.test(testset)

In [162]:
accuracy.rmse(predictions)

RMSE: 0.8588


0.85884422741116728

In [163]:
accuracy.mae(predictions)

MAE:  0.6600


0.66000707442924811