### Import Libraries

* surprise package :  Simple Python RecommendatIon System Engine

In [43]:
import numpy as np
import gzip
import math
import operator
import random
import collections
from collections import defaultdict
import pandas as pd
from sklearn.model_selection import train_test_split


    
# Recommender Systems Packages :
from surprise import SVD
from surprise import Dataset
from surprise import evaluate, print_perf
from surprise import Reader
from surprise import accuracy
from surprise import GridSearch
from surprise import KNNBaseline
from surprise import SlopeOne
from surprise import BaselineOnly
from surprise import SlopeOne
from surprise import NMF

### Note :  For model description, optimization, parameters refer readme.ipynb

## Read Data

In [6]:
# Function to unzip json file
def readGz(f):
    for l in gzip.open(f):
        yield eval(l)

# Dictionary to store itemIdD, userID and Rating
ratings_dict = {'itemID': [],
                'userID': [],
                'rating': []}
# Unzip json train data and read data line-by-line into 'ratings_dict'
for l in readGz("../Data/train.json.gz"):
    user,business,rating = l['userID'],l['businessID'],l['rating']
    ratings_dict['itemID'].append(business)
    ratings_dict['userID'].append(user)
    ratings_dict['rating'].append(rating)
    
# Convert to Dataframe
df = pd.DataFrame(ratings_dict)


## Cross-Validation : BasilineOnly

### Stochastic Gradient Descent (SDG)
The parameters for SGD are :
* ’reg’ orlambda: The regularization parameter of the cost function that is optimized
* ’learning rate’: The learning rate of SGD
* ’nepochs’: The number of iteration of the SGD procedure

### Alternating Least Squares(ALS)
The parameters for ALS are :
* ’regi’: The regularization parameter for items
* ’regu’: The regularization parameter for users
* ’nepochs’: The number of iteration of the ALS procedure.

In [None]:
# Set parameter space for k-fold cross-validation

# n_epochs for Stocastic Gradient Descent and/or Alternating Least squares
n_epochs_low = 20
n_epochs_high = 81
n_epochs_no_vals = 5
n_epochs_step = (n_epochs_high-n_epochs_low)/n_epochs_no_vals

# regularization parameter for user bias term, item bias term
reg_all_low = 0.01
reg_all_high = 0.5
reg_all_no_vals = 5
reg_all = np.linspace(reg_all_low,reg_all_high,reg_all_no_vals)

n_epochs = [i for i in range(n_epochs_low, n_epochs_high, n_epochs_step)]
reg_all = np.append(reg_all,[1,5,10,50,100])

# Convert data to a form readable by the surprise package
# Split data into 3 sets for 3-fold validation
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)
data.split(n_folds=3)

### Grid Search : Baseline : ALS

In [None]:
# Grid Search for finding optimal parameters for Baseline : ALS
param_grid = {'bsl_options' : {'method': ['als'],
               'n_epochs': n_epochs,
               'reg_u': reg_all,
               'reg_i': reg_all
               }
                  }
grid_search = GridSearch(BaselineOnly, param_grid,
                         measures=['FCP', 'rMSE'],verbose = False)
grid_search.evaluate(data)

In [10]:
# best RMSE score
print(grid_search.best_score['RMSE'])

# combination of parameters that gave the best RMSE score
print(grid_search.best_params['RMSE'])

# Store the best parameter estimates
als_baseline_param_estimate = grid_search.best_params['RMSE']
als_baseline_param_estimate = als_baseline_param_estimate['bsl_options']

0.767285474609
{'bsl_options': {'n_epochs': 44, 'reg_i': 5.0, 'method': 'als', 'reg_u': 5.0}}


### Grid Search : Baseline : SGD

In [None]:
# Grid Search for finding optimal parameters for Baseline : SDG
param_grid = {            
            'bsl_options' : {'method': ['sgd'],
               'learning_rate': [.005,0.001,0.002,0.003,0.004],
               'reg': [0.01],
               'n_epochs' : [80]
               }
             }
               
grid_search1 = GridSearch(BaselineOnly, param_grid,
                         measures=['FCP', 'rMSE'],verbose = True)
grid_search1.evaluate(data)

In [25]:
# best RMSE score
print(grid_search1.best_score['RMSE'])

# combination of parameters that gave the best RMSE score
print(grid_search1.best_params['RMSE'])

# Store the best parameter estimates
sdg_baseline_param_estimate = grid_search.best_params['RMSE']
sdg_baseline_param_estimate = sdg_baseline_param_estimate['bsl_options']

0.768384257884
{'bsl_options': {'reg': 0.01, 'learning_rate': 0.002, 'method': 'sgd', 'n_epochs': 80}}


### Cross-Validation : Latent Factor Model  using SVD

The parameters for SVD are as follows :
* nf actors– The number of factors.
* nepochs– The number of iteration of the SGD procedure
* lrall– The learning rate for all parameters.
* regall– The regularization term for all parameters

In [None]:
# Grid Search for finding optimal parameters for BSVD Latent Factor Model
param_grid = {'n_epochs': n_epochs, 'lr_all': [0.002, 0.005],
              'reg_all': [0.1],'reg_pu' :reg_all , 'reg_qi': reg_all,
             'n_factors': [1]}
grid_search = GridSearch(SVD, param_grid, measures=['RMSE', 'FCP'],verbose = False)
grid_search.evaluate(data)

In [23]:
# best RMSE score
print(grid_search.best_score['RMSE'])

# combination of parameters that gave the best RMSE score
print(grid_search.best_params['RMSE'])

# Store the best parameters
SVD_param_estimate = grid_search.best_params['RMSE'].values()

0.767938113581
{'lr_all': 0.005, 'reg_pu': 0.5, 'n_epochs': 36, 'reg_qi': 0.3775, 'reg_all': 0.1, 'n_factors': 1}


### Cross-Validation : KNNBaseline

The Parameters for KNNBaseline are as follows :

* K– The (max) number of neighbors to take into account for aggregation
* simoptions– Fixed for this task, pearson similarity 
* bsloptions– Similar to baseline updates and parameters in previous section "BaselineOnly"

In [None]:
param_grid = {'bsl_options': {'method': ['als','sgd'],
                                  'reg': reg_all,
                             'n_epochs' : n_epochs},
                  'k': [5,10,15,20,25,30],
                  'sim_options': {'name': ['pearson_baseline'],
                                  'shrinkage': [0]}
                  }
grid_search = GridSearch(KNNBaseline, param_grid,
                         measures=['FCP', 'rMSE'],verbose = False)
grid_search.evaluate(data)

In [None]:
# best RMSE score
print(grid_search.best_score['RMSE'])

# combination of parameters that gave the best RMSE score
print(grid_search.best_params['RMSE'])

# Store the best parameters
KNNBaseline_param_estimate = grid_search.best_params['RMSE']

### Cross-Validation : Non Negative Matrix Factorization (NMF)

* n_factors – The number of factors.
* n_epochs – The number of iteration of the SGD procedure.
* reg_pu – The regularization term for users λu
* reg_qi – The regularization term for items λi



In [None]:
param_grid = {'n_epochs':n_epochs,'n_factors':n_factors,
              'reg_pu':reg_all,'reg_qi':reg_all
                  }
grid_search = GridSearch(NMF, param_grid,
                         measures=['FCP', 'rMSE'],verbose = False)
grid_search.evaluate(data)

In [None]:
# best RMSE score
print(grid_search.best_score['RMSE'])

# combination of parameters that gave the best RMSE score
print(grid_search.best_params['RMSE'])

# Store the best parameters
NMF_param_estimate = grid_search.best_params['RMSE'].values()

## Model fitting on entire train data

In [37]:
# Represent data in a form readable by surpise
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)
trainset = data.build_full_trainset()

# Build each of the above algorithm models using the best parameters and train them

##################################
# SVD
##################################
[best_lr,best_pu,best_n_epochs,best_qi, best_reg_all, best_n_factors] = SVD_param_estimate
algoSVD = SVD('lr_all' = best_lr,
              'reg_pu' = best_pu,
              'n_epochs'= best_n_epochs,
              'reg_qi'= best_qi, 
              'reg_all'= best_reg_all, 
              'n_factors'= best_n_factors)
algoSVD.train(trainset)


##################################
# KNNBaseline
##################################
bsl_options = KNNBaseline_param_estimate['bsl_options']
best_k = KNNBaseline_param_estimate['k']
sim_options = {'name': 'pearson_baseline','shrinkage': 0}
algoKNNBaseline = KNNBaseline(k = best_K,bsl_options=bsl_options, sim_options=sim_options)
algoKNNBaseline.train(trainset)

##################################
# BaselineOnly : ALS
#################################

bsl_options_als = als_baseline_param_estimate
algoBaselineALS = BaselineOnly(bsl_options= bsl_options_als)
algoBaselineALS.train(trainset)

##################################
# BaselineOnly : SGD
#################################

bsl_options_sgd = sgd_baseline_param_estimate
algoBaselineSGD = BaselineOnly (bsl_options= bsl_options_sgd)
algoBaselineSGD.train(trainset)

##################################
# SlopeOne
#################################

algoSlopeOne = SlopeOne()
algoSlopeOne.train(trainset)

################################
# NMF 
################################
[best_n_epochs, best_n_factors, best_reg_pu, best_reg_qi] = NMF_param_estimate
algoNMF = NMF(n_factors = best_n_factors, reg_qi = best_req_qi, reg_pu = best_reg_pu, n_epochs = best_n_epochs)
algoNMF.train(trainset)



Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Estimating biases using sgd...


## Prediction on Testset

## Weighting Approach

An ensemble model which averages the predicted ratings $\hat{r_{ui}}$ from all the 6 models built above. Instead of equal weights for all the 6 models, the weightes can also be learnt through grid search

In [None]:
predictions = open("Submissions/predictions_Rating_als_sgd_svd_knn_slope_nmf.txt", 'w')
for l in open("../Data/pairs_Rating.txt"):
    if l.startswith("userID"):
        #header
        predictions.write(l)
        continue
    u,i = l.strip().split('-')
    # Predict Ratings from all the 6 models
    r1 = algoSVD.predict(u,i)[3]
    r2 = algoBaselineALS.predict(u,i)[3]
    r3 = algoBaselineSGD.predict(u,i)[3]
    r4 = algoKNNBaseline.predict(u,i)[3]
    r5 = algoSlopeOne.predict(u,i)[3]
    r6 = algoNMF.predict(u,i)[3]
    
    # Weight Each model by 1
    # The weights can be further considered as parameters and can be estimated by Cross-validation
    w1 = 1
    w2 = 1
    w3 = 1
    w4 = 1
    w5 = 1
    w6 = 1
    
    # Weighted Sum of the rating estimates from all 6 models
    w_vec = [w1,w2,w3,w4,w5,w6]
    r = (w1*r1) + (w2*r2) + (w3*r3) + (w4*r4) + (w5*r5) + (w6*r6)
    w = (w1) + (w2) + (w3) + (w4) + (w5) + (w6)
    r = r*1.0/w
    predictions.write(u + '-' + i + ',' + str(r) + '\n')   
predictions.close()