In [77]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import itertools
from pyspark.mllib.recommendation import ALS
from pyspark.sql.functions import col
import math
import random
import itertools
import copy
from joblib import Parallel, delayed
import multiprocessing
import pickle
import scipy.optimize as sco

from cross_validator import *
from models.als import *
from models.means import *
from models.medians import *
from models.helpers_scipy import *
from models.MF_RR import *
from models.MF_SGD import *
from models.pyfm import *
from models.surprise_models import *
from helpers import *

%matplotlib inline
%reload_ext autoreload
%autoreload 2

pd.options.display.max_columns = 100
sc.setCheckpointDir('./checkpoint/')

In [78]:
df = load_csv()
df.head()

Unnamed: 0,User,Movie,Rating
0,44,1,4
1,61,1,3
2,67,1,4
3,72,1,3
4,86,1,5


In [79]:
models = {
    'global_mean': {
        'function': global_mean,
        'params': {}    
    },
    'global_median': {
        'function': global_median,
        'params': {}    
    },     
    'user_mean': {
        'function': user_mean,
        'params': {}
    },
    'user_median': {
        'function': user_median,
        'params': {}
    },    
    'movie_mean': {
        'function': movie_mean,
        'params': {}
    }, 
    'movie_mean_rescaled': {
        'function': movie_mean_rescaled,
        'params': {}
    },  
    'movie_median': {
        'function': movie_median,
        'params': {}
    },    
    'movie_median_rescaled': {
        'function': movie_median_rescaled,
        'params': {}
    },
    'movie_mean_deviation_user': {
        'function': movie_mean_deviation_user,
        'params': {}
    },
    'movie_mean_deviation_user_rescaled': {
        'function': movie_mean_deviation_user_rescaled,
        'params': {}
    },    
    'movie_median_deviation_user': {
        'function': movie_median_deviation_user,
        'params': {}
    },
    'movie_median_deviation_user_rescaled': {
        'function': movie_median_deviation_user_rescaled,
        'params': {}
    },    
    'als': {
        'function': predictions_ALS,
        'params': {
            'spark_context': sc,
            'rank': 8,
            'lambda_': 0.081, 
            'iterations': 24, 
            'nonnegative': True
        }
    },
    'als_rescaled': {
        'function': predictions_ALS_rescaled,
        'params': {
            'spark_context': sc,
            'rank': 8,
            'lambda_': 0.081, 
            'iterations': 24, 
            'nonnegative': True
        }
    },    
    'mf_rr': {
        'function': mf_RR,
        'params': {
            'movie_features':20, 
            'alpha':19
        }  
    },
    'mf_rr_rescaled': {
        'function': mf_RR_rescaled,
        'params': {
            'movie_features':20, 
            'alpha':19
        }  
    },    
    'mf_sgd': {
        'function': mf_SGD,
        'params': {
            'gamma': 0.004,
            'n_features': 20,
            'n_iter': 20,
            'init_method': 'global_mean'
        }
    },  
    'mf_sgd_rescaled': {
        'function': mf_SGD_rescaled,
        'params': {
            'gamma': 0.004,
            'n_features': 20,
            'n_iter': 20,
            'init_method': 'global_mean'
        }
    },    
    'pyfm': {
        'function': pyfm,
        'params': {
            'num_factors': 20,
            'num_iter': 200,
            'init_lr': 0.001,
        }
    },
    'pyfm_rescaled': {
        'function': pyfm_rescaled,
        'params': {
            'num_factors': 20,
            'num_iter': 200,
            'init_lr': 0.001,
        }
    },
    'knn_ib': {
        'function': knn,
        'params': {
            'k': 60,
            'sim_options': {
                'name': 'pearson_baseline',
                'user_based': False
            }
        }
    },
    'knn_ib_rescaled': {
        'function': knn_rescaled,
        'params': {
            'k': 60,
            'sim_options': {
                'name': 'pearson_baseline',
                'user_based': False
            }
        }
    },
    'knn_ub': {
        'function': knn,
        'params': {
            'k': 300,
            'sim_options': {
                'name': 'pearson_baseline',
                'user_based': True
            }
        }
    },
    'knn_ub_rescaled': {
        'function': knn_rescaled,
        'params': {
            'k': 300,
            'sim_options': {
                'name': 'pearson_baseline',
                'user_based': True
            }
        }
    },    
    'svd': {
        'function': svd,
        'params': {
            'n_epochs': 30,
            'lr_all': 0.001,
            'reg_all': 0.001
        }
    },
    'svd_rescaled': {
        'function': svd_rescaled,
        'params': {
            'n_epochs': 30,
            'lr_all': 0.001,
            'reg_all': 0.001
        }
    },
    'slope_one': {
        'function': slope_one,
        'params': {}
    },
    'slope_one_rescaled': {
        'function': slope_one_rescaled,
        'params': {}
    },
    'baseline': {
        'function': baseline,
        'params': {}
    },
    'baseline_rescaled': {
        'function': baseline_rescaled,
        'params': {}
    }   
}

#to_pop = ['mf_rr', 'global_median', 'movie_median_deviation_user', 
#            'movie_median_deviation_user_rescaled', 'user_median', 
#            'movie_mean_rescaled', 'mf_rr_rescaled', 'user_mean', 
#            'mf_sgd', 'movie_median', 'movie_mean', 'movie_median_rescaled',
#            'slope_one_rescaled', 'baseline_rescaled', 'slope_one', 
#            'movie_mean_deviation_user_rescaled', 'global_mean']
#for i in to_pop:
#    models.pop(i)
len(models)

30

In [91]:
def eval_(x, cv, models):
    dict_try = {}
    for idx, key in enumerate(model_names):
        dict_try[key] = x[idx] 
        
    return cv.evaluate_blending(dict_try)

def test_blending(cv, best_dict):
    cv.evaluation_all_models()
        
    print()
    rmse = cv.evaluate_blending(best_dict)
    print("Best blending: %s"%best_dict)
    print("RMSE best blending: %.5f"%rmse)

In [93]:
cv = CrossValidator()
#cv.shuffle_indices_and_store(df, 5)
a = cv.load_indices()
b = cv.define_ground_truth(df)

In [88]:
variations = ['mf_rr', 'slope_one_rescaled', 'movie_mean_deviation_user', 
 'mf_sgd_rescaled', 'movie_mean_rescaled', 'als_rescaled',
 'mf_rr_rescaled', 'user_mean', 'knn_ib_rescaled',
 'mf_sgd', 'als', 'pyfm_rescaled', 'baseline', 'movie_mean', 
 'knn_ub', 'svd_rescaled', 'baseline_rescaled', 'slope_one',
 'svd', 'movie_mean_deviation_user_rescaled', 'pyfm', 'knn_ib', 
 'knn_ub_rescaled']
var_ceil = cv.models_ceil(variations)
var_round = cv.models_round(variations)
var_floor = cv.models_floor(variations)

Ceil Variation for model mf_rr
Ceil Variation for model slope_one_rescaled
Ceil Variation for model movie_mean_deviation_user
Ceil Variation for model mf_sgd_rescaled
Ceil Variation for model movie_mean_rescaled
Ceil Variation for model als_rescaled
Ceil Variation for model mf_rr_rescaled
Ceil Variation for model user_mean
Ceil Variation for model knn_ib_rescaled
Ceil Variation for model mf_sgd
Ceil Variation for model als
Ceil Variation for model pyfm_rescaled
Ceil Variation for model baseline
Ceil Variation for model movie_mean
Ceil Variation for model knn_ub
Ceil Variation for model svd_rescaled
Ceil Variation for model baseline_rescaled
Ceil Variation for model slope_one
Ceil Variation for model svd
Ceil Variation for model movie_mean_deviation_user_rescaled
Ceil Variation for model pyfm
Ceil Variation for model knn_ib
Ceil Variation for model knn_ub_rescaled
Round Variation for model mf_rr
Round Variation for model slope_one_rescaled
Round Variation for model movie_mean_deviation_

In [98]:
model_names = list(models.keys())
len(model_names)

30

In [102]:
model_names = list(models.keys())
model_names.extend(var_ceil)
model_names.extend(var_round)
model_names.extend(var_floor)
len(model_names)

99

In [61]:
for idx, mdl in enumerate(models.keys()):
        print("Predict and store for Model %s (%i/%i)"%(mdl, idx+1, len(models.keys())))
        cv.k_fold_predictions_and_store(df, models[mdl]['function'], mdl, override=False, **models[mdl]['params'])

Predict and store for Model mf_rr (1/30)
Predict and store for Model slope_one_rescaled (2/30)
Predict and store for Model movie_mean_deviation_user (3/30)
Predict and store for Model knn_ib (4/30)
Predict and store for Model movie_median_deviation_user (5/30)
Predict and store for Model movie_median_deviation_user_rescaled (6/30)
Predict and store for Model mf_sgd_rescaled (7/30)
Predict and store for Model movie_median_rescaled (8/30)
Predict and store for Model movie_mean_rescaled (9/30)
Predict and store for Model global_mean (10/30)
Predict and store for Model mf_rr_rescaled (11/30)
Predict and store for Model user_mean (12/30)
Predict and store for Model knn_ib_rescaled (13/30)
Predict and store for Model mf_sgd (14/30)
Predict and store for Model als (15/30)
Predict and store for Model pyfm_rescaled (16/30)
Predict and store for Model baseline (17/30)
Predict and store for Model movie_median (18/30)
Predict and store for Model movie_mean (19/30)
Predict and store for Model user_

In [103]:
a = cv.load_predictions(model_names)

In [104]:
x0 = 1/len(model_names)*np.ones(len(model_names))
res = sco.minimize(eval_, x0, method='SLSQP', args=(cv, model_names), 
                   options={'maxiter':1000, 'disp':True})

Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.959536335331
            Iterations: 153
            Function evaluations: 15457
            Gradient evaluations: 153


In [105]:
res.x

array([  3.85128979e-02,   3.51114521e-01,   9.93604205e-01,
         1.09702320e-02,   9.43857913e-01,   9.46737735e-01,
         2.93620595e-01,  -9.31365627e-01,  -8.89941645e-01,
         1.48610139e+00,   4.60640904e-02,  -3.72358040e+00,
         2.54812688e-01,  -9.10853898e-01,   3.57351430e-01,
         3.23975217e-02,  -6.49204830e-01,  -9.33969630e-01,
        -8.84082550e-01,   5.24114406e-03,   2.01655740e-01,
        -1.96694890e-02,   1.54695770e+00,  -5.58355628e-01,
         1.44034690e-01,   7.01306575e-01,   9.87334079e-01,
         1.21662282e-01,   5.64814846e-02,   2.60001063e-01,
        -6.66018997e-02,   2.47621530e-01,   1.95726476e-01,
         1.97424487e-01,   1.90317604e-01,   1.93613991e-01,
         1.91538914e-01,   6.86803137e-03,   2.02512408e-01,
         1.81948019e-01,   1.95794447e-01,   1.92873981e-01,
        -5.32517197e-02,  -7.62636618e-02,  -2.51344175e-02,
         1.90055003e-01,   1.93180771e-01,  -5.19309049e-02,
         7.09684053e-02,

In [107]:
best_dict = {}
for idx, key in enumerate(model_names):
    best_dict[key] = res.x[idx]

In [108]:
test_blending(cv, best_dict)

RMSE for  slope_one_rescaled_floor  :  1.15161837872
RMSE for  mf_rr_round  :  1.06745574065
RMSE for  knn_ib_ceil  :  1.14748801189
RMSE for  knn_ib  :  0.990312758998
RMSE for  movie_median_deviation_user  :  1.0722079529
RMSE for  baseline_rescaled  :  1.00039580028
RMSE for  user_mean_round  :  1.11853771201
RMSE for  movie_mean_deviation_user_floor  :  1.13645873656
RMSE for  knn_ib_rescaled_ceil  :  1.15125142814
RMSE for  svd_floor  :  1.15605369052
RMSE for  mf_rr_ceil  :  1.11296041146
RMSE for  knn_ib_rescaled  :  0.99043827713
RMSE for  movie_mean_deviation_user_ceil  :  1.16440383778
RMSE for  als_rescaled_floor  :  1.18647554649
RMSE for  movie_mean_deviation_user_rescaled_ceil  :  1.20419331534
RMSE for  mf_sgd_ceil  :  1.15688643073
RMSE for  slope_one_rescaled_round  :  1.04216644865
RMSE for  movie_mean_rescaled_floor  :  1.15598296989
RMSE for  mf_sgd_rescaled_ceil  :  1.15621175699
RMSE for  movie_mean_deviation_user_rescaled  :  1.04494088282
RMSE for  mf_rr  :  1.0

In [109]:
sum(res.x)

0.25482250644021026