In [3]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import itertools
from pyspark.mllib.recommendation import ALS
from pyspark.sql.functions import col
import math
import random
import itertools
import copy
from joblib import Parallel, delayed
import multiprocessing
import pickle
import scipy.optimize as sco

from cross_validator import *
from models.als import *
from models.means import *
from models.medians import *
from models.helpers_scipy import *
from models.MF_RR import *
from models.MF_SGD import *
from models.pyfm import *
from models.surprise_models import *
from helpers import *

%matplotlib inline
%reload_ext autoreload
%autoreload 2

pd.options.display.max_columns = 100
sc.setCheckpointDir('./checkpoint/')

In [4]:
df = load_csv()
df.head()

Unnamed: 0,User,Movie,Rating
0,44,1,4
1,61,1,3
2,67,1,4
3,72,1,3
4,86,1,5


In [5]:
models = {
    'global_mean': {
        'function': global_mean,
        'params': {}    
    },
    'global_median': {
        'function': global_median,
        'params': {}    
    },     
    'user_mean': {
        'function': user_mean,
        'params': {}
    },
    'user_median': {
        'function': user_median,
        'params': {}
    },    
    'movie_mean': {
        'function': movie_mean,
        'params': {}
    }, 
    'movie_mean_rescaled': {
        'function': movie_mean_rescaled,
        'params': {}
    },  
    'movie_median': {
        'function': movie_median,
        'params': {}
    },    
    'movie_median_rescaled': {
        'function': movie_median_rescaled,
        'params': {}
    },
    'movie_mean_deviation_user': {
        'function': movie_mean_deviation_user,
        'params': {}
    },
    'movie_mean_deviation_user_rescaled': {
        'function': movie_mean_deviation_user_rescaled,
        'params': {}
    },    
    'movie_median_deviation_user': {
        'function': movie_median_deviation_user,
        'params': {}
    },
    'movie_median_deviation_user_rescaled': {
        'function': movie_median_deviation_user_rescaled,
        'params': {}
    },    
    'als': {
        'function': predictions_ALS,
        'params': {
            'spark_context': sc,
            'rank': 8,
            'lambda_': 0.081, 
            'iterations': 24, 
            'nonnegative': True
        }
    },
    'als_rescaled': {
        'function': predictions_ALS_rescaled,
        'params': {
            'spark_context': sc,
            'rank': 8,
            'lambda_': 0.081, 
            'iterations': 24, 
            'nonnegative': True
        }
    },    
    'mf_rr': {
        'function': mf_RR,
        'params': {
            'movie_features':20, 
            'alpha':19
        }  
    },
    'mf_rr_rescaled': {
        'function': mf_RR_rescaled,
        'params': {
            'movie_features':20, 
            'alpha':19
        }  
    },    
    'mf_sgd': {
        'function': mf_SGD,
        'params': {
            'gamma': 0.004,
            'n_features': 20,
            'n_iter': 20,
            'init_method': 'global_mean'
        }
    },  
    'mf_sgd_rescaled': {
        'function': mf_SGD_rescaled,
        'params': {
            'gamma': 0.004,
            'n_features': 20,
            'n_iter': 20,
            'init_method': 'global_mean'
        }
    },    
    'pyfm': {
        'function': pyfm,
        'params': {
            'num_factors': 20,
            'num_iter': 200,
            'init_lr': 0.001,
        }
    },
    'pyfm_rescaled': {
        'function': pyfm_rescaled,
        'params': {
            'num_factors': 20,
            'num_iter': 200,
            'init_lr': 0.001,
        }
    },
    'knn_ib': {
        'function': knn,
        'params': {
            'k': 60,
            'sim_options': {
                'name': 'pearson_baseline',
                'user_based': False
            }
        }
    },
    'knn_ib_rescaled': {
        'function': knn_rescaled,
        'params': {
            'k': 60,
            'sim_options': {
                'name': 'pearson_baseline',
                'user_based': False
            }
        }
    },
    'knn_ub': {
        'function': knn,
        'params': {
            'k': 300,
            'sim_options': {
                'name': 'pearson_baseline',
                'user_based': True
            }
        }
    },
    'knn_ub_rescaled': {
        'function': knn_rescaled,
        'params': {
            'k': 300,
            'sim_options': {
                'name': 'pearson_baseline',
                'user_based': True
            }
        }
    },    
    'svd': {
        'function': svd,
        'params': {
            'n_epochs': 30,
            'lr_all': 0.001,
            'reg_all': 0.001
        }
    },
    'svd_rescaled': {
        'function': svd_rescaled,
        'params': {
            'n_epochs': 30,
            'lr_all': 0.001,
            'reg_all': 0.001
        }
    },
    'slope_one': {
        'function': slope_one,
        'params': {}
    },
    'slope_one_rescaled': {
        'function': slope_one_rescaled,
        'params': {}
    },
    'baseline': {
        'function': baseline,
        'params': {}
    },
    'baseline_rescaled': {
        'function': baseline_rescaled,
        'params': {}
    }    
}

len(models)

30

In [6]:
def eval_(x, cv, models):
    dict_try = {}
    for idx, key in enumerate(models.keys()):
        dict_try[key] = x[idx] 
        
    return cv.evaluate_blending(dict_try)

def test_blending(cv, best_dict):
    cv.evaluation_all_models()
        
    print()
    rmse = cv.evaluate_blending(best_dict)
    print("Best blending: %s"%best_dict)
    print("RMSE best blending: %.5f"%rmse)

In [7]:
cv = CrossValidator()
#cv.shuffle_indices_and_store(df, 5)
a = cv.load_indices()
b = cv.define_ground_truth(df)

In [None]:
for idx, mdl in enumerate(models.keys()):
    if "pyfm" not in mdl:
        print("Predict and store for Model %s (%i/%i)"%(mdl, idx+1, len(models.keys())))
        cv.k_fold_predictions_and_store(df, models[mdl]['function'], mdl, override=False, **models[mdl]['params'])

Predict and store for Model mf_rr (1/30)
Predict and store for Model slope_one_rescaled (2/30)
Predict and store for Model movie_mean_deviation_user (3/30)
Predict and store for Model knn_ib (4/30)
Predict and store for Model movie_median_deviation_user (5/30)
Predict and store for Model movie_median_deviation_user_rescaled (6/30)
Predict and store for Model mf_sgd_rescaled (7/30)
Predict and store for Model movie_median_rescaled (8/30)
Predict and store for Model movie_mean_rescaled (9/30)
Predict and store for Model global_mean (10/30)
Predict and store for Model mf_rr_rescaled (11/30)
Predict and store for Model user_mean (12/30)
Predict and store for Model knn_ib_rescaled (13/30)
Predict and store for Model mf_sgd (14/30)
Predict and store for Model als (15/30)
Predict and store for Model baseline (17/30)
Predict and store for Model movie_median (18/30)
Predict and store for Model movie_mean (19/30)
Predict and store for Model user_median (20/30)
Predict and store for Model knn_ub 

In [19]:
model_names = list(models.keys())
a = cv.load_predictions(model_names)

In [28]:
x0 = 1/len(models)*np.ones(len(models))
bnds = ((0, None), (0, None), (0, None), (0, None), (0, None), (0, None), (0, None), (0, None), (0, None), (0, None), (0, None), (0, None), (0, None), (0, None), (0, None), (0, None), (0, None), (0, None), (0, None), (0, None), (0, None), (0, None), (0, None), (0, None), (0, None), (0, None))
res = sco.minimize(eval_, x0, method='SLSQP', args=(cv, models), options={'maxiter':1000, 'disp':True})

Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.980657618105
            Iterations: 19
            Function evaluations: 535
            Gradient evaluations: 19


In [25]:
res.x

array([ 0.09875201,  1.88712089,  0.16996783,  0.91807928,  0.99628493,
        0.04140426, -0.41886259, -0.8516105 , -1.94169478,  0.1488136 ,
       -1.71508448,  0.14825008, -0.67447281,  0.40948983,  0.09875325,
        0.14881726, -0.92986879,  0.03919194, -0.00226493,  0.03621587,
        0.89441847,  0.0471551 ,  0.29754789, -0.09377009,  0.86223095,
        0.3478734 ])

In [29]:
best_dict = {}
for idx, key in enumerate(models.keys()):
    best_dict[key] = res.x[idx]

In [30]:
test_blending(cv, best_dict)

RMSE for  slope_one_rescaled  :  1.00032081157
RMSE for  movie_mean_deviation_user  :  0.996612661275
RMSE for  knn_ib  :  0.990312758998
RMSE for  movie_median_deviation_user  :  1.0722079529
RMSE for  movie_median_deviation_user_rescaled  :  1.06465213764
RMSE for  collab_filt  :  1.02774839578
RMSE for  slope_one  :  1.00010502221
RMSE for  user_median  :  1.15146416557
RMSE for  movie_mean_rescaled  :  1.00562209798
RMSE for  mf_sgd_rescaled  :  0.999931853884
RMSE for  user_mean  :  1.09516860129
RMSE for  knn_ib_rescaled  :  0.99043827713
RMSE for  mf_sgd  :  1.00080284886
RMSE for  als  :  0.9887407042
RMSE for  baseline_rescaled  :  1.00032081157
RMSE for  baseline  :  1.00010502221
RMSE for  movie_median  :  1.09968357705
RMSE for  movie_mean  :  1.03043151776
RMSE for  movie_median_rescaled  :  1.0226743167
RMSE for  svd_rescaled  :  0.998400266617
RMSE for  global_median  :  1.12811994776
RMSE for  collab_filt_rescaled  :  1.02746150517
RMSE for  svd  :  0.99835151007
RMSE f