In [66]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import itertools
from pyspark.mllib.recommendation import ALS
from pyspark.sql.functions import col
import math
import random
import itertools
import copy
from joblib import Parallel, delayed
import multiprocessing
import pickle

from cross_validation import KFoldIndexes,CrossValidation
from cross_validation_blending import CrossValidationBlending
from models.als import predictions_ALS
from models.means import *
from models.medians import *
from models.helpers import *

%matplotlib inline
%load_ext autoreload
%autoreload 2

pd.options.display.max_columns = 100
sc.setCheckpointDir('./checkpoint/')


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
train = load_csv()
train.head()

Unnamed: 0,User,Movie,Rating
0,44,1,4
1,61,1,3
2,67,1,4
3,72,1,3
4,86,1,5


In [74]:
models = {
    'user_mean': {
        'compute_predictions': True,
        'function': user_mean,
        'params': {}
    },
    'movie_mean': {
        'compute_predictions': True,
        'function': movie_mean,
        'params': {}
    },
    'user_median': {
        'compute_predictions': True,
        'function': user_median,
        'params': {}
    },
    'movie_median': {
        'compute_predictions': True,
        'function': movie_median,
        'params': {}
    }
}

models2 = {
    'movie_mean_deviation_user': {
        'compute_predictions': True,
        'function': movie_mean_deviation_user,
        'params': {}
    },
    'movie_median_deviation_user': {
        'compute_predictions': True,
        'function': movie_median_deviation_user,
        'params': {}
    }
}

models3 = {
    'als': {
        'compute_predictions': True,
        'function': predictions_ALS,
        'params': {
            'spark_context': sc,
            'rank': 8,
            'lambda_': 0.081, 
            'iterations': 24, 
            'nonnegative': True
        }
    }
}

#models = dict(models, **models2)

models = dict(models2, **models3)

In [62]:
def prepare_blending(data, k_folds, models):
    blending = CrossValidationBlending(data, k_folds)
    for key in models.keys():
        blending.add_model(models[key]['function'], key)
    return blending

def add_param_blending(blending, models):
    for key in models.keys():
        blending.add_params_for_model(key, models[key]['params'], compute_predictions=models[key]['compute_predictions'])
        
    return blending

def cross_validate(blending, models):
    nbr_models = len(models.keys())

    curr_arr = [0, 0.1, 0.1, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]

    comb_input = [a for a in itertools.combinations_with_replacement(curr_arr, nbr_models) if sum(a) == 1]
    comb_input = [set(itertools.permutations(a)) for a in comb_input]

    combinations = []
    for a in comb_input:
        combinations += list(a)
        
    dict_try = {}
    best_dict = {}
    for key in models.keys():
        dict_try[key] = 0
        
    min_rmse = 100
    best_weights = []
    nbr = 0
    for comb in combinations:
        for idx, key in enumerate(models.keys()):
            dict_try[key] = comb[idx]  

        result = blending.evaluate_blending(dict_try)
        
        if result < min_rmse:
            min_rmse = result
            best_weights = comb
            best_dict = dict_try.copy()

        nbr += 1
        if nbr%round(len(combinations)/10) == 0:
            print("%i%% done!"%(nbr/round(len(combinations)/10)*10))
                  
    return best_dict

def test_blending(blending, best_dict):
    dict_try = {}
    for key in models.keys():
        dict_try[key] = 0
        
    for key in models.keys():
        dict_test = dict_try.copy()
        dict_test[key] = 1
        rmse = blending.evaluate_blending(dict_test)
        print("RMSE for model %s: %.3f"%(key, rmse))
        
    print()
    rmse = blending.evaluate_blending(best_dict)
    print("Best blending: %s"%best_dict)
    print("RMSE best blending: %.3f"%rmse)
    
def prediction(train, blending, best_dict, output):
    test = pd.read_csv('../data/sampleSubmission.csv')
    test_prep = test
    test_prep['User'] = test_prep['Id'].apply(lambda x: int(x.split('_')[0][1:]))
    test_prep['Movie'] = test_prep['Id'].apply(lambda x: int(x.split('_')[1][1:]))
    test_prep['Rating'] = test_prep['Prediction']
    test_prep = test_prep.drop(['Prediction', 'Id'], axis=1)
    
    pred = blending.evaluate_blending_for_validation(best_dict, train, test_prep)
    test.Rating=pred
    test['Prediction']=test.Rating
    test = test.drop(['User', 'Movie', 'Rating'], axis=1)
    test.to_csv(output, index=False)
    
    

In [63]:
blending = prepare_blending(train, 10, models)

In [65]:
blending = add_param_blending(blending, models)

[USER_MEAN] applying
[USER_MEAN] done
[USER_MEAN] applying
[USER_MEAN] done
[USER_MEAN] applying
[USER_MEAN] done
[USER_MEAN] applying
[USER_MEAN] done
[USER_MEAN] applying
[USER_MEAN] done
[USER_MEAN] applying
[USER_MEAN] done
[USER_MEAN] applying
[USER_MEAN] done
[USER_MEAN] applying
[USER_MEAN] done
[USER_MEAN] applying
[USER_MEAN] done
[USER_MEAN] applying
[USER_MEAN] done
[ALS] applying
[ALS] done
[ALS] applying
[ALS] done
[ALS] applying
[ALS] done
[ALS] applying
[ALS] done
[ALS] applying
[ALS] done
[ALS] applying
[ALS] done
[ALS] applying
[ALS] done
[ALS] applying
[ALS] done
[ALS] applying
[ALS] done
[ALS] applying
[ALS] done
[MOVIE_MEAN_DEVIATION_USER] applying
[MOVIE_MEAN_DEVIATION_USER] done
[MOVIE_MEAN_DEVIATION_USER] applying
[MOVIE_MEAN_DEVIATION_USER] done
[MOVIE_MEAN_DEVIATION_USER] applying
[MOVIE_MEAN_DEVIATION_USER] done
[MOVIE_MEAN_DEVIATION_USER] applying
[MOVIE_MEAN_DEVIATION_USER] done
[MOVIE_MEAN_DEVIATION_USER] applying
[MOVIE_MEAN_DEVIATION_USER] done
[MOVIE_MEA

In [71]:
best_dict = cross_validate(blending, models)

10% done!
20% done!
30% done!
40% done!
50% done!
60% done!
70% done!
80% done!
90% done!


In [72]:
test_blending(blending, best_dict)

RMSE for model user_mean: 1.095
RMSE for model als: 0.986
RMSE for model movie_mean_deviation_user: 0.997
RMSE for model movie_median: 1.099
RMSE for model movie_mean: 1.030
RMSE for model movie_median_deviation_user: 1.072
RMSE for model user_median: 1.150

Best blending: {'user_mean': 0, 'als': 0.8, 'movie_mean_deviation_user': 0.1, 'movie_median': 0, 'movie_mean': 0, 'movie_median_deviation_user': 0.1, 'user_median': 0}
RMSE best blending: 0.983


In [73]:
prediction(train, blending, best_dict, 'pred_blending.csv')

[ALS] applying
[ALS] done
[MOVIE_MEAN_DEVIATION_USER] applying
[MOVIE_MEAN_DEVIATION_USER] done
[MOVIE_MEDIAN_DEVIATION_USER] applying
[MOVIE_MEDIAN_DEVIATION_USER] done
