# Example of usage of Cross Validation while mixing models
Not well implemented yet


In [1]:
sc

<pyspark.context.SparkContext at 0x7f5e903f3208>

# Import

In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import itertools
from pyspark.mllib.recommendation import ALS
import math

%matplotlib inline
%load_ext autoreload
%autoreload 2

pd.options.display.max_columns = 100


In [3]:
from pyspark.sql.functions import col

In [4]:
from cross_validation import KFoldIndexes,CrossValidation

In [8]:
from cross_validation_blending import CrossValidationBlending

In [5]:
from als import predictions_ALS
from means import user_mean,global_mean,movie_mean

In [6]:
import helpers

In [7]:
import random


# Dataframe creation

In [9]:
train = helpers.load_csv()
train.head()

Unnamed: 0,User,Movie,Rating
0,44,1,4
1,61,1,3
2,67,1,4
3,72,1,3
4,86,1,5


# Test with predictions_ALS function
Example of usage of a function - for all function the usage should be the same

In [None]:
train_index=random.sample(range(1176952),1000000)
train_index.sort()
test_index=list(set(range(1176952))-set(train_index))
test_index.sort()

In [None]:
training=train.loc[train_index]
testing=train.loc[test_index]

In [None]:
predictions_ALS(training,testing,sc,rank=8,lambda_=0.081, iterations=24, nonnegative=True).head()

# Cross Validation Blending class
To save computational power it computes the predictions for different models 

In [10]:
blending_test=CrossValidationBlending(train,4)

In [11]:
blending_test.add_model(predictions_ALS,'als')
blending_test.add_model(predictions_ALS,'als2')

In [None]:
blending_test.add_model(global_mean,'global_mean')

In [None]:
blending_test.add_model(user_mean,'user_mean')

In [None]:
blending_test.add_model(movie_mean,'movie_mean')

In [None]:
blending_test.add_params_for_model('als',{'spark_context':sc,'rank':4})
blending_test.add_params_for_model('als2',{'spark_context':sc,'rank':4})

[ALS] applying
[ALS] done
[ALS] applying
[ALS] done
[ALS] applying
[ALS] done
[ALS] applying


In [None]:
blending_test.add_params_for_model('user_mean',{})

In [None]:
blending_test.add_params_for_model('movie_mean',{})

In [None]:
blending_test.add_params_for_model('global_mean',{})

In [None]:
blending_test.evaluate_blending({'als':0,'als2':0,'global_mean':0,'user_mean':0,'movie_mean':1})

In [None]:
blending_test.evaluate_blending({'movie_mean':0,'user_mean':0,'global_mean':1})

In [None]:
class CrossValidationBlending:
    ''' Class to cross validate a blended model. The different submodels should have a predefined set of parameters.
    No cross validation is done on the submodel parameters.
    
    Usage (blend of 2 models ALS and movie mean):
    blending_test=CrossValidationBlending(full_dataset,k_splits)
    blending_test.add_model(predictions_ALS,'als')
    blending_test.add_model(movie_mean,'movie_mean')
    blending_test.add_params_for_model('als',{'spark_context':sc,'rank':4})
    blending_test.add_params_for_model('movie_mean',{})
    blending_test.evaluate_blending({'movie_mean':0.5,'als':0.5})
    '''
    
    def __init__(self,data,k):
        ''' Initialization function. It creates self.tests_list, the list of all the test dataframe
        
        @ params
            - data, the input dataframe
            - k, the number of splits in the cross validation
        '''
        
        # Initialize static class variables
        self.models={} # Dict to store all the model functions
        self.params={} # Dict to store the params with which running each model
        self.predictions={} # Dict to store the predictions for each model with the given parameters
        self.real_values=[] # List to store the real values for each chunk
        self.blended_predictions=[]
        
        # Initialize the parameters
        self.k=k
        
        # Initialize the k_fold_indexes
        k_fold_indexes=KFoldIndexes(k,data.shape[0])
        
        if k>1:
            self.tests_list=self.get_tests_database(data,k_fold_indexes)
        else:
            pass
        
    def add_model(self,function, name):
        ''' Function to add a model to the evaluation
        
        Requirements for 'function'
        - First 2 parameters train and test database
        - Other optional parameters to be specified using the function add_params_for_model
        - It returns the prediction dataset, with columns named User, Movie, Rating, sorted by Movie and then User.
        
        @ params
            - function, function that, given train, test and possibly other parameters returns the predictions computed on the test dataset.
            - name, string representing the model name
        '''
        self.models[name]=function

    def add_params_for_model(self,model_name,params_dict):
        ''' Function to add optional parameters to a model
        
        The function given as input in add_model() may require other parameters a part of train and test.
        
        WARNING - IT MAY BE SLOW - It computes the prediction for the model with these parameters
        
        @ params
            - model_name, string - the same passed in add_model
            - params_dict, dictionary containing the optional parameters to be passed to the model function
        '''
        if model_name not in self.models:
            print('Warning: Adding parameters for a non-existing model')
        self.params[model_name]=params_dict
        self.compute_predictions(model_name)
        
    def compute_predictions(self,model):
        ''' Function that computes the predictions for a given model
        
        DO NOT CALL IT DIRECTLY - It is automatically called when adding the model parameters.
        
        The predictions are stored in predictions[model]. 
        It is a list with k_splits elements, each one is a Pandas dataframe containing the prediction for the corresponding test dataframe. 
        
        @ params
            - model, string that represents the model for which computing the predictions
        
        '''
        for model_name in self.models:
            if model_name!=model:
                continue
            self.real_values=[]
            
            function=self.models[model_name]
            try:
                arguments=self.params[model_name]
            except:
                print('Arguments not available for model',model_name)
                
            self.predictions[model_name]=[]
            for comb in itertools.combinations(range(self.k),self.k-1):
                trains=[self.tests_list[x] for x in comb]
                train=pd.concat(trains)                
                
                test_index=[x for x in range(self.k) if x not in comb][0]
                test=self.tests_list[test_index]
                
                self.real_values.append(test.Rating)
                self.predictions[model_name].append(function(train,test,**arguments))
            
    def evaluate_blending(self,blending_dict):        
        ''' Evaluate a particular blended model
        
        @ params
            - blending_dict, dictionary in the form {'movie_mean':0,'user_mean':0,'global_mean':1}
        '''
        for model_name in blending_dict:
            if model_name not in self.predictions:
                print('Predictions not available for model',model_name)
                raise()
        
        self.blended_predictions=[]
        for i in range(self.k):            
            cont=0
            for model_name in blending_dict:
                if cont==0:
                    prediction=np.array(blending_dict[model_name]*self.predictions[model_name][i].Rating)
                    cont+=1
                else:
                    prediction+=np.array(blending_dict[model_name]*self.predictions[model_name][i].Rating)
                
            self.blended_predictions.append(prediction)
        
        predictions_conc=np.concatenate(self.blended_predictions)
        real_values_conc=np.concatenate(self.real_values)
        rmse=np.sqrt(sum((predictions_conc-real_values_conc)**2)/predictions_conc.shape[0])
        return rmse
        
    def delete_model(self,model_name):
        ''' Function to permanently delete a model and all its data'''
        try:
            del self.models[model_name]
        except:
            print('Model not saved')
        try:
            del self.params[model_name]
        except:
            print('Model params not presents')
        try:
            del self.predictions[model_name]
        except:
            print('Model predictions not presents')
        
    def get_tests_database(self,data,k_fold_indexes):
        '''Internal function to get the list of test pandas dataframe'''
        tests=[]
        for i in k_fold_indexes.indexes:
            tests.append(data.loc[i[1]])
        return tests

# Usage of simple Cross Validation on a single model
Old part - it will be updated

# Models

In [None]:
class ALSModel:
    
    def __init__(self):
        pass
    
    def fit(self,data,**arg):
        self.model = ALS.train(data, **arg)
    
    def predict(self,data):
        data_for_preditions=data.map(lambda x: (x[0], x[1]))
        self.predictions = self.model.predictAll(data_for_preditions).map(lambda r: ((r[0], r[1]), r[2]))
    
    def evaluate(self,data):
        rates_and_preds = data.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(self.predictions)
        error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
        return error

# Cross validation

In [None]:
cv=CrossValidation(train,4,True,sc)

In [None]:
ranks = [8]
lambdas = [0.08,0.081,0.082,0.083,0.084,0.085,0.086,0.087,0.088,0.089,0.09]
numIters = [24]
nbr_models = len(ranks)*len(lambdas)*len(numIters)

bestModel = None
bestValidationRmse = float("inf")
bestRank = 0
bestLambda = -1.0
bestNumIter = -1

In [None]:
bestLambda

In [None]:
i = 0
for rank, lmbda, numIter in itertools.product(ranks, lambdas, numIters):
    try:
        print(rank,lmbda,numIter)
        validationRmse = cv.evaluate(ALSModel(),rank=rank,lambda_=lmbda, iterations=numIter, nonnegative=True)
        validationRmse = np.mean(validationRmse)
        print("Model %i/%i: RMSE (validation) = %f" %(i+1, nbr_models, validationRmse))
        print("  Trained with rank = %d, lambda = %.1f, and numIter = %d." % (rank, lmbda, numIter))
        print("")
        if (validationRmse < bestValidationRmse):
#             bestModel = model
            bestValidationRmse = validationRmse
            bestRank = rank
            bestLambda = lmbda
            bestNumIter = numIter
    except:
        print("Model %i/%i failed!" %(i+1, nbr_models))
        print("  Parameters: rank = %d, lambda = %.1f, and numIter = %d." % (rank, lmbda, numIter))

    i += 1
    
# Evaluate the best model on the training set
print("The best model was trained with rank = %d and lambda = %.1f, " % (bestRank, bestLambda) \
  + "and numIter = %d, and its RMSE on the training set is %f" % (bestNumIter, bestValidationRmse))

# # Evaluate the best model on the test set
# testRmse = computeRMSE(bestModel, test_for_predict_RDD, test_RDD)
# print("RMSE on the test set: %f"%(testRmse))


# Prepare data for submission

In [None]:
test = pd.read_csv('../data/sampleSubmission.csv')
test.head()

In [None]:
# Prepare test for RDD
test_prep = test
test_prep['UserID'] = test_prep['Id'].apply(lambda x: int(x.split('_')[0][1:]))
test_prep['MovieID'] = test_prep['Id'].apply(lambda x: int(x.split('_')[1][1:]))
test_prep['Rating'] = test_prep['Prediction']
test_prep = test_prep.drop(['Prediction', 'Id'], axis=1)
test_prep.head()

In [None]:
# First, we transform it using sqlContect
test_sql = sqlContext.createDataFrame(test_prep)
test_rdd = test_sql.rdd
test_rdd.take(3)

In [None]:
bestModel=ALSModel()
bestModel.fit(train_rdd,rank=8,lambda_=0.081, iterations=24, nonnegative=True)

In [None]:
bestModel.predict(test_rdd)
predictions=bestModel.predictions
# predictions = bestModel.predictAll(test_RDD_Kaggle).map(lambda r: ((r[0], r[1]), r[2]))

In [None]:
predictions.take(3)

In [None]:
pred_df = predictions.toDF().toPandas()

In [None]:
pred_df['UserID'] = pred_df['_1'].apply(lambda x: x['_1'])
pred_df['MovieID'] = pred_df['_1'].apply(lambda x: x['_2'])
pred_df['Rating'] = pred_df['_2']
pred_df = pred_df.drop(['_1', '_2'], axis=1)
pred_df.head()

In [None]:
pred_df = pred_df.sort_values(by=['MovieID', 'UserID'])
pred_df.head()

In [None]:
pred_df.index = range(len(pred_df))

In [None]:
test['Prediction'] = pred_df['Rating']


In [None]:
test.head()

In [None]:
test = test.drop(['UserID', 'MovieID', 'Rating'], axis=1)


In [None]:
test.to_csv('pred_pyspark_als.csv', index=False)