# Cross validation of the ALS method using Spark
We refer to this tutorial: http://spark.apache.org/docs/latest/ml-tuning.html#example-model-selection-via-cross-validation

The tutorial was not clear: we used SKLearn to split the dataset

# Import

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import itertools
from pyspark.mllib.recommendation import ALS
import math

%matplotlib inline
%load_ext autoreload
%autoreload 2

pd.options.display.max_columns = 100


In [2]:
from pyspark.sql.functions import col

In [3]:
from helpers_cross_validation import KFoldIndexes,CrossValidation

In [4]:
import random


# Models

In [5]:
class ALSModel:
    
    def __init__(self):
        pass
    
    def fit(self,data,**arg):
        self.model = ALS.train(data, **arg)
    
    def predict(self,data):
        data_for_preditions=data.map(lambda x: (x[0], x[1]))
        self.predictions = self.model.predictAll(data_for_preditions).map(lambda r: ((r[0], r[1]), r[2]))
    
    def evaluate(self,data):
        rates_and_preds = data.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(self.predictions)
        error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
        return error

# Dataframe creation

In [6]:
train = pd.read_csv('../data/data_train.csv')


In [7]:
train['UserID'] = train['Id'].apply(lambda x: int(x.split('_')[0][1:]))
train['MovieID'] = train['Id'].apply(lambda x: int(x.split('_')[1][1:]))
train['Rating'] = train['Prediction']
train = train.drop(['Id', 'Prediction'], axis=1)

In [8]:
train.shape[0]

1176952

# Cross validation

In [9]:
cv=CrossValidation(train,4,True,sc)

In [24]:
ranks = [7,8,9]
lambdas = [0.08,0.085,0.09,0.095,0.1]
numIters = [20]
nbr_models = len(ranks)*len(lambdas)*len(numIters)

bestModel = None
bestValidationRmse = float("inf")
bestRank = 0
bestLambda = -1.0
bestNumIter = -1

In [25]:
i = 0
for rank, lmbda, numIter in itertools.product(ranks, lambdas, numIters):
    try:
        print(rank,lmbda,numIter)
        validationRmse = cv.evaluate(ALSModel(),rank=rank,lambda_=lmbda, iterations=numIter)
        validationRmse = np.mean(validationRmse)
        print("Model %i/%i: RMSE (validation) = %f" %(i+1, nbr_models, validationRmse))
        print("  Trained with rank = %d, lambda = %.1f, and numIter = %d." % (rank, lmbda, numIter))
        print("")
        if (validationRmse < bestValidationRmse):
#             bestModel = model
            bestValidationRmse = validationRmse
            bestRank = rank
            bestLambda = lmbda
            bestNumIter = numIter
    except:
        print("Model %i/%i failed!" %(i+1, nbr_models))
        print("  Parameters: rank = %d, lambda = %.1f, and numIter = %d." % (rank, lmbda, numIter))

    i += 1
    
# Evaluate the best model on the training set
print("The best model was trained with rank = %d and lambda = %.1f, " % (bestRank, bestLambda) \
  + "and numIter = %d, and its RMSE on the training set is %f" % (bestNumIter, bestValidationRmse))

# # Evaluate the best model on the test set
# testRmse = computeRMSE(bestModel, test_for_predict_RDD, test_RDD)
# print("RMSE on the test set: %f"%(testRmse))


7 0.08 20
Model 1/15: RMSE (validation) = 0.991639
  Trained with rank = 7, lambda = 0.1, and numIter = 20.

7 0.085 20
Model 2/15: RMSE (validation) = 0.991673
  Trained with rank = 7, lambda = 0.1, and numIter = 20.

7 0.09 20
Model 3/15: RMSE (validation) = 0.991241
  Trained with rank = 7, lambda = 0.1, and numIter = 20.

7 0.095 20
Model 4/15: RMSE (validation) = 0.991421
  Trained with rank = 7, lambda = 0.1, and numIter = 20.

7 0.1 20
Model 5/15: RMSE (validation) = 0.992471
  Trained with rank = 7, lambda = 0.1, and numIter = 20.

8 0.08 20
Model 6/15: RMSE (validation) = 0.991252
  Trained with rank = 8, lambda = 0.1, and numIter = 20.

8 0.085 20
Model 7/15: RMSE (validation) = 0.991237
  Trained with rank = 8, lambda = 0.1, and numIter = 20.

8 0.09 20
Model 8/15: RMSE (validation) = 0.991082
  Trained with rank = 8, lambda = 0.1, and numIter = 20.

8 0.095 20
Model 9/15: RMSE (validation) = 0.991452
  Trained with rank = 8, lambda = 0.1, and numIter = 20.

8 0.1 20
Model 1

# Others

In [None]:
tests=get_tests_database(train,k_fold_indexes)

In [None]:
tests_sql=get_sql_from_pd(tests)

In [None]:
type(tests[0])

In [None]:
sc.union(tests_sql)

In [None]:
tr=tests_sql[0].rdd.union(tests_sql[1].rdd).union(tests_sql[2].rdd)

In [None]:
ts=tests_sql[3].rdd

In [None]:
model = ALS.train(tr, 2, 10, 0.1)

In [None]:
validation_for_predict_RDD = ts.map(lambda x: (x[0], x[1]))

In [None]:
predictions = model.predictAll(validation_for_predict_RDD).map(lambda r: ((r[0], r[1]), r[2]))

In [None]:
rates_and_preds = ts.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)

In [None]:
error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())

In [None]:
error

In [None]:
tests_sql

In [None]:
train_sql = sqlContext.createDataFrame(train)
train_rdd = train_sql.rdd
train_rdd.take(1)

In [None]:
type(train_sql.map(lambda x: (x[0],x[1],float(x[2]))))