# Cross validation of the ALS method using Spark
We refer to this tutorial: http://spark.apache.org/docs/latest/ml-tuning.html#example-model-selection-via-cross-validation

The tutorial was not clear: we used SKLearn to split the dataset

# Import

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import itertools
from pyspark.mllib.recommendation import ALS
import math

%matplotlib inline
%load_ext autoreload
%autoreload 2

pd.options.display.max_columns = 100


In [2]:
from pyspark.sql.functions import col

In [3]:
from helpers_cross_validation import KFoldIndexes,CrossValidation

In [4]:
from als import get_predictions_ALS

In [5]:
import helpers

In [6]:
import random


# Models

In [7]:
class ALSModel:
    
    def __init__(self):
        pass
    
    def fit(self,data,**arg):
        self.model = ALS.train(data, **arg)
    
    def predict(self,data):
        data_for_preditions=data.map(lambda x: (x[0], x[1]))
        self.predictions = self.model.predictAll(data_for_preditions).map(lambda r: ((r[0], r[1]), r[2]))
    
    def evaluate(self,data):
        rates_and_preds = data.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(self.predictions)
        error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
        return error

# Dataframe creation

In [8]:
train = helpers.load_csv()
train.head()

Unnamed: 0,UserID,MovieID,Rating
0,44,1,4
1,61,1,3
2,67,1,4
3,72,1,3
4,86,1,5


In [None]:
# First, we transform it using sqlContect
train_rdd = helpers.to_rdd(sqlContext, train)
train_rdd.take(3)

In [9]:
train.shape[0]

1176952

# Cross validation

In [None]:
cv=CrossValidation(train,4,True,sc)

In [None]:
ranks = [8]
lambdas = [0.08,0.081,0.082,0.083,0.084,0.085,0.086,0.087,0.088,0.089,0.09]
numIters = [24]
nbr_models = len(ranks)*len(lambdas)*len(numIters)

bestModel = None
bestValidationRmse = float("inf")
bestRank = 0
bestLambda = -1.0
bestNumIter = -1

In [None]:
bestLambda

In [None]:
i = 0
for rank, lmbda, numIter in itertools.product(ranks, lambdas, numIters):
    try:
        print(rank,lmbda,numIter)
        validationRmse = cv.evaluate(ALSModel(),rank=rank,lambda_=lmbda, iterations=numIter, nonnegative=True)
        validationRmse = np.mean(validationRmse)
        print("Model %i/%i: RMSE (validation) = %f" %(i+1, nbr_models, validationRmse))
        print("  Trained with rank = %d, lambda = %.1f, and numIter = %d." % (rank, lmbda, numIter))
        print("")
        if (validationRmse < bestValidationRmse):
#             bestModel = model
            bestValidationRmse = validationRmse
            bestRank = rank
            bestLambda = lmbda
            bestNumIter = numIter
    except:
        print("Model %i/%i failed!" %(i+1, nbr_models))
        print("  Parameters: rank = %d, lambda = %.1f, and numIter = %d." % (rank, lmbda, numIter))

    i += 1
    
# Evaluate the best model on the training set
print("The best model was trained with rank = %d and lambda = %.1f, " % (bestRank, bestLambda) \
  + "and numIter = %d, and its RMSE on the training set is %f" % (bestNumIter, bestValidationRmse))

# # Evaluate the best model on the test set
# testRmse = computeRMSE(bestModel, test_for_predict_RDD, test_RDD)
# print("RMSE on the test set: %f"%(testRmse))


# Prepare data for submission

In [None]:
test = pd.read_csv('../data/sampleSubmission.csv')
test.head()

In [None]:
# Prepare test for RDD
test_prep = test
test_prep['UserID'] = test_prep['Id'].apply(lambda x: int(x.split('_')[0][1:]))
test_prep['MovieID'] = test_prep['Id'].apply(lambda x: int(x.split('_')[1][1:]))
test_prep['Rating'] = test_prep['Prediction']
test_prep = test_prep.drop(['Prediction', 'Id'], axis=1)
test_prep.head()

In [None]:
# First, we transform it using sqlContect
test_sql = sqlContext.createDataFrame(test_prep)
test_rdd = test_sql.rdd
test_rdd.take(3)

In [None]:
bestModel=ALSModel()
bestModel.fit(train_rdd,rank=8,lambda_=0.081, iterations=24, nonnegative=True)

In [None]:
bestModel.predict(test_rdd)
predictions=bestModel.predictions
# predictions = bestModel.predictAll(test_RDD_Kaggle).map(lambda r: ((r[0], r[1]), r[2]))

In [None]:
predictions.take(3)

In [None]:
pred_df = predictions.toDF().toPandas()

In [None]:
pred_df['UserID'] = pred_df['_1'].apply(lambda x: x['_1'])
pred_df['MovieID'] = pred_df['_1'].apply(lambda x: x['_2'])
pred_df['Rating'] = pred_df['_2']
pred_df = pred_df.drop(['_1', '_2'], axis=1)
pred_df.head()

In [None]:
pred_df = pred_df.sort_values(by=['MovieID', 'UserID'])
pred_df.head()

In [None]:
pred_df.index = range(len(pred_df))

In [None]:
test['Prediction'] = pred_df['Rating']


In [None]:
test.head()

In [None]:
test = test.drop(['UserID', 'MovieID', 'Rating'], axis=1)


In [None]:
test.to_csv('pred_pyspark_als.csv', index=False)

# Test with get_predictions_ALS function

In [10]:
import random

In [11]:
train_index=random.sample(range(1176952),1000000)
train_index.sort()

In [12]:
test_index=list(set(range(1176952))-set(train_index))
test_index.sort()

In [13]:
training=train.loc[train_index]

In [14]:
testing=train.loc[test_index]

In [16]:
get_predictions_ALS(training,testing,sc,rank=8,lambda_=0.081, iterations=24, nonnegative=True)

Unnamed: 0,User,Movie,Rating
0,72,1,3.332143
1,333,1,3.294733
2,743,1,3.666692
3,930,1,3.345466
4,981,1,3.361476
5,1107,1,3.119422
6,1407,1,2.994545
7,1435,1,3.148449
8,1549,1,2.984605
9,1802,1,2.918144
