In [1]:
# Test if PySpark is installed correctly. If it's the case, you should see:
# <pyspark.context.SparkContext at 0x7f0d85ae4240>
sc


<pyspark.context.SparkContext at 0x101710128>

In [2]:
# Import the fucking useful libraries =)
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import itertools
from pyspark.mllib.recommendation import ALS
import math

%matplotlib inline
%load_ext autoreload
%autoreload 2

# There's a lot of columns in the DF. 
# Therefore, we add this option so that we can see more columns
pd.options.display.max_columns = 100

# Load and prepare train data

First, we will load the train data. Then, we create a pandas DF so that it contains three columns:

                        UserID | MovieID | Rating

In [43]:
train = pd.read_csv('../data/data_rescaled.csv')
deviation = pd.read_csv('../data/deviations_per_users.csv')

In [10]:
train['UserID'] = train['Id'].apply(lambda x: int(x.split('_')[0][1:]))
train['MovieID'] = train['Id'].apply(lambda x: int(x.split('_')[1][1:]))
train['Rating'] = train['Prediction']
train = train.drop(['Id', 'Prediction'], axis=1)

In [11]:
train.head()

Unnamed: 0,UserID,MovieID,Rating
0,44,1,3.938425
1,61,1,2.59843
2,67,1,4.25148
3,72,1,2.970702
4,86,1,4.728805


In [12]:
train.shape

(1176952, 3)

In [13]:
deviation.head()

Unnamed: 0,1,0.21135267540047176
0,2,-0.060314
1,3,-0.30887
2,4,-0.032126
3,5,0.152949
4,6,0.534738


Now, we need to transform the pandas DataFrame into a RDD (Resilient Distributed Dataset) for Spark.

In [14]:
# First, we transform it using sqlContect
train_sql = sqlContext.createDataFrame(train)
train_rdd = train_sql.rdd
train_rdd.take(3)

[Row(UserID=44, MovieID=1, Rating=3.938424798067519),
 Row(UserID=61, MovieID=1, Rating=2.598429933295181),
 Row(UserID=67, MovieID=1, Rating=4.251480424178771)]

# Collaborative Filtering

For this Recommender System, we will use a collaborative filtering recommender system. Indeed, we don't have any information about the movies. Therefore, we will use the ratings of other users to guess the rating of a user. 

The Spark MLib provides a Collaborative Filtering implementation by using Alternating Least Squares. We will need the following parameters:

- `numBlocks`: Number of blocks used to parallelize computation (-1 for auto-configure)
- `rank`: Number of latent factors in the model
- `iterations`: Number of iterations
- `lambda`: Regularization parameter in ALS
- `implicitPrefs`: Specify whether to use the explicit feedback ALS variant or one adapted for implicit feedback data
- `alpha`: Parameter applicable to the implicit feedback variant of ALS that governs the baseline confidence in preference observations


In [15]:
# Split into train, validation and test datasets
training_RDD, validation_RDD, test_RDD = train_rdd.randomSplit([6, 2, 2], seed=0)
validation_for_predict_RDD = validation_RDD.map(lambda x: (x[0], x[1]))
test_for_predict_RDD = test_RDD.map(lambda x: (x[0], x[1]))

## Training phase!

In [16]:
def computeRMSE(model, data, prediction):
    predictions = model.predictAll(data).map(lambda r: ((r[0], r[1]), r[2]))
    rates_and_preds = prediction.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
    error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
    return error

In [17]:
ranks = [2, 4, 6, 8, 10, 12]
lambdas = [0.1, 0.5, 1.0, 5.0, 10.0]
numIters = [5, 10, 20]
nbr_models = len(ranks)*len(lambdas)*len(numIters)

bestModel = None
bestValidationRmse = float("inf")
bestRank = 0
bestLambda = -1.0
bestNumIter = -1

In [18]:
i = 0
for rank, lmbda, numIter in itertools.product(ranks, lambdas, numIters):
    try:
        model = ALS.train(training_RDD, rank, numIter, lmbda)
        validationRmse = computeRMSE(model, validation_for_predict_RDD, validation_RDD)
        print("Model %i/%i: RMSE (validation) = %f" %(i+1, nbr_models, validationRmse))
        print("  Trained with rank = %d, lambda = %.1f, and numIter = %d." % (rank, lmbda, numIter))
        print("")
        if (validationRmse < bestValidationRmse):
            bestModel = model
            bestValidationRmse = validationRmse
            bestRank = rank
            bestLambda = lmbda
            bestNumIter = numIter
    except:
        print("Model %i/%i failed!" %(i+1, nbr_models))
        print("  Parameters: rank = %d, lambda = %.1f, and numIter = %d." % (rank, lmbda, numIter))

    i += 1
    
# Evaluate the best model on the training set
print("The best model was trained with rank = %d and lambda = %.1f, " % (bestRank, bestLambda) \
  + "and numIter = %d, and its RMSE on the training set is %f" % (bestNumIter, bestValidationRmse))

# Evaluate the best model on the test set
testRmse = computeRMSE(bestModel, test_for_predict_RDD, test_RDD)
print("RMSE on the test set: %f"%(testRmse))

Model 1/90: RMSE (validation) = 1.011860
  Trained with rank = 2, lambda = 0.1, and numIter = 5.

Model 2/90: RMSE (validation) = 1.000885
  Trained with rank = 2, lambda = 0.1, and numIter = 10.

Model 3/90: RMSE (validation) = 1.002129
  Trained with rank = 2, lambda = 0.1, and numIter = 20.

Model 4/90: RMSE (validation) = 1.143835
  Trained with rank = 2, lambda = 0.5, and numIter = 5.

Model 5/90: RMSE (validation) = 1.118615
  Trained with rank = 2, lambda = 0.5, and numIter = 10.

Model 6/90: RMSE (validation) = 1.118058
  Trained with rank = 2, lambda = 0.5, and numIter = 20.

Model 7/90: RMSE (validation) = 1.405118
  Trained with rank = 2, lambda = 1.0, and numIter = 5.

Model 8/90: RMSE (validation) = 1.411702
  Trained with rank = 2, lambda = 1.0, and numIter = 10.

Model 9/90: RMSE (validation) = 1.411707
  Trained with rank = 2, lambda = 1.0, and numIter = 20.

Model 10/90: RMSE (validation) = 3.978086
  Trained with rank = 2, lambda = 5.0, and numIter = 5.

Model 11/90: 

Now, that we have the best rank, best lambda and best number of iterations, we can train on the whole train data.

In [19]:
perfect_model = ALS.train(train_rdd, bestRank, bestNumIter, bestLambda)

# Load and prepare the test data

In [20]:
test = pd.read_csv('../data/sampleSubmission.csv')
test.head()

Unnamed: 0,Id,Prediction
0,r37_c1,3
1,r73_c1,3
2,r156_c1,3
3,r160_c1,3
4,r248_c1,3


In [21]:
# Prepare test for RDD
test_prep = test
test_prep['UserID'] = test_prep['Id'].apply(lambda x: int(x.split('_')[0][1:]))
test_prep['MovieID'] = test_prep['Id'].apply(lambda x: int(x.split('_')[1][1:]))
test_prep['Rating'] = test_prep['Prediction']
test_prep = test_prep.drop(['Prediction', 'Id'], axis=1)
test_prep.head()

Unnamed: 0,UserID,MovieID,Rating
0,37,1,3
1,73,1,3
2,156,1,3
3,160,1,3
4,248,1,3


In [23]:
# First, we transform it using sqlContect
test_sql = sqlContext.createDataFrame(test_prep)
test_rdd = test_sql.rdd
test_rdd.take(3)

[Row(UserID=37, MovieID=1, Rating=3),
 Row(UserID=73, MovieID=1, Rating=3),
 Row(UserID=156, MovieID=1, Rating=3)]

In [24]:
test_RDD_Kaggle = test_rdd.map(lambda x: (x[0], x[1]))
predictions = perfect_model.predictAll(test_RDD_Kaggle).map(lambda r: ((r[0], r[1]), r[2]))

In [25]:
predictions.take(3)

[((4904, 864), 3.8075502006192443),
 ((4904, 608), 4.460712476861956),
 ((4904, 72), 3.2903132054032556)]

In [26]:
pred_df = predictions.toDF().toPandas()

In [27]:
pred_df.head()

Unnamed: 0,_1,_2
0,"(4904, 864)",3.80755
1,"(4904, 608)",4.460712
2,"(4904, 72)",3.290313
3,"(4904, 48)",3.544272
4,"(4904, 193)",3.811244


In [29]:
pred_df['UserID'] = pred_df['_1'].apply(lambda x: x['_1'])
pred_df['MovieID'] = pred_df['_1'].apply(lambda x: x['_2'])
pred_df['Rating'] = pred_df['_2']
pred_df = pred_df.drop(['_1', '_2'], axis=1)
pred_df.head()


Unnamed: 0,UserID,MovieID,Rating
0,4904,864,3.80755
1,4904,608,4.460712
2,4904,72,3.290313
3,4904,48,3.544272
4,4904,193,3.811244


In [30]:
pred_df = pred_df.sort_values(by=['MovieID', 'UserID'])
pred_df.head()

Unnamed: 0,UserID,MovieID,Rating
840969,37,1,3.252095
238375,73,1,3.245019
615219,156,1,3.284145
2964,160,1,3.358135
123927,248,1,3.249512


In [38]:
pred_df.index = range(len(pred_df))

In [85]:
deviation.set_index('UserID').head()

Unnamed: 0_level_0,dev
UserID,Unnamed: 1_level_1
1,0.211353
2,-0.060314
3,-0.30887
4,-0.032126
5,0.152949


In [88]:
deviation = deviation.set_index('UserID')

In [89]:
# ADD THE SUBSTRACTED DEVIATION

def rescale(row):
    return row['Rating'] + deviation.loc[row['UserID']]

pred_df_rescaled = pred_df.apply(rescale, axis = 1)

pred_df_rescaled.head()

Unnamed: 0,dev
0,3.177701
1,2.952893
2,3.682235
3,3.218317
4,3.34553


In [91]:
pred_df['Rating_rescaled']  = pred_df_rescaled

In [92]:
pred_df.head()

Unnamed: 0,UserID,MovieID,Rating,Rating_rescaled
0,37,1,3.252095,3.177701
1,73,1,3.245019,2.952893
2,156,1,3.284145,3.682235
3,160,1,3.358135,3.218317
4,248,1,3.249512,3.34553


In [93]:
test['Prediction'] = pred_df['Rating_rescaled']

In [94]:
test.head()

Unnamed: 0,Id,Prediction,UserID,MovieID,Rating
0,r37_c1,3.177701,37,1,3
1,r73_c1,2.952893,73,1,3
2,r156_c1,3.682235,156,1,3
3,r160_c1,3.218317,160,1,3
4,r248_c1,3.34553,248,1,3


In [95]:
test = test.drop(['UserID', 'MovieID', 'Rating'], axis=1)

In [97]:
test.to_csv('../preds/pred_pyspark_als_rescaled.csv', index=False)