In [1]:
# Test if PySpark is installed correctly. If it's the case, you should see:
# <pyspark.context.SparkContext at 0x7f0d85ae4240>
sc


<pyspark.context.SparkContext at 0x7f4804a79278>

In [2]:
# Import the fucking useful libraries =)
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

%matplotlib inline
%load_ext autoreload
%autoreload 2

# There's a lot of columns in the DF. 
# Therefore, we add this option so that we can see more columns
pd.options.display.max_columns = 100

# Load and prepare train data

First, we will load the train data. Then, we create a pandas DF so that it contains three columns:

                        UserID | MovieID | Rating

In [3]:
train = pd.read_csv('../data/data_train.csv')

In [4]:
train['UserID'] = train['Id'].apply(lambda x: int(x.split('_')[0][1:]))
train['MovieID'] = train['Id'].apply(lambda x: int(x.split('_')[1][1:]))
train['Rating'] = train['Prediction']
train = train.drop(['Id', 'Prediction'], axis=1)

In [5]:
train.head()

Unnamed: 0,UserID,MovieID,Rating
0,44,1,4
1,61,1,3
2,67,1,4
3,72,1,3
4,86,1,5


In [6]:
train.shape

(1176952, 3)

Now, we need to transform the pandas DataFrame into a RDD (Resilient Distributed Dataset) for Spark.

In [7]:
# First, we transform it using sqlContect
train_sql = sqlContext.createDataFrame(train)
train_rdd = train_sql.rdd
train_rdd.take(3)

[Row(UserID=44, MovieID=1, Rating=4),
 Row(UserID=61, MovieID=1, Rating=3),
 Row(UserID=67, MovieID=1, Rating=4)]

# Collaborative Filtering

For this Recommender System, we will use a collaborative filtering recommender system. Indeed, we don't have any information about the movies. Therefore, we will use the ratings of other users to guess the rating of a user. 

The Spark MLib provides a Collaborative Filtering implementation by using Alternating Least Squares. We will need the following parameters:

- `numBlocks`: Number of blocks used to parallelize computation (-1 for auto-configure)
- `rank`: Number of latent factors in the model
- `iterations`: Number of iterations
- `lambda`: Regularization parameter in ALS
- `implicitPrefs`: Specify whether to use the explicit feedback ALS variant or one adapted for implicit feedback data
- `alpha`: Parameter applicable to the implicit feedback variant of ALS that governs the baseline confidence in preference observations


In [8]:
# Split into train, validation and test datasets
training_RDD, validation_RDD, test_RDD = train_rdd.randomSplit([6, 2, 2], seed=0)
validation_for_predict_RDD = validation_RDD.map(lambda x: (x[0], x[1]))
test_for_predict_RDD = test_RDD.map(lambda x: (x[0], x[1]))

In [9]:
from pyspark.mllib.recommendation import ALS
import math

## Training phase!

In [10]:
seed = 5
iterations = 10
regularization_parameter = 0.1
ranks = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
errors = np.zeros(len(ranks))
err = 0
tolerance = 0.02

In [11]:
min_error = float('inf')
best_rank = -1
best_iteration = -1
for rank in ranks:
    model = ALS.train(training_RDD, rank, seed=seed, iterations=iterations,
                      lambda_=regularization_parameter)
    predictions = model.predictAll(validation_for_predict_RDD).map(lambda r: ((r[0], r[1]), r[2]))
    rates_and_preds = validation_RDD.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
    error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
    errors[err] = error
    err += 1
    print('For rank %s the RMSE is %s' % (rank, error))
    if error < min_error:
        min_error = error
        best_rank = rank

print('The best model was trained with rank %s' % best_rank)

For rank 1 the RMSE is 1.007540182227325
For rank 2 the RMSE is 1.0020322689548113
For rank 3 the RMSE is 1.003751201330986
For rank 4 the RMSE is 1.0020197163484321
For rank 5 the RMSE is 1.010166527878165
For rank 6 the RMSE is 1.004745523051763
For rank 7 the RMSE is 1.0057312848324946
For rank 8 the RMSE is 1.009959161118551
For rank 9 the RMSE is 1.0083529450803814
For rank 10 the RMSE is 1.0077796457720793
For rank 11 the RMSE is 1.0094903233570671
For rank 12 the RMSE is 1.0092073692291144
The best model was trained with rank 4


In [12]:
model = ALS.train(training_RDD, best_rank, seed=seed, iterations=iterations,
                      lambda_=regularization_parameter)
predictions = model.predictAll(test_for_predict_RDD).map(lambda r: ((r[0], r[1]), r[2]))
rates_and_preds = test_RDD.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())

print('For testing data the RMSE is %s' % (error))

For testing data the RMSE is 0.9987459788155417


Now, that we have the best rank, we can train on the whole train data.

In [13]:
perfect_model = ALS.train(train_rdd, best_rank, seed=seed, 
                           iterations=iterations, lambda_=regularization_parameter)

# Load and prepare the test data

In [65]:
test = pd.read_csv('../data/sampleSubmission.csv')
test.head()

Unnamed: 0,Id,Prediction
0,r37_c1,3
1,r73_c1,3
2,r156_c1,3
3,r160_c1,3
4,r248_c1,3


In [66]:
# Prepare test for RDD
test_prep = test
test_prep['UserID'] = test_prep['Id'].apply(lambda x: int(x.split('_')[0][1:]))
test_prep['MovieID'] = test_prep['Id'].apply(lambda x: int(x.split('_')[1][1:]))
test_prep['Rating'] = test_prep['Prediction']
test_prep = test_prep.drop(['Prediction', 'Id'], axis=1)
test_prep.head()

Unnamed: 0,UserID,MovieID,Rating
0,37,1,3
1,73,1,3
2,156,1,3
3,160,1,3
4,248,1,3


In [67]:
# First, we transform it using sqlContect
test_sql = sqlContext.createDataFrame(test_prep)
test_rdd = test_sql.rdd
test_rdd.take(3)

[Row(UserID=37, MovieID=1, Rating=3),
 Row(UserID=73, MovieID=1, Rating=3),
 Row(UserID=156, MovieID=1, Rating=3)]

In [68]:
test_RDD = test_rdd.map(lambda x: (x[0], x[1]))
predictions = perfect_model.predictAll(test_RDD).map(lambda r: ((r[0], r[1]), r[2]))

In [69]:
predictions.take(3)

[((4904, 864), 3.5538599783011353),
 ((4904, 608), 4.09080031686581),
 ((4904, 72), 3.1578939972129794)]

In [70]:
pred_df = predictions.toDF().toPandas()

In [71]:
pred_df.head()

Unnamed: 0,_1,_2
0,"(4904, 864)",3.55386
1,"(4904, 608)",4.0908
2,"(4904, 72)",3.157894
3,"(4904, 48)",3.300087
4,"(4904, 193)",3.590811


In [72]:
pred_df['UserID'] = pred_df['_1'].apply(lambda x: x['_1'])
pred_df['MovieID'] = pred_df['_1'].apply(lambda x: x['_2'])
pred_df['Rating'] = pred_df['_2']
pred_df = pred_df.drop(['_1', '_2'], axis=1)
pred_df.head()


Unnamed: 0,UserID,MovieID,Rating
0,4904,864,3.55386
1,4904,608,4.0908
2,4904,72,3.157894
3,4904,48,3.300087
4,4904,193,3.590811


In [73]:
pred_df = pred_df.sort_values(by=['MovieID', 'UserID'])
pred_df.head()

Unnamed: 0,UserID,MovieID,Rating
840969,37,1,3.219237
238375,73,1,2.949798
615219,156,1,3.623151
2964,160,1,3.210012
123927,248,1,3.342383


In [74]:
pred_df.index = range(len(pred_df))

In [75]:
test['Prediction'] = pred_df['Rating']

In [76]:
test.head()

Unnamed: 0,Id,Prediction,UserID,MovieID,Rating
0,r37_c1,3.219237,37,1,3
1,r73_c1,2.949798,73,1,3
2,r156_c1,3.623151,156,1,3
3,r160_c1,3.210012,160,1,3
4,r248_c1,3.342383,248,1,3


In [77]:
test = test.drop(['UserID', 'MovieID', 'Rating'], axis=1)

In [80]:
test.to_csv('pred_pyspark_als.csv', index=False)