In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import random
import os

import findspark
findspark.init()

from pyspark.sql import SparkSession 
from pyspark.ml  import Pipeline     
from pyspark.sql import SQLContext  
from pyspark.sql.functions import mean,col,split, col, regexp_extract, when, lit
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import QuantileDiscretizer

In [11]:
url = 'https://raw.githubusercontent.com/alextanhongpin/machine-learning-with-pyspark/master/07_recommender_system/movie_ratings_df.csv'
csv = pd.read_csv(url, encoding="ISO-8859-1")

In [12]:
csv.head(3)

Unnamed: 0,userId,title,rating
0,196,Kolya (1996),3
1,63,Kolya (1996),3
2,226,Kolya (1996),5


In [18]:
csv.to_csv(r"C:\Users\gonza\Repositorios\TestGit\movie_ratings_df.csv",index=False)

### Movie Recommendation with Pyspark 

In [19]:
# create sparksession
spark = SparkSession.builder.appName('recomendation').master("local[5]").getOrCreate()

In [20]:
url = r"C:\Users\gonza\Repositorios\TestGit\movie_ratings_df.csv"

df = spark.read.csv(url,inferSchema=True,header=True)
df.limit(3).toPandas()

Unnamed: 0,userId,title,rating
0,196,Kolya (1996),3
1,63,Kolya (1996),3
2,226,Kolya (1996),5


In [21]:
df.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- rating: integer (nullable = true)



As we can see, the title column is stored as string type. To work with pyspark Mlib library, we need to convert string type to numeric values

In [22]:
from pyspark.ml.feature import StringIndexer,IndexToString

stringIndexer = StringIndexer(inputCol='title',outputCol='title_new')
# Applying stringindexer object on dataframe movie title column
model = stringIndexer.fit(df)
#creating new dataframe with transformed values
indexed = model.transform(df)
indexed.limit(5).toPandas()

Unnamed: 0,userId,title,rating,title_new
0,196,Kolya (1996),3,287.0
1,63,Kolya (1996),3,287.0
2,226,Kolya (1996),5,287.0
3,154,Kolya (1996),3,287.0
4,306,Kolya (1996),5,287.0


We use Alternating least squares (ALS) algorithm in Pyspark Ml library for recommendation.

In [23]:
# split the data
train, test = indexed.randomSplit([0.75,0.25])

# training the model using train dataset
from pyspark.ml.recommendation import ALS
rec = ALS(maxIter=10,
            regParam=0.01,
            userCol='userId',
            itemCol='title_new',
            ratingCol='rating',
            nonnegative=True,
            coldStartStrategy='drop')
# fit the model on training set
rec_model = rec.fit(train)
# making prediction on test set
predicted_ratings = rec_model.transform(test)
predicted_ratings.limit(5).toPandas()

Unnamed: 0,userId,title,rating,title_new,prediction
0,588,Much Ado About Nothing (1993),5,148.0,4.959589
1,642,Much Ado About Nothing (1993),5,148.0,5.339576
2,606,Much Ado About Nothing (1993),5,148.0,4.872139
3,727,Much Ado About Nothing (1993),5,148.0,4.03444
4,360,Much Ado About Nothing (1993),4,148.0,4.146318


In [24]:
spark.stop()