In [0]:
# dbutils.fs.rm("/FileStore/tables", True)

In [0]:
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.sql.functions import col,explode
spark = SparkSession.builder.appName("Collaborative filtering").getOrCreate()

In [0]:
moviesDF = spark.read.options(header="True", inferSchema="True").csv("/FileStore/tables/movies.csv")
ratingsDF = spark.read.options(header="True", inferSchema="True").csv("/FileStore/tables/ratings.csv")

In [0]:
display(moviesDF)

movieId,title,genres
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy
6,Heat (1995),Action|Crime|Thriller
7,Sabrina (1995),Comedy|Romance
8,Tom and Huck (1995),Adventure|Children
9,Sudden Death (1995),Action
10,GoldenEye (1995),Action|Adventure|Thriller


In [0]:
display(ratingsDF)

userId,movieId,rating,timestamp
1,1,4.0,964982703
1,3,4.0,964981247
1,6,4.0,964982224
1,47,5.0,964983815
1,50,5.0,964982931
1,70,3.0,964982400
1,101,5.0,964980868
1,110,4.0,964982176
1,151,5.0,964984041
1,157,5.0,964984100


In [0]:
ratings = ratingsDF.join(moviesDF, 'movieId', 'left')

In [0]:
display(ratings)

movieId,userId,rating,timestamp,title,genres
1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,1,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
6,1,4.0,964982224,Heat (1995),Action|Crime|Thriller
47,1,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
50,1,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
70,1,3.0,964982400,From Dusk Till Dawn (1996),Action|Comedy|Horror|Thriller
101,1,5.0,964980868,Bottle Rocket (1996),Adventure|Comedy|Crime|Romance
110,1,4.0,964982176,Braveheart (1995),Action|Drama|War
151,1,5.0,964984041,Rob Roy (1995),Action|Drama|Romance|War
157,1,5.0,964984100,Canadian Bacon (1995),Comedy|War


In [0]:
(train, test) = ratings.randomSplit([0.8,0.2])

In [0]:
ratings.count()

Out[8]: 100836

In [0]:
print(train.count())
train.show()

80500
+-------+------+------+----------+----------------+--------------------+
|movieId|userId|rating| timestamp|           title|              genres|
+-------+------+------+----------+----------------+--------------------+
|      1|     1|   4.0| 964982703|Toy Story (1995)|Adventure|Animati...|
|      1|     7|   4.5|1106635946|Toy Story (1995)|Adventure|Animati...|
|      1|    17|   4.5|1305696483|Toy Story (1995)|Adventure|Animati...|
|      1|    18|   3.5|1455209816|Toy Story (1995)|Adventure|Animati...|
|      1|    21|   3.5|1407618878|Toy Story (1995)|Adventure|Animati...|
|      1|    31|   5.0| 850466616|Toy Story (1995)|Adventure|Animati...|
|      1|    33|   3.0| 939647444|Toy Story (1995)|Adventure|Animati...|
|      1|    43|   5.0| 848993983|Toy Story (1995)|Adventure|Animati...|
|      1|    44|   3.0| 869251860|Toy Story (1995)|Adventure|Animati...|
|      1|    46|   5.0| 834787906|Toy Story (1995)|Adventure|Animati...|
|      1|    50|   3.0|1514238116|Toy Story (

In [0]:
print(test.count())
test.show()

20336
+-------+------+------+----------+----------------+--------------------+
|movieId|userId|rating| timestamp|           title|              genres|
+-------+------+------+----------+----------------+--------------------+
|      1|     5|   4.0| 847434962|Toy Story (1995)|Adventure|Animati...|
|      1|    15|   2.5|1510577970|Toy Story (1995)|Adventure|Animati...|
|      1|    19|   4.0| 965705637|Toy Story (1995)|Adventure|Animati...|
|      1|    27|   3.0| 962685262|Toy Story (1995)|Adventure|Animati...|
|      1|    32|   3.0| 856736119|Toy Story (1995)|Adventure|Animati...|
|      1|    40|   5.0| 832058959|Toy Story (1995)|Adventure|Animati...|
|      1|    45|   4.0| 951170182|Toy Story (1995)|Adventure|Animati...|
|      1|    54|   3.0| 830247330|Toy Story (1995)|Adventure|Animati...|
|      1|    73|   4.5|1464196374|Toy Story (1995)|Adventure|Animati...|
|      1|    90|   3.0| 856353996|Toy Story (1995)|Adventure|Animati...|
|      1|    91|   4.0|1112713037|Toy Story (

In [0]:
als = ALS(userCol = "userId", itemCol="movieId", ratingCol="rating", nonnegative=True,implicitPrefs=False, coldStartStrategy="drop")

In [0]:
param_grid = ParamGridBuilder() \
            .addGrid(als.rank, [10, 50, 100, 150]) \
            .addGrid(als.regParam, [.01, .05, .1, .15]) \
            .build()

In [0]:
evaluator = RegressionEvaluator(
           metricName="rmse", 
           labelCol="rating", 
           predictionCol="prediction")

In [0]:
cv = CrossValidator(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=5)

In [0]:
model = cv.fit(train)
best_model = model.bestModel
test_predictions = best_model.transform(test)
RMSE = evaluator.evaluate(test_predictions)
print(RMSE)

0.8629956557104574


In [0]:
print(RMSE)

0.8629956557104574


In [0]:
recommendations = best_model.recommendForAllUsers(5)

In [0]:
df = recommendations

In [0]:
display(df)

userId,recommendations
1,"List(List(7842, 5.8989754), List(68945, 5.831123), List(3379, 5.831123), List(33649, 5.7802143), List(7748, 5.7333374))"
2,"List(List(68945, 4.8413825), List(3379, 4.8413825), List(33649, 4.827885), List(171495, 4.705534), List(72171, 4.67413))"
3,"List(List(6835, 4.8493104), List(5181, 4.8493104), List(5746, 4.8493104), List(70946, 4.824515), List(7991, 4.7278786))"
4,"List(List(1733, 4.6732383), List(1046, 4.671116), List(68945, 4.659825), List(3379, 4.659825), List(7748, 4.64642))"
5,"List(List(132333, 4.5597167), List(5490, 4.5597167), List(7748, 4.443245), List(68945, 4.3882036), List(3379, 4.3882036))"
6,"List(List(33649, 4.71267), List(67618, 4.7059646), List(3086, 4.69094), List(26528, 4.6063967), List(74282, 4.6063967))"
7,"List(List(132333, 4.611452), List(5490, 4.611452), List(7748, 4.5131803), List(68945, 4.507324), List(3379, 4.507324))"
8,"List(List(7842, 4.9308968), List(68945, 4.734235), List(3379, 4.734235), List(7748, 4.647522), List(33649, 4.6398883))"
9,"List(List(68945, 4.7807193), List(3379, 4.7807193), List(7748, 4.7562566), List(7842, 4.722553), List(132333, 4.7051187))"
10,"List(List(8869, 4.469881), List(71579, 4.4609823), List(113275, 4.3559155), List(3086, 4.276463), List(140110, 4.2218275))"


In [0]:
df2 = df.withColumn("movieid_rating", explode("recommendations"))

In [0]:
display(df2)

userId,recommendations,movieid_rating
1,"List(List(7842, 5.8989754), List(68945, 5.831123), List(3379, 5.831123), List(33649, 5.7802143), List(7748, 5.7333374))","List(7842, 5.8989754)"
1,"List(List(7842, 5.8989754), List(68945, 5.831123), List(3379, 5.831123), List(33649, 5.7802143), List(7748, 5.7333374))","List(68945, 5.831123)"
1,"List(List(7842, 5.8989754), List(68945, 5.831123), List(3379, 5.831123), List(33649, 5.7802143), List(7748, 5.7333374))","List(3379, 5.831123)"
1,"List(List(7842, 5.8989754), List(68945, 5.831123), List(3379, 5.831123), List(33649, 5.7802143), List(7748, 5.7333374))","List(33649, 5.7802143)"
1,"List(List(7842, 5.8989754), List(68945, 5.831123), List(3379, 5.831123), List(33649, 5.7802143), List(7748, 5.7333374))","List(7748, 5.7333374)"
2,"List(List(68945, 4.8413825), List(3379, 4.8413825), List(33649, 4.827885), List(171495, 4.705534), List(72171, 4.67413))","List(68945, 4.8413825)"
2,"List(List(68945, 4.8413825), List(3379, 4.8413825), List(33649, 4.827885), List(171495, 4.705534), List(72171, 4.67413))","List(3379, 4.8413825)"
2,"List(List(68945, 4.8413825), List(3379, 4.8413825), List(33649, 4.827885), List(171495, 4.705534), List(72171, 4.67413))","List(33649, 4.827885)"
2,"List(List(68945, 4.8413825), List(3379, 4.8413825), List(33649, 4.827885), List(171495, 4.705534), List(72171, 4.67413))","List(171495, 4.705534)"
2,"List(List(68945, 4.8413825), List(3379, 4.8413825), List(33649, 4.827885), List(171495, 4.705534), List(72171, 4.67413))","List(72171, 4.67413)"


In [0]:
display(df2.select("userId", col("movieid_rating.movieId"), col("movieid_rating.rating")))

userId,movieId,rating
1,7842,5.8989754
1,68945,5.831123
1,3379,5.831123
1,33649,5.7802143
1,7748,5.7333374
2,68945,4.8413825
2,3379,4.8413825
2,33649,4.827885
2,171495,4.705534
2,72171,4.67413
