In [1]:
from pyspark.sql import SparkSession

In [2]:
MAX_MEMORY = '5g'
spark = SparkSession.builder.appName("movie-recommendation")\
    .config("spark.executor.memory", MAX_MEMORY)\
    .config("spark.driver.memory", MAX_MEMORY)\
    .getOrCreate()

22/04/06 22:34:35 WARN Utils: Your hostname, devkhk-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 172.30.1.27 instead (on interface en0)
22/04/06 22:34:35 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/04/06 22:34:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/04/06 22:34:36 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/04/06 22:34:36 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [3]:
ratings_file = "/Users/devkhk/Documents/data-engineering-study/data/ml-25m/ratings.csv"
ratings_df = spark.read.csv(f"file:///{ratings_file}", inferSchema=True, header=True)

                                                                                

In [4]:
ratings_df.show()

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|    296|   5.0|1147880044|
|     1|    306|   3.5|1147868817|
|     1|    307|   5.0|1147868828|
|     1|    665|   5.0|1147878820|
|     1|    899|   3.5|1147868510|
|     1|   1088|   4.0|1147868495|
|     1|   1175|   3.5|1147868826|
|     1|   1217|   3.5|1147878326|
|     1|   1237|   5.0|1147868839|
|     1|   1250|   4.0|1147868414|
|     1|   1260|   3.5|1147877857|
|     1|   1653|   4.0|1147868097|
|     1|   2011|   2.5|1147868079|
|     1|   2012|   2.5|1147868068|
|     1|   2068|   2.5|1147869044|
|     1|   2161|   3.5|1147868609|
|     1|   2351|   4.5|1147877957|
|     1|   2573|   4.0|1147878923|
|     1|   2632|   5.0|1147878248|
|     1|   2692|   5.0|1147869100|
+------+-------+------+----------+
only showing top 20 rows



In [5]:
ratings_df = ratings_df.select(["userId", "movieId", "rating"])

In [6]:
ratings_df.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)



In [7]:
ratings_df.select("rating").describe().show()



+-------+------------------+
|summary|            rating|
+-------+------------------+
|  count|          25000095|
|   mean| 3.533854451353085|
| stddev|1.0607439611423508|
|    min|               0.5|
|    max|               5.0|
+-------+------------------+



                                                                                

In [8]:
train_df, test_df = ratings_df.randomSplit([0.8, 0.2])

In [9]:
from pyspark.ml.recommendation import ALS

In [10]:
als = ALS(
    maxIter=5,
    regParam=0.1,
    userCol="userId",
    itemCol="movieId",
    ratingCol="rating",
    coldStartStrategy="drop"
)

In [13]:
model = als.fit(train_df)

                                                                                

In [16]:
predictions = model.transform(test_df)

In [17]:
predictions.show()

                                                                                

+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|    31|   3175|   1.5| 2.4484122|
|    76|   1342|   3.5| 2.9879394|
|   159|  54190|   5.0| 3.9631314|
|   321| 175197|   0.5| 2.0642397|
|   497|   1580|   5.0| 3.0925753|
|   501|   1580|   5.0|  3.875893|
|   597|   1645|   5.0| 3.5025375|
|   597|   4519|   4.0| 3.5809584|
|   613|   1645|   4.0| 3.6777897|
|   626|   2366|   3.0|  3.169643|
|   626|   2866|   3.0| 3.3862936|
|   626|   3997|   2.0| 2.1135566|
|   737|   3175|   5.0| 4.0989394|
|   744|  44022|   3.5|  3.661557|
|   756|   1580|   4.0| 3.4517128|
|   772|   1645|   3.0| 3.0035124|
|   830|   1591|   2.0| 2.8905733|
|   833|   3175|   5.0| 3.3304594|
|   844|   2122|   2.0| 2.3614206|
|   847|   1645|   4.0|  2.870592|
+------+-------+------+----------+
only showing top 20 rows



In [18]:
predictions.select("rating", "prediction").describe().show()

[Stage 170:>                                                        (0 + 8) / 8]

+-------+------------------+------------------+
|summary|            rating|        prediction|
+-------+------------------+------------------+
|  count|           4996949|           4996949|
|   mean|3.5344151000940776|3.4054393919399337|
| stddev|1.0598187035807765|0.6388691750293307|
|    min|               0.5|        -1.9948835|
|    max|               5.0|         7.0163283|
+-------+------------------+------------------+



                                                                                

In [19]:
from pyspark.ml.evaluation import RegressionEvaluator

In [20]:
evaluator = RegressionEvaluator(
    predictionCol='prediction',
    labelCol='rating',
    metricName='rmse'
)

In [21]:
rmse = evaluator.evaluate(predictions)

                                                                                

In [22]:
print(rmse)

0.8130030750686954


In [24]:
model.recommendForAllUsers(3).show()



+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|    26|[{149484, 5.64775...|
|    27|[{203882, 5.69071...|
|    28|[{194434, 7.56227...|
|    31|[{203882, 3.98220...|
|    34|[{194434, 5.98069...|
|    44|[{149484, 7.58397...|
|    53|[{194332, 6.35287...|
|    65|[{149484, 6.80274...|
|    76|[{194434, 6.07916...|
|    78|[{149484, 6.62845...|
|    81|[{179707, 4.9064}...|
|    85|[{176597, 5.37159...|
|   101|[{203882, 5.01413...|
|   103|[{151989, 5.87781...|
|   108|[{149484, 5.98693...|
|   115|[{151989, 6.37320...|
|   126|[{149484, 6.14715...|
|   133|[{151989, 5.42379...|
|   137|[{151989, 5.62344...|
|   148|[{194434, 5.89000...|
+------+--------------------+
only showing top 20 rows





In [25]:
model.recommendForAllItems(3).show()



+-------+--------------------+
|movieId|     recommendations|
+-------+--------------------+
|     12|[{87426, 5.324092...|
|     26|[{3195, 5.111856}...|
|     27|[{143282, 5.30294...|
|     28|[{105801, 5.60748...|
|     31|[{87426, 5.250770...|
|     34|[{71227, 5.432758...|
|     44|[{87426, 5.456319...|
|     53|[{27587, 5.231360...|
|     65|[{87426, 5.150985...|
|     76|[{87426, 5.467379...|
|     78|[{87426, 4.720285...|
|     81|[{156318, 4.80650...|
|     85|[{105801, 4.87715...|
|    101|[{3195, 4.8425584...|
|    103|[{87426, 5.340760...|
|    108|[{142811, 5.43198...|
|    115|[{143291, 5.48569...|
|    126|[{87426, 4.900862...|
|    133|[{33115, 5.723346...|
|    137|[{123177, 5.00105...|
+-------+--------------------+
only showing top 20 rows



                                                                                

In [26]:
from pyspark.sql.types import IntegerType

user_list = [65, 78, 81]
users_df = spark.createDataFrame(user_list, IntegerType()).toDF('userId')

users_df.show()

[Stage 298:>                                                        (0 + 1) / 1]

+------+
|userId|
+------+
|    65|
|    78|
|    81|
+------+



                                                                                

In [28]:
model.recommendForUserSubset(users_df, 5).show()



+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|    65|[{149484, 6.80274...|
|    78|[{149484, 6.62845...|
|    81|[{179707, 4.9064}...|
+------+--------------------+



                                                                                

In [29]:
user_recs = model.recommendForUserSubset(users_df, 5)



In [40]:
user_recs.show()



+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|    65|[{149484, 6.80274...|
|    78|[{149484, 6.62845...|
|    81|[{179707, 4.9064}...|
+------+--------------------+



                                                                                

In [30]:
movies_list = user_recs.collect()[0].recommendations

                                                                                

In [31]:
movies_list

[Row(movieId=149484, rating=6.802742958068848),
 Row(movieId=205277, rating=6.199429988861084),
 Row(movieId=169606, rating=5.929337501525879),
 Row(movieId=61913, rating=5.86498498916626),
 Row(movieId=171923, rating=5.7708516120910645)]

In [32]:
recs_df = spark.createDataFrame(movies_list)

In [34]:
recs_df.show()

+-------+------------------+
|movieId|            rating|
+-------+------------------+
| 149484| 6.802742958068848|
| 205277| 6.199429988861084|
| 169606| 5.929337501525879|
|  61913|  5.86498498916626|
| 171923|5.7708516120910645|
+-------+------------------+



In [35]:
movies_file = "/Users/devkhk/Documents/data-engineering-study/data/ml-25m/movies.csv"
movies_df = spark.read.csv(f"file:///{movies_file}", inferSchema=True, header=True)

In [36]:
movies_df.show()

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
|      6|         Heat (1995)|Action|Crime|Thri...|
|      7|      Sabrina (1995)|      Comedy|Romance|
|      8| Tom and Huck (1995)|  Adventure|Children|
|      9| Sudden Death (1995)|              Action|
|     10|    GoldenEye (1995)|Action|Adventure|...|
|     11|American Presiden...|Comedy|Drama|Romance|
|     12|Dracula: Dead and...|       Comedy|Horror|
|     13|        Balto (1995)|Adventure|Animati...|
|     14|        Nixon (1995)|               Drama|
|     15|Cutthroat Island ...|Action|Adventure|...|
|     16|       Casino (1995)|         Crime|Drama|
|     17|Sen

In [37]:
recs_df.createOrReplaceTempView('recommendations')
movies_df.createOrReplaceTempView('movies')

In [38]:
query = """
SELECT *
FROM
    movies JOIN recommendations
    ON movies.movieID = recommendations.movieID
ORDER BY
    rating desc
"""
recommended_movies = spark.sql(query)
recommended_movies.show()

+-------+--------------------+--------------------+-------+------------------+
|movieId|               title|              genres|movieId|            rating|
+-------+--------------------+--------------------+-------+------------------+
| 149484|All About My Wife...|      Comedy|Romance| 149484| 6.802742958068848|
| 205277|   Inside Out (1991)|Comedy|Drama|Romance| 205277| 6.199429988861084|
| 169606|Dara O'Briain Cro...|              Comedy| 169606| 5.929337501525879|
|  61913| Africa addio (1966)|         Documentary|  61913|  5.86498498916626|
| 171923|Design of Death (...|       Drama|Mystery| 171923|5.7708516120910645|
+-------+--------------------+--------------------+-------+------------------+



In [41]:
def get_recommendations(user_id, num_recs):
    users_df = spark.createDataFrame([user_id], IntegerType()).toDF('userId')
    user_recs_df = model.recommendForUserSubset(users_df, num_recs)
    
    recs_list = user_recs_df.collect()[0].recommendations
    recs_df = spark.createDataFrame(recs_list)
    recommended_movies = spark.sql(query)
    return recommended_movies

In [42]:
recs = get_recommendations(456, 10)

                                                                                

In [43]:
recs.toPandas()

Unnamed: 0,movieId,title,genres,movieId.1,rating
0,149484,All About My Wife (2012),Comedy|Romance,149484,6.802743
1,205277,Inside Out (1991),Comedy|Drama|Romance,205277,6.19943
2,169606,Dara O'Briain Crowd Tickler (2015),Comedy,169606,5.929338
3,61913,Africa addio (1966),Documentary,61913,5.864985
4,171923,Design of Death (2012),Drama|Mystery,171923,5.770852


In [44]:
spark.stop()