In [None]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.2.tar.gz (281.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.4/281.4 MB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 KB[0m [31m23.7 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.2-py2.py3-none-any.whl size=281824028 sha256=d514114a3b409448189ae428e7564a0bd92b7e6c053d8eb2be44d16d61cdb5ae
  Stored in directory: /root/.cache/pip/wheels/6c/e3/9b/0525ce8a69478916513509d43693511463c6468db0de237c86
Successfully built pyspark
Installing collected packages: py4j, pyspa

In [79]:
#Libraries
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

In [80]:
spark = SparkSession.builder.appName("RecommendationSystem").getOrCreate()

# Load Data

In [81]:
myschema = StructType([StructField("userID", IntegerType(), True),
                       StructField("movieID", IntegerType(), True),
                       StructField("rating",IntegerType(), True),
                       StructField("unixTimestamp",IntegerType(), True),
                        ])

df = spark.read.format("csv").schema(myschema).option("delimiter", "\t").load("ml-100k/u.data")
#df = spark.read.format("csv").schema(myschema).option("delimiter", "\t").load("hdfs://localhost:9000/user/vvd09/data/u.data")

In [82]:
df.show(10)

+------+-------+------+-------------+
|userID|movieID|rating|unixTimestamp|
+------+-------+------+-------------+
|   196|    242|     3|    881250949|
|   186|    302|     3|    891717742|
|    22|    377|     1|    878887116|
|   244|     51|     2|    880606923|
|   166|    346|     1|    886397596|
|   298|    474|     4|    884182806|
|   115|    265|     2|    881171488|
|   253|    465|     5|    891628467|
|   305|    451|     3|    886324817|
|     6|     86|     3|    883603013|
+------+-------+------+-------------+
only showing top 10 rows



In [83]:
df = df.drop("unixTimestamp")

In [84]:
df.describe().show()

+-------+------------------+------------------+------------------+
|summary|            userID|           movieID|            rating|
+-------+------------------+------------------+------------------+
|  count|            100000|            100000|            100000|
|   mean|         462.48475|         425.53013|           3.52986|
| stddev|266.61442012750905|330.79835632558473|1.1256735991443214|
|    min|                 1|                 1|                 1|
|    max|               943|              1682|                 5|
+-------+------------------+------------------+------------------+



In [85]:
#split train, test data
(train, test) = df.randomSplit([0.8, 0.2], seed=27)

# ALS Model

In [86]:
#create ALS model
als = ALS(maxIter=10, regParam=0.01, userCol="userID", itemCol="movieID", ratingCol="rating")

In [87]:
#train model on train data
model_col = als.fit(train)
model_col.setColdStartStrategy("drop")

ALSModel: uid=ALS_633780b38d71, rank=10

In [88]:
#get predictions for test data
pred = model_col.transform(test)

In [95]:
pred.show(5)

+------+-------+------+----------+
|userID|movieID|rating|prediction|
+------+-------+------+----------+
|   148|      7|     5| 4.6743793|
|   148|     69|     5| 3.0447612|
|   148|    114|     5|   4.84867|
|   148|    135|     5| 5.1084986|
|   148|    151|     4| 5.3941426|
+------+-------+------+----------+
only showing top 5 rows



# Evaluate predictions

In [91]:
#evaluate predictions
eval = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = eval.evaluate(pred)
print("RMSE: ", rmse)

RMSE:  1.0779482552322297


In [92]:
#get top 5 recommendations for all users
user_recs = model_col.recommendForAllUsers(5)

In [93]:
#display top 5 recommendations 0f UserID = 10
user_recs.select(user_recs.recommendations).where(user_recs.userID==10).cache().collect()[0][0]

[Row(movieID=1643, rating=5.561593055725098),
 Row(movieID=853, rating=5.350731372833252),
 Row(movieID=1149, rating=5.23101806640625),
 Row(movieID=1066, rating=5.225638389587402),
 Row(movieID=1368, rating=5.202705383300781)]

# Recommend Movies with names

In [94]:
movieSchema = StructType([
    StructField("movie_id", IntegerType(), True),
    StructField("movie_title", StringType(), True),
    StructField("release_date", StringType(), True),
    StructField("video_release_date", StringType(), True),
    StructField("imdb_url", StringType(), True),
    StructField("unknown", IntegerType(), True),
    StructField("action", IntegerType(), True),
    StructField("adventure", IntegerType(), True),
    StructField("animation", IntegerType(), True),
    StructField("children", IntegerType(), True),
    StructField("comedy", IntegerType(), True),
    StructField("crime", IntegerType(), True),
    StructField("documentary", IntegerType(), True),
    StructField("drama", IntegerType(), True),
    StructField("fantasy", IntegerType(), True),
    StructField("film_noir", IntegerType(), True),
    StructField("horror", IntegerType(), True),
    StructField("musical", IntegerType(), True),
    StructField("mystery", IntegerType(), True),
    StructField("romance", IntegerType(), True),
    StructField("sci_fi", IntegerType(), True),
    StructField("thriller", IntegerType(), True),
    StructField("war", IntegerType(), True),
    StructField("western", IntegerType(), True),
])

rec_userid = 10

movie_df = spark.read.format("csv").schema(movieSchema).option("delimiter", "|").load("ml-100k/u.item")

user_movies = df.select(df.movieID).where((df.userID == rec_userid) & (df.rating >= 3)).orderBy(df.rating.desc()).limit(5)
print("Movies liked by user ID ", rec_userid)
for row in user_movies.collect():
    movie_id = row['movieID']
    print(movie_df.select(movie_df.movie_title).where(movie_df.movie_id == movie_id).collect()[0][0])
    
rec_movie_ids  =[]
movie_recs = model_col.recommendForAllUsers(5)
print("\nPredictions of which movies user might like")
for row in movie_recs.filter(f"userID = {rec_userid}").collect():
    for rec in row.recommendations:
        movie_id = rec.movieID
        print(movie_df.select(movie_df.movie_title).where(movie_df.movie_id == movie_id).collect()[0][0])

Movies liked by user ID  10
Fargo (1996)
Laura (1944)
Sunset Blvd. (1950)
Secrets & Lies (1996)
Bonnie and Clyde (1967)

Predictions of which movies user might like
Angel Baby (1995)
Braindead (1992)
Walkabout (1971)
Balto (1995)
Mina Tannenbaum (1994)
