 **Preprocess and load data**

In [1]:
import os
# Find the latest version of spark 3.3 from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.3.0'
spark_version = 'spark-3.3.0'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
0% [Waiting for headers] [Waiting for headers] [Waiting for headers] [Connectin0% [1 InRelease gpgv 3,626 B] [Waiting for headers] [Waiting for headers] [Wait                                                                               Get:2 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
0% [1 InRelease gpgv 3,626 B] [Waiting for headers] [2 InRelease 14.2 kB/88.7 k                                                                               Hit:3 http://archive.ubuntu.com/ubuntu bionic InRelease
0% [1 InRelease gpgv 3,626 B] [Waiting for headers] [2 InRelease 48.9 kB/88.7 k                                                                               Ign:4 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
0% [1 InRelease gpgv 3,626 B] [Waiting for headers] [2 InRelease 51.8 kB/88.7 k                 

In [2]:
# Download the Postgres driver that will allow Spark to interact with Postgres.
!wget https://jdbc.postgresql.org/download/postgresql-42.2.16.jar

--2022-08-27 21:51:55--  https://jdbc.postgresql.org/download/postgresql-42.2.16.jar
Resolving jdbc.postgresql.org (jdbc.postgresql.org)... 72.32.157.228, 2001:4800:3e1:1::228
Connecting to jdbc.postgresql.org (jdbc.postgresql.org)|72.32.157.228|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1002883 (979K) [application/java-archive]
Saving to: ‘postgresql-42.2.16.jar.3’


2022-08-27 21:51:56 (6.24 MB/s) - ‘postgresql-42.2.16.jar.3’ saved [1002883/1002883]



In [3]:
# Start Spark Session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Netflix Movies").config("spark.driver.extraClassPath","/content/postgresql-42.2.16.jar").getOrCreate()

In [4]:
# Read in data from S3 Buckets
from pyspark import SparkFiles
url ="https://luiwarang-bucket.s3.amazonaws.com/Netflix_Dataset_Movie_Rating.csv"
spark.sparkContext.addFile(url)
user_data_df = spark.read.csv(SparkFiles.get("Netflix_Dataset_Movie_Rating.csv"), sep=",", header=True, inferSchema=True)


In [5]:
user_data_df.show()

+--------+----+---------+-------+------+
|movie_id|year|     name|user_id|rating|
+--------+----+---------+-------+------+
|       3|1997|Character| 712664|     5|
|       3|1997|Character|1331154|     4|
|       3|1997|Character|2632461|     3|
|       3|1997|Character|  44937|     5|
|       3|1997|Character| 656399|     4|
|       3|1997|Character| 439011|     1|
|       3|1997|Character|1644750|     3|
|       3|1997|Character|2031561|     4|
|       3|1997|Character| 616720|     4|
|       3|1997|Character|2467008|     4|
|       3|1997|Character| 701730|     2|
|       3|1997|Character|1614320|     4|
|       3|1997|Character| 115498|     3|
|       3|1997|Character| 931626|     2|
|       3|1997|Character| 699878|     4|
|       3|1997|Character|1694958|     3|
|       3|1997|Character|  66414|     5|
|       3|1997|Character|2519847|     5|
|       3|1997|Character| 948069|     3|
|       3|1997|Character|  67315|     4|
+--------+----+---------+-------+------+
only showing top

In [6]:
# Determine how many columns the dataframe has
user_data_df.columns

['movie_id', 'year', 'name', 'user_id', 'rating']

In [7]:
# List dataframe data types
user_data_df.dtypes

[('movie_id', 'int'),
 ('year', 'int'),
 ('name', 'string'),
 ('user_id', 'int'),
 ('rating', 'int')]

In [8]:
# Drop null columns
user_data_df =user_data_df.dropna()

In [9]:
# Separate the feature(x) from the target
from pyspark.sql.functions import col
X = user_data_df.drop(col("rating"))
y = user_data_df["rating"]

In [10]:
# Split into training and testing sets
train, test = user_data_df.randomSplit(weights=(0.8, 0.2))

**Building Recommendation Model using ALS**

In [11]:
# Build the recommendation model using ALS on the training data
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

als = ALS(maxIter=10, regParam=0.1, rank=8, nonnegative=True, coldStartStrategy="drop",\
          userCol='user_id', itemCol='movie_id', ratingCol='rating')
model = als.fit(train)


**Making predictions with ALS Model**

In [12]:
# importing appropriate library
from pyspark.ml.evaluation import RegressionEvaluator

# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
predictions

DataFrame[movie_id: int, year: int, name: string, user_id: int, rating: int, prediction: float]

In [13]:
predictions.show()

+--------+----+--------------------+-------+------+----------+
|movie_id|year|                name|user_id|rating|prediction|
+--------+----+--------------------+-------+------+----------+
|       3|1997|           Character|1608536|     5|  4.003406|
|       3|1997|           Character|2190625|     1| 3.2696831|
|       3|1997|           Character|2427438|     4|  4.138896|
|       8|2004|What the #$*! Do ...|  94851|     2| 2.9840095|
|       8|2004|What the #$*! Do ...|1046377|     4| 3.2521875|
|       8|2004|What the #$*! Do ...|1323740|     4|  3.134016|
|       8|2004|What the #$*! Do ...|1497891|     4| 2.7242157|
|       8|2004|What the #$*! Do ...|1577862|     5| 4.0240355|
|       8|2004|What the #$*! Do ...|1713085|     2| 3.1967816|
|       8|2004|What the #$*! Do ...|1769353|     3| 3.0402372|
|       8|2004|What the #$*! Do ...|1896167|     2| 3.0210369|
|      16|1996|           Screamers|1628484|     2|  2.778148|
|      16|1996|           Screamers|1990657|     3|  2.


**Evaluate the predictions**

In [14]:
# using RMSE to evaluate the model
evaluator = RegressionEvaluator(metricName='rmse', predictionCol='prediction', labelCol='rating')
rmse = evaluator.evaluate(predictions)
print('RMSE: %.4f' % rmse)

RMSE: 0.8583


**Making movie recommendations to users**

In [15]:
# Filter for user data history( in this case we will use user 7 as an example)
data_history = train.filter(train['user_id']==7)
data_history.show()

+--------+----+--------------------+-------+------+
|movie_id|year|                name|user_id|rating|
+--------+----+--------------------+-------+------+
|       8|2004|What the #$*! Do ...|      7|     5|
|      28|2002|     Lilo and Stitch|      7|     4|
|      30|2003|Something's Gotta...|      7|     5|
|      83|1983|            Silkwood|      7|     5|
|     175|1992|      Reservoir Dogs|      7|     5|
|     185|1985|Missing in Action...|      7|     4|
|     257|1973|     Charlotte's Web|      7|     5|
|     273|2004|                Taxi|      7|     4|
|     283|1996|If These Walls Co...|      7|     5|
|     285|1997|     The Devil's Own|      7|     5|
|     313|2000|      Pay It Forward|      7|     5|
|     329|1999|               Dogma|      7|     4|
|     348|1988|The Last Temptati...|      7|     4|
|     357|2003|House of Sand and...|      7|     5|
|     457|2004|   Kill Bill: Vol. 2|      7|     5|
|     468|2003|The Matrix: Revol...|      7|     5|
|     535|19

In [16]:
# Create a suggestion movie list for the user
movie_list_suggestion = test.filter(train['user_id']==7).select(['movie_id','name', 'user_id','year'])
movie_list_suggestion.show()

+--------+--------------------+-------+----+
|movie_id|                name|user_id|year|
+--------+--------------------+-------+----+
|     191|    X2: X-Men United|      7|2003|
|     299|Bridget Jones's D...|      7|2001|
|     501|Three Days of the...|      7|1975|
|     708|   The Perfect Storm|      7|2000|
|     758|          Mean Girls|      7|2004|
|     937|              Fallen|      7|1998|
|     940|            Hercules|      7|1997|
|    1046|        Uptown Girls|      7|2003|
|    1066| Superman: The Movie|      7|1978|
|    1102|        Training Day|      7|2001|
|    1220|         Man on Fire|      7|2004|
|    1289|Look Who's Talkin...|      7|1990|
|    1359|          Funny Lady|      7|1975|
|    1428|         The Recruit|      7|2003|
|    1571|         Poltergeist|      7|1982|
|    1659|      Grumpy Old Men|      7|1993|
|    1754|     Sixteen Candles|      7|1984|
|    1795|      MASH: Season 7|      7|1978|
|    1798|       Lethal Weapon|      7|1987|
|    1843|

In [17]:
# Create a list with the high predicting rating movies for the user 
final_recommendation_list = model.transform(movie_list_suggestion)
final_recommendation_list.orderBy('prediction', ascending=False).show()

+--------+--------------------+-------+----+----------+
|movie_id|                name|user_id|year|prediction|
+--------+--------------------+-------+----+----------+
|    2782|          Braveheart|      7|1995|  4.882445|
|    1795|      MASH: Season 7|      7|1978| 4.7661743|
|    2743|         The Pianist|      7|2002|  4.750317|
|    1220|         Man on Fire|      7|2004|  4.593657|
|    2128|                Rudy|      7|1993| 4.4514112|
|    3342|The Day of the Ja...|      7|1973| 4.4092593|
|     501|Three Days of the...|      7|1975| 4.4032283|
|    2209|      On Golden Pond|      7|1981|  4.398661|
|    1798|       Lethal Weapon|      7|1987| 4.3963065|
|    4356|   Road to Perdition|      7|2002|  4.332597|
|    1754|     Sixteen Candles|      7|1984| 4.2955976|
|    3684|          Goldfinger|      7|1964| 4.2461386|
|     191|    X2: X-Men United|      7|2003| 4.2168818|
|     937|              Fallen|      7|1998| 4.2075152|
|    1571|         Poltergeist|      7|1982| 4.2