In [1]:
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import CountVectorizer, IDF
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import col, udf, explode, desc

# create or get SparkSession
spark = SparkSession.builder.appName("ContentBasedFiltering").getOrCreate()

# load the dataset
df = spark.read.csv("final_dataset.csv", 
                    header=True, 
                    inferSchema=True)

In [3]:
df.show()

+-------+--------------------+------+--------------------+------+--------+
|User_Id|          Movie_Name|Rating|               Genre|  Year|Movie_Id|
+-------+--------------------+------+--------------------+------+--------+
|      1|             Jumanji|   3.5|Adventure|Childre...|1995.0|       1|
|      1|City of Lost Chil...|   3.5|Adventure|Drama|F...|1995.0|       2|
|      1|Twelve Monkeys (a...|   3.5|Mystery|Sci-Fi|Th...|1995.0|       3|
|      1|Seven (a.k.a. Se7en)|   3.5|    Mystery|Thriller|1995.0|       4|
|      1| Usual Suspects, The|   3.5|Crime|Mystery|Thr...|1995.0|       5|
|      1|Rumble in the Bro...|   3.5|Action|Adventure|...|1995.0|       6|
|      1|             Rob Roy|   4.0|Action|Drama|Roma...|1995.0|       7|
|      1|              Clerks|   4.0|              Comedy|1994.0|       8|
|      1|Interview with th...|   4.0|        Drama|Horror|1994.0|       9|
|      1|Star Wars: Episod...|   4.0|Action|Adventure|...|1977.0|      10|
|      1|Léon: The Profes

In [None]:
from pyspark.sql.functions import split, explode

df_exploded = df.select(
    col("User_Id"), 
    col("Movie_Name"), 
    explode(split(col("Genre"), r"\|")).alias("Genre")
)

In [None]:
# explode genre into separate rows
df_exploded.show()

+-------+--------------------+---------+
|User_Id|          Movie_Name|    Genre|
+-------+--------------------+---------+
|      1|             Jumanji|Adventure|
|      1|             Jumanji| Children|
|      1|             Jumanji|  Fantasy|
|      1|City of Lost Chil...|Adventure|
|      1|City of Lost Chil...|    Drama|
|      1|City of Lost Chil...|  Fantasy|
|      1|City of Lost Chil...|  Mystery|
|      1|City of Lost Chil...|   Sci-Fi|
|      1|Twelve Monkeys (a...|  Mystery|
|      1|Twelve Monkeys (a...|   Sci-Fi|
|      1|Twelve Monkeys (a...| Thriller|
|      1|Seven (a.k.a. Se7en)|  Mystery|
|      1|Seven (a.k.a. Se7en)| Thriller|
|      1| Usual Suspects, The|    Crime|
|      1| Usual Suspects, The|  Mystery|
|      1| Usual Suspects, The| Thriller|
|      1|Rumble in the Bro...|   Action|
|      1|Rumble in the Bro...|Adventure|
|      1|Rumble in the Bro...|   Comedy|
|      1|Rumble in the Bro...|    Crime|
+-------+--------------------+---------+
only showing top

In [None]:
from pyspark.sql.functions import collect_list

# group by Movie_Name and collect Genres:
df_grouped = df_exploded.groupBy("Movie_Name").agg(collect_list("Genre").alias("Genres"))


In [7]:
df_grouped.show()

+--------------------+--------------------+
|          Movie_Name|              Genres|
+--------------------+--------------------+
|                 '71|[Action, Drama, T...|
|             'R Xmas|[Crime, Drama, Cr...|
|  'Til There Was You|[Drama, Drama, Ro...|
|...All the Marble...|[Action, Comedy, ...|
|10 Questions for ...|[Documentary, Doc...|
|      10 to Midnight|[Action, Action, ...|
|101 Dalmatians (O...|[Adventure, Adven...|
|   12 Days of Terror|[Drama, Horror, T...|
|     12 O'Clock Boys|       [Documentary]|
|12 Storeys (Shier...|     [Comedy, Drama]|
|               12:01|[Comedy, Romance,...|
|    13th Letter, The|[Film-Noir, Film-...|
|14 Blades (Jin yi...|[Action, Drama, A...|
|                  15|     [Action, Drama]|
|          16 to Life|[Comedy, Drama, C...|
|           18 Again!|[Comedy, Comedy, ...|
|            20 Dates|[Comedy, Comedy, ...|
|20,000 Years in S...|[Crime, Drama, Cr...|
|                2012|[Action, Action, ...|
|2019: After the F...|[Action, H

In [None]:
# apply CountVectorizer:
cv = CountVectorizer(inputCol="Genres", outputCol="rawFeatures") 
model = cv.fit(df_grouped)
featurizedData = model.transform(df_grouped)

In [9]:
featurizedData.show()

+--------------------+--------------------+--------------------+
|          Movie_Name|              Genres|         rawFeatures|
+--------------------+--------------------+--------------------+
|                 '71|[Action, Drama, T...|(20,[0,2,3,13],[7...|
|             'R Xmas|[Crime, Drama, Cr...|(20,[0,6],[6.0,6.0])|
|  'Til There Was You|[Drama, Drama, Ro...|(20,[0,5],[90.0,9...|
|...All the Marble...|[Action, Comedy, ...|(20,[0,1,2,5],[11...|
|10 Questions for ...|[Documentary, Doc...|     (20,[17],[2.0])|
|      10 to Midnight|[Action, Action, ...|(20,[2,3,4],[13.0...|
|101 Dalmatians (O...|[Adventure, Adven...|(20,[4,9,12],[105...|
|   12 Days of Terror|[Drama, Horror, T...|(20,[0,3,11],[2.0...|
|     12 O'Clock Boys|       [Documentary]|     (20,[17],[1.0])|
|12 Storeys (Shier...|     [Comedy, Drama]|(20,[0,1],[1.0,1.0])|
|               12:01|[Comedy, Romance,...|(20,[1,3,5,7],[8....|
|    13th Letter, The|[Film-Noir, Film-...|     (20,[18],[2.0])|
|14 Blades (Jin yi...|[Ac

In [10]:
IDF = IDF(inputCol="rawFeatures", outputCol="features")
IDFModel = IDF.fit(featurizedData)
rescaledData = IDFModel.transform(featurizedData)

In [11]:
rescaledData.show()

+--------------------+--------------------+--------------------+--------------------+
|          Movie_Name|              Genres|         rawFeatures|            features|
+--------------------+--------------------+--------------------+--------------------+
|                 '71|[Action, Drama, T...|(20,[0,2,3,13],[7...|(20,[0,2,3,13],[4...|
|             'R Xmas|[Crime, Drama, Cr...|(20,[0,6],[6.0,6.0])|(20,[0,6],[4.0463...|
|  'Til There Was You|[Drama, Drama, Ro...|(20,[0,5],[90.0,9...|(20,[0,5],[60.694...|
|...All the Marble...|[Action, Comedy, ...|(20,[0,1,2,5],[11...|(20,[0,1,2,5],[7....|
|10 Questions for ...|[Documentary, Doc...|     (20,[17],[2.0])|(20,[17],[5.30415...|
|      10 to Midnight|[Action, Action, ...|(20,[2,3,4],[13.0...|(20,[2,3,4],[25.5...|
|101 Dalmatians (O...|[Adventure, Adven...|(20,[4,9,12],[105...|(20,[4,9,12],[250...|
|   12 Days of Terror|[Drama, Horror, T...|(20,[0,3,11],[2.0...|(20,[0,3,11],[1.3...|
|     12 O'Clock Boys|       [Documentary]|     (20,[1

In [14]:
def cosine_similarity(v1, v2):
    return float(v1.dot(v2) / (v1.norm(2) * v2.norm(2)))

cosine_similarity_udf = udf(cosine_similarity)

# calculate similarity between movies
movie_pairs = rescaledData.alias("i").join(
    rescaledData.alias("j"), col("i.Movie_Name") < col("j.Movie_Name")
).select(
    col("i.Movie_Name").alias("Movie first"),
    col("j.Movie_Name").alias("Movie second"),
    cosine_similarity_udf(col("i.features"), col("j.features")).alias("Similarity")
)

# get recommendations based on user's watched movies
user_id = 1  
watched_movies = df.filter(col("User_Id") == user_id).select("Movie_Name").rdd.flatMap(lambda x: x).collect()

recommendations = movie_pairs.filter(
    (col("Movie first").isin(watched_movies)) | (col("Movie second").isin(watched_movies))
).sort(desc("Similarity"))

recommendations.show()

+--------------------+--------------------+--------------------+
|         Movie first|        Movie second|          Similarity|
+--------------------+--------------------+--------------------+
|           Dark City|    Enter the Dragon|9.987915802896691E-4|
|           Dark City|Run Lola Run (Lol...|9.987915802896691E-4|
|   Great Escape, The|                   M|9.982918999835958E-5|
|             Godsend|            Reckless|9.980122797856714E-4|
|              Misery|            Reckless|9.980122797856714E-4|
|Brotherhood of th...|               Mulan|9.979851710609634E-4|
|2001: A Space Ody...|      My Blue Heaven|9.974782088646892E-4|
|              Aliens|          Cinderella|9.966088346807617E-4|
|  Control (Kontroll)|   Wizard of Oz, The|  9.9579301313787E-5|
|Gentlemen of Fort...|   Wizard of Oz, The|9.957930131378698E-5|
|             Gumshoe|   Wizard of Oz, The|9.957930131378698E-5|
|         Murder Ahoy|   Wizard of Oz, The|9.957930131378698E-5|
|      Late Show, The|   