#Start Spark Context and Data Preprocessing

In [55]:
!apt-get update -qq > /dev/null
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://dlcdn.apache.org/spark/spark-3.4.2/spark-3.4.2-bin-hadoop3.tgz
!tar xf spark-3.4.2-bin-hadoop3.tgz
!pip install -q findspark
!pip install -q pyspark

In [56]:
from pyspark.sql.types import IntegerType, FloatType, DoubleType
from pyspark.sql.functions import rand, udf
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
import os
import sys
import pandas as pd
import numpy as np
import pyspark
from pyspark.sql import SparkSession
import pyspark.ml as ml
from pyspark.sql.functions import col
from pyspark.ml.linalg import Vectors
from pyspark.sql import functions as F

In [57]:
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
spark = SparkSession.builder.appName("YourAppName").getOrCreate()
#Load the cleaned dataset
def load_dfs():
    global movies, users, ratings, orig_movies
    movies = spark.read.csv("/content/drive/MyDrive/movieLens/cleaned_data/pivoted_movies_features.csv", encoding="latin1", header=True).cache()
    movies.createOrReplaceTempView("movies_info")

    users = spark.read.csv("/content/drive/MyDrive/movieLens/cleaned_data/pivoted_users_features.csv", encoding="latin1", header=True).cache()
    users.createOrReplaceTempView("users_info")

    ratings = spark.read.csv("/content/drive/MyDrive/movieLens/ratings.dat", sep="::", encoding="latin1")
    ratings = ratings.toDF("user_id", "movie_id", "rating", "time_stamp").cache()
    ratings.createOrReplaceTempView("ratings_info")

    orig_movies = spark.read.csv("/content/drive/MyDrive/movieLens/movies.dat", sep="::", encoding="latin1")
    orig_movies = orig_movies.toDF("movie_id", "Movie_Title", "Genre").cache()
    orig_movies.createOrReplaceTempView("orig_movies_info")

load_dfs()

In [58]:
#Remove Index column
movies = movies.drop("_c0")
users = users.drop("_c0")

In [81]:
movies.show()

+--------+----+-------+----------+---------------------+------------------------+------------------------+-------------------------+---------------------+--------------------+--------------------------+--------------------+----------------------+------------------------+---------------------+----------------------+----------------------+----------------------+---------------------+-----------------------+------------------+----------------------+-----------------+--------------------+--------------------+---------------------+-----------------+----------------+----------------------+----------------+------------------+--------------------+-----------------+------------------+------------------+------------------+-----------------+-------------------+--------------+------------------+
|movie_id|year|watches|avg_rating|popularity_per_Action|popularity_per_Adventure|popularity_per_Animation|popularity_per_Children's|popularity_per_Comedy|popularity_per_Crime|popularity_per_Documentary|pop

#Start Clustering using User Features

In [60]:

# Convert categorical variables to numerical using one-hot encoding
for col in users.columns:
    if col in ["user_id", "gender", "region","academic/educator",  "artist", "clerical/ admin",  "college/grad student", "customer service","doctor/health care", "executive/managerial", "farmer",  "homemaker", "K-12 student", "lawyer", "programmer", "retired", "sales/marketing", "scientist", "self-employed","technician/engineer","tradesman/craftsman", "unemployed","writer" ]:
        users = users.withColumn(col, users[col].cast(IntegerType()))
    else:
        users = users.withColumn(col, users[col].cast(FloatType()))

In [61]:

silhouette = 0
seed = 1
# Vectorize the features
feature_columns = users.columns
feature_columns.remove("user_id")  # Exclude the user_id column
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
data = assembler.transform(users)

while (silhouette < 0.19):
      #Change seed
      seed += 1

      # Initialize KMeans model with 600 clusters
      kmeans = KMeans().setK(500).setSeed(seed)

      # Train the model
      model = kmeans.fit(data)

      # Make predictions
      predictions = model.transform(data)

      # Evaluate clustering by computing Silhouette score
      evaluator = ClusteringEvaluator()
      silhouette = evaluator.evaluate(predictions)

# Shows the result.
centers = model.clusterCenters()

#Set the user ID we want to make recommendations to

In [62]:
#User we want to recommend to
req_user_id = 148

#Most Similar Users to req_user_id

In [63]:

def print_other_users_in_cluster(user_id):
    # Find the predicted cluster for the given user_id
    user_cluster = predictions.filter(predictions["user_id"] == user_id).select("prediction").collect()[0]["prediction"]

    # Filter the predictions DataFrame to get all users belonging to the same cluster
    cluster_users = predictions.filter(predictions["prediction"] == user_cluster).select("user_id").collect()

    # Print all other user IDs in the cluster except for the given user_id
    print(f"Other user IDs in cluster {user_cluster} for user {user_id}:")

    user_list = []
    for row in cluster_users:
        if row["user_id"] != user_id:
            print(row["user_id"])
            user_list.append(row["user_id"])
    return user_list


users_sim = print_other_users_in_cluster(req_user_id)
print(users_sim)

Other user IDs in cluster 178 for user 148:
3475
3182
4867
[3475, 3182, 4867]


In [64]:
def get_high_rated_movies_for_user(user_id):
    # Filter the ratings DataFrame for the given user_id and high ratings
    high_rated_movies = ratings.filter('user_id == '+str(user_id)).filter("rating == 5").select("movie_id")

    # Collect the movie_id values as a list
    high_rated_movie_ids = high_rated_movies.rdd.map(lambda row: row[0]).collect()

    return high_rated_movie_ids
all_movies = []
for i in users_sim:
  high_rated_movies = get_high_rated_movies_for_user(i)
  all_movies += high_rated_movies
  print("High-rated movies for user", i, ":", high_rated_movies)

High-rated movies for user 3475 : ['2997', '6', '2067', '1265', '590', '593', '902', '908', '1299', '778', '1617', '953', '3424', '213', '1897', '428', '296', '2858', '441', '2871', '608', '1147', '1300', '1302', '1307', '497', '1183', '1186', '1193', '1358', '3100', '2302', '2321', '2324', '2329', '858', '3148', '2352', '1704', '110', '1584', '1735', '3198', '2398', '2542', '1758', '2712', '2571', '307', '1923', '318', '3543', '16', '17', '194', '25', '2762', '356', '2918', '508', '509', '47', '50', '58', '524', '1997', '527', '69', '1060', '1208', '1213', '2020', '2022', '1221', '2028', '1089', '1234', '1092', '1096', '1242', '1245', '1246']
High-rated movies for user 3182 : ['2051', '3793', '1258', '3949', '1263', '2076', '1274', '1276', '3020', '2088', '1288', '3033', '1298', '778', '924', '785', '2403', '2409', '3210', '2410', '2411', '2420', '2427', '1639', '3098', '3421', '1824', '223', '2640', '2641', '260', '1884', '3499', '428', '288', '431', '296', '2858', '441', '1125', '36

In [65]:
req_movie_ids = ratings.filter('user_id == '+str(req_user_id)).select("movie_id").rdd.map(lambda row: row[0]).collect()
all_movies = list(set(all_movies))
recommendations = [x for x in all_movies if x not in req_movie_ids]


In [87]:
# Step 1: Choose the top 30 movies with the highest average rating
top_30_movies = movies.orderBy("avg_rating", ascending = False).limit(30)

# Step 2: Join with orig_movies to get additional information
top_30_movies_with_info = top_30_movies.join(orig_movies, on="movie_id")

# Step 3: Select the relevant columns
recommendations = top_30_movies_with_info.select("movie_id","year", "watches", "Movie_Title", "Genre")

# Show the recommendations
recommendations.show()

+----+--------+-------+--------------------+--------------------+
|year|movie_id|watches|         Movie_Title|               Genre|
+----+--------+-------+--------------------+--------------------+
|1995|      50| 1783.0|Usual Suspects, T...|      Crime|Thriller|
|1994|      53|    8.0|     Lamerica (1994)|               Drama|
|1994|     318| 2227.0|Shawshank Redempt...|               Drama|
|1993|     439|    2.0|Dangerous Game (1...|               Drama|
|1993|     527| 2304.0|Schindler's List ...|           Drama|War|
|1962|     557|    2.0|   Mamma Roma (1962)|               Drama|
|1993|     578|    2.0|Hour of the Pig, ...|       Drama|Mystery|
|1995|     745|  657.0|Close Shave, A (1...|Animation|Comedy|...|
|1995|     787|    3.0|Gate of Heavenly ...|         Documentary|
|1972|     858| 2223.0|Godfather, The (1...|  Action|Crime|Drama|
|1950|     922|  470.0|Sunset Blvd. (a.k...|           Film-Noir|
|1995|     989|    1.0|Schlafes Bruder (...|               Drama|
|1993|    

#Movies from 2 of the most similar users

In [68]:


# Function to compute cosine similarity between two feature vectors
def cosine_similarity(vec1, vec2):
    dot_product = float(vec1.dot(vec2))
    norm_vec1 = float(vec1.norm(2))
    norm_vec2 = float(vec2.norm(2))
    return dot_product / (norm_vec1 * norm_vec2)

# Convert feature vectors to dense vectors
dense_vectors = predictions.rdd.map(lambda row: (row["user_id"], Vectors.dense(row["features"]))).collectAsMap()

# Compute similarity between req_user_id and all other users in the cluster
similarities = {}
req_user_features = dense_vectors[req_user_id]
for user_id, features in dense_vectors.items():
    if user_id != req_user_id:
        similarity = cosine_similarity(req_user_features, features)
        similarities[user_id] = similarity

# Sort the similarities dictionary by similarity values in descending order
sorted_similarities = sorted(similarities.items(), key=lambda x: x[1], reverse=True)

# Select the top 2 most similar users
top_similar_users = sorted_similarities[:2]

# Print the user IDs of the top 2 most similar users
print(f"Top 2 most similar users to user {req_user_id}:")
for user_id, similarity in top_similar_users:
    print(f"User ID: {user_id}, Similarity: {similarity}")

Top 2 most similar users to user 148:
User ID: 2453, Similarity: 0.9999966619455816
User ID: 1889, Similarity: 0.9999963552627231


In [69]:
all_movies = []
for user_id, similarity in top_similar_users:
  high_rated_movies = get_high_rated_movies_for_user(user_id)
  all_movies += high_rated_movies
  print("High-rated movies for user", user_id, ":", high_rated_movies)
all_movies = list(set(all_movies))
recommendations = [x for x in all_movies if x not in req_movie_ids]

High-rated movies for user 2453 : ['1249', '1250', '2996', '589', '1262', '1263', '1266', '590', '593', '3952', '912', '1459', '1608', '2268', '1610', '3082', '969', '3252', '3256', '3408', '2611', '3274', '204', '215', '230', '246', '2671', '1104', '3809', '457', '3811', '608', '474', '1183', '1198', '3897', '1357', '1358', '1372', '2324', '1385', '858', '866', '3147', '2357', '1704', '1719', '105', '1721', '1722', '110', '111', '1729', '3330', '2391', '2396', '2571', '1784', '318', '175', '1004', '2745', '1945', '1954', '1959', '349', '1960', '1961', '1962', '356', '2912', '2917', '508', '47', '1047', '370', '50', '527', '3753', '1213', '2028', '1226', '1089', '3916', '1233', '1092', '1094', '1096', '3783']
High-rated movies for user 1889 : ['6', '919', '2406', '3259', '223', '2804', '260', '3635', '296', '459', '1147', '493', '1172', '1196', '1197', '3897', '2161', '1517', '2324', '2501', '1704', '2371', '110', '3347', '1912', '2571', '163', '1784', '18', '1960', '2916', '2918', '10

In [70]:
len(recommendations)

64

#Movie Clustering

In [71]:
movies.show()

+--------+----+-------+------------------+---------------------+------------------------+------------------------+-------------------------+---------------------+--------------------+--------------------------+--------------------+----------------------+------------------------+---------------------+----------------------+----------------------+----------------------+---------------------+-----------------------+------------------+----------------------+------------------+--------------------+--------------------+---------------------+------------------+------------------+----------------------+------------------+------------------+--------------------+------------------+------------------+------------------+------------------+-----------------+-------------------+--------------+------------------+
|movie_id|year|watches|        avg_rating|popularity_per_Action|popularity_per_Adventure|popularity_per_Animation|popularity_per_Children's|popularity_per_Comedy|popularity_per_Crime|popular

In [72]:
# Convert categorical variables to numerical using one-hot encoding
for col in movies.columns:
    if col in ["movie_id", "year"]:
            movies = movies.withColumn(col, movies[col].cast(IntegerType()))
    else:
            movies = movies.withColumn(col, movies[col].cast(FloatType()))
movies.show()

+--------+----+-------+----------+---------------------+------------------------+------------------------+-------------------------+---------------------+--------------------+--------------------------+--------------------+----------------------+------------------------+---------------------+----------------------+----------------------+----------------------+---------------------+-----------------------+------------------+----------------------+-----------------+--------------------+--------------------+---------------------+-----------------+----------------+----------------------+----------------+------------------+--------------------+-----------------+------------------+------------------+------------------+-----------------+-------------------+--------------+------------------+
|movie_id|year|watches|avg_rating|popularity_per_Action|popularity_per_Adventure|popularity_per_Animation|popularity_per_Children's|popularity_per_Comedy|popularity_per_Crime|popularity_per_Documentary|pop

In [89]:

silhouette = 0
seed = 0

# Vectorize the features
feature_columns = movies.columns
feature_columns.remove("movie_id")  # Exclude the user_id column
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
data = assembler.transform(movies)

#Loop till silhouette score is reached
while (silhouette < 0.575):
      #Change seed
      seed += 1

      # Initialize KMeans model with 600 clusters
      kmeans = KMeans().setK(100).setSeed(seed)

      # Train the model
      model = kmeans.fit(data)

      # Make predictions
      predictions = model.transform(data)

      # Evaluate clustering by computing Silhouette score
      evaluator = ClusteringEvaluator()
      silhouette = evaluator.evaluate(predictions)

# Shows the result.
centers = model.clusterCenters()

0.5580307669868385
0.5493050133106248
0.5778642371869518
0.5470405071177543
0.5778164019855081
0.5520918173948272
0.5793899299138502
0.5593479964261859
0.5365441812333989
0.5515163419279284
0.5648181814686662
0.5718330468527906
0.5654696966196621
0.5681222621169175
0.5414409313393079
0.5660267545609082
0.53629520670358
0.521873130539177
0.5559056196317238
0.5730593554348274
0.5587481341890065
0.5344324013488653
0.5784175343676476


ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/lib/python3.10/dist-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [74]:

#Get all the high rated movies of the user
req_highly_rated_movies = ratings.filter((ratings["user_id"] == req_user_id) & (ratings["rating"] == 5))
#Join ratings with predictions to associate movies with clusters
joined_data = req_highly_rated_movies.join(predictions, on="movie_id")

#Count the number of movies rated 5 in each cluster
cluster_counts = joined_data.groupBy("prediction").agg(F.countDistinct("movie_id").alias("count_5_rating"))

#Find the cluster with the most movies rated 5
most_rated_cluster = cluster_counts.orderBy(F.desc("count_5_rating")).first()["prediction"]

# Print the cluster with the most movies rated 5
print("Cluster with the most movies rated 5:", most_rated_cluster)

Cluster with the most movies rated 5: 78


In [80]:
# Filter predictions for movies in the cluster with the most movies rated 5
req_cluster_movies = predictions.filter(predictions["prediction"] == most_rated_cluster)

# Join with orig_movies to get movie information
movies_in_cluster_except_rated = req_cluster_movies.join(orig_movies, on="movie_id")

# Filter out movies that req_user_id has rated
user_rated_movies = ratings.filter(ratings["user_id"] == req_user_id).select("movie_id").distinct()
movies_in_cluster_except_rated = movies_in_cluster_except_rated.join(user_rated_movies, on="movie_id", how="left_anti").select("movie_id", "year", "watches", "Movie_Title", "Genre")

# Show the remaining movies in the cluster
movies_in_cluster_except_rated.show()

+--------+----+-------+--------------------+--------------------+
|movie_id|year|watches|         Movie_Title|               Genre|
+--------+----+-------+--------------------+--------------------+
|      47|1995| 1137.0|Seven (Se7en) (1995)|      Crime|Thriller|
|     923|1941| 1116.0| Citizen Kane (1941)|               Drama|
|    1242|1989| 1112.0|        Glory (1989)|    Action|Drama|War|
|    1288|1984| 1118.0|This Is Spinal Ta...|Comedy|Drama|Musical|
|    1500|1997| 1136.0|Grosse Pointe Bla...|        Comedy|Crime|
|    1673|1997| 1128.0|Boogie Nights (1997)|               Drama|
|    3160|1999| 1113.0|     Magnolia (1999)|               Drama|
|    3253|1992| 1120.0|Wayne's World (1992)|              Comedy|
|    3671|1974| 1119.0|Blazing Saddles (...|      Comedy|Western|
+--------+----+-------+--------------------+--------------------+

