#Start Spark Context and Data Preprocessing

In [53]:
!apt-get update -qq > /dev/null
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://dlcdn.apache.org/spark/spark-3.4.2/spark-3.4.2-bin-hadoop3.tgz
!tar xf spark-3.4.2-bin-hadoop3.tgz
!pip install -q findspark
!pip install -q pyspark

In [54]:
from pyspark.sql.types import IntegerType, FloatType, DoubleType
from pyspark.sql.functions import rand, udf
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
import os
import sys
import pandas as pd
import numpy as np
import pyspark
from pyspark.sql import SparkSession
import pyspark.ml as ml
from pyspark.sql.functions import col
from pyspark.ml.linalg import Vectors
from pyspark.sql import functions as F

In [55]:
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
spark = SparkSession.builder.appName("YourAppName").getOrCreate()
#Load the cleaned dataset
def load_dfs():
    global movies, users, ratings, orig_movies
    movies = spark.read.csv("/content/drive/MyDrive/movieLens/cleaned_data/pivoted_movies_features.csv", encoding="latin1", header=True).cache()
    movies.createOrReplaceTempView("movies_info")

    users = spark.read.csv("/content/drive/MyDrive/movieLens/cleaned_data/pivoted_users_features.csv", encoding="latin1", header=True).cache()
    users.createOrReplaceTempView("users_info")

    ratings = spark.read.csv("/content/drive/MyDrive/movieLens/ratings.dat", sep="::", encoding="latin1")
    ratings = ratings.toDF("user_id", "movie_id", "rating", "time_stamp").cache()
    ratings.createOrReplaceTempView("ratings_info")

    orig_movies = spark.read.csv("/content/drive/MyDrive/movieLens/movies.dat", sep="::", encoding="latin1")
    orig_movies = orig_movies.toDF("movie_id", "Movie_Title", "Genre").cache()
    orig_movies.createOrReplaceTempView("orig_movies_info")

load_dfs()

In [56]:
#Remove Index column
movies = movies.drop("_c0")
users = users.drop("_c0")

In [57]:
movies.show()

+--------+----+-------+------------------+---------------------+------------------------+------------------------+-------------------------+---------------------+--------------------+--------------------------+--------------------+----------------------+------------------------+---------------------+----------------------+----------------------+----------------------+---------------------+-----------------------+------------------+----------------------+------------------+--------------------+--------------------+---------------------+------------------+------------------+----------------------+------------------+------------------+--------------------+------------------+------------------+------------------+------------------+-----------------+-------------------+--------------+------------------+
|movie_id|year|watches|        avg_rating|popularity_per_Action|popularity_per_Adventure|popularity_per_Animation|popularity_per_Children's|popularity_per_Comedy|popularity_per_Crime|popular

#Start Clustering using User Features

In [73]:

# Convert categorical variables to numerical using one-hot encoding
for col in users.columns:
    if col in ["user_id", "gender", "region","academic/educator",  "artist", "clerical/ admin",  "college/grad student", "customer service","doctor/health care", "executive/managerial", "farmer",  "homemaker", "K-12 student", "lawyer", "programmer", "retired", "sales/marketing", "scientist", "self-employed","technician/engineer","tradesman/craftsman", "unemployed","writer" ]:
        users = users.withColumn(col, users[col].cast(IntegerType()))
    else:
        users = users.withColumn(col, users[col].cast(FloatType()))

In [74]:

silhouette = 0
cluster = 200
# Vectorize the features
feature_columns = users.columns
feature_columns.remove("user_id")  # Exclude the user_id column
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
data = assembler.transform(users)

while (silhouette < 0.23):

      # Initialize KMeans model with 600 clusters
      kmeans = KMeans().setK(cluster).setSeed(1)

      # Train the model
      model = kmeans.fit(data)

      # Make predictions
      predictions = model.transform(data)

      # Evaluate clustering by computing Silhouette score
      evaluator = ClusteringEvaluator()
      silhouette = evaluator.evaluate(predictions)

      print("The silhouette score:" +str(silhouette)+ " has a cluster size of "+str(cluster))
      #Change cluster size
      cluster += 50

# Shows the result.
centers = model.clusterCenters()

The silhouette score:0.23914413203029236 has a cluster size of 200


#Set the user ID we want to make recommendations to

In [75]:
#User we want to recommend to
req_user_id = 148

#Most Similar Users to req_user_id

In [76]:

def print_other_users_in_cluster(user_id):
    # Find the predicted cluster for the given user_id
    user_cluster = predictions.filter(predictions["user_id"] == user_id).select("prediction").collect()[0]["prediction"]

    # Filter the predictions DataFrame to get all users belonging to the same cluster
    cluster_users = predictions.filter(predictions["prediction"] == user_cluster).select("user_id").collect()

    # Print all other user IDs in the cluster except for the given user_id
    print(f"Other user IDs in cluster {user_cluster} for user {user_id}:")

    user_list = []
    for row in cluster_users:
        if row["user_id"] != user_id:
            print(row["user_id"])
            user_list.append(row["user_id"])
    return user_list


users_sim = print_other_users_in_cluster(req_user_id)
print(users_sim)

Other user IDs in cluster 12 for user 148:
2962
731
1889
855
1988
692
3285
877
3182
1671
411
2453
2124
2073
4186
4354
5306
3648
5015
3823
5536
5550
4802
[2962, 731, 1889, 855, 1988, 692, 3285, 877, 3182, 1671, 411, 2453, 2124, 2073, 4186, 4354, 5306, 3648, 5015, 3823, 5536, 5550, 4802]


In [77]:
def get_high_rated_movies_for_user(user_id):
    # Filter the ratings DataFrame for the given user_id and high ratings
    high_rated_movies = ratings.filter('user_id == '+str(user_id)).filter("rating == 5").select("movie_id")

    # Collect the movie_id values as a list
    high_rated_movie_ids = high_rated_movies.rdd.map(lambda row: row[0]).collect()

    return high_rated_movie_ids
all_movies = []
for i in users_sim:
  high_rated_movies = get_high_rated_movies_for_user(i)
  all_movies += high_rated_movies
  print("High-rated movies for user", i, ":", high_rated_movies)

High-rated movies for user 2962 : ['1248', '1264', '750', '904', '1296', '922', '923', '951', '3091', '3435', '3481', '1131', '2872', '3831', '3853', '1179', '3863', '3872', '2300', '858', '3160', '3535', '3548', '1949', '29', '2940', '1204', '1221', '1090']
High-rated movies for user 731 : ['1248', '1249', '719', '2052', '1256', '2997', '589', '1', '2', '2205', '3947', '2208', '2065', '1265', '1269', '1270', '2078', '1276', '1278', '3028', '2080', '1282', '1284', '2087', '1285', '905', '908', '909', '3046', '910', '911', '915', '3052', '2252', '927', '928', '784', '3060', '2405', '2406', '930', '2266', '931', '933', '936', '3072', '2417', '3076', '946', '947', '2424', '1625', '3088', '951', '952', '2289', '955', '3239', '2291', '3097', '1639', '3099', '2297', '969', '1648', '1654', '2609', '2463', '2469', '1673', '2624', '1682', '1689', '224', '3299', '236', '3451', '3462', '3466', '252', '1875', '262', '3622', '3624', '3481', '3489', '2692', '2858', '2863', '1126', '3814', '3671', '2

In [78]:
req_movie_ids = ratings.filter('user_id == '+str(req_user_id)).select("movie_id").rdd.map(lambda row: row[0]).collect()
all_movies = list(set(all_movies))
recommendations = [x for x in all_movies if x not in req_movie_ids]


#Movies from 2 of the most similar users

In [79]:


# Function to compute cosine similarity between two feature vectors
def cosine_similarity(vec1, vec2):
    dot_product = float(vec1.dot(vec2))
    norm_vec1 = float(vec1.norm(2))
    norm_vec2 = float(vec2.norm(2))
    return dot_product / (norm_vec1 * norm_vec2)

# Convert feature vectors to dense vectors
dense_vectors = predictions.rdd.map(lambda row: (row["user_id"], Vectors.dense(row["features"]))).collectAsMap()

# Compute similarity between req_user_id and all other users in the cluster
similarities = {}
req_user_features = dense_vectors[req_user_id]
for user_id, features in dense_vectors.items():
    if user_id != req_user_id:
        similarity = cosine_similarity(req_user_features, features)
        similarities[user_id] = similarity

# Sort the similarities dictionary by similarity values in descending order
sorted_similarities = sorted(similarities.items(), key=lambda x: x[1], reverse=True)

# Select the top 2 most similar users
top_similar_users = sorted_similarities[:2]

# Print the user IDs of the top 2 most similar users
print(f"Top 2 most similar users to user {req_user_id}:")
for user_id, similarity in top_similar_users:
    print(f"User ID: {user_id}, Similarity: {similarity}")

Top 2 most similar users to user 148:
User ID: 2453, Similarity: 0.9999966619455816
User ID: 1889, Similarity: 0.9999963552627231


In [80]:
all_movies = []
for user_id, similarity in top_similar_users:
  high_rated_movies = get_high_rated_movies_for_user(user_id)
  all_movies += high_rated_movies
  #print("High-rated movies for user", user_id, ":", high_rated_movies)
all_movies = list(set(all_movies))
recommendations = [x for x in all_movies if x not in req_movie_ids]
# Step 1: Choose the top 30 movies with the highest average rating
top_30_movies = movies.orderBy("watches", ascending = False).limit(30)

# Step 2: Join with orig_movies to get additional information
top_30_movies_with_info = top_30_movies.join(orig_movies, on="movie_id")

# Step 3: Select the relevant columns
recommendations = top_30_movies_with_info.select("Movie_Title", "watches", "Genre")

# Show the recommendations
recommendations.show()

+--------------------+-------+--------------------+
|         Movie_Title|watches|               Genre|
+--------------------+-------+--------------------+
|    Toy Story (1995)| 2077.0|Animation|Childre...|
|   Braveheart (1995)| 2443.0|    Action|Drama|War|
|Star Wars: Episod...| 2991.0|Action|Adventure|...|
| Pulp Fiction (1994)| 2171.0|         Crime|Drama|
|Shawshank Redempt...| 2227.0|               Drama|
| Forrest Gump (1994)| 2194.0|  Comedy|Romance|War|
|Jurassic Park (1993)| 2672.0|Action|Adventure|...|
|Schindler's List ...| 2304.0|           Drama|War|
|Terminator 2: Jud...| 2649.0|Action|Sci-Fi|Thr...|
|Silence of the La...| 2578.0|      Drama|Thriller|
|        Fargo (1996)| 2513.0|Crime|Drama|Thriller|
|Godfather, The (1...| 2223.0|  Action|Crime|Drama|
|E.T. the Extra-Te...| 2269.0|Children's|Drama|...|
|Star Wars: Episod...| 2990.0|Action|Adventure|...|
|Princess Bride, T...| 2318.0|Action|Adventure|...|
|Raiders of the Lo...| 2514.0|    Action|Adventure|
|Star Wars: 

In [66]:
recommendations.count()

30

#Movie Clustering

In [67]:
movies.show()

+--------+----+-------+------------------+---------------------+------------------------+------------------------+-------------------------+---------------------+--------------------+--------------------------+--------------------+----------------------+------------------------+---------------------+----------------------+----------------------+----------------------+---------------------+-----------------------+------------------+----------------------+------------------+--------------------+--------------------+---------------------+------------------+------------------+----------------------+------------------+------------------+--------------------+------------------+------------------+------------------+------------------+-----------------+-------------------+--------------+------------------+
|movie_id|year|watches|        avg_rating|popularity_per_Action|popularity_per_Adventure|popularity_per_Animation|popularity_per_Children's|popularity_per_Comedy|popularity_per_Crime|popular

In [81]:
# Convert categorical variables to numerical using one-hot encoding
for col in movies.columns:
    if col in ["movie_id", "year"]:
            movies = movies.withColumn(col, movies[col].cast(IntegerType()))
    else:
            movies = movies.withColumn(col, movies[col].cast(FloatType()))
movies.show()

+--------+----+-------+----------+---------------------+------------------------+------------------------+-------------------------+---------------------+--------------------+--------------------------+--------------------+----------------------+------------------------+---------------------+----------------------+----------------------+----------------------+---------------------+-----------------------+------------------+----------------------+-----------------+--------------------+--------------------+---------------------+-----------------+----------------+----------------------+----------------+------------------+--------------------+-----------------+------------------+------------------+------------------+-----------------+-------------------+--------------+------------------+
|movie_id|year|watches|avg_rating|popularity_per_Action|popularity_per_Adventure|popularity_per_Animation|popularity_per_Children's|popularity_per_Comedy|popularity_per_Crime|popularity_per_Documentary|pop

In [82]:

silhouette = 1
cluster = 50

# Vectorize the features
feature_columns = movies.columns
feature_columns.remove("movie_id")  # Exclude the user_id column
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
data = assembler.transform(movies)

#Loop till silhouette score is reached
while (silhouette > 0.56):

      # Initialize KMeans model with 600 clusters
      kmeans = KMeans().setK(cluster).setSeed(1)

      # Train the model
      model = kmeans.fit(data)

      # Make predictions
      predictions = model.transform(data)

      # Evaluate clustering by computing Silhouette score
      evaluator = ClusteringEvaluator()
      silhouette = evaluator.evaluate(predictions)

      print("The silhouette score:" +str(silhouette)+ " has a cluster size of "+str(cluster))
      #Change cluster size
      cluster += 10

# Shows the result.
centers = model.clusterCenters()

The silhouette score:0.6052103280918592 has a cluster size of 50
The silhouette score:0.6027924591139651 has a cluster size of 60
The silhouette score:0.5812454162971773 has a cluster size of 70
The silhouette score:0.5778376496690538 has a cluster size of 80
The silhouette score:0.5965794395027118 has a cluster size of 90
The silhouette score:0.5580307669868385 has a cluster size of 100


In [83]:

#Get all the high rated movies of the user
req_highly_rated_movies = ratings.filter((ratings["user_id"] == req_user_id) & (ratings["rating"] == 5))
#Join ratings with predictions to associate movies with clusters
joined_data = req_highly_rated_movies.join(predictions, on="movie_id")

#Count the number of movies rated 5 in each cluster
cluster_counts = joined_data.groupBy("prediction").agg(F.countDistinct("movie_id").alias("count_5_rating"))

#Find the cluster with the most movies rated 5
most_rated_cluster = cluster_counts.orderBy(F.desc("count_5_rating")).first()["prediction"]

# Print the cluster with the most movies rated 5
print("Cluster with the most movies rated 5:", most_rated_cluster)

Cluster with the most movies rated 5: 78


In [84]:
# Filter predictions for movies in the cluster with the most movies rated 5
req_cluster_movies = predictions.filter(predictions["prediction"] == most_rated_cluster)

# Join with orig_movies to get movie information
movies_in_cluster_except_rated = req_cluster_movies.join(orig_movies, on="movie_id")

# Filter out movies that req_user_id has rated
user_rated_movies = ratings.filter(ratings["user_id"] == req_user_id).select("movie_id").distinct()
movies_in_cluster_except_rated = movies_in_cluster_except_rated.join(user_rated_movies, on="movie_id", how="left_anti").select( "Movie_Title","watches", "Genre")

# Show the remaining movies in the cluster
movies_in_cluster_except_rated.show()

+--------------------+-------+--------------------+
|         Movie_Title|watches|               Genre|
+--------------------+-------+--------------------+
|Seven (Se7en) (1995)| 1137.0|      Crime|Thriller|
| Citizen Kane (1941)| 1116.0|               Drama|
|        Glory (1989)| 1112.0|    Action|Drama|War|
|This Is Spinal Ta...| 1118.0|Comedy|Drama|Musical|
|Grosse Pointe Bla...| 1136.0|        Comedy|Crime|
|Boogie Nights (1997)| 1128.0|               Drama|
|     Magnolia (1999)| 1113.0|               Drama|
|Wayne's World (1992)| 1120.0|              Comedy|
|Blazing Saddles (...| 1119.0|      Comedy|Western|
+--------------------+-------+--------------------+

