#Start Spark Context and Data Preprocessing

In [None]:
!apt-get update -qq > /dev/null
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://dlcdn.apache.org/spark/spark-3.4.2/spark-3.4.2-bin-hadoop3.tgz
!tar xf spark-3.4.2-bin-hadoop3.tgz
!pip install -q findspark
!pip install -q pyspark

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [None]:
from pyspark.sql.types import IntegerType, FloatType, DoubleType
from pyspark.sql.functions import rand, udf
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
import os
import sys
import pandas as pd
import numpy as np
import pyspark
from pyspark.sql import SparkSession
import pyspark.ml as ml
from pyspark.sql.functions import col
from pyspark.ml.linalg import Vectors
from pyspark.sql import functions as F

In [None]:
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
spark = SparkSession.builder.appName("YourAppName").getOrCreate()
#Load the cleaned dataset
def load_dfs():
    global movies, users, ratings, orig_movies
    movies = spark.read.csv("/content/drive/MyDrive/movieLens/cleaned_data/pivoted_movies_features.csv", encoding="latin1", header=True).cache()
    movies.createOrReplaceTempView("movies_info")

    users = spark.read.csv("/content/drive/MyDrive/movieLens/cleaned_data/pivoted_users_features.csv", encoding="latin1", header=True).cache()
    users.createOrReplaceTempView("users_info")

    ratings = spark.read.csv("/content/drive/MyDrive/movieLens/ratings.dat", sep="::", encoding="latin1")
    ratings = ratings.toDF("user_id", "movie_id", "rating", "time_stamp").cache()
    ratings.createOrReplaceTempView("ratings_info")

    orig_movies = spark.read.csv("/content/drive/MyDrive/movieLens/movies.dat", sep="::", encoding="latin1")
    orig_movies = orig_movies.toDF("movie_id", "Movie_Title", "Genre").cache()
    orig_movies.createOrReplaceTempView("orig_movies_info")

load_dfs()

In [None]:
#Remove Index column
movies = movies.drop("_c0")
users = users.drop("_c0")

In [None]:
movies.show()

+--------+----+-------+------------------+---------------------+------------------------+------------------------+-------------------------+---------------------+--------------------+--------------------------+--------------------+----------------------+------------------------+---------------------+----------------------+----------------------+----------------------+---------------------+-----------------------+------------------+----------------------+------------------+--------------------+--------------------+---------------------+------------------+------------------+----------------------+------------------+------------------+--------------------+------------------+------------------+------------------+------------------+-----------------+-------------------+--------------+------------------+
|movie_id|year|watches|        avg_rating|popularity_per_Action|popularity_per_Adventure|popularity_per_Animation|popularity_per_Children's|popularity_per_Comedy|popularity_per_Crime|popular

#Start Clustering using User Features

In [None]:

# Convert categorical variables to numerical using one-hot encoding
for col in users.columns:
    if col in ["user_id", "gender", "region","academic/educator",  "artist", "clerical/ admin",  "college/grad student", "customer service","doctor/health care", "executive/managerial", "farmer",  "homemaker", "K-12 student", "lawyer", "programmer", "retired", "sales/marketing", "scientist", "self-employed","technician/engineer","tradesman/craftsman", "unemployed","writer" ]:
        users = users.withColumn(col, users[col].cast(IntegerType()))
    else:
        users = users.withColumn(col, users[col].cast(FloatType()))

In [None]:

silhouette = 0
cluster = 500
# Vectorize the features
feature_columns = users.columns
feature_columns.remove("user_id")  # Exclude the user_id column
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
data = assembler.transform(users)

while (silhouette < 0.185):

      # Initialize KMeans model with 600 clusters
      kmeans = KMeans().setK(cluster).setSeed(1)

      # Train the model
      model = kmeans.fit(data)

      # Make predictions
      predictions = model.transform(data)

      # Evaluate clustering by computing Silhouette score
      evaluator = ClusteringEvaluator()
      silhouette = evaluator.evaluate(predictions)
      #Change cluster
      cluster += 50
      print(silhouette)
      print(cluster)

# Shows the result.
centers = model.clusterCenters()

0.18814031990732877
550
0.18356274786399263
600
0.18035840981069617
650
0.1749976854324765
700
0.17435568514783414
750
0.17011348778244245
800
0.17119754179364438
850
0.16510175083222026
900
0.1636112824483007
950
0.15183096480012453
1000
0.15152003587223878
1050
0.1580156906779924
1100
0.14681886167850464
1150
0.14907871570965597
1200
0.1401155562563767
1250
0.13974322546473347
1300
0.13969458569696455
1350
0.1378766917073383
1400
0.13596598159615783
1450
0.136254452313531
1500
0.13081719317581658
1550
0.1271333318687027
1600
0.13319065091593438
1650
0.12807905058181598
1700
0.12818801615355094
1750
0.12809500624188505
1800
0.12131581799565457
1850
0.1228136853803991
1900
0.12055126667824317
1950
0.12441217825415313
2000
0.11567248881383743
2050
0.11707076446539322
2100
0.11055488774406748
2150
0.1098276759837881
2200
0.10894991862815843
2250
0.1135189091607504
2300
0.1091829923389721
2350
0.10597133825708113
2400
0.10848063253762387
2450
0.11144113064634127
2500
0.10878105740021306
2

#Set the user ID we want to make recommendations to

In [None]:
#User we want to recommend to
req_user_id = 3475

#Most Similar Users to req_user_id

In [None]:

def print_other_users_in_cluster(user_id):
    # Find the predicted cluster for the given user_id
    user_cluster = predictions.filter(predictions["user_id"] == user_id).select("prediction").collect()[0]["prediction"]

    # Filter the predictions DataFrame to get all users belonging to the same cluster
    cluster_users = predictions.filter(predictions["prediction"] == user_cluster).select("user_id").collect()

    # Print all other user IDs in the cluster except for the given user_id
    print(f"Other user IDs in cluster {user_cluster} for user {user_id}:")

    user_list = []
    for row in cluster_users:
        if row["user_id"] != user_id:
            print(row["user_id"])
            user_list.append(row["user_id"])
    return user_list


users_sim = print_other_users_in_cluster(req_user_id)
print(users_sim)

Other user IDs in cluster 178 for user 3475:
148
3182
4867
[148, 3182, 4867]


In [None]:
def get_high_rated_movies_for_user(user_id):
    # Filter the ratings DataFrame for the given user_id and high ratings
    high_rated_movies = ratings.filter('user_id == '+str(user_id)).filter("rating == 5").select("movie_id")

    # Collect the movie_id values as a list
    high_rated_movie_ids = high_rated_movies.rdd.map(lambda row: row[0]).collect()

    return high_rated_movie_ids
all_movies = []
for i in users_sim:
  high_rated_movies = get_high_rated_movies_for_user(i)
  all_movies += high_rated_movies
  print("High-rated movies for user", i, ":", high_rated_movies)

High-rated movies for user 148 : ['2987', '1320', '3791', '1253', '3868', '2059', '587', '589', '1', '2', '3871', '1408', '733', '1197', '736', '1198', '592', '597', '2072', '1270', '671', '673', '3028', '2086', '1287', '3107', '3035', '3109', '1291', '2094', '832', '2097', '908', '836', '1299', '3111', '3114', '910', '2174', '1381', '780', '3062', '2406', '1608', '2193', '2194', '2268', '861', '2416', '1616', '2424', '3155', '1703', '2366', '965', '899', '3175', '971', '3255', '3256', '110', '1727', '1586', '3408', '2399', '1597', '2470', '2476', '1687', '3430', '3360', '3507', '1833', '150', '2496', '158', '2571', '2648', '2720', '3524', '3386', '1784', '316', '318', '2804', '2664', '3396', '3397', '1876', '260', '11', '15', '3624', '2752', '3555', '1019', '1957', '349', '1020', '2762', '356', '34', '1101', '500', '1036', '364', '368', '1042', '1049', '379', '3591', '2791', '3668', '1127', '527', '454', '457', '2942', '2874', '605', '539', '3751', '2012', '1210', '3753', '2953', '202

In [None]:
req_movie_ids = ratings.filter('user_id == '+str(req_user_id)).select("movie_id").rdd.map(lambda row: row[0]).collect()
all_movies = list(set(all_movies))
recommendations = [x for x in all_movies if x not in req_movie_ids]


#Movies from 2 of the most similar users

In [None]:


# Function to compute cosine similarity between two feature vectors
def cosine_similarity(vec1, vec2):
    dot_product = float(vec1.dot(vec2))
    norm_vec1 = float(vec1.norm(2))
    norm_vec2 = float(vec2.norm(2))
    return dot_product / (norm_vec1 * norm_vec2)

# Convert feature vectors to dense vectors
dense_vectors = predictions.rdd.map(lambda row: (row["user_id"], Vectors.dense(row["features"]))).collectAsMap()

# Compute similarity between req_user_id and all other users in the cluster
similarities = {}
req_user_features = dense_vectors[req_user_id]
for user_id, features in dense_vectors.items():
    if user_id != req_user_id:
        similarity = cosine_similarity(req_user_features, features)
        similarities[user_id] = similarity

# Sort the similarities dictionary by similarity values in descending order
sorted_similarities = sorted(similarities.items(), key=lambda x: x[1], reverse=True)

# Select the top 2 most similar users
top_similar_users = sorted_similarities[:2]

# Print the user IDs of the top 2 most similar users
print(f"Top 2 most similar users to user {req_user_id}:")
for user_id, similarity in top_similar_users:
    print(f"User ID: {user_id}, Similarity: {similarity}")

Top 2 most similar users to user 3475:
User ID: 1780, Similarity: 0.9999980356450433
User ID: 5888, Similarity: 0.9999969473524409


In [None]:
all_movies = []
for user_id, similarity in top_similar_users:
  high_rated_movies = get_high_rated_movies_for_user(user_id)
  all_movies += high_rated_movies
  #print("High-rated movies for user", user_id, ":", high_rated_movies)
all_movies = list(set(all_movies))
recommendations = [x for x in all_movies if x not in req_movie_ids]
# Step 1: Choose the top 30 movies with the highest average rating
top_30_movies = movies.orderBy("watches", ascending = False).limit(30)

# Step 2: Join with orig_movies to get additional information
top_30_movies_with_info = top_30_movies.join(orig_movies, on="movie_id")

# Step 3: Select the relevant columns
recommendations = top_30_movies_with_info.select("movie_id","year", "watches", "Movie_Title", "Genre")

# Show the recommendations
recommendations.show()

+--------+----+-------+--------------------+--------------------+
|movie_id|year|watches|         Movie_Title|               Genre|
+--------+----+-------+--------------------+--------------------+
|       1|1995| 2077.0|    Toy Story (1995)|Animation|Childre...|
|     110|1995| 2443.0|   Braveheart (1995)|    Action|Drama|War|
|     260|1977| 2991.0|Star Wars: Episod...|Action|Adventure|...|
|     296|1994| 2171.0| Pulp Fiction (1994)|         Crime|Drama|
|     318|1994| 2227.0|Shawshank Redempt...|               Drama|
|     356|1994| 2194.0| Forrest Gump (1994)|  Comedy|Romance|War|
|     480|1993| 2672.0|Jurassic Park (1993)|Action|Adventure|...|
|     527|1993| 2304.0|Schindler's List ...|           Drama|War|
|     589|1991| 2649.0|Terminator 2: Jud...|Action|Sci-Fi|Thr...|
|     593|1991| 2578.0|Silence of the La...|      Drama|Thriller|
|     608|1996| 2513.0|        Fargo (1996)|Crime|Drama|Thriller|
|     858|1972| 2223.0|Godfather, The (1...|  Action|Crime|Drama|
|    1097|

In [None]:
recommendations.count()

30

#Movie Clustering

In [None]:
movies.show()

+--------+----+-------+------------------+---------------------+------------------------+------------------------+-------------------------+---------------------+--------------------+--------------------------+--------------------+----------------------+------------------------+---------------------+----------------------+----------------------+----------------------+---------------------+-----------------------+------------------+----------------------+------------------+--------------------+--------------------+---------------------+------------------+------------------+----------------------+------------------+------------------+--------------------+------------------+------------------+------------------+------------------+-----------------+-------------------+--------------+------------------+
|movie_id|year|watches|        avg_rating|popularity_per_Action|popularity_per_Adventure|popularity_per_Animation|popularity_per_Children's|popularity_per_Comedy|popularity_per_Crime|popular

In [None]:
# Convert categorical variables to numerical using one-hot encoding
for col in movies.columns:
    if col in ["movie_id", "year"]:
            movies = movies.withColumn(col, movies[col].cast(IntegerType()))
    else:
            movies = movies.withColumn(col, movies[col].cast(FloatType()))
movies.show()

+--------+----+-------+----------+---------------------+------------------------+------------------------+-------------------------+---------------------+--------------------+--------------------------+--------------------+----------------------+------------------------+---------------------+----------------------+----------------------+----------------------+---------------------+-----------------------+------------------+----------------------+-----------------+--------------------+--------------------+---------------------+-----------------+----------------+----------------------+----------------+------------------+--------------------+-----------------+------------------+------------------+------------------+-----------------+-------------------+--------------+------------------+
|movie_id|year|watches|avg_rating|popularity_per_Action|popularity_per_Adventure|popularity_per_Animation|popularity_per_Children's|popularity_per_Comedy|popularity_per_Crime|popularity_per_Documentary|pop

In [None]:

silhouette = 0
seed = 0

# Vectorize the features
feature_columns = movies.columns
feature_columns.remove("movie_id")  # Exclude the user_id column
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
data = assembler.transform(movies)

#Loop till silhouette score is reached
while (silhouette < 0.575):
      #Change seed
      seed += 1

      # Initialize KMeans model with 600 clusters
      kmeans = KMeans().setK(100).setSeed(seed)

      # Train the model
      model = kmeans.fit(data)

      # Make predictions
      predictions = model.transform(data)

      # Evaluate clustering by computing Silhouette score
      evaluator = ClusteringEvaluator()
      silhouette = evaluator.evaluate(predictions)

# Shows the result.
centers = model.clusterCenters()

In [None]:

#Get all the high rated movies of the user
req_highly_rated_movies = ratings.filter((ratings["user_id"] == req_user_id) & (ratings["rating"] == 5))
#Join ratings with predictions to associate movies with clusters
joined_data = req_highly_rated_movies.join(predictions, on="movie_id")

#Count the number of movies rated 5 in each cluster
cluster_counts = joined_data.groupBy("prediction").agg(F.countDistinct("movie_id").alias("count_5_rating"))

#Find the cluster with the most movies rated 5
most_rated_cluster = cluster_counts.orderBy(F.desc("count_5_rating")).first()["prediction"]

# Print the cluster with the most movies rated 5
print("Cluster with the most movies rated 5:", most_rated_cluster)

Cluster with the most movies rated 5: 20


In [None]:
# Filter predictions for movies in the cluster with the most movies rated 5
req_cluster_movies = predictions.filter(predictions["prediction"] == most_rated_cluster)

# Join with orig_movies to get movie information
movies_in_cluster_except_rated = req_cluster_movies.join(orig_movies, on="movie_id")

# Filter out movies that req_user_id has rated
user_rated_movies = ratings.filter(ratings["user_id"] == req_user_id).select("movie_id").distinct()
movies_in_cluster_except_rated = movies_in_cluster_except_rated.join(user_rated_movies, on="movie_id", how="left_anti").select("movie_id", "year", "watches", "Movie_Title", "Genre")

# Show the remaining movies in the cluster
movies_in_cluster_except_rated.show()

+--------+----+-------+--------------------+--------------------+
|movie_id|year|watches|         Movie_Title|               Genre|
+--------+----+-------+--------------------+--------------------+
|     253|1994|  738.0|Interview with th...|        Drama|Horror|
|     292|1995|  733.0|     Outbreak (1995)|Action|Drama|Thri...|
|    1077|1973|  744.0|      Sleeper (1973)|       Comedy|Sci-Fi|
|    1275|1986|  741.0|   Highlander (1986)|    Action|Adventure|
|    1513|1997|  755.0|Romy and Michele'...|              Comedy|
|    1965|1984|  751.0|     Repo Man (1984)|       Comedy|Sci-Fi|
|    2094|1991|  736.0|Rocketeer, The (1...|Action|Adventure|...|
|    2288|1982|  741.0|   Thing, The (1982)|Action|Horror|Sci...|
|    2826|1999|  750.0|13th Warrior, The...|Action|Horror|Thr...|
|    2872|1981|  742.0|    Excalibur (1981)|Action|Drama|Fant...|
|    3698|1987|  730.0|Running Man, The ...|Action|Adventure|...|
+--------+----+-------+--------------------+--------------------+

