In [133]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import OneHotEncoder

In [134]:
#Task 1: Load and preview the data

print("Task 1: Loading and Previewing Data\n")

# Load datasets
movies = pd.read_csv("lab8files/movies.csv")
ratings = pd.read_csv("lab8files/ratings.csv")

# Preview movies
print("Movies Dataset Preview:")
print(movies.head(), "\n")

# Preview ratings
print("Ratings Dataset Preview:")
print(ratings.head())


Task 1: Loading and Previewing Data

Movies Dataset Preview:
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy   

Ratings Dataset Preview:
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931


In [135]:
#Task 2: Preprocess the 'movies.csv' dataset

print("\nTask 2: Preprocessing 'movies.csv'\n")

# Handle Missing Values (Check for NaNs)
print("Checking for missing values")
print(movies.isna().sum())

# If any missing titles/genres exist, fill or drop
movies['genres'] = movies['genres'].fillna("Unknown")
movies['title'] = movies['title'].fillna("Untitled")

# Process the 'genres' column
genres_matrix = movies_df['genres'].str.get_dummies('|')
movies_df = pd.concat([movies_df, genres_matrix], axis=1)

print("\nMovies DataFrame after one-hot encoding genres (preview):")
print(movies_df.head())
print("\nShape of movies_df after processing:", movies_df.shape)


Task 2: Preprocessing 'movies.csv'

Checking for missing values
movieId    0
title      0
genres     0
dtype: int64

Movies DataFrame after one-hot encoding genres (preview):
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  (no genres listed)  Action  \
0  Adventure|Animation|Children|Comedy|Fantasy                   0       0   
1                   Adventure|Children|Fantasy                   0       0   
2                               Comedy|Romance                   0       0   
3                         Comedy|Drama|Romance                   0       0   
4                                       Comedy                   0       0   

   Adventure  Animation  Children  Comedy  Crime  ... 

In [136]:
#Task 3: Create a user-item interaction matrix from 'ratings.csv'

print("\nTask 3: Creating User-Item Interaction Matrix\n")

# Drop the 'timestamp' column 
ratings_df_clean = ratings_df.drop('timestamp', axis=1)

# Create the User-Item Interaction Matrix using a pivot table
user_item_matrix = ratings_df_clean.pivot(
    index='userId',
    columns='movieId',
    values='rating'
).fillna(0) # Fill NaNs with 0 

print("User-Item Interaction Matrix (Utility Matrix) Preview:")
print(user_item_matrix.head())
print("\nShape of User-Item Interaction Matrix (Users x Movies):", user_item_matrix.shape)
print("\nSummary of the Utility Matrix:")
print(user_item_matrix.count(axis=1).describe())


Task 3: Creating User-Item Interaction Matrix

User-Item Interaction Matrix (Utility Matrix) Preview:
movieId  1       2       3       4       5       6       7       8       \
userId                                                                    
1           4.0     0.0     4.0     0.0     0.0     4.0     0.0     0.0   
2           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
3           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
4           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
5           4.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   

movieId  9       10      ...  193565  193567  193571  193573  193579  193581  \
userId                   ...                                                   
1           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
2           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
3           0.0     0.0  ...     0.0     0.0     0.

In [137]:
#Task 4 Content-Based Filtering Code

print("\nTask 4: Content-Based Model (Genres + TF-IDF)")

# Clean genres text
movies['genres_clean'] = movies['genres'].fillna("").str.replace("|", " ", regex=False)

# TF-IDF matrix
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(movies['genres_clean'])

# Cosine similarity (movie x movie)
content_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

print()
print("TF-IDF matrix shape:", tfidf_matrix.shape)
print("Content-based similarity matrix shape:", content_sim.shape)

# Mapping titles to index
title_to_idx = pd.Series(movies.index, index=movies['title']).drop_duplicates()

def get_content_recommendations(title, n_recs=10):
    idx = title_to_idx[title]
    sims = list(enumerate(content_sim[idx]))
    sims = sorted(sims, key=lambda x: x[1], reverse=True)
    sims = sims[1:n_recs+1]
    movie_indices = [i for i, s in sims]
    return movies[['movieId','title','genres']].iloc[movie_indices]

# PRINT SAMPLE OUTPUT
print("\nContent-based recommendations for Toy Story:")
print(get_content_recommendations("Toy Story (1995)", 5))



Task 4: Content-Based Model (Genres + TF-IDF)

TF-IDF matrix shape: (9742, 24)
Content-based similarity matrix shape: (9742, 9742)

Content-based recommendations for Toy Story:
      movieId                                           title  \
1706     2294                                     Antz (1998)   
2355     3114                              Toy Story 2 (1999)   
2809     3754  Adventures of Rocky and Bullwinkle, The (2000)   
3000     4016                Emperor's New Groove, The (2000)   
3568     4886                           Monsters, Inc. (2001)   

                                           genres  
1706  Adventure|Animation|Children|Comedy|Fantasy  
2355  Adventure|Animation|Children|Comedy|Fantasy  
2809  Adventure|Animation|Children|Comedy|Fantasy  
3000  Adventure|Animation|Children|Comedy|Fantasy  
3568  Adventure|Animation|Children|Comedy|Fantasy  


In [139]:
#Task 5 Collaborative Filtering Code

print("\nTask 5: Collaborative Filtering (Item-Based KNN)")

# Align user-item matrix to movieId order
user_item_matrix_aligned = user_item_matrix.reindex(columns=movies['movieId'])
user_item_filled = user_item_matrix_aligned.fillna(0)

# Item-user matrix
item_user_matrix = user_item_filled.T  # movie x user

# Cosine similarity between movies
collab_sim = cosine_similarity(item_user_matrix, item_user_matrix)

print()
print("Item-user matrix shape:", item_user_matrix.shape)
print("Collaborative similarity matrix shape:", collab_sim.shape)

def get_collab_recommendations(title, n_recs=10):
    idx = title_to_idx[title]
    sims = list(enumerate(collab_sim[idx]))
    sims = sorted(sims, key=lambda x: x[1], reverse=True)
    sims = sims[1:n_recs+1]
    movie_indices = [i for i, s in sims]
    return movies[['movieId','title','genres']].iloc[movie_indices]

# PRINT SAMPLE OUTPUT
print("\nCollaborative recommendations for Toy Story:")
print(get_collab_recommendations("Toy Story (1995)", 5))



Task 5: Collaborative Filtering (Item-Based KNN)

Item-user matrix shape: (9742, 610)
Collaborative similarity matrix shape: (9742, 9742)

Collaborative recommendations for Toy Story:
      movieId                                      title  \
2355     3114                         Toy Story 2 (1999)   
418       480                       Jurassic Park (1993)   
615       780       Independence Day (a.k.a. ID4) (1996)   
224       260  Star Wars: Episode IV - A New Hope (1977)   
314       356                        Forrest Gump (1994)   

                                           genres  
2355  Adventure|Animation|Children|Comedy|Fantasy  
418              Action|Adventure|Sci-Fi|Thriller  
615              Action|Adventure|Sci-Fi|Thriller  
224                       Action|Adventure|Sci-Fi  
314                      Comedy|Drama|Romance|War  


In [140]:
#Task 6 hybrid recommender system

print("\nTask 6: Hybrid Recommender (Weighted Content + Collaborative)")

def get_hybrid_recommendations(title, alpha=0.5, n_recs=10):
    idx = title_to_idx[title]

    # Weighted merge of similarities
    hybrid_scores = alpha * content_sim[idx] + (1 - alpha) * collab_sim[idx]

    sims = list(enumerate(hybrid_scores))
    sims = sorted(sims, key=lambda x: x[1], reverse=True)
    sims = sims[1:n_recs+1]

    movie_indices = [i for i, s in sims]
    scores = [s for i, s in sims]

    result = movies[['movieId','title','genres']].iloc[movie_indices].copy()
    result['hybrid_score'] = scores
    return result

# PRINT SAMPLE OUTPUT
print("\nHybrid recommendations for Toy Story (alpha=0.5):")
print(get_hybrid_recommendations("Toy Story (1995)", alpha=0.5, n_recs=10))



Task 6: Hybrid Recommender (Weighted Content + Collaborative)

Hybrid recommendations for Toy Story (alpha=0.5):
      movieId                             title  \
2355     3114                Toy Story 2 (1999)   
3568     4886             Monsters, Inc. (2001)   
3194     4306                      Shrek (2001)   
1706     2294                       Antz (1998)   
4360     6377               Finding Nemo (2003)   
1757     2355              Bug's Life, A (1998)   
3000     4016  Emperor's New Groove, The (2000)   
8900   134853                 Inside Out (2015)   
5374     8961           Incredibles, The (2004)   
3745     5218                    Ice Age (2002)   

                                                 genres  hybrid_score  
2355        Adventure|Animation|Children|Comedy|Fantasy      0.786301  
3568        Adventure|Animation|Children|Comedy|Fantasy      0.752325  
3194  Adventure|Animation|Children|Comedy|Fantasy|Ro...      0.730632  
1706        Adventure|Animation|Chil

In [144]:
#Task 7 Extended content-based model with descriptions

print("\nTask 7: Extended content-based model with descriptions")
print("No description avaliable")

#Task 8 Add user demographics

print("\nTask 8: Add user demographics to collaborative model")
print("No users.csv file")


Task 7: Extended content-based model with descriptions
No description avaliable

Task 8: Add user demographics to collaborative model
No users.csv file


In [145]:
#Task 9: Evaluation metrics (Precision, Recall, F1)

print("\nTask 9: Implement evaluation metrics (precision, recall, F1)")


def precision_recall_f1(recommended_ids, relevant_ids):
    recommended = set(recommended_ids) #list of movieId recommended by a model
    relevant = set(relevant_ids) #list of movieId considered relevant

    tp = len(recommended & relevant)
    fp = len(recommended - relevant)
    fn = len(relevant - recommended)

    #Compute precision, recall and F1-score.
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    recall    = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    f1        = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0

    return precision, recall, f1, tp, fp, fn


#Evaluate a recommender for one user and one seed movie.
def evaluate_model_for_user(model_name, recommend_func, movie_title, user_id,
                            n_recs=10, rating_threshold=4.0, **kwargs):

    print(f"\nEvaluating {model_name}")

    # Get recommendations from the model
    rec_df = recommend_func(movie_title, n_recs=n_recs, **kwargs)
    recommended_ids = rec_df['movieId'].tolist()
    top5_recommended = rec_df[['movieId', 'title']].head(5)

    # Relevant items: movies this user rated >= threshold
    movie_title_id = movies.loc[movies['title'] == movie_title, 'movieId'].iloc[0]
    user_r = ratings[ratings['userId'] == user_id]
    relevant_ids = user_r[user_r['rating'] >= rating_threshold]['movieId'].tolist()

    # Remove the seed movie from the relevant set 
    relevant_ids = [m for m in relevant_ids if m != movie_title_id]
    relevant_df = movies[movies['movieId'].isin(relevant_ids)][['movieId', 'title']]
    top5_relevant = relevant_df.head(5)

    # Compute metrics
    precision, recall, f1, tp, fp, fn = precision_recall_f1(recommended_ids, relevant_ids)

    print("\nTop 5 Recommended:")
    print(top5_recommended.to_string(index=False))

    print("\nTop 5 Relevant:")
    print(top5_relevant.to_string(index=False))

    print(f"\nTP={tp}, FP={fp}, FN={fn}")
    print(f"Precision: {precision:.3f}")
    print(f"Recall   : {recall:.3f}")
    print(f"F1-score : {f1:.3f}")
   
    return precision, recall, f1

example_user_id   = 1                 
example_movie_title = "Toy Story (1995)"

evaluate_model_for_user(
    model_name="Content-Based (genres)",
    recommend_func=get_content_recommendations,
    movie_title=example_movie_title,
    user_id=example_user_id,
    n_recs=10,
    rating_threshold=4.0
)


Task 9: Implement evaluation metrics (precision, recall, F1)

Evaluating Content-Based (genres)

Top 5 Recommended:
 movieId                                          title
    2294                                    Antz (1998)
    3114                             Toy Story 2 (1999)
    3754 Adventures of Rocky and Bullwinkle, The (2000)
    4016               Emperor's New Groove, The (2000)
    4886                          Monsters, Inc. (2001)

Top 5 Relevant:
 movieId                       title
       3     Grumpier Old Men (1995)
       6                 Heat (1995)
      47 Seven (a.k.a. Se7en) (1995)
      50  Usual Suspects, The (1995)
     101        Bottle Rocket (1996)

TP=0, FP=10, FN=199
Precision: 0.000
Recall   : 0.000
F1-score : 0.000


(0.0, 0.0, 0.0)

In [146]:
#Task 10: Compare performance of content, collaborative, and hybrid 

print("\nTask 10: Compare the performance of the three model")

movie_title    = "Toy Story (1995)"  # movie to base recommendations on
target_user   = 1                   
top_k         = 10                  
rating_thr    = 4.0                 # threshold for 'relevant' items
alpha_hybrid  = 0.5                 # weight for content in hybrid model

# 1. Content-based
p_c, r_c, f1_c = evaluate_model_for_user(
    "Content-Based",
    get_content_recommendations,
    movie_title,
    target_user,
    n_recs=top_k,
    rating_threshold=rating_thr
)

# 2. Collaborative
p_cf, r_cf, f1_cf = evaluate_model_for_user(
    "Collaborative (Item-based KNN)",
    get_collab_recommendations,
    movie_title,
    target_user,
    n_recs=top_k,
    rating_threshold=rating_thr
)

# 3. Hybrid
p_h, r_h, f1_h = evaluate_model_for_user(
    f"Hybrid (alpha={alpha_hybrid})",
    get_hybrid_recommendations,
    movie_title,
    target_user,
    n_recs=top_k,
    rating_threshold=rating_thr,
    alpha=alpha_hybrid
)

print("\nSummary: Precision / Recall / F1 for each model")
print(f"Content-Based              -> P={p_c:.3f},  R={r_c:.3f},  F1={f1_c:.3f}")
print(f"Collaborative (Item KNN)   -> P={p_cf:.3f}, R={r_cf:.3f}, F1={f1_cf:.3f}")
print(f"Hybrid (alpha={alpha_hybrid}) -> P={p_h:.3f},  R={r_h:.3f},  F1={f1_h:.3f}")



Task 10: Compare the performance of the three model

Evaluating Content-Based

Top 5 Recommended:
 movieId                                          title
    2294                                    Antz (1998)
    3114                             Toy Story 2 (1999)
    3754 Adventures of Rocky and Bullwinkle, The (2000)
    4016               Emperor's New Groove, The (2000)
    4886                          Monsters, Inc. (2001)

Top 5 Relevant:
 movieId                       title
       3     Grumpier Old Men (1995)
       6                 Heat (1995)
      47 Seven (a.k.a. Se7en) (1995)
      50  Usual Suspects, The (1995)
     101        Bottle Rocket (1996)

TP=0, FP=10, FN=199
Precision: 0.000
Recall   : 0.000
F1-score : 0.000

Evaluating Collaborative (Item-based KNN)

Top 5 Recommended:
 movieId                                     title
    3114                        Toy Story 2 (1999)
     480                      Jurassic Park (1993)
     780      Independence Day (a.k.a.