<a href="https://colab.research.google.com/github/farhodibr/CUNY-SPS-MSDS/blob/main/project2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DATA 612 Project 2


In [88]:
import pandas as pd
import numpy as np
# !pip install tmdbsimple
import tmdbsimple as tmdb
import os
import time
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from typing import Set, Union
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


## here we are loading data for movies and genres

In [89]:
ratings = pd.read_csv(
    "http://files.grouplens.org/datasets/movielens/ml-100k/u.data",
    sep="\t", names=["user_id", "movie_id", "rating", "timestamp"]
)

movie_cols = ["movie_id", "title", "release_date", "video_release_date", "IMDb_URL"] + \
             ["unknown", "Action", "Adventure", "Animation", "Children's", "Comedy", "Crime",
              "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical",
              "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"]
genre_cols = movie_cols[5:]

movies = pd.read_csv(
    "http://files.grouplens.org/datasets/movielens/ml-100k/u.item",
    sep="|", names=movie_cols, encoding="latin-1"
)[["movie_id", "title"] + genre_cols]

print(movies.head())
print(ratings.head())

   movie_id              title  unknown  Action  Adventure  Animation  \
0         1   Toy Story (1995)        0       0          0          1   
1         2   GoldenEye (1995)        0       1          1          0   
2         3  Four Rooms (1995)        0       0          0          0   
3         4  Get Shorty (1995)        0       1          0          0   
4         5     Copycat (1995)        0       0          0          0   

   Children's  Comedy  Crime  Documentary  ...  Fantasy  Film-Noir  Horror  \
0           1       1      0            0  ...        0          0       0   
1           0       0      0            0  ...        0          0       0   
2           0       0      0            0  ...        0          0       0   
3           0       1      0            0  ...        0          0       0   
4           0       0      1            0  ...        0          0       0   

   Musical  Mystery  Romance  Sci-Fi  Thriller  War  Western  
0        0        0        0 

## This function converts the binary genre flags into a space-separated string.

In [90]:
def genres_to_string(row):
    return ' '.join([genre for genre in genre_cols if row[genre] == 1])

movies["genre_str"] = movies[genre_cols].apply(genres_to_string, axis=1)

movies[['title', 'genre_str']].head()


Unnamed: 0,title,genre_str
0,Toy Story (1995),Animation Children's Comedy
1,GoldenEye (1995),Action Adventure Thriller
2,Four Rooms (1995),Thriller
3,Get Shorty (1995),Action Comedy Drama
4,Copycat (1995),Crime Drama Thriller


## TF-IDF & Cosine similarity

In [91]:

movies["pseudo_plot"] = movies["title"] + " " + movies["genre_str"]

vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(movies["pseudo_plot"])

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

title_to_index = pd.Series(movies.index, index=movies["title"])

def recommend_by_cosine(title, top_n=5):
    try:
        idx = title_to_index[title]
    except KeyError:
        print(f"Error: Movie '{title}' not found in the dataset.")
        return None

    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    top_recommendations = sim_scores[1:top_n+1]

    movie_indices = [i[0] for i in top_recommendations]
    similarity_scores = [i[1] for i in top_recommendations]

    recommendations_df = movies.iloc[movie_indices].copy()
    recommendations_df['similarity'] = similarity_scores

    return recommendations_df[["title", "genre_str", "similarity"]]



## Jaccard Similarity (Title + Genre)

In [92]:
def create_token_set(row):
    genres = {genre for genre in genre_cols if row[genre] == 1}
    title_words = set(row['title'].lower().split(' (')[0].split())
    return genres.union(title_words)

movies['tokens'] = movies.apply(create_token_set, axis=1)

def recommend_by_jaccard(title, top_n=5):
    try:
        target_tokens = movies.loc[movies['title'] == title, 'tokens'].iloc[0]
    except IndexError:
        print(f"Error: Movie '{title}' not found in the dataset.")
        return None

    sim_scores = []
    for index, row in movies.iterrows():
        intersection = len(target_tokens.intersection(row['tokens']))
        union = len(target_tokens.union(row['tokens']))
        score = intersection / union if union > 0 else 0
        sim_scores.append((index, score))

    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    top_recommendations = sim_scores[1:top_n+1]

    movie_indices = [i[0] for i in top_recommendations]
    similarity_scores = [i[1] for i in top_recommendations]

    recommendations_df = movies.iloc[movie_indices].copy()
    recommendations_df['similarity'] = similarity_scores

    return recommendations_df[["title", "genre_str", "similarity"]]



## testing models (title + genre)

In [93]:
print("--- Testing 'Toy Story (1995)' ---")
print("\n[Cosine Similarity Recommendations]")
print(recommend_by_cosine("Toy Story (1995)"))
print("\n[Jaccard Similarity Recommendations]")
print(recommend_by_jaccard("Toy Story (1995)"))

--- Testing 'Toy Story (1995)' ---

[Cosine Similarity Recommendations]
                                  title                             genre_str  \
1071  Pyromaniac's Love Story, A (1995)                        Comedy Romance   
1065                       Balto (1995)                  Animation Children's   
1218              Goofy Movie, A (1995)   Animation Children's Comedy Romance   
547   NeverEnding Story III, The (1994)                    Children's Fantasy   
541                   Pocahontas (1995)  Animation Children's Musical Romance   

      similarity  
1071    0.352262  
1065    0.345902  
1218    0.313701  
547     0.310664  
541     0.306716  

[Jaccard Similarity Recommendations]
                    title                                      genre_str  \
94         Aladdin (1992)            Animation Children's Comedy Musical   
992       Hercules (1997)  Adventure Animation Children's Comedy Musical   
242  Jungle2Jungle (1997)                              Childr

## Cosine similarity (genres only)

In [94]:
vectorizer_genre = TfidfVectorizer()
tfidf_matrix_genre = vectorizer_genre.fit_transform(movies["genre_str"])
cosine_sim_genre = linear_kernel(tfidf_matrix_genre, tfidf_matrix_genre)

def recommend_by_cosine_genre_only(title: str, top_n: int = 5) -> Union[pd.DataFrame, None]:
    try:
        idx = title_to_index[title]
    except KeyError:
        print(f"Error: Movie '{title}' not found.")
        return None

    sim_scores = list(enumerate(cosine_sim_genre[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    top_recommendations = sim_scores[1:top_n+1]

    movie_indices = [i[0] for i in top_recommendations]
    similarity_scores = [i[1] for i in top_recommendations]

    recommendations_df = movies.iloc[movie_indices].copy()
    recommendations_df['similarity'] = similarity_scores

    return recommendations_df[["title", "genre_str", "similarity"]]

## Jaccard similarity (genres only)

In [95]:
def create_token_set_genre_only(row: pd.Series) -> Set[str]:
    return {genre for genre in genre_cols if row[genre] == 1}

movies['tokens_genre_only'] = movies.apply(create_token_set_genre_only, axis=1)

def recommend_by_jaccard_genre_only(title: str, top_n: int = 5) -> Union[pd.DataFrame, None]:
    try:
        target_tokens = movies.loc[movies['title'] == title, 'tokens_genre_only'].iloc[0]
    except IndexError:
        print(f"Error: Movie '{title}' not found.")
        return None

    sim_scores = []
    for index, row in movies.iterrows():
        intersection = len(target_tokens.intersection(row['tokens_genre_only']))
        union = len(target_tokens.union(row['tokens_genre_only']))
        score = intersection / union if union > 0 else 0
        sim_scores.append((index, score))

    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    top_recommendations = sim_scores[1:top_n+1]

    movie_indices = [i[0] for i in top_recommendations]
    similarity_scores = [i[1] for i in top_recommendations]

    recommendations_df = movies.iloc[movie_indices].copy()
    recommendations_df['similarity'] = similarity_scores

    return recommendations_df[["title", "genre_str", "similarity"]]




## testing models (genres only)

In [96]:
print("--- Testing 'Toy Story (1995)' with Genre-Only Models ---")
print("\n[Cosine Similarity Recommendations (Genre Only)]")
print(recommend_by_cosine_genre_only("Toy Story (1995)"))
print("\n[Jaccard Similarity Recommendations (Genre Only)]")
print(recommend_by_jaccard_genre_only("Toy Story (1995)"))


--- Testing 'Toy Story (1995)' with Genre-Only Models ---

[Cosine Similarity Recommendations (Genre Only)]
                                      title                    genre_str  \
421  Aladdin and the King of Thieves (1996)  Animation Children's Comedy   
101                  Aristocats, The (1970)         Animation Children's   
403                        Pinocchio (1940)         Animation Children's   
624          Sword in the Stone, The (1963)         Animation Children's   
945           Fox and the Hound, The (1981)         Animation Children's   

     similarity  
421    1.000000  
101    0.936967  
403    0.936967  
624    0.936967  
945    0.936967  

[Jaccard Similarity Recommendations (Genre Only)]
                                       title  \
421   Aladdin and the King of Thieves (1996)   
94                            Aladdin (1992)   
1218                   Goofy Movie, A (1995)   
62                  Santa Clause, The (1994)   
93                         Home Alon

## Movie details from TMDB

In [97]:
tmdb.API_KEY = "fdfd7077d4e4a33a6d9c72409dff2c7e"

ENRICHED_FILE = "movies_enriched.csv"
CHECKPOINT_FILE = "checkpoint_tmdb.csv"

movies_df = movies

def genres_to_string(row):
    return ' '.join([genre for genre in genre_cols if row[genre] == 1])
movies_df["genre_str"] = movies_df.apply(genres_to_string, axis=1)

print(f"Loaded {len(movies_df)} movies")

Loaded 1682 movies


Getting movie details from TMDB and creating one dabase. Saving the database, so we don't have to download details each time (it takes quite a bit of time). Once the dataset created we need to change the source to created csv file

In [98]:
def get_tmdb_info(title: str, release_year: str):
    try:
        search = tmdb.Search()
        search.movie(query=title, year=release_year)
        results = getattr(search, "results", [])

        if not results:
            search.movie(query=title)
            results = getattr(search, "results", [])

        if not results:
            return {'overview': '', 'keywords': '', 'cast': '', 'director': '', 'success': False}

        movie_id = results[0]['id']
        movie = tmdb.Movies(movie_id)
        info = movie.info(append_to_response='keywords,credits')

        overview = info.get('overview', '')
        keywords = ' '.join(kw['name'] for kw in info.get('keywords', {}).get('keywords', []))
        cast = ' '.join(c['name'] for c in info.get('credits', {}).get('cast', [])[:3])
        director = next(
            (crew['name'] for crew in info.get('credits', {}).get('crew', []) if crew.get('job') == 'Director'),
            ''
                        )

        return {'overview': overview, 'keywords': keywords, 'cast': cast, 'director': director, 'success': True}

    except Exception as e:
        print(f"[TMDb ERROR] {title} ({release_year}): {e}")
        return {'overview': '', 'keywords': '', 'cast': '', 'director': '', 'success': False}

if os.path.exists(ENRICHED_FILE):
    print(f"Loading from cached enriched file: {ENRICHED_FILE}")
    enriched_movies_df = pd.read_csv(ENRICHED_FILE)

else:
    print("Starting enrichment process... This will take time.")
    tmdb_data = []

    for i, row in tqdm(movies_df.iterrows(), total=movies_df.shape[0], desc="Enriching Movies"):
        year_str = row['title'][-5:-1] if ' (' in row['title'] else ''
        title_only = row['title'].split(' (')[0]
        info = get_tmdb_info(title_only, year_str)
        tmdb_data.append(info)

        if (i + 1) % 100 == 0:
            pd.DataFrame(tmdb_data).to_csv(CHECKPOINT_FILE, index=False)

        time.sleep(0.25)

    tmdb_df = pd.DataFrame(tmdb_data)
    enriched_movies_df = pd.concat([movies_df.reset_index(drop=True), tmdb_df.reset_index(drop=True)], axis=1)
    enriched_movies_df.to_csv(ENRICHED_FILE, index=False)
    print(f"✅ Saved enriched data to {ENRICHED_FILE}")

enriched_movies_df[["title", "overview", "keywords", "cast", "director"]].head(10)

Loading from cached enriched file: movies_enriched.csv


Unnamed: 0,title,overview,keywords,cast,director
0,Toy Story (1995),"Led by Woody, Andy's toys live happily in his ...",rescue friendship mission jealousy villain bul...,Tom Hanks Tim Allen Don Rickles,John Lasseter
1,GoldenEye (1995),When a powerful satellite system falls into th...,computer virus cuba falsely accused secret int...,Pierce Brosnan Sean Bean Izabella Scorupco,Martin Campbell
2,Four Rooms (1995),It's Ted the Bellhop's first night on the job....,hotel new year's eve witch bet sperm hotel roo...,Tim Roth Jennifer Beals Antonio Banderas,Allison Anders
3,Get Shorty (1995),Chili Palmer is a Miami mobster who gets sent ...,"based on novel or book miami, florida gambling...",John Travolta Gene Hackman Rene Russo,Barry Sonnenfeld
4,Copycat (1995),An agoraphobic psychologist and a female detec...,police brutality psychology police operation p...,Sigourney Weaver Holly Hunter Dermot Mulroney,Jon Amiel
5,Shanghai Triad (Yao a yao yao dao waipo qiao) ...,"Shanghai, China, 1930. When young Shuisheng ar...","shanghai, china chinese mafia coming of age mi...",Gong Li Li Baotian Sun Chun,Zhang Yimou
6,Twelve Monkeys (1995),"In the year 2035, convict James Cole reluctant...","biological weapon philadelphia, pennsylvania s...",Bruce Willis Madeleine Stowe Brad Pitt,Terry Gilliam
7,Babe (1995),Babe is a little pig who doesn't quite know hi...,farm sheep pig alarm clock cat duck heroism af...,Christine Cavanaugh Miriam Margolyes Danny Mann,Chris Noonan
8,Dead Man Walking (1995),A death row inmate turns for spiritual guidanc...,prison death penalty right and justice rape ra...,Susan Sarandon Sean Penn Robert Prosky,Tim Robbins
9,Richard III (1995),A murderous lust for the British throne sees R...,england kidnapping murder king,Ian McKellen Annette Bening Jim Broadbent,Richard Loncraine


In [100]:
ENRICHED_URL = "https://raw.githubusercontent.com/farhodibr/CUNY-SPS-MSDS/refs/heads/main/DATA612/PROJECT2/movies_enriched.csv"
LOCAL_FILE = "movies_enriched.csv"
checkpoint_freq = 100
tmdb_data = []

# Try to download enriched file from GitHub
try:
    enriched_movies_df = pd.read_csv(ENRICHED_URL)
    print("Loaded enriched TMDb data from GitHub.")
except Exception as e:
    print("Failed to load from GitHub. Generating enriched data locally...")
    for i, row in tqdm(movies_df.iterrows(), total=movies_df.shape[0], desc="Enriching Movies"):
        year_str = row['title'][-5:-1] if ' (' in row['title'] else ''
        title_only = row['title'].split(' (')[0]
        tmdb_info = get_tmdb_info(title_only, year_str)
        tmdb_data.append(tmdb_info)

        if (i + 1) % checkpoint_freq == 0:
            pd.DataFrame(tmdb_data).to_csv("checkpoint_tmdb.csv", index=False)

    tmdb_df = pd.DataFrame(tmdb_data)
    enriched_movies_df = pd.concat([movies_df.reset_index(drop=True), tmdb_df.reset_index(drop=True)], axis=1)
    enriched_movies_df.to_csv(LOCAL_FILE, index=False)
    print(f"Enriched data saved locally as {LOCAL_FILE}")


✅ Loaded enriched TMDb data from GitHub.


## here we can change the weights on text features. we need to try different weights to get best recommendations.

In [101]:
cols_to_fill = ['overview', 'keywords', 'cast', 'director', 'genre_str']
enriched_movies_df[cols_to_fill] = enriched_movies_df[cols_to_fill].fillna('')
enriched_movies_df['enriched'] = enriched_movies_df['success']

def create_weighted_soup(x):
    return ' '.join([
        x['overview'],
        x['keywords'] * 2 ,
        x['cast'] * 3,
        x['director'] * 3,
        x['genre_str'] * 2
    ])

enriched_movies_df['soup'] = enriched_movies_df.apply(create_weighted_soup, axis=1)
enriched_movies_df.head()


Unnamed: 0,movie_id,title,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,pseudo_plot,tokens,tokens_genre_only,overview,keywords,cast,director,success,enriched,soup
0,1,Toy Story (1995),0,0,0,1,1,1,0,0,...,Toy Story (1995) Animation Children's Comedy,"{""Children's"", 'story', 'Animation', 'Comedy',...","{'Animation', 'Comedy', ""Children's""}","Led by Woody, Andy's toys live happily in his ...",rescue friendship mission jealousy villain bul...,Tom Hanks Tim Allen Don Rickles,John Lasseter,True,True,"Led by Woody, Andy's toys live happily in his ..."
1,2,GoldenEye (1995),0,1,1,0,0,0,0,0,...,GoldenEye (1995) Action Adventure Thriller,"{'goldeneye', 'Thriller', 'Adventure', 'Action'}","{'Thriller', 'Adventure', 'Action'}",When a powerful satellite system falls into th...,computer virus cuba falsely accused secret int...,Pierce Brosnan Sean Bean Izabella Scorupco,Martin Campbell,True,True,When a powerful satellite system falls into th...
2,3,Four Rooms (1995),0,0,0,0,0,0,0,0,...,Four Rooms (1995) Thriller,"{'rooms', 'Thriller', 'four'}",{'Thriller'},It's Ted the Bellhop's first night on the job....,hotel new year's eve witch bet sperm hotel roo...,Tim Roth Jennifer Beals Antonio Banderas,Allison Anders,True,True,It's Ted the Bellhop's first night on the job....
3,4,Get Shorty (1995),0,1,0,0,0,1,0,0,...,Get Shorty (1995) Action Comedy Drama,"{'Drama', 'get', 'Action', 'Comedy', 'shorty'}","{'Comedy', 'Drama', 'Action'}",Chili Palmer is a Miami mobster who gets sent ...,"based on novel or book miami, florida gambling...",John Travolta Gene Hackman Rene Russo,Barry Sonnenfeld,True,True,Chili Palmer is a Miami mobster who gets sent ...
4,5,Copycat (1995),0,0,0,0,0,0,1,0,...,Copycat (1995) Crime Drama Thriller,"{'Crime', 'Thriller', 'Drama', 'copycat'}","{'Crime', 'Thriller', 'Drama'}",An agoraphobic psychologist and a female detec...,police brutality psychology police operation p...,Sigourney Weaver Holly Hunter Dermot Mulroney,Jon Amiel,True,True,An agoraphobic psychologist and a female detec...


## here we calculate cosine similarities for random 3 movies using TD-IDF

In [102]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(enriched_movies_df['soup'])

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

title_to_index = pd.Series(enriched_movies_df.index, index=enriched_movies_df['title']).drop_duplicates()

def recommend(title, top_n=5):
    if title not in title_to_index:
        return f"Movie '{title}' not found in the dataset."

    idx = title_to_index[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    top_indices = [i[0] for i in sim_scores[1:top_n+1]]

    recs = enriched_movies_df.iloc[top_indices][['title', 'genre_str']]
    recs['similarity'] = [sim_scores[i+1][1] for i in range(top_n)]

    return recs

random_titles = enriched_movies_df['title'].sample(5, random_state=52).tolist()

for title in random_titles:
    print(f"\n🎬 Recommendations for: {title}")
    display(recommend(title))


🎬 Recommendations for: Death and the Maiden (1994)


Unnamed: 0,title,genre_str,similarity
551,Species (1995),Sci-Fi,0.129294
317,Schindler's List (1993),Drama War,0.119887
1267,Bitter Moon (1992),Drama,0.11091
4,Copycat (1995),Crime Drama Thriller,0.102321
526,Gandhi (1982),Drama,0.101152



🎬 Recommendations for: Age of Innocence, The (1993)


Unnamed: 0,title,genre_str,similarity
240,"Last of the Mohicans, The (1992)",Action Romance War,0.14923
675,"Crucible, The (1996)",Drama,0.137341
508,My Left Foot (1989),Drama,0.126347
211,"Unbearable Lightness of Being, The (1988)",Drama,0.117675
411,"Very Brady Sequel, A (1996)",Comedy,0.096407



🎬 Recommendations for: Jackal, The (1997)


Unnamed: 0,title,genre_str,similarity
244,"Devil's Own, The (1997)",Action Drama Thriller War,0.188216
781,Little Odessa (1994),Drama,0.171799
630,"Crying Game, The (1992)",Action Drama Romance War,0.146363
316,In the Name of the Father (1993),Drama,0.145338
981,Maximum Risk (1996),Action Adventure Thriller,0.135898



🎬 Recommendations for: Citizen Kane (1941)


Unnamed: 0,title,genre_str,similarity
652,Touch of Evil (1958),Crime Film-Noir Thriller,0.180491
512,"Third Man, The (1949)",Mystery Thriller,0.095024
1295,Indian Summer (1996),Comedy Drama,0.076531
493,His Girl Friday (1940),Comedy,0.075466
1162,"Portrait of a Lady, The (1996)",Drama,0.073281



🎬 Recommendations for: Grumpier Old Men (1995)


Unnamed: 0,title,genre_str,similarity
1282,Out to Sea (1997),Comedy,0.198485
136,Big Night (1996),Drama,0.141467
254,My Best Friend's Wedding (1997),Comedy Romance,0.112255
159,Glengarry Glen Ross (1992),Drama,0.110837
1119,I'm Not Rappaport (1996),Comedy,0.107428


# Collaborative Filtering

In [None]:
# Merge data frames so we have the movie title as part of the user movie ratings
user_movie_ratings = pd.merge(ratings, movies, on="movie_id")

# Only focus on user_id, title, and rating for now
user_movie_ratings = user_movie_ratings[['user_id', 'title', 'rating']]

# Remove any duplicate observations
user_movie_ratings = user_movie_ratings.drop_duplicates()

user_movie_ratings.head()

Check for missing values:

In [None]:
# Get info
print(user_movie_ratings.info())

We now have 99,739 unique observations, with zero missing values.

Check for multiple different ratings for the same movie by the same user:

In [None]:
# Check for multiple ratings for a single movie by a single user
users_and_titles = user_movie_ratings[['user_id', 'title']]
print(users_and_titles[users_and_titles.duplicated(keep=False)])

Looks like we do have some users that rated a movie more than once. Let's double check an example:



In [None]:
print(user_movie_ratings[(user_movie_ratings['user_id'] == 50) & (user_movie_ratings['title'] == "Chasing Amy (1997)")])

We can see that `user_id` `50` has rated `Chasing Amy (1997)` twice - the first time a rating of `3` and a second time a rating of `4`. To address this, when we pivot the data frame into a matrix, let's take the mean of any user who rated a single movie more than once:

In [None]:
# Pivot the data
# If there are multiple ratings for a single movie by a user, take the average of the ratings
user_movie_ratings_pivot = user_movie_ratings.pivot_table(index='user_id',
                                                    columns='title',
                                                    values='rating',
                                                    aggfunc='mean')
user_movie_ratings_pivot.head()

#### User-based

##### Fill in missing data

There's now missing values which makes sense as not every user can rate every single movie. So to avoid skewed results, let's fill in the incomplete data:

In [None]:
# Mean rating per user
average_ratings = user_movie_ratings_pivot.mean(axis=1)

# Center each users ratings around 0
user_movie_ratings_centered = user_movie_ratings_pivot.sub(average_ratings, axis=0)

# Fill in the missing data with 0s
user_movie_ratings_normalized = user_movie_ratings_centered.fillna(0)

user_movie_ratings_normalized.head()

##### Finding Similarities



Find similarites using cosine:

In [None]:
# Create the similarity matrix
user_similarities = cosine_similarity(user_movie_ratings_normalized)

# Create data frame
users_cosine_sim_df = pd.DataFrame(user_similarities, index=user_movie_ratings_normalized.index, columns=user_movie_ratings_normalized.index)
users_cosine_sim_df.head()

##### Testing




Let's test for user `1`:

In [None]:
print("--- Testing for user 1 ---")
print("\n[Cosine Similarity User-Based Recommendations]")

# Grab the similarities for a movie
cosine_similarity_series = users_cosine_sim_df.loc[1]

# Sort similarities
ordered_similarities = cosine_similarity_series.sort_values(ascending=False)

print(ordered_similarities.head())

In [None]:
print(user_movie_ratings[(user_movie_ratings['user_id']).isin([1, 773])])

##### K-nearest neighbors

Let's predict a user's rating for a particular movie using k-nearest neighbors (users). `user_id` `8` has not seen Toy Story, so let's predict that rating:

In [None]:
# Predict Toy Story rating for user 8
# Drop the movie to predict
user_movie_ratings_normalized_copy = user_movie_ratings_normalized.copy()
user_movie_ratings_normalized_copy.drop("Toy Story (1995)", axis=1, inplace=True)

# Grab the test data for user 8 (index == 8)
test_user_x = user_movie_ratings_normalized_copy.loc[[8]]

# Grab the training target data
training_data_y = user_movie_ratings_pivot["Toy Story (1995)"]

# Grab the users who have seen the movie
training_data_x = user_movie_ratings_normalized_copy[training_data_y.notnull()]

# Only focus on users who have seen the movie
training_data_y.dropna(inplace=True)

In [None]:
# Create the KNN model
users_knn_model = KNeighborsRegressor(metric='cosine', n_neighbors=10)

# Fit the model using training data
users_knn_model.fit(training_data_x, training_data_y)

# Make prediction using the test data
prediction = users_knn_model.predict(test_user_x)
print("The user-user KNN model predicted {}".format(prediction))

The predicted rating for `Toy Story (1995)` is `4` for `user_id` `8`, therefore, this movie would be recommended for this user.

### Item-based

In [None]:
# Transpose the data frame to make movies the index
movie_ratings = user_movie_ratings_normalized.T
movie_ratings.head()

##### Finding Similarities

Cosine similarity:

In [None]:
# Create the similarity matrix
movie_similarities = cosine_similarity(movie_ratings)

# Create data frame
cosine_sim_df = pd.DataFrame(movie_similarities, index=movie_ratings.index, columns=movie_ratings.index)
cosine_sim_df.head()

##### Testing

In [None]:
print("--- Testing 'Toy Story (1995)' ---")
print("\n[Cosine Similarity Item-Based Recommendations]")

# Grab the similarities for a movie
cosine_similarity_series = cosine_sim_df.loc['Toy Story (1995)']

# Sort similarities
ordered_similarities = cosine_similarity_series.sort_values(ascending=False)

print(ordered_similarities.head())

##### K-nearest neighbors

In [None]:
# Predict Toy Story rating for user 8
movie_ratings_copy = movie_ratings.copy()
movie_ratings_copy.drop(1, axis=1, inplace=True)

# Grab the test data x
test_movie_x = movie_ratings_copy.loc[["Toy Story (1995)"]]

# Grab the training target data (user's data)
training_data_y_movies = user_movie_ratings_pivot.T[8]

# Grab the users who have seen the movie
training_data_x_movies = movie_ratings_copy[training_data_y_movies.notnull()]

# Only focus on users who have seen the movie
training_data_y_movies.dropna(inplace=True)

# Create the KNN model
movies_knn_model = KNeighborsRegressor(metric='cosine', n_neighbors=10)

# Fit the model using training data
movies_knn_model.fit(training_data_x_movies, training_data_y_movies)

# Make prediction using the test data
prediction_item_based = movies_knn_model.predict(test_movie_x)
print("The item-item KNN model predicted {}".format(prediction_item_based))

The item-item based KNN model predicted 4.5, so slightly higher than the user-user model prediction of 4.

## Quantitative Evaluation (RMSE)

Right now, we're just doing qualitive analysis of the results. To properly measure our model's performance, we need a quantitative metric. We will use Root Mean Squared Error (RMSE), which tells us, on average, how far off our predicted ratings are from the actual ratings. A lower RMSE is better.

 To do this, we will:
 1.  Split our known ratings into a `training set` and a `testing set`.
 2.  Build the user-item matrix using **only** the training set.
 3.  Apply regularization to the user averages before normalizing the data.
 4.  Loop through the test set to predict ratings and calculate the final RMSE.

# 1. Split the data

We'll use an 80/20 split. `random_state` ensures we get the same split every time.

In [None]:
train_df, test_df = train_test_split(user_movie_ratings, test_size=0.2, random_state=42)

print(f"Training set size: {len(train_df)}")
print(f"Testing set size: {len(test_df)}")

# 2. Build User-Item Matrix on Training Data ONLY

In [None]:
train_pivot = train_df.pivot_table(index="user_id",
                                   columns="title",
                                   values="rating",
                                   aggfunc="mean"
                                )
train_avg_ratings  = train_pivot.mean(axis=1)
train_centered  = train_pivot.sub(train_avg_ratings, axis=0)
train_normalized = train_centered.fillna(0)

# 3. Loop, predict, and evaluate

 For demonstration, we'll run on a sample of 100 ratings from the test set.
Remove `.sample(100, random_state=42)` to run on the full test set (will be very slow).

In [None]:
user_similarity = cosine_similarity(train_normalized)
user_sim_df = pd.DataFrame(user_similarity, index=train_normalized.index, columns=train_normalized.index)

true_ratings = []
predicted_ratings = []

Here we prepare data for that specific movie

In [None]:
test_sample = test_df.sample(100, random_state=42)


for _,  row in test_sample.iterrows():
    user_id = row['user_id']
    movie_title = row['title']
    actual_rating = row['rating']

    # Make sure the user and movie exist in our training matrix
    if user_id not in train_normalized.index or movie_title not in train_normalized.columns:
        continue

    # 1. Identify Similar Users
    # Find the top 10 most similar users to our target user (excluding the user themselves)
    n = 10
    similar_users = user_sim_df[user_id].sort_values(ascending=False).iloc[1:n+1]

    # If no similar users found, we can't predict
    if similar_users.empty:
        continue

    # 2. Filter for Relevant Ratings
    # Find which of these similar users have actually rated the target movie
    neighbor_ratings = train_pivot.loc[similar_users.index, movie_title].dropna()

    # If none of the neighbors have rated the movie, we can't predict
    if neighbor_ratings.empty:
        continue

    # 3. & 4. We will calculate the weighted average.
    # The "weight" is the similarity score of the neighbor.
    # Get the similarity scores for just the neighbors who rated the movie
    neighbor_similarities = user_sim_df.loc[neighbor_ratings.index, user_id]

    # Calculate the weighted average of the neighbors' ratings
    # (sum of (similarity * rating)) / (sum of similarities)
    numerator = np.dot(neighbor_ratings, neighbor_similarities)
    denominator = neighbor_similarities.sum()

    if denominator == 0:
        continue # Avoid division by zero

    predicted_rating = numerator / denominator

    predicted_ratings.append(predicted_rating)
    true_ratings.append(actual_rating)


In [None]:
if len(true_ratings) > 0:
    global_rmse = np.sqrt(mean_squared_error(true_ratings, predicted_ratings))
    print(f"\n--- Global Model Performance (with Regularization) ---")
    print(f"Global Root Mean Squared Error (RMSE): {global_rmse:.4f}")
    print(f"Evaluated on {len(true_ratings)} ratings from the test set sample.")
else:
    print("\nCould not find any evaluatable ratings in the test set sample.")

# Conclusions