In [1]:
# ──────────────────────────────────────────────────────────────
# 02_content_tfidf.ipynb
# Content-Based Filtering – TF-IDF Baseline
# Day 3 of Hybrid Movie Recommender Project
# ──────────────────────────────────────────────────────────────

# ──────────────────────────────────────────────────────────────
# 1. Imports
# ──────────────────────────────────────────────────────────────

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
import joblib
from pathlib import Path

In [2]:
PROCESSED_DATA_PATH = Path('/Users/dhirkatre/code/movie-recommender/data/processed/movies_with_plots.csv')
MODELS_DIR = Path('/Users/dhirkatre/code/movie-recommender/models')
MODELS_DIR.mkdir(exist_ok=True)

In [3]:
# ──────────────────────────────────────────────────────────────
# 2. Load the enriched dataset
# ──────────────────────────────────────────────────────────────

print("Loading processed data...")
df = pd.read_csv(PROCESSED_DATA_PATH)

print(f"Dataset shape: {df.shape}")
print("Columns:", df.columns.tolist())
display(df.head(3))

# Basic validation
assert 'movieId' in df.columns, "movieId column missing"
assert 'title' in df.columns, "title column missing"
assert 'genres' in df.columns, "genres column missing"
assert 'overview' in df.columns, "overview column missing"

Loading processed data...
Dataset shape: (9734, 7)
Columns: ['movieId', 'title', 'genres', 'genres_list', 'tmdbId', 'overview', 'title_clean']


Unnamed: 0,movieId,title,genres,genres_list,tmdbId,overview,title_clean
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"['Adventure', 'Animation', 'Children', 'Comedy...",862,"Led by Woody, Andy's toys live happily in his ...",Toy Story
1,2,Jumanji (1995),Adventure|Children|Fantasy,"['Adventure', 'Children', 'Fantasy']",8844,When siblings Judy and Peter discover an encha...,Jumanji
2,3,Grumpier Old Men (1995),Comedy|Romance,"['Comedy', 'Romance']",15602,A family wedding reignites the ancient feud be...,Grumpier Old Men


In [4]:
# ──────────────────────────────────────────────────────────────
# 3. Prepare text for TF-IDF
# ──────────────────────────────────────────────────────────────

print("Preparing content text...")

# Fill missing values
df['overview'] = df['overview'].fillna("No plot summary available.")
df['genres'] = df['genres'].fillna("(no genres listed)")

# Combine genres and overview → single text field
# Replace | with space so TF-IDF treats them as separate terms
df['content_text'] = (
    df['genres'].str.replace('|', ' ', regex=False) + " " + df['overview']
)

# Optional: minimal cleaning
df['content_text'] = df['content_text'].str.lower().str.strip()

# Preview
print("\nExample content_text:")
print(df[['title', 'content_text']].head(3))

Preparing content text...

Example content_text:
                     title                                       content_text
0         Toy Story (1995)  adventure animation children comedy fantasy le...
1           Jumanji (1995)  adventure children fantasy when siblings judy ...
2  Grumpier Old Men (1995)  comedy romance a family wedding reignites the ...


In [5]:
# ──────────────────────────────────────────────────────────────
# 4. Build TF-IDF matrix
# ──────────────────────────────────────────────────────────────

print("Fitting TF-IDF vectorizer...")

tfidf = TfidfVectorizer(
    max_features=5000,           # control memory & speed
    stop_words='english',
    min_df=3,                    # ignore very rare terms
    max_df=0.85,                 # ignore very common terms
    ngram_range=(1, 2),          # include bigrams for better phrases
)

tfidf_matrix = tfidf.fit_transform(df['content_text'])

print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")
print(f"Vocabulary size: {len(tfidf.get_feature_names_out())}")

# Optional: show some feature names
print("Sample features:", tfidf.get_feature_names_out()[100:110])

Fitting TF-IDF vectorizer...
TF-IDF matrix shape: (9734, 5000)
Vocabulary size: 5000
Sample features: ['accused' 'ace' 'achieve' 'acquaintances' 'act' 'acting' 'action'
 'action adventure' 'action animation' 'action children']


In [6]:
# ──────────────────────────────────────────────────────────────
# 5. Compute similarity matrix
# ──────────────────────────────────────────────────────────────

print("Computing cosine similarity...")

# Use linear_kernel for TF-IDF (faster & memory efficient than cosine_similarity)
similarity_matrix = linear_kernel(tfidf_matrix, tfidf_matrix)

# Alternative (if you prefer classic cosine):
# similarity_matrix = cosine_similarity(tfidf_matrix)

print(f"Similarity matrix shape: {similarity_matrix.shape}")

Computing cosine similarity...
Similarity matrix shape: (9734, 9734)


In [7]:
# ──────────────────────────────────────────────────────────────
# 6. Save models & important artifacts
# ──────────────────────────────────────────────────────────────

print("Saving models...")

joblib.dump(tfidf, MODELS_DIR / 'tfidf_vectorizer.joblib')
joblib.dump(similarity_matrix, MODELS_DIR / 'content_similarity_matrix.joblib')

# Save movie index mapping (movieId → title, for easy lookup later)
movie_index = df[['movieId', 'title', 'title_clean']].copy()
movie_index.to_csv(MODELS_DIR / 'movie_indices.csv', index=False)

print("Saved:")
print(" - tfidf_vectorizer.joblib")
print(" - content_similarity_matrix.joblib")
print(" - movie_indices.csv")

Saving models...
Saved:
 - tfidf_vectorizer.joblib
 - content_similarity_matrix.joblib
 - movie_indices.csv


In [8]:
# ──────────────────────────────────────────────────────────────
# 7. Define recommendation function
# ──────────────────────────────────────────────────────────────

def get_similar_movies(
    movie_id: int,
    n: int = 10,
    similarity_mat=similarity_matrix,
    movies_df=df
) -> pd.DataFrame:
    """
    Return top N similar movies based on TF-IDF cosine similarity.
    
    Args:
        movie_id: MovieLens movieId
        n: number of recommendations
    Returns:
        DataFrame with title, genres, similarity_score
    """
    try:
        # Find row index for this movieId
        idx = movies_df[movies_df['movieId'] == movie_id].index[0]
    except IndexError:
        raise ValueError(f"Movie ID {movie_id} not found in dataset")

    # Get similarity scores for this movie
    sim_scores = list(enumerate(similarity_mat[idx]))

    # Sort descending, exclude self (position 0)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:n+1]

    # Get indices
    similar_indices = [i[0] for i in sim_scores]

    # Build result
    result = movies_df.iloc[similar_indices][['movieId', 'title', 'genres']].copy()
    result['similarity_score'] = [round(score, 4) for _, score in sim_scores]

    return result.reset_index(drop=True)

In [12]:
# 1. Check dtype and sample values
print("movieId dtype:", df['movieId'].dtype)
print("First 10 movieIds:", df['movieId'].head(10).tolist())
print("Any movieId == 603 exactly?", (df['movieId'] == 603).any())
print("Any movieId close to 603?", df['movieId'].between(600, 610).sum())

# 2. Search by title (bypass ID issue)
print("\nSearch for 'Matrix' in title:")
print(df[df['title'].str.contains('Matrix', case=False, na=False)][['movieId', 'title', 'genres']])

# 3. Force movieId to integer safely
df['movieId'] = pd.to_numeric(df['movieId'], errors='coerce').astype('Int64')  # Int64 handles NaN
print("\nAfter forcing int:")
print("movieId dtype now:", df['movieId'].dtype)
print("Now check for 603:", (df['movieId'] == 603).any())

# If found now, get the row
if (df['movieId'] == 603).any():
    print(df[df['movieId'] == 603][['movieId', 'title', 'genres', 'overview']])

movieId dtype: int64
First 10 movieIds: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
Any movieId == 603 exactly? False
Any movieId close to 603? 7

Search for 'Matrix' in title:
      movieId                           title  \
1937     2571              Matrix, The (1999)   
4347     6365     Matrix Reloaded, The (2003)   
4635     6934  Matrix Revolutions, The (2003)   
5664    27660           Animatrix, The (2003)   

                                     genres  
1937                 Action|Sci-Fi|Thriller  
4347  Action|Adventure|Sci-Fi|Thriller|IMAX  
4635  Action|Adventure|Sci-Fi|Thriller|IMAX  
5664          Action|Animation|Drama|Sci-Fi  

After forcing int:
movieId dtype now: Int64
Now check for 603: False


In [14]:
# ──────────────────────────────────────────────────────────────
# 8. Quick sanity check (before manual testing)
# ──────────────────────────────────────────────────────────────

print("\nSanity check - Toy Story (movieId=1)")
print(get_similar_movies(1, n=8))

print("\nSanity check - The Matrix (movieId=2751)")
print(get_similar_movies(2571, n=5))



Sanity check - Toy Story (movieId=1)
   movieId                           title  \
0    78499              Toy Story 3 (2010)   
1     3114              Toy Story 2 (1999)   
2    35836  40-Year-Old Virgin, The (2005)   
3    51174             Factory Girl (2006)   
4     2041                Condorman (1981)   
5     7096         Rivers and Tides (2001)   
6     3174          Man on the Moon (1999)   
7     1103    Rebel Without a Cause (1955)   

                                             genres  similarity_score  
0  Adventure|Animation|Children|Comedy|Fantasy|IMAX            0.5501  
1       Adventure|Animation|Children|Comedy|Fantasy            0.5100  
2                                    Comedy|Romance            0.3356  
3                                             Drama            0.2193  
4                  Action|Adventure|Children|Comedy            0.2096  
5                                       Documentary            0.2013  
6                                      Come