# Content-Based Filtering (CBF) with Sentence Transformers

## Overview
This notebook implements a **Content-Based Filtering** recommendation model that solves the "cold start" problem by analyzing movie content rather than user behavior.

## Methodology
1. **Data Soup**: Aggregates `Title`, `Genres`, and `Tags` into a single semantic string for each movie.
2. **Vectorization**: Uses **Sentence Transformers (SBERT)** to encode this text into dense, high-dimensional vectors that capture semantic meaning.
3. **Inference**: Calculates **Cosine Similarity** between these vectors to recommend movies that are thematically similar, even if they share no common keywords.

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import sys

# Configuration based on your directory structure
# Current notebook is in: .../Netflix-Hybrid-Deep-Recommendation-Model/cf_model/
PROJECT_ROOT = Path("..").resolve() 
DATA_DIR = PROJECT_ROOT / "data"
# We save CBF artifacts in a separate folder to avoid conflicts with the CF model
MODEL_DIR = Path("saved_model_cbf") 
MODEL_DIR.mkdir(exist_ok=True)

print(f"Project Root: {PROJECT_ROOT}")
print(f"Data Directory: {DATA_DIR}")
print(f"Model Directory: {MODEL_DIR}")

# Check if data exists
if not (DATA_DIR / "movies.csv").exists():
    print("WARNING: movies.csv not found. Please run onboarding.py first to download data.")

Project Root: C:\Users\Arjun\Documents\Github\Netflix-Hybrid-Deep-Recommendation-Model
Data Directory: C:\Users\Arjun\Documents\Github\Netflix-Hybrid-Deep-Recommendation-Model\data
Model Directory: saved_model_cbf


In [2]:
def load_and_preprocess_data():
    """
    Loads Movies and creates a 'soup' string for semantic encoding.
    """
    print("Loading data...")
    movies_path = DATA_DIR / "movies.csv"
    tags_path = DATA_DIR / "tags.csv" # Standard MovieLens file
    
    # 1. Load Movies
    movies = pd.read_csv(movies_path)
    
    # 2. Load and Aggregate Tags (if available)
    if tags_path.exists():
        print("Aggregating tags...")
        tags = pd.read_csv(tags_path)
        # Filter tags to only valid movieIds
        tags = tags[tags['movieId'].isin(movies['movieId'])]
        # Group tags by movie and join with spaces
        tags['tag'] = tags['tag'].astype(str)
        movie_tags = tags.groupby('movieId')['tag'].apply(lambda x: ' '.join(x)).reset_index()
        movies = pd.merge(movies, movie_tags, on='movieId', how='left')
        movies['tag'] = movies['tag'].fillna('')
    else:
        print("tags.csv not found (onboarding.py might not have extracted it). Using titles + genres only.")
        movies['tag'] = ''

    # 3. Create the "Content Soup"
    # We replace pipes in genres with spaces for better tokenization
    print("Creating text metadata...")
    movies['genres_clean'] = movies['genres'].str.replace('|', ' ', regex=False)
    
    def create_soup(x):
        # Format: "Title: <title> Genre: <genres> Keywords: <tags>"
        return f"Title: {x['title']} Genre: {x['genres_clean']} Keywords: {x['tag']}"

    movies['soup'] = movies.apply(create_soup, axis=1)
    
    # Reset index to ensure 0..N alignment with embeddings
    movies = movies.reset_index(drop=True)
    
    print(f"Data prepared: {len(movies)} movies.")
    return movies

movies_df = load_and_preprocess_data()
movies_df.head()

Loading data...
Aggregating tags...
Creating text metadata...
Data prepared: 27278 movies.


Unnamed: 0,movieId,title,genres,tag,genres_clean,soup
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Watched computer animation Disney animated fea...,Adventure Animation Children Comedy Fantasy,Title: Toy Story (1995) Genre: Adventure Anima...
1,2,Jumanji (1995),Adventure|Children|Fantasy,time travel adapted from:book board game child...,Adventure Children Fantasy,Title: Jumanji (1995) Genre: Adventure Childre...
2,3,Grumpier Old Men (1995),Comedy|Romance,old people that is actually funny sequel fever...,Comedy Romance,Title: Grumpier Old Men (1995) Genre: Comedy R...
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,chick flick revenge characters chick flick cha...,Comedy Drama Romance,Title: Waiting to Exhale (1995) Genre: Comedy ...
4,5,Father of the Bride Part II (1995),Comedy,Diane Keaton family sequel Steve Martin weddin...,Comedy,Title: Father of the Bride Part II (1995) Genr...


In [3]:
def build_embeddings(df, model_name='all-MiniLM-L6-v2'):
    """
    Encodes the 'soup' text into dense vectors.
    """
    print(f"Loading Sentence Transformer: {model_name}...")
    encoder = SentenceTransformer(model_name)
    
    print("Encoding movie metadata (this may take time)...")
    embeddings = encoder.encode(
        df['soup'].tolist(), 
        batch_size=64, 
        show_progress_bar=True, 
        convert_to_numpy=True
    )
    
    print(f"Embeddings shape: {embeddings.shape}")
    return embeddings

# Check if we already have saved embeddings to save time
embedding_path = MODEL_DIR / "content_embeddings.npy"

if embedding_path.exists():
    print("Loading pre-computed embeddings...")
    movie_embeddings = np.load(embedding_path)
else:
    movie_embeddings = build_embeddings(movies_df)
    # Save for future use
    np.save(embedding_path, movie_embeddings)
    movies_df.to_pickle(MODEL_DIR / "movies_metadata.pkl")
    print("Embeddings and metadata saved.")

Loading Sentence Transformer: all-MiniLM-L6-v2...
Encoding movie metadata (this may take time)...


Batches:   0%|          | 0/427 [00:00<?, ?it/s]

Embeddings shape: (27278, 384)
Embeddings and metadata saved.


In [4]:
# Create a reverse mapping for quick title lookup
# Case insensitive lookup helper
title_to_idx = {title.lower(): idx for idx, title in zip(movies_df.index, movies_df['title'])}

def get_content_recommendations(title, top_k=10):
    """
    Given a movie title, returns the top_k most similar movies based on content.
    """
    title_lower = title.lower()
    
    # fuzzy matching could be added here, but exact match for now
    if title_lower not in title_to_idx:
        # Try to find partial match
        matches = [t for t in title_to_idx.keys() if title_lower in t]
        if matches:
            print(f"Movie '{title}' not found. Did you mean: {matches[:3]}?")
            return None
        return f"Movie '{title}' not found in database."

    # 1. Get index
    idx = title_to_idx[title_lower]
    
    # 2. Calculate Cosine Similarity
    # Reshape query to (1, dim)
    query_embedding = movie_embeddings[idx].reshape(1, -1)
    sim_scores = cosine_similarity(query_embedding, movie_embeddings)[0]
    
    # 3. Get top indices (excluding the movie itself)
    top_indices = sim_scores.argsort()[-(top_k+1):-1][::-1]
    
    # 4. Format results
    results = movies_df.iloc[top_indices][['title', 'genres']].copy()
    results['similarity_score'] = sim_scores[top_indices]
    return results

print("Recommendation system ready.")

Recommendation system ready.


In [5]:
# Test Case 1: Sci-Fi / Action
movie_title = "Inception (2010)"
print(f"\n--- Recommendations for {movie_title} ---")
recs = get_content_recommendations(movie_title)
if recs is not None:
    display(recs)

# Test Case 2: Animation / Children
movie_title = "Toy Story (1995)"
print(f"\n--- Recommendations for {movie_title} ---")
recs = get_content_recommendations(movie_title)
if recs is not None:
    display(recs)

# Test Case 3: Horror
movie_title = "Exorcist, The (1973)"
print(f"\n--- Recommendations for {movie_title} ---")
recs = get_content_recommendations(movie_title)
if recs is not None:
    display(recs)


--- Recommendations for Inception (2010) ---


Unnamed: 0,title,genres,similarity_score
14869,Shutter Island (2010),Drama|Mystery|Thriller,0.706495
3770,"Cell, The (2000)",Drama|Horror|Thriller,0.690508
10520,Stay (2005),Thriller,0.670623
2515,eXistenZ (1999),Action|Sci-Fi|Thriller,0.66762
4132,Memento (2000),Mystery|Thriller,0.664933
4777,Waking Life (2001),Animation|Drama|Fantasy,0.65367
15541,Knight and Day (2010),Action|Comedy|Thriller,0.649456
15668,"Expendables, The (2010)",Action|Adventure|Thriller,0.647013
14276,Surrogates (2009),Action|Sci-Fi|Thriller,0.639492
14234,Paranormal Activity (2009),Horror|Thriller,0.634062



--- Recommendations for Toy Story (1995) ---


Unnamed: 0,title,genres,similarity_score
3027,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,0.81254
15401,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,0.776747
2270,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,0.732606
13767,Up (2009),Adventure|Animation|Children|Drama,0.719297
4790,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,0.68379
6271,Finding Nemo (2003),Adventure|Animation|Children|Comedy,0.669834
5121,Ice Age (2002),Adventure|Animation|Children|Comedy,0.656013
25461,Toy Story Toons: Partysaurus Rex (2012),Animation|Children|Comedy,0.642412
2209,Antz (1998),Adventure|Animation|Children|Comedy|Fantasy,0.635399
589,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,0.633959



--- Recommendations for Exorcist, The (1973) ---


Unnamed: 0,title,genres,similarity_score
10428,"Exorcism of Emily Rose, The (2005)",Crime|Drama|Horror|Thriller,0.782334
3539,"House of Exorcism, The (a.k.a. Lisa and the De...",Horror,0.757806
25327,Exorcismus (2010),Horror,0.714623
1914,Exorcist II: The Heretic (1977),Horror,0.702385
1320,"Omen, The (1976)",Horror|Mystery|Thriller,0.684798
20608,"Last Exorcism Part II, The (2013)",Horror|Thriller,0.673048
18400,"Devil Inside, The (2012)",Horror|Thriller,0.645692
15597,"Name for Evil, A (1973)",Horror,0.639093
26908,Demonic (2015),Horror|Thriller,0.636613
20777,Horror of the Zombies (1974),Horror,0.635339
