# Movie Recommender System - Phase 3: Content-Based Filtering


In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
from scipy.sparse import hstack
import warnings
warnings.filterwarnings('ignore')

# Load processed data
df = pd.read_csv('../data/processed/movies_processed.csv')
df['genres'] = df['genres'].apply(eval)
df['keywords'] = df['keywords'].apply(eval)
df['cast'] = df['cast'].apply(eval)

# Reset index to ensure consistency
df = df.reset_index(drop=True)

# 1. FEATURE PREPARATION


In [2]:
print("\n[1] Preparing Features...")

def clean_text(x):
    """Clean and prepare text for vectorization"""
    if isinstance(x, list):
        return ' '.join([str(i).lower().replace(" ", "") for i in x])
    elif isinstance(x, str):
        return x.lower().replace(" ", "")
    return ''

# Create feature columns
df['genres_str'] = df['genres'].apply(clean_text)
df['keywords_str'] = df['keywords'].apply(clean_text)
df['cast_str'] = df['cast'].apply(clean_text)
df['director_str'] = df['director'].fillna('').apply(lambda x: str(x).lower().replace(" ", ""))

# Clean overview
df['overview_clean'] = df['overview'].fillna('').str.lower()

print("Features prepared successfully!")
print(f"Sample genres_str: {df['genres_str'].iloc[0]}")
print(f"Sample keywords_str: {df['keywords_str'].iloc[0][:50]}...")


[1] Preparing Features...
Features prepared successfully!
Sample genres_str: action adventure fantasy sciencefiction
Sample keywords_str: cultureclash future spacewar spacecolony society s...


# 2. METHOD 1: OVERVIEW-BASED (TF-IDF)


In [3]:
print("\n[2] Method 1: Overview-Based Recommendations (TF-IDF)")
print("-" * 60)

# TF-IDF on overview
tfidf_overview = TfidfVectorizer(
    max_features=5000,
    stop_words='english',
    ngram_range=(1, 2),
    min_df=2
)

tfidf_matrix_overview = tfidf_overview.fit_transform(df['overview_clean'])
print(f"TF-IDF Matrix shape: {tfidf_matrix_overview.shape}")

# Compute cosine similarity
cosine_sim_overview = linear_kernel(tfidf_matrix_overview, tfidf_matrix_overview)
print(f"Cosine Similarity Matrix shape: {cosine_sim_overview.shape}")

def get_recommendations_overview(title, cosine_sim=cosine_sim_overview, top_n=10):
    """Get recommendations based on overview similarity"""
    # Find movie index
    idx = df[df['title'] == title].index
    if len(idx) == 0:
        return f"Movie '{title}' not found!"
    idx = idx[0]

    # Get similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]  # Exclude itself

    # Get movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return recommendations with scores
    recommendations = df.iloc[movie_indices][['title', 'genres', 'vote_average', 'release_year']].copy()
    recommendations['similarity_score'] = [i[1] for i in sim_scores]

    return recommendations

# Test
print("\n[TEST] Overview-based recommendations for 'The Dark Knight':")
print(get_recommendations_overview('The Dark Knight', top_n=10))


[2] Method 1: Overview-Based Recommendations (TF-IDF)
------------------------------------------------------------
TF-IDF Matrix shape: (4772, 5000)
Cosine Similarity Matrix shape: (4772, 4772)

[TEST] Overview-based recommendations for 'The Dark Knight':
                                        title  \
3                       The Dark Knight Rises   
299                            Batman Forever   
428                            Batman Returns   
3853  Batman: The Dark Knight Returns, Part 2   
2507                                Slow Burn   
1181                                      JFK   
119                             Batman Begins   
879                       Law Abiding Citizen   
1359                                   Batman   
9          Batman v Superman: Dawn of Justice   

                                 genres  vote_average  release_year  \
3      [Action, Crime, Drama, Thriller]           7.6        2012.0   
299            [Action, Crime, Fantasy]           5.2        

# 3. METHOD 2: METADATA-BASED (GENRES + KEYWORDS + CAST + DIRECTOR)


In [4]:
print("\n[3] Method 2: Metadata-Based Recommendations")
print("-" * 60)

# Combine metadata features
df['metadata'] = (
    df['genres_str'] + ' ' +
    df['genres_str'] + ' ' +  # Weight genres more
    df['keywords_str'] + ' ' +
    df['cast_str'] + ' ' +
    df['director_str'] + ' ' +
    df['director_str'] + ' ' +  # Weight director more
    df['director_str']
)

# Count Vectorizer for metadata
count_metadata = CountVectorizer(
    max_features=8000,
    stop_words='english'
)

count_matrix_metadata = count_metadata.fit_transform(df['metadata'])
print(f"Metadata Count Matrix shape: {count_matrix_metadata.shape}")

# Compute cosine similarity
cosine_sim_metadata = cosine_similarity(count_matrix_metadata, count_matrix_metadata)
print(f"Metadata Cosine Similarity Matrix shape: {cosine_sim_metadata.shape}")

def get_recommendations_metadata(title, cosine_sim=cosine_sim_metadata, top_n=10):
    """Get recommendations based on metadata similarity"""
    idx = df[df['title'] == title].index
    if len(idx) == 0:
        return f"Movie '{title}' not found!"
    idx = idx[0]

    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]

    movie_indices = [i[0] for i in sim_scores]
    recommendations = df.iloc[movie_indices][['title', 'genres', 'vote_average', 'release_year']].copy()
    recommendations['similarity_score'] = [i[1] for i in sim_scores]

    return recommendations

# Test
print("\n[TEST] Metadata-based recommendations for 'The Dark Knight':")
print(get_recommendations_metadata('The Dark Knight', top_n=10))


[3] Method 2: Metadata-Based Recommendations
------------------------------------------------------------
Metadata Count Matrix shape: (4772, 8000)
Metadata Cosine Similarity Matrix shape: (4772, 4772)

[TEST] Metadata-based recommendations for 'The Dark Knight':
                         title                            genres  \
3        The Dark Knight Rises  [Action, Crime, Drama, Thriller]   
119              Batman Begins            [Action, Crime, Drama]   
4613  Amidst the Devil's Wings            [Drama, Action, Crime]   
4095               Harsh Times  [Crime, Drama, Thriller, Action]   
1196              The Prestige        [Drama, Mystery, Thriller]   
3965               Point Blank  [Action, Crime, Drama, Thriller]   
3331               Harry Brown  [Thriller, Crime, Drama, Action]   
4748                   Rampage  [Action, Drama, Crime, Thriller]   
1503                    Takers  [Action, Crime, Drama, Thriller]   
3358               In Too Deep  [Drama, Action, Thrille

# 4. METHOD 3: HYBRID (OVERVIEW + METADATA)


In [5]:
print("\n[4] Method 3: Hybrid Recommendations (Overview + Metadata)")
print("-" * 60)

# Combine similarity matrices with weights
alpha = 0.6  # Weight for metadata
beta = 0.4   # Weight for overview

cosine_sim_hybrid = alpha * cosine_sim_metadata + beta * cosine_sim_overview
print(f"Hybrid Cosine Similarity Matrix shape: {cosine_sim_hybrid.shape}")

def get_recommendations_hybrid(title, cosine_sim=cosine_sim_hybrid, top_n=10):
    """Get recommendations based on hybrid similarity"""
    idx = df[df['title'] == title].index
    if len(idx) == 0:
        return f"Movie '{title}' not found!"
    idx = idx[0]

    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]

    movie_indices = [i[0] for i in sim_scores]
    recommendations = df.iloc[movie_indices][['title', 'genres', 'vote_average', 'release_year']].copy()
    recommendations['similarity_score'] = [i[1] for i in sim_scores]

    return recommendations

# Test
print("\n[TEST] Hybrid recommendations for 'The Dark Knight':")
print(get_recommendations_hybrid('The Dark Knight', top_n=10))


[4] Method 3: Hybrid Recommendations (Overview + Metadata)
------------------------------------------------------------
Hybrid Cosine Similarity Matrix shape: (4772, 4772)

[TEST] Hybrid recommendations for 'The Dark Knight':
                         title                             genres  \
3        The Dark Knight Rises   [Action, Crime, Drama, Thriller]   
119              Batman Begins             [Action, Crime, Drama]   
4095               Harsh Times   [Crime, Drama, Thriller, Action]   
4613  Amidst the Devil's Wings             [Drama, Action, Crime]   
299             Batman Forever           [Action, Crime, Fantasy]   
2507                 Slow Burn  [Mystery, Crime, Drama, Thriller]   
1253             Kiss of Death   [Action, Crime, Drama, Thriller]   
1196              The Prestige         [Drama, Mystery, Thriller]   
428             Batman Returns                  [Action, Fantasy]   
4748                   Rampage   [Action, Drama, Crime, Thriller]   

      vote_av

# 5. METHOD 4: WEIGHTED HYBRID WITH POPULARITY


In [6]:
print("\n[5] Method 4: Weighted Hybrid with Popularity Boost")
print("-" * 60)

# Calculate weighted rating (IMDB formula)
C = df['vote_average'].mean()
m = df['vote_count'].quantile(0.70)
df['weighted_rating'] = (df['vote_count'] / (df['vote_count'] + m) * df['vote_average'] +
                         m / (df['vote_count'] + m) * C)

def get_recommendations_weighted(title, top_n=10, percentile=0.70):
    """Get recommendations with popularity/quality filter"""
    idx = df[df['title'] == title].index
    if len(idx) == 0:
        return f"Movie '{title}' not found!"
    idx = idx[0]

    # Get similarity scores
    sim_scores = list(enumerate(cosine_sim_hybrid[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get top similar movies (get more initially for filtering)
    sim_scores = sim_scores[1:51]  # Top 50
    movie_indices = [i[0] for i in sim_scores]

    # Filter by quality threshold
    quality_threshold = df['vote_count'].quantile(percentile)

    recommendations = df.iloc[movie_indices].copy()
    recommendations['similarity_score'] = [i[1] for i in sim_scores]

    # Filter by vote count
    recommendations = recommendations[recommendations['vote_count'] >= quality_threshold]

    # Sort by weighted score (similarity + weighted_rating)
    recommendations['final_score'] = (
        0.7 * recommendations['similarity_score'] +
        0.3 * recommendations['weighted_rating'] / 10
    )

    recommendations = recommendations.sort_values('final_score', ascending=False).head(top_n)

    return recommendations[['title', 'genres', 'vote_average', 'vote_count',
                           'similarity_score', 'final_score', 'release_year']]

# Test
print("\n[TEST] Weighted recommendations for 'The Dark Knight':")
print(get_recommendations_weighted('The Dark Knight', top_n=10))


[5] Method 4: Weighted Hybrid with Popularity Boost
------------------------------------------------------------

[TEST] Weighted recommendations for 'The Dark Knight':
                      title                            genres  vote_average  \
3     The Dark Knight Rises  [Action, Crime, Drama, Thriller]           7.6   
119           Batman Begins            [Action, Crime, Drama]           7.5   
1196           The Prestige        [Drama, Mystery, Thriller]           8.0   
1850               Scarface  [Action, Crime, Drama, Thriller]           8.0   
879     Law Abiding Citizen          [Drama, Crime, Thriller]           7.2   
428          Batman Returns                 [Action, Fantasy]           6.6   
299          Batman Forever          [Action, Crime, Fantasy]           5.2   
1033               Insomnia        [Crime, Mystery, Thriller]           6.8   
1727         3 Days to Kill  [Action, Drama, Thriller, Crime]           6.0   
1664          Dead Man Down  [Thriller, 

# 6. CREATE RECOMMENDATION FUNCTION


In [7]:
print("\n[6] Saving models...")

# Save similarity matrices
np.save('../results/models/content_based/cosine_sim_overview.npy', cosine_sim_overview)
np.save('../results/models/content_based/cosine_sim_metadata.npy', cosine_sim_metadata)
np.save('../results/models/content_based/cosine_sim_hybrid.npy', cosine_sim_hybrid)
print("Similarity matrices saved!")

# Create comprehensive recommendation function
def recommend_movies(title, method='hybrid', top_n=10):
    """
    Comprehensive recommendation function

    Parameters:
    -----------
    title : str
        Movie title
    method : str
        'overview', 'metadata', 'hybrid', or 'weighted'
    top_n : int
        Number of recommendations

    Returns:
    --------
    DataFrame with recommendations
    """
    if method == 'overview':
        return get_recommendations_overview(title, top_n=top_n)
    elif method == 'metadata':
        return get_recommendations_metadata(title, top_n=top_n)
    elif method == 'hybrid':
        return get_recommendations_hybrid(title, top_n=top_n)
    elif method == 'weighted':
        return get_recommendations_weighted(title, top_n=top_n)
    else:
        return "Invalid method! Choose from: 'overview', 'metadata', 'hybrid', 'weighted'"


[6] Saving models...
Similarity matrices saved!


# 7. COMPREHENSIVE TESTING


In [8]:
print("\n[7] Comprehensive Testing")
print("=" * 60)

test_movies = ['Avatar', 'Inception', 'The Godfather', 'Toy Story']

for movie in test_movies:
    if movie in df['title'].values:
        print(f"\n{'='*60}")
        print(f"RECOMMENDATIONS FOR: {movie}")
        print(f"{'='*60}")

        for method in ['overview', 'metadata', 'hybrid', 'weighted']:
            print(f"\n[{method.upper()} METHOD]")
            result = recommend_movies(movie, method=method, top_n=5)
            print(result.to_string(index=False))

print("\n" + "=" * 60)
print("CONTENT-BASED FILTERING COMPLETED!")
print("=" * 60)
print("\nAvailable functions:")
print("  - recommend_movies(title, method, top_n)")
print("  - get_recommendations_overview(title, top_n)")
print("  - get_recommendations_metadata(title, top_n)")
print("  - get_recommendations_hybrid(title, top_n)")
print("  - get_recommendations_weighted(title, top_n)")


[7] Comprehensive Testing

RECOMMENDATIONS FOR: Avatar

[OVERVIEW METHOD]
                       title                              genres  vote_average  release_year  similarity_score
                   Apollo 18 [Horror, Thriller, Science Fiction]           5.0        2011.0          0.231656
                The American            [Crime, Drama, Thriller]           5.8        2010.0          0.207522
                     Beowulf      [Adventure, Action, Animation]           5.5        2007.0          0.189334
            Tears of the Sun                [Action, Drama, War]           6.4        2003.0          0.159195
The Adventures of Pluto Nash   [Action, Comedy, Science Fiction]           4.4        2002.0          0.141251

[METADATA METHOD]
                  title                                                genres  vote_average  release_year  similarity_score
              The Abyss        [Adventure, Action, Thriller, Science Fiction]           7.1        1989.0          0