In [55]:
import pandas as pd
import numpy as np
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [56]:
pd.options.display.max_colwidth = 100

# Load movies metadata
movies = pd.read_csv(
    './source/movies_metadata.csv',
    usecols=['id', 'title', 'genres', 'overview'],
    dtype={'id': 'str'},
    low_memory=False
)

# Convert ID to INT
movies['id'] = pd.to_numeric(movies['id'], errors='coerce')
movies = movies.dropna(subset=['id'])
movies['id'] = movies['id'].astype(int)

# Parse genres to lists
movies['genres'] = movies['genres'].apply(
    lambda x: [g['name'] for g in literal_eval(x)] if isinstance(x, str) else []
)

# Remove empty data
movies = movies[~movies['genres'].apply(lambda x: len(x) == 0)]
movies = movies.dropna(subset=['overview'])
movies['overview'] = movies['overview'].astype(str)  # Force string type


In [57]:
def debug():
    print('Shape')
    print('-'*150)
    print(movies.shape)
    print('Columns')
    print('-'*150)
    print(movies.columns)
    print('Head')
    print('-'*150)
    print(movies.head())

In [58]:
debug()

Shape
------------------------------------------------------------------------------------------------------------------------------------------------------
(42324, 4)
Columns
------------------------------------------------------------------------------------------------------------------------------------------------------
Index(['genres', 'id', 'overview', 'title'], dtype='object')
Head
------------------------------------------------------------------------------------------------------------------------------------------------------
                         genres     id  \
0   [Animation, Comedy, Family]    862   
1  [Adventure, Fantasy, Family]   8844   
2             [Romance, Comedy]  15602   
3      [Comedy, Drama, Romance]  31357   
4                      [Comedy]  11862   

                                                                                              overview  \
0  Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear

In [59]:
ratings = pd.read_csv('./source/ratings.csv')
ratings['movieId'] = ratings['movieId'].astype(int)  # Ensure integer type

# Calculate avg rating and number of reviews per movie
movie_stats = ratings.groupby('movieId').agg(
    avg_rating=('rating', 'mean'),
    num_reviews=('rating', 'count')
).reset_index()

movies = pd.merge(
    movies,
    movie_stats,
    left_on='id',
    right_on='movieId',
    how='left'
)

# Fill missing values for movies without ratings
movies['avg_rating'] = movies['avg_rating'].fillna(0)
movies['num_reviews'] = movies['num_reviews'].fillna(0)

In [60]:
debug()

Shape
------------------------------------------------------------------------------------------------------------------------------------------------------
(42324, 7)
Columns
------------------------------------------------------------------------------------------------------------------------------------------------------
Index(['genres', 'id', 'overview', 'title', 'movieId', 'avg_rating',
       'num_reviews'],
      dtype='object')
Head
------------------------------------------------------------------------------------------------------------------------------------------------------
                         genres     id  \
0   [Animation, Comedy, Family]    862   
1  [Adventure, Fantasy, Family]   8844   
2             [Romance, Comedy]  15602   
3      [Comedy, Drama, Romance]  31357   
4                      [Comedy]  11862   

                                                                                              overview  \
0  Led by Woody, Andy's toys live happily in

In [61]:
# Bayesian Weighted Score https://www.quora.com/How-does-IMDbs-rating-system-work
# https://help.imdb.com/article/imdb/track-movies-tv/ratings-faq/G67Y87TFYYP6TWAV?showReportContentLink=false&reportContentLinkPath=%2Fcontact%2Freport#
print("""
How do you calculate the rank of movies and TV shows on the Top 250 Movies and Top 250 TV Show lists?
The following formula is used to calculate the Top Rated 250 titles. This formula provides a true 'Bayesian estimate', which takes into account the number of ratings each title has received, minimum ratings required to be on the list, and the mean rating for all titles:

weighted rating (WR) = (v ÷ (v+m)) × R + (m ÷ (v+m)) × C

Where:

R = average for the title (mean) = (rating)

v = number of ratings for the title = (ratings)

m = minimum ratings required to be listed in the Top Rated 250 list (currently 25,000)

C = the mean rating across the whole report

 

Please be aware the Top 250 Movie list only includes feature films: shorts, TV movies, miniseries and documentaries are not included in the Top 250 Movies Chart. The Top 250 TV Shows Chart includes TV Series, but not TV episodes or Movies.
""")
C = movies['avg_rating'].mean()
m = movies['num_reviews'].quantile(0.90)  # Only consider top 10% reviewed

def bayesian_score(row):
    v = row['num_reviews']
    R = row['avg_rating']
    return (v/(v+m) * R) + (m/(v+m) * C)

movies['bayesian_score'] = movies.apply(bayesian_score, axis=1)



How do you calculate the rank of movies and TV shows on the Top 250 Movies and Top 250 TV Show lists?
The following formula is used to calculate the Top Rated 250 titles. This formula provides a true 'Bayesian estimate', which takes into account the number of ratings each title has received, minimum ratings required to be on the list, and the mean rating for all titles:

weighted rating (WR) = (v ÷ (v+m)) × R + (m ÷ (v+m)) × C

Where:

R = average for the title (mean) = (rating)

v = number of ratings for the title = (ratings)

m = minimum ratings required to be listed in the Top Rated 250 list (currently 25,000)

C = the mean rating across the whole report

 

Please be aware the Top 250 Movie list only includes feature films: shorts, TV movies, miniseries and documentaries are not included in the Top 250 Movies Chart. The Top 250 TV Shows Chart includes TV Series, but not TV episodes or Movies.



In [62]:
debug()

Shape
------------------------------------------------------------------------------------------------------------------------------------------------------
(42324, 8)
Columns
------------------------------------------------------------------------------------------------------------------------------------------------------
Index(['genres', 'id', 'overview', 'title', 'movieId', 'avg_rating',
       'num_reviews', 'bayesian_score'],
      dtype='object')
Head
------------------------------------------------------------------------------------------------------------------------------------------------------
                         genres     id  \
0   [Animation, Comedy, Family]    862   
1  [Adventure, Fantasy, Family]   8844   
2             [Romance, Comedy]  15602   
3      [Comedy, Drama, Romance]  31357   
4                      [Comedy]  11862   

                                                                                              overview  \
0  Led by Woody, Andy's to

In [63]:
# Increase weight of genres (Optional but give better results)
movies['text'] = movies.apply(
    lambda row: ' '.join(row['genres'] * 3) + ' ' + row['overview'],
    axis=1
)

In [64]:
# TF-IDF Vectorization with bigrams
tfidf = TfidfVectorizer(
    stop_words='english',
    ngram_range=(1, 2),  # Capture phrases like "science fiction"
    max_features=10000
)
tfidf_matrix = tfidf.fit_transform(movies['text'])

# Compute cosine similarity matrix https://web.stanford.edu/class/cs246/slides/03-lsh.pdf
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)


In [65]:
debug()

Shape
------------------------------------------------------------------------------------------------------------------------------------------------------
(42324, 9)
Columns
------------------------------------------------------------------------------------------------------------------------------------------------------
Index(['genres', 'id', 'overview', 'title', 'movieId', 'avg_rating',
       'num_reviews', 'bayesian_score', 'text'],
      dtype='object')
Head
------------------------------------------------------------------------------------------------------------------------------------------------------
                         genres     id  \
0   [Animation, Comedy, Family]    862   
1  [Adventure, Fantasy, Family]   8844   
2             [Romance, Comedy]  15602   
3      [Comedy, Drama, Romance]  31357   
4                      [Comedy]  11862   

                                                                                              overview  \
0  Led by Woody, A

In [66]:
# Combine content-based similarity with Bayesian-adjusted ratings
def hybrid_recommend(
    title, 
    genre_filter=None, 
    similarity_weight=0.6,
    rating_weight=0.4,
    top_n=10
):
    idx = movies.index[movies['title'] == title].tolist()[0]
    
    # Similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Calculate hybrid scores
    recommendations = []
    for i, sim in sim_scores:
        if i == idx:  # Avoid suggesting the selected movie
            continue
        
        # Get Bayesian score
        rating_score = movies.iloc[i]['bayesian_score'] / 5.0
        
        # Calculate hybrid score
        hybrid_score = (similarity_weight * sim) + (rating_weight * rating_score)
        
        # Apply genre filter
        if genre_filter:
            if genre_filter not in movies.iloc[i]['genres']:
                continue
        
        recommendations.append((i, hybrid_score))
    
    # Sort score
    recommendations = sorted(recommendations, key=lambda x: x[1], reverse=True)
    
    # Get recommendations
    top_indices = [i[0] for i in recommendations[:top_n]]
    
    return movies.iloc[top_indices][['title', 'genres', 'avg_rating', 'num_reviews']]


In [68]:
title = "Titanic"
genre = "Drama"
similarity_weight=0.7
rating_weight=0.3

print(f"Recommendations for '{title}' ({genre}):")
recs = hybrid_recommend(
    title, 
    genre_filter=genre,
    similarity_weight=similarity_weight,
    rating_weight=rating_weight,
    top_n=10
)
print(recs)

Recommendations for 'Titanic' (Drama):
                         title                                      genres  \
1671                Live Flesh                  [Drama, Romance, Thriller]   
31735  Gone, But Not Forgotten                  [Drama, Romance, Thriller]   
4150                 Baise-moi           [Crime, Drama, Romance, Thriller]   
6166                 Music Box           [Crime, Drama, Romance, Thriller]   
8447      The Bride Wore Black  [Mystery, Drama, Romance, Thriller, Crime]   
8080             Bad Education           [Crime, Drama, Romance, Thriller]   
9019        Cruel Intentions 2                  [Drama, Romance, Thriller]   
862                  Notorious                  [Thriller, Drama, Romance]   
9224                   Head-On                            [Drama, Romance]   
3078                 The Beach       [Drama, Adventure, Romance, Thriller]   

       avg_rating  num_reviews  
1671     2.902898       4521.0  
31735    3.128571        105.0  
415