1. Import Libraries

In [39]:
import pandas as pd
import numpy as np
import ast
from fuzzywuzzy import process
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.metrics import precision_score, recall_score, f1_score

2. Data Exploration

In [40]:
# Load the movie and credits data
movies = pd.read_csv('../datasets/tmdb_5000_movies.csv')
credits = pd.read_csv('../datasets/tmdb_5000_credits.csv')

In [41]:
print(movies.head())

      budget                                             genres  \
0  237000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
1  300000000  [{"id": 12, "name": "Adventure"}, {"id": 14, "...   
2  245000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
3  250000000  [{"id": 28, "name": "Action"}, {"id": 80, "nam...   
4  260000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   

                                       homepage      id  \
0                   http://www.avatarmovie.com/   19995   
1  http://disney.go.com/disneypictures/pirates/     285   
2   http://www.sonypictures.com/movies/spectre/  206647   
3            http://www.thedarkknightrises.com/   49026   
4          http://movies.disney.com/john-carter   49529   

                                            keywords original_language  \
0  [{"id": 1463, "name": "culture clash"}, {"id":...                en   
1  [{"id": 270, "name": "ocean"}, {"id": 726, "na...                en   
2  [{"id": 470, "nam

In [42]:
movies.shape

(4803, 20)

In [43]:
print(movies.columns)
print(credits.columns)

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')
Index(['movie_id', 'title', 'cast', 'crew'], dtype='object')


In [44]:
print(credits.head())

   movie_id                                     title  \
0     19995                                    Avatar   
1       285  Pirates of the Caribbean: At World's End   
2    206647                                   Spectre   
3     49026                     The Dark Knight Rises   
4     49529                               John Carter   

                                                cast  \
0  [{"cast_id": 242, "character": "Jake Sully", "...   
1  [{"cast_id": 4, "character": "Captain Jack Spa...   
2  [{"cast_id": 1, "character": "James Bond", "cr...   
3  [{"cast_id": 2, "character": "Bruce Wayne / Ba...   
4  [{"cast_id": 5, "character": "John Carter", "c...   

                                                crew  
0  [{"credit_id": "52fe48009251416c750aca23", "de...  
1  [{"credit_id": "52fe4232c3a36847f800b579", "de...  
2  [{"credit_id": "54805967c3a36829b5002c41", "de...  
3  [{"credit_id": "52fe4781c3a36847f81398c3", "de...  
4  [{"credit_id": "52fe479ac3a36847f813eaa3",

2. Data Preprocessing

In [45]:
credits.shape

(4803, 4)

In [46]:
print(credits.columns)

Index(['movie_id', 'title', 'cast', 'crew'], dtype='object')


In [47]:
# Rename 'movie_id' to 'id' in the credits dataframe to match the movies dataframe
credits = credits.rename(columns={'movie_id': 'id'})

In [48]:
print(credits.columns)

Index(['id', 'title', 'cast', 'crew'], dtype='object')


In [49]:
# Merge the movies dataframe with the credits dataframe on 'id'
movies = movies.merge(credits, on='id')

In [50]:
# Check the first few rows of the merged dataframe
movies.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title_x,vote_average,vote_count,title_y,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [51]:
# Fill any missing values in the 'genres' and 'cast' columns
movies['genres'] = movies['genres'].fillna('')
movies['cast'] = movies['cast'].fillna('')

In [52]:
# Convert genres from string to list
movies['genres'] = movies['genres'].apply(lambda x: [i['name'] for i in ast.literal_eval(x)] if isinstance(x, str) else [])

# Convert cast from string to list (first 3 actors for simplicity)
movies['cast'] = movies['cast'].apply(lambda x: [i['name'] for i in ast.literal_eval(x)] if isinstance(x, str) else [])

3. Content-Based Filtering

In [53]:
# Combine genres and cast into a single string (for each movie)
movies['combined_features'] = movies['genres'].apply(lambda x: ' '.join(x)) + ' ' + movies['cast'].apply(lambda x: ' '.join(x))

In [54]:
# Initialize the TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english')

# Fit the vectorizer on the 'combined_features' column
tfidf_matrix = tfidf.fit_transform(movies['combined_features'])

# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [55]:
# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [56]:
# Check the columns of the merged DataFrame to verify 'title'
print(movies.columns)

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title_x', 'vote_average',
       'vote_count', 'title_y', 'cast', 'crew', 'combined_features'],
      dtype='object')


In [57]:
# Drop 'title_y' column and rename 'title_x' to 'title'
movies = movies.drop(columns=['title_y'])
movies = movies.rename(columns={'title_x': 'title'})

In [58]:
# Function to fetch poster URLs
def fetch_poster(movie_id):
    url = f"https://api.themoviedb.org/3/movie/{movie_id}?api_key=8265bd1679663a7ea12ac168da84d2e8&language=en-US"
    try:
        data = requests.get(url).json()
        poster_path = data.get('poster_path')
        if poster_path:
            return f"https://image.tmdb.org/t/p/w500/{poster_path}"
        else:
            return "https://via.placeholder.com/300x450?text=No+Image"  # Placeholder image
    except:
        return "https://via.placeholder.com/300x450?text=No+Image"  # Handle errors gracefully

# Add poster paths to the dataframe
movies['poster_path'] = movies['id'].apply(fetch_poster)

In [59]:
# Function to combine features into a single string
def combine_features(row):
    # Handle missing or non-string values by converting to string
    genres = " ".join(row['genres']) if isinstance(row['genres'], list) else str(row['genres'])
    keywords = " ".join(row['keywords']) if isinstance(row['keywords'], list) else str(row['keywords'])
    cast = " ".join(row['cast']) if isinstance(row['cast'], list) else str(row['cast'])
    overview = str(row['overview'])  # Ensure 'overview' is a string

    # Combine all features into a single string
    return genres + ' ' + keywords + ' ' + cast + ' ' + overview

# Apply the function to create the 'combined_features' column
movies['combined_features'] = movies.apply(combine_features, axis=1)

In [60]:
# Check the combined features
print(movies['combined_features'].head())

0    Action Adventure Fantasy Science Fiction [{"id...
1    Adventure Fantasy Action [{"id": 270, "name": ...
2    Action Adventure Crime [{"id": 470, "name": "s...
3    Action Crime Drama Thriller [{"id": 849, "name...
4    Action Adventure Science Fiction [{"id": 818, ...
Name: combined_features, dtype: object


In [61]:
movies.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,combined_features,poster_path
0,237000000,"[Action, Adventure, Fantasy, Science Fiction]",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,"[Sam Worthington, Zoe Saldana, Sigourney Weave...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...","Action Adventure Fantasy Science Fiction [{""id...",https://via.placeholder.com/300x450?text=No+Image
1,300000000,"[Adventure, Fantasy, Action]",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,"[Johnny Depp, Orlando Bloom, Keira Knightley, ...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de...","Adventure Fantasy Action [{""id"": 270, ""name"": ...",https://via.placeholder.com/300x450?text=No+Image
2,245000000,"[Action, Adventure, Crime]",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,"[Daniel Craig, Christoph Waltz, Léa Seydoux, R...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de...","Action Adventure Crime [{""id"": 470, ""name"": ""s...",https://via.placeholder.com/300x450?text=No+Image
3,250000000,"[Action, Crime, Drama, Thriller]",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",...,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,"[Christian Bale, Michael Caine, Gary Oldman, A...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de...","Action Crime Drama Thriller [{""id"": 849, ""name...",https://via.placeholder.com/300x450?text=No+Image
4,260000000,"[Action, Adventure, Science Fiction]",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",...,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,"[Taylor Kitsch, Lynn Collins, Samantha Morton,...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de...","Action Adventure Science Fiction [{""id"": 818, ...",https://via.placeholder.com/300x450?text=No+Image


In [62]:
# Function to extract 'name' from a list of dictionaries
def extract_names(data):
    try:
        # If the data is a string representation of a list, convert it back to a list
        if isinstance(data, str):
            data = ast.literal_eval(data)
        
        # If it's a list of dictionaries, extract the 'name' field from each dictionary
        if isinstance(data, list):
            names = [item['name'] for item in data if isinstance(item, dict)]
            return " ".join(names)  # Join names with a space
        else:
            return ""  # Return empty string if it's not a valid list
    except:
        return ""  # Return empty string if an error occurs

# Function to combine features into a single string
def combine_features(row):
    # Extract the 'name' field for each feature (genres, keywords, cast)
    genres = extract_names(row['genres'])
    keywords = extract_names(row['keywords'])
    cast = extract_names(row['cast'])
    overview = str(row['overview'])  # Ensure 'overview' is a string
    
    # Combine all features into a single string
    return genres + ' ' + keywords + ' ' + cast + ' ' + overview

# Apply the function to create the 'combined_features' column
movies['combined_features'] = movies.apply(combine_features, axis=1)

# Check the combined features
print(movies['combined_features'].head())

0     culture clash future space war space colony s...
1     ocean drug abuse exotic island east india tra...
2     spy based on novel secret agent sequel mi6 br...
3     dc comics crime fighter terrorist secret iden...
4     based on novel mars medallion space travel pr...
Name: combined_features, dtype: object


In [63]:
# Initialize the TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english')

# Fit the vectorizer on the 'combined_features' column
tfidf_matrix = tfidf.fit_transform(movies['combined_features'])

# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [64]:
def clean_keywords(keywords_str):
    # If the keyword string is empty or None, return an empty set
    if not keywords_str:
        return set()  # Return empty set if no keywords exist
    
    try:
        # Safely evaluate the string to convert it to a list of dictionaries
        keywords_list = ast.literal_eval(keywords_str)
    except:
        return set()  # Return empty set if evaluation fails
    
    # Extract 'name' field from each dictionary in the list (if it's a dictionary and has 'name' key)
    cleaned_keywords = [kw['name'] for kw in keywords_list if isinstance(kw, dict) and 'name' in kw]
    
    # Debugging: Print cleaned keywords
    #print(f"Cleaned keywords: {cleaned_keywords}")
    
    # Return a set of cleaned keywords
    return set(cleaned_keywords)

In [65]:
# Perform K-means clustering
def perform_clustering(X, num_clusters=5):
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    kmeans.fit(X)
    return kmeans.labels_

# Get the index of a movie from its title
def get_movie_index(movie_title, movies_df):
    best_match, score = process.extractOne(movie_title, movies_df['title'].values)
    if score < 70:
        print(f"Movie '{movie_title}' not found or doesn't match closely enough.")
        return None
    return movies_df[movies_df['title'] == best_match].index[0]

# This code for relevent movies
def get_relevant_movies_by_similarity(movie_title, similarity_matrix, movies_df, top_n=5):
    # Get the movie index using fuzzy matching
    movie_index = get_movie_index(movie_title, movies_df)
    
    if movie_index is None:
        return []  # Return empty list if no good match is found
    
    # Get similarity scores for all movies
    similarity_scores = list(enumerate(similarity_matrix[movie_index]))
    
    # Sort movies by similarity score
    sorted_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    
    # Get top N relevant movies (skip the first one, as it's the selected movie itself)
    relevant_movies = [movies_df.iloc[i[0]]['title'] for i in sorted_scores[1:top_n+1]]
    
    return relevant_movies


# End of relevent movies 
# Recommend movies based on cluster and similarity
def recommend_with_cluster_and_similarity(movie_title, feature_matrix, movies_df, top_n=5, num_clusters=5):
    # Perform clustering
    cluster_labels = perform_clustering(feature_matrix, num_clusters)
    
    # Get the movie index using fuzzy matching
    movie_index = get_movie_index(movie_title, movies_df)
    if movie_index is None:
        return [], [], None
    
    # Get the cluster of the selected movie
    movie_cluster = cluster_labels[movie_index]
    
    # Get all movies from the same cluster
    cluster_indices = np.where(cluster_labels == movie_cluster)[0]
    
    # Get the similarity matrix for movies in the same cluster
    cluster_similarity_matrix = cosine_similarity(feature_matrix[cluster_indices])
    
    # Get similarity scores for the selected movie within the cluster
    movie_cluster_index = np.where(cluster_indices == movie_index)[0][0]
    similarity_scores = list(enumerate(cluster_similarity_matrix[movie_cluster_index]))
    
    # Sort movies by similarity score within the cluster
    sorted_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    
    # Recommend top N movies from the same cluster
    recommendations = []
    explanations = []
    
    for i in sorted_scores[1:top_n+1]:  # Skip the first movie (the selected one itself)
        similar_movie_index = cluster_indices[i[0]]
        similar_movie_title = movies_df.iloc[similar_movie_index]['title']
        
        # Construct an explanation
        explanation = f"{similar_movie_title} is recommended because it is in the same cluster and shares high similarity."
        
        recommendations.append(similar_movie_title)
        explanations.append(explanation)
    
    return recommendations, explanations

In [66]:
def precision_at_k(recommended_movies, relevant_movies, k=5):
    """Calculate Precision at k"""
    # Ensure only top k recommendations are considered
    top_k_recommendations = recommended_movies[:k]
    relevant_count = sum(1 for movie in top_k_recommendations if movie in relevant_movies)
    return relevant_count / k

def recall_at_k(recommended_movies, relevant_movies, k=5):
    """Calculate Recall at k"""
    # Ensure only top k recommendations are considered
    top_k_recommendations = recommended_movies[:k]
    relevant_count = sum(1 for movie in top_k_recommendations if movie in relevant_movies)
    return relevant_count / len(relevant_movies)

# Example of how to use the function
movie_title = input("Enter the movie name: ")

# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print(cosine_sim.shape)
feature_matrix = tfidf_matrix  # Using the actual TF-IDF matrix as the feature matrix

relevant_movies = get_relevant_movies_by_similarity(movie_title, cosine_sim, movies, top_n=5)  # Dynamically determined
print(relevant_movies)
# Fuzzy matching to ensure movie title exists
best_match, score = process.extractOne(movie_title, movies['title'].values)

if score < 70:
    print(f"Movie '{movie_title}' not found in the dataset with a good match.")
else:
    print(f"Did you mean '{best_match}'?")
    recommendations, explanations = recommend_with_cluster_and_similarity(best_match, feature_matrix, movies, top_n=5)
    if recommendations:
        for i in range(len(recommendations)):
            print(f"\nRecommended Movie: {recommendations[i]}")
            print(f"Explanation: {explanations[i]}")
    else:
        print("No recommendations found.")
    # Calculate Precision at 5
    precision = precision_at_k(recommendations, relevant_movies, k=5)
    print(f"Precision at 5: {precision}")

    # Calculate Recall at 5
    recall = recall_at_k(recommendations, relevant_movies, k=5)
print(f"Recall at 5: {recall}")


(4803, 4803)
["Pirates of the Caribbean: Dead Man's Chest", 'Pirates of the Caribbean: The Curse of the Black Pearl', 'Pirates of the Caribbean: On Stranger Tides', 'The Pirates! In an Adventure with Scientists!', 'The Blue Lagoon']
Did you mean 'Pirates of the Caribbean: At World's End'?


  super()._check_params_vs_input(X, default_n_init=10)



Recommended Movie: Nim's Island
Explanation: Nim's Island is recommended because it is in the same cluster and shares high similarity.

Recommended Movie: Swept Away
Explanation: Swept Away is recommended because it is in the same cluster and shares high similarity.

Recommended Movie: Anna and the King
Explanation: Anna and the King is recommended because it is in the same cluster and shares high similarity.

Recommended Movie: What's Love Got to Do with It
Explanation: What's Love Got to Do with It is recommended because it is in the same cluster and shares high similarity.

Recommended Movie: Joy
Explanation: Joy is recommended because it is in the same cluster and shares high similarity.
Precision at 5: 0.0
Recall at 5: 0.0
