In [8]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import ast
from sklearn.metrics.pairwise import cosine_similarity
from fuzzywuzzy import process
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np

1. Data Exploration

In [10]:
# Load the movie and credits data
movies = pd.read_csv('../datasets/tmdb_5000_movies.csv')
credits = pd.read_csv('../datasets/tmdb_5000_credits.csv')

In [11]:
print(movies.head())

      budget                                             genres  \
0  237000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
1  300000000  [{"id": 12, "name": "Adventure"}, {"id": 14, "...   
2  245000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
3  250000000  [{"id": 28, "name": "Action"}, {"id": 80, "nam...   
4  260000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   

                                       homepage      id  \
0                   http://www.avatarmovie.com/   19995   
1  http://disney.go.com/disneypictures/pirates/     285   
2   http://www.sonypictures.com/movies/spectre/  206647   
3            http://www.thedarkknightrises.com/   49026   
4          http://movies.disney.com/john-carter   49529   

                                            keywords original_language  \
0  [{"id": 1463, "name": "culture clash"}, {"id":...                en   
1  [{"id": 270, "name": "ocean"}, {"id": 726, "na...                en   
2  [{"id": 470, "nam

In [12]:
movies.shape

(4803, 20)

In [13]:
print(movies.columns)
print(credits.columns)

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')
Index(['movie_id', 'title', 'cast', 'crew'], dtype='object')


In [14]:
print(credits.head())

   movie_id                                     title  \
0     19995                                    Avatar   
1       285  Pirates of the Caribbean: At World's End   
2    206647                                   Spectre   
3     49026                     The Dark Knight Rises   
4     49529                               John Carter   

                                                cast  \
0  [{"cast_id": 242, "character": "Jake Sully", "...   
1  [{"cast_id": 4, "character": "Captain Jack Spa...   
2  [{"cast_id": 1, "character": "James Bond", "cr...   
3  [{"cast_id": 2, "character": "Bruce Wayne / Ba...   
4  [{"cast_id": 5, "character": "John Carter", "c...   

                                                crew  
0  [{"credit_id": "52fe48009251416c750aca23", "de...  
1  [{"credit_id": "52fe4232c3a36847f800b579", "de...  
2  [{"credit_id": "54805967c3a36829b5002c41", "de...  
3  [{"credit_id": "52fe4781c3a36847f81398c3", "de...  
4  [{"credit_id": "52fe479ac3a36847f813eaa3",

2. Data Preprocessing

In [15]:
credits.shape

(4803, 4)

In [16]:
print(credits.columns)

Index(['movie_id', 'title', 'cast', 'crew'], dtype='object')


In [17]:
# Rename 'movie_id' to 'id' in the credits dataframe to match the movies dataframe
credits = credits.rename(columns={'movie_id': 'id'})

In [18]:
print(credits.columns)

Index(['id', 'title', 'cast', 'crew'], dtype='object')


In [19]:
# Merge the movies dataframe with the credits dataframe on 'id'
movies = movies.merge(credits, on='id')

In [20]:
# Check the first few rows of the merged dataframe
movies.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title_x,vote_average,vote_count,title_y,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [21]:
# Fill any missing values in the 'genres' and 'cast' columns
movies['genres'] = movies['genres'].fillna('')
movies['cast'] = movies['cast'].fillna('')

In [22]:
# Convert genres from string to list
movies['genres'] = movies['genres'].apply(lambda x: [i['name'] for i in ast.literal_eval(x)] if isinstance(x, str) else [])

# Convert cast from string to list (first 3 actors for simplicity)
movies['cast'] = movies['cast'].apply(lambda x: [i['name'] for i in ast.literal_eval(x)] if isinstance(x, str) else [])

3. Content-Based Filtering

In [23]:
# Combine genres and cast into a single string (for each movie)
movies['combined_features'] = movies['genres'].apply(lambda x: ' '.join(x)) + ' ' + movies['cast'].apply(lambda x: ' '.join(x))


In [24]:
# Initialize the TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english')

# Fit the vectorizer on the 'combined_features' column
tfidf_matrix = tfidf.fit_transform(movies['combined_features'])

4. Cosine Similarity Matrix

In [25]:

# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [26]:
# Check the columns of the merged DataFrame to verify 'title'
print(movies.columns)

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title_x', 'vote_average',
       'vote_count', 'title_y', 'cast', 'crew', 'combined_features'],
      dtype='object')


In [27]:
# Drop 'title_y' column and rename 'title_x' to 'title'
movies = movies.drop(columns=['title_y'])
movies = movies.rename(columns={'title_x': 'title'})

5. Testing Recommendations Based on Similarity

In [28]:
# Function to get movie recommendations based on similarity
def recommend_movie(movie_title, cosine_sim, movies_df, top_n=5):
    # Get the index of the movie from the title
    idx = movies_df[movies_df['title'] == movie_title].index[0]
    
    # Get similarity scores for all movies
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort movies by similarity score
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the top_n most similar movies (excluding the movie itself)
    sim_scores = sim_scores[1:top_n+1]
    
    # Get movie indices and titles
    movie_indices = [i[0] for i in sim_scores]
    movie_titles = movies_df['title'].iloc[movie_indices]
    
    return movie_titles

# Example: Recommend 5 movies similar to 'The Dark Knight'
recommended_movies = recommend_movie('Iron Man', cosine_sim, movies, top_n=5)
print(recommended_movies)

79                         Iron Man 2
31                         Iron Man 3
361     You Don't Mess with the Zohan
16                       The Avengers
2000                Anywhere But Here
Name: title, dtype: object


In [29]:
# Function to extract 'name' from a list of dictionaries
def extract_names(data):
    try:
        # If the data is a string representation of a list, convert it back to a list
        if isinstance(data, str):
            data = ast.literal_eval(data)
        
        # If it's a list of dictionaries, extract the 'name' field from each dictionary
        if isinstance(data, list):
            names = [item['name'] for item in data if isinstance(item, dict)]
            return " ".join(names)  # Join names with a space
        else:
            return ""  # Return empty string if it's not a valid list
    except: 
        return ""  # Return empty string if an error occurs

# Function to combine features into a single string
def combine_features(row):
    # Extract the 'name' field for each feature (genres, keywords, cast)
    genres = extract_names(row['genres'])
    keywords = extract_names(row['keywords'])
    cast = extract_names(row['cast'])
    overview = str(row['overview'])  # Ensure 'overview' is a string
    
    # Combine all features into a single string
    return genres + ' ' + keywords + ' ' + cast + ' ' + overview

# Apply the function to create the 'combined_features' column
movies['combined_features'] = movies.apply(combine_features, axis=1)

# Check the combined features
print(movies['combined_features'].head())


0     culture clash future space war space colony s...
1     ocean drug abuse exotic island east india tra...
2     spy based on novel secret agent sequel mi6 br...
3     dc comics crime fighter terrorist secret iden...
4     based on novel mars medallion space travel pr...
Name: combined_features, dtype: object


In [30]:
# Initialize the vectorizer
tfidf = TfidfVectorizer(stop_words='english')

# Fit and transform the combined features
tfidf_matrix = tfidf.fit_transform(movies['combined_features'])

# Check the shape of the tfidf_matrix
print(tfidf_matrix.shape)

(4803, 23005)


In [31]:
# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Check the cosine similarity matrix
print(cosine_sim.shape)

(4803, 4803)


In [32]:
def clean_keywords(keywords_str):
    # If the keyword string is empty or None, return an empty set
    if not keywords_str:
        return set()  # Return empty set if no keywords exist
    
    try:
        # Safely evaluate the string to convert it to a list of dictionaries
        keywords_list = ast.literal_eval(keywords_str)
    except:
        return set()  # Return empty set if evaluation fails
    
    # Extract 'name' field from each dictionary in the list (if it's a dictionary and has 'name' key)
    cleaned_keywords = [kw['name'] for kw in keywords_list if isinstance(kw, dict) and 'name' in kw]
    
    # Debugging: Print cleaned keywords
    #print(f"Cleaned keywords: {cleaned_keywords}")
    
    # Return a set of cleaned keywords
    return set(cleaned_keywords)

In [33]:
# This code for relevent movies
def get_movie_index(movie_title, movies_df):
    # Use fuzzy matching to find the best match
    best_match, score = process.extractOne(movie_title, movies_df['title'].values)
    
    if score < 70:  # Threshold to avoid incorrect matches
        print(f"Movie '{movie_title}' not found or doesn't match closely enough.")
        return None  # Return None if no good match is found
    
    # Get the index of the matched movie
    movie_index = movies_df[movies_df['title'] == best_match].index[0]
    return movie_index

def get_relevant_movies_by_similarity(movie_title, similarity_matrix, movies_df, top_n=5):
    # Get the movie index using fuzzy matching
    movie_index = get_movie_index(movie_title, movies_df)
    
    if movie_index is None:
        return []  # Return empty list if no good match is found
    
    # Get similarity scores for all movies
    similarity_scores = list(enumerate(similarity_matrix[movie_index]))
    
    # Sort movies by similarity score
    sorted_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    
    # Get top N relevant movies (skip the first one, as it's the selected movie itself)
    relevant_movies = [movies_df.iloc[i[0]]['title'] for i in sorted_scores[1:top_n+1]]
    
    return relevant_movies

# End of relevent movies 


def evaluate_recommendations(recommended_movies, relevant_movies, top_n):
    """
    Evaluate the recommendations using Precision, Recall, and F1-score.
    """
    # If the recommended_movies list is shorter than top_n, pad with empty strings
    if len(recommended_movies) < top_n:
        recommended_movies.extend([""] * (top_n - len(recommended_movies)))
    
    # Convert to binary lists (1 if movie is relevant, 0 if not)
    y_true = [1 if movie in relevant_movies else 0 for movie in recommended_movies[:top_n]]
    y_pred = [1 if movie in recommended_movies[:top_n] else 0 for movie in recommended_movies[:top_n]]
    
    # Use zero_division=1 to avoid the warning and set a default value for undefined recall
    precision = precision_score(y_true, y_pred, zero_division=1)
    recall = recall_score(y_true, y_pred, zero_division=1)
    f1 = f1_score(y_true, y_pred, zero_division=1)
    
    return precision, recall, f1

def recommend_with_explanation(movie_title, similarity_matrix, movies_df, top_n=3, relevant_movies=None):
    # Step 1: Use fuzzy matching to find the best movie title in the dataset
    best_match, score = process.extractOne(movie_title, movies_df['title'].values)
    
    if score < 70:  # You can adjust the threshold as needed
        print(f"Movie '{movie_title}' not found with a good match in the dataset.")
        return [], [], None  # Return empty lists and None for metrics

    movie_index = movies_df[movies_df['title'] == best_match].index[0]
    
    # Step 2: Get similarity scores for all movies
    similarity_scores = list(enumerate(similarity_matrix[movie_index]))
    
    # Step 3: Sort movies by similarity score
    sorted_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    
    recommendations = []
    explanations = []
    
    for i in sorted_scores[1:top_n+1]:  # Skip the first movie (the selected one itself)
        similar_movie_index = i[0]
        similar_movie_title = movies_df.iloc[similar_movie_index]['title']
        
        # Step 5: Compute overlapping features
        genres_overlap = set(movies_df.iloc[movie_index]['genres']).intersection(
            set(movies_df.iloc[similar_movie_index]['genres'])
        )
        
        keywords_overlap = clean_keywords(movies_df.iloc[movie_index]['keywords']).intersection(
            clean_keywords(movies_df.iloc[similar_movie_index]['keywords'])
        )
        
        cast_overlap = set(movies_df.iloc[movie_index]['cast']).intersection(
            set(movies_df.iloc[similar_movie_index]['cast'])
        )
        
        # Step 6: Construct an explanation
        explanation = f"{similar_movie_title} is recommended because it shares:\n"
        if genres_overlap:
            explanation += f"- Genres: {', '.join(genres_overlap)}\n"
        if keywords_overlap:
            explanation += f"- Keywords: {', '.join(keywords_overlap)}\n"
        if cast_overlap:
            explanation += f"- Cast: {', '.join(cast_overlap)}\n"
        
        # Append recommendation and explanation
        recommendations.append(similar_movie_title)
        explanations.append(explanation)
    
    # Evaluate the recommendations if relevant_movies is provided
    if relevant_movies:
        precision, recall, f1 = evaluate_recommendations(recommendations, relevant_movies, top_n)
        return recommendations, explanations, (precision, recall, f1)
    
    return recommendations, explanations, None

In [36]:
# Input movie title
movie_title = input("Enter the movie name: ")

# Assume the relevant_movies is a list of movie titles that are relevant to the user
relevant_movies = get_relevant_movies_by_similarity(movie_title, cosine_sim, movies, top_n=3)  # This should be dynamically determined

# Ensure the movie title exists in the dataset using fuzzy matching
best_match, score = process.extractOne(movie_title, movies['title'].values)

if score < 70:  # You can adjust the threshold as needed
    print(f"Movie '{movie_title}' not found in the dataset with a good match.")
else:
    print(f"Did you mean '{best_match}'?")
    recommendations, explanations, metrics = recommend_with_explanation(best_match, cosine_sim, movies, top_n=5, relevant_movies=relevant_movies)

    if recommendations:
        for i in range(len(recommendations)):
            print(f"\nRecommended Movie: {recommendations[i]}")
            print(f"Explanation: {explanations[i]}")
        
        if metrics:
            precision, recall, f1 = metrics
            print(f"\nEvaluation Metrics:")
            print(f"Precision: {precision:.2f}")
            print(f"Recall: {recall:.2f}")
            print(f"F1 Score: {f1:.2f}")
    else:
        print("No recommendations found.")

Did you mean 'Pirates of the Caribbean: At World's End'?

Recommended Movie: Pirates of the Caribbean: Dead Man's Chest
Explanation: Pirates of the Caribbean: Dead Man's Chest is recommended because it shares:
- Genres: Action, Fantasy, Adventure
- Keywords: east india trading company, aftercreditsstinger, swashbuckler, pirate, exotic island, ship
- Cast: Lauren Maher, Lee Arenberg, David Schofield, Reggie Lee, Jack Davenport, Keira Knightley, Bill Nighy, Jonathan Pryce, Johnny Depp, Christopher S. Capp, David Bailie, Orlando Bloom, Stellan Skarsgård, Naomie Harris, Peter Donald Badalamenti II, Martin Klebba, Geoffrey Rush, Andy Beckwith, Ho-Kwan Tse, Vanessa Branch, Mackenzie Crook, Kevin McNally, Tom Hollander


Recommended Movie: Pirates of the Caribbean: The Curse of the Black Pearl
Explanation: Pirates of the Caribbean: The Curse of the Black Pearl is recommended because it shares:
- Genres: Action, Fantasy, Adventure
- Keywords: east india trading company, aftercreditsstinger, sw