1. Import Libraries

In [22]:
import pandas as pd
import numpy as np
import ast
from fuzzywuzzy import process
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import precision_score, recall_score, f1_score

2. Data Exploration

In [23]:
# Load the movie and credits data
movies = pd.read_csv('../datasets/tmdb_5000_movies.csv')
credits = pd.read_csv('../datasets/tmdb_5000_credits.csv')

# Rename 'movie_id' to 'id' in the credits dataframe to match the movies dataframe
credits = credits.rename(columns={'movie_id': 'id'})

# Merge the movies dataframe with the credits dataframe on 'id'
movies = movies.merge(credits, on='id')

# Fill any missing values in the 'genres' and 'cast' columns
movies['genres'] = movies['genres'].fillna('')
movies['cast'] = movies['cast'].fillna('')

# Convert genres from string to list
movies['genres'] = movies['genres'].apply(lambda x: [i['name'] for i in ast.literal_eval(x)] if isinstance(x, str) else [])

# Convert cast from string to list (first 3 actors for simplicity)
movies['cast'] = movies['cast'].apply(lambda x: [i['name'] for i in ast.literal_eval(x)] if isinstance(x, str) else [])

3. Content-Based Filtering

In [24]:
# Combine genres and cast into a single string (for each movie)
movies['combined_features'] = movies['genres'].apply(lambda x: ' '.join(x)) + ' ' + movies['cast'].apply(lambda x: ' '.join(x))

# Drop 'title_y' column and rename 'title_x' to 'title'
movies = movies.drop(columns=['title_y'])
movies = movies.rename(columns={'title_x': 'title'})

In [25]:
# Function to combine features into a single string
def combine_features(row):
    # Handle missing or non-string values by converting to string
    genres = " ".join(row['genres']) if isinstance(row['genres'], list) else str(row['genres'])
    keywords = " ".join(row['keywords']) if isinstance(row['keywords'], list) else str(row['keywords'])
    cast = " ".join(row['cast']) if isinstance(row['cast'], list) else str(row['cast'])
    overview = str(row['overview'])  # Ensure 'overview' is a string

    # Combine all features into a single string
    return genres + ' ' + keywords + ' ' + cast + ' ' + overview

# Apply the function to create the 'combined_features' column
movies['combined_features'] = movies.apply(combine_features, axis=1)

In [26]:
# Check the combined features
print(movies['combined_features'].head())

0    Action Adventure Fantasy Science Fiction [{"id...
1    Adventure Fantasy Action [{"id": 270, "name": ...
2    Action Adventure Crime [{"id": 470, "name": "s...
3    Action Crime Drama Thriller [{"id": 849, "name...
4    Action Adventure Science Fiction [{"id": 818, ...
Name: combined_features, dtype: object


In [27]:
# Function to extract 'name' from a list of dictionaries
def extract_names(data):
    try:
        # If the data is a string representation of a list, convert it back to a list
        if isinstance(data, str):
            data = ast.literal_eval(data)
        
        # If it's a list of dictionaries, extract the 'name' field from each dictionary
        if isinstance(data, list):
            names = [item['name'] for item in data if isinstance(item, dict)]
            return " ".join(names)  # Join names with a space
        else:
            return ""  # Return empty string if it's not a valid list
    except:
        return ""  # Return empty string if an error occurs

# Function to combine features into a single string
def combine_features(row):
    # Extract the 'name' field for each feature (genres, keywords, cast)
    genres = extract_names(row['genres'])
    keywords = extract_names(row['keywords'])
    cast = extract_names(row['cast'])
    overview = str(row['overview'])  # Ensure 'overview' is a string
    
    # Combine all features into a single string
    return genres + ' ' + keywords + ' ' + cast + ' ' + overview

# Apply the function to create the 'combined_features' column
movies['combined_features'] = movies.apply(combine_features, axis=1)

# Check the combined features
print(movies['combined_features'].head())

0     culture clash future space war space colony s...
1     ocean drug abuse exotic island east india tra...
2     spy based on novel secret agent sequel mi6 br...
3     dc comics crime fighter terrorist secret iden...
4     based on novel mars medallion space travel pr...
Name: combined_features, dtype: object


In [28]:
# Initialize the vectorizer
tfidf = TfidfVectorizer(stop_words='english')

# Fit and transform the combined features
tfidf_matrix = tfidf.fit_transform(movies['combined_features'])

# Check the shape of the tfidf_matrix
print(tfidf_matrix.shape)

(4803, 23005)


In [29]:
# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Check the cosine similarity matrix
print(cosine_sim.shape)

(4803, 4803)


In [30]:
def clean_keywords(keywords_str):
    # If the keyword string is empty or None, return an empty set
    if not keywords_str:
        return set()  # Return empty set if no keywords exist
    
    try:
        # Safely evaluate the string to convert it to a list of dictionaries
        keywords_list = ast.literal_eval(keywords_str)
    except:
        return set()  # Return empty set if evaluation fails
    
    # Extract 'name' field from each dictionary in the list (if it's a dictionary and has 'name' key)
    cleaned_keywords = [kw['name'] for kw in keywords_list if isinstance(kw, dict) and 'name' in kw]
    
    # Debugging: Print cleaned keywords
    #print(f"Cleaned keywords: {cleaned_keywords}")
    
    # Return a set of cleaned keywords
    return set(cleaned_keywords)

4. KNN

In [31]:
# Use KNN to find the most similar movies based on TF-IDF features
knn = NearestNeighbors(n_neighbors=6, metric='cosine')
knn.fit(tfidf_matrix)

In [38]:
# Function to get movie index using fuzzy matching
def get_movie_index(movie_title, movies_df):
    best_match, score = process.extractOne(movie_title, movies_df['title'].values)
    if score >= 70:  # Threshold for a good match
        return movies_df[movies_df['title'] == best_match].index[0]
    else:
        print(f"Movie '{movie_title}' not found or doesn't match closely enough.")
        return None
    
# Function to recommend movies using KNN based on TF-IDF matrix
def get_relevant_movies_knn(movie_title, knn_model, movies_df, tfidf_matrix, top_n=5):
    # Get the movie index using fuzzy matching
    movie_index = get_movie_index(movie_title, movies_df)
    
    if movie_index is None:
        return []  # Return empty list if no good match is found
    
    # Get distances and indices of neighbors
    distances, indices = knn_model.kneighbors(tfidf_matrix[movie_index], n_neighbors=top_n + 1)
    
    # Extract movie titles, skipping the first result (the movie itself)
    relevant_movies = [movies_df.iloc[i]['title'] for i in indices[0][1:]]
    
    return relevant_movies

# List of movie titles to test
movie_title = input("Enter the movie name: ")

# Get the relevant movies
recommended_movies = get_relevant_movies_knn(movie_title, knn, movies, tfidf_matrix, top_n=5)

# Print recommended movies
print(f"\nRecommended Movies for '{movie_title}':")
for i, movie in enumerate(recommended_movies, 1):
    print(f"{i}. {movie}")



Recommended Movies for 'iron man':
1. Iron Man 3
2. Iron Man 2
3. Avengers: Age of Ultron
4. The Incredible Hulk
5. Captain America: Civil War


In [39]:
# Function to get movie index using fuzzy matching
def get_movie_index(movie_title, movies_df):
    best_match, score = process.extractOne(movie_title, movies_df['title'].values)
    if score >= 70:  # Threshold for a good match
        return movies_df[movies_df['title'] == best_match].index[0]
    else:
        print(f"Movie '{movie_title}' not found or doesn't match closely enough.")
        return None

# Function to get relevant movies dynamically based on similarity using cosine similarity
def get_relevant_movies_by_cosine_similarity(movie_title, cosine_sim, movies_df, top_n=5):
    movie_index = get_movie_index(movie_title, movies_df)
    
    if movie_index is None:
        return []  # Return empty list if no good match is found
    
    # Get similarity scores for all movies
    sim_scores = list(enumerate(cosine_sim[movie_index]))
    
    # Sort movies by similarity score
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get top N relevant movies (skip the first one, as it's the movie itself)
    relevant_movies = [movies_df.iloc[i[0]]['title'] for i in sim_scores[1:top_n+1]]
    
    return relevant_movies

# Function to recommend movies using KNN based on TF-IDF matrix
def get_relevant_movies_knn(movie_title, knn_model, movies_df, tfidf_matrix, top_n=5):
    # Get the movie index using fuzzy matching
    movie_index = get_movie_index(movie_title, movies_df)
    
    if movie_index is None:
        return []  # Return empty list if no good match is found
    
    # Get distances and indices of neighbors
    distances, indices = knn_model.kneighbors(tfidf_matrix[movie_index], n_neighbors=top_n + 1)
    
    # Extract movie titles, skipping the first result (the movie itself)
    relevant_movies = [movies_df.iloc[i]['title'] for i in indices[0][1:]]
    
    return relevant_movies

# Function to evaluate precision and recall
def evaluate_recommendations(true_relevant_movies, recommended_movies, top_n=5):
    relevant_in_top_n = [movie for movie in recommended_movies[:top_n] if movie in true_relevant_movies]
    precision_at_n = len(relevant_in_top_n) / top_n if top_n > 0 else 0
    recall_at_n = len(relevant_in_top_n) / len(true_relevant_movies) if true_relevant_movies else 0
    return precision_at_n, recall_at_n

# Function to get the recommended movies and evaluate precision & recall
def recommend_and_evaluate(movie_title, knn_model, cosine_sim, tfidf_matrix, movies_df, top_n=5):
    
    # Get relevant movies dynamically using cosine similarity
    true_relevant_movies = get_relevant_movies_by_cosine_similarity(movie_title, cosine_sim, movies_df, top_n)
    '''
    if not true_relevant_movies:
        print("No relevant movies found dynamically.")
        return None, None
    
    # Print relevant movies
    print(f"True Relevant Movies for '{movie_title}':")
    
    for i, movie in enumerate(true_relevant_movies, 1):
        print(f"{i}. {movie}")
    '''
    
    # Get recommended movies using KNN
    recommended_movies = get_relevant_movies_knn(movie_title, knn_model, movies_df, tfidf_matrix, top_n)
    
    # Print recommended movies
    print(f"\nRecommended Movies for '{movie_title}':")
    for i, movie in enumerate(recommended_movies, 1):
        print(f"{i}. {movie}")
    
    # Evaluate precision and recall
    precision, recall = evaluate_recommendations(true_relevant_movies, recommended_movies, top_n)
    
    return recommended_movies, (precision, recall)

# Sample movie title input from user
movie_title_input = input("Enter the movie name: ")  # Example: User enters 'Spiderman'

# Assuming cosine_sim, knn_model, and tfidf_matrix are already computed
# Get the recommended movies and precision & recall for the input movie
recommended_movies, (precision, recall) = recommend_and_evaluate(movie_title_input, knn, cosine_sim, tfidf_matrix, movies, top_n=5)

# Print Precision and Recall values
print(f"\nPrecision at 5: {precision}")
print(f"Recall at 5: {recall}")


Recommended Movies for 'pirates':
1. Pirates of the Caribbean: Dead Man's Chest
2. Pirates of the Caribbean: The Curse of the Black Pearl
3. Pirates of the Caribbean: On Stranger Tides
4. The Pirates! In an Adventure with Scientists!
5. The Blue Lagoon

Precision at 5: 1.0
Recall at 5: 1.0


In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer

def get_top_keywords(movie_index, top_n=5):
    """Get top contributing keywords for a movie."""
    feature_array = np.array(tfidf.get_feature_names_out())
    tfidf_scores = tfidf_matrix[movie_index].toarray().flatten()
    top_indices = tfidf_scores.argsort()[-top_n:][::-1]
    top_keywords = feature_array[top_indices]
    top_scores = tfidf_scores[top_indices]
    return list(zip(top_keywords, top_scores))

# Example usage
selected_movie_index = get_movie_index("Interstellar", movies)
top_keywords = get_top_keywords(selected_movie_index)
print("Top contributing keywords:", top_keywords)


Top contributing keywords: [('space', 0.33056231238690836), ('interstellar', 0.2947636379517511), ('wormhole', 0.28464281868798796), ('father', 0.18062222648272627), ('single', 0.16804786623259732)]
