In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Step 1: Load the datasets
movies = pd.read_csv('datasets/tmdb_5000_movies.csv')
credits = pd.read_csv('datasets/tmdb_5000_credits.csv')

In [None]:
# Rename 'movie_id' to 'id' in the credits dataframe to match the movies dataframe
credits = credits.rename(columns={'movie_id': 'id'})

In [None]:
# Step 2: Merge datasets on the 'id' column
movies = movies.merge(credits, on='id')

In [None]:
# Step 3: Select and preprocess relevant features
movies['overview'] = movies['overview'].fillna('')  # Fill missing overviews
movies['genres'] = movies['genres'].apply(lambda x: ' '.join([d['name'] for d in eval(x)]))  # Extract genre names
movies['cast'] = movies['cast'].apply(lambda x: ' '.join([d['name'] for d in eval(x)[:5]]))  # Top 5 cast members
movies['crew'] = movies['crew'].apply(lambda x: ' '.join([d['name'] for d in eval(x) if d['job'] == 'Director']))  # Director name


In [None]:
# Combine features into a single string
movies['combined_features'] = movies['genres'] + ' ' + movies['cast'] + ' ' + movies['crew'] + ' ' + movies['overview']

In [None]:
# Step 4: Feature Engineering with TF-IDF
tfidf = TfidfVectorizer(stop_words='english', max_features=5000, ngram_range=(1, 2))
tfidf_matrix = tfidf.fit_transform(movies['combined_features'])

In [None]:
# Step 5: Apply K-Means Clustering
k = 10  # Number of clusters
kmeans = KMeans(n_clusters=k, random_state=42)
movies['cluster'] = kmeans.fit_predict(tfidf_matrix)

In [None]:
# Updated function to handle 'title_x' for movie title
def recommend_movies(movie_title, top_n=5):
    if 'title_x' not in movies.columns:
        return "The 'title_x' column is missing from the dataset. Check your dataset."
    
    # Find the input movie by title
    input_movie = movies[movies['title_x'].str.contains(movie_title, case=False, na=False)]
    if input_movie.empty:
        return f"No movie found with title containing '{movie_title}'"
    
    # Find the cluster of the input movie
    input_cluster = input_movie.iloc[0]['cluster']
    
    # Filter movies in the same cluster
    cluster_movies = movies[movies['cluster'] == input_cluster]
    
    # Compute cosine similarity within the cluster
    cluster_indices = cluster_movies.index
    similarity_matrix = cosine_similarity(tfidf_matrix[cluster_indices], tfidf_matrix[cluster_indices])
    
    # Find the input movie index within the cluster
    input_index = input_movie.index[0] - cluster_movies.index[0]
    
    # Rank movies by similarity to the input movie
    similar_indices = similarity_matrix[input_index].argsort()[::-1]
    recommendations = cluster_movies.iloc[similar_indices]
    
    # Return top-n recommendations (excluding the input movie)
    recommendations = recommendations.iloc[1:top_n+1]  # Exclude input movie itself
    return recommendations[['title_x', 'combined_features']]


In [None]:
# Step 7: Example Usage
movie_title = "Avatar"  # Replace with any movie title
recommendations = recommend_movies(movie_title)
print(recommendations)

In [None]:
# Check the columns of the merged DataFrame to verify 'title'
print(movies.columns)

In [None]:
# Rename columns after the merge for consistency
movies.rename(columns={'title_x': 'title'}, inplace=True)

In [None]:
# Function to get movie recommendations based on similarity
def recommend_movie(movie_title, cosine_sim, movies_df, top_n=5):
    # Get the index of the movie from the title
    idx = movies_df[movies_df['title'] == movie_title].index[0]
    
    # Get similarity scores for all movies
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort movies by similarity score
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the top_n most similar movies (excluding the movie itself)
    sim_scores = sim_scores[1:top_n+1]
    
    # Get movie indices and titles
    movie_indices = [i[0] for i in sim_scores]
    movie_titles = movies_df['title'].iloc[movie_indices]
    
    return movie_titles

# Example: Recommend 5 movies similar to 'The Dark Knight'
recommended_movies = recommend_movie('Iron Man', cosine_sim, movies, top_n=5)
print(recommended_movies)

In [None]:
# Function to combine features into a single string
def combine_features(row):
    # Handle missing or non-string values by converting to string
    genres = " ".join(row['genres']) if isinstance(row['genres'], list) else str(row['genres'])
    keywords = " ".join(row['keywords']) if isinstance(row['keywords'], list) else str(row['keywords'])
    cast = " ".join(row['cast']) if isinstance(row['cast'], list) else str(row['cast'])
    overview = str(row['overview'])  # Ensure 'overview' is a string

    # Combine all features into a single string
    return genres + ' ' + keywords + ' ' + cast + ' ' + overview

# Apply the function to create the 'combined_features' column
movies['combined_features'] = movies.apply(combine_features, axis=1)

In [None]:
# Check the combined features
print(movies['combined_features'].head())

In [None]:
import ast

# Function to extract 'name' from a list of dictionaries
def extract_names(data):
    try:
        # If the data is a string representation of a list, convert it back to a list
        if isinstance(data, str):
            data = ast.literal_eval(data)
        
        # If it's a list of dictionaries, extract the 'name' field from each dictionary
        if isinstance(data, list):
            names = [item['name'] for item in data if isinstance(item, dict)]
            return " ".join(names)  # Join names with a space
        else:
            return ""  # Return empty string if it's not a valid list
    except:
        return ""  # Return empty string if an error occurs

# Function to combine features into a single string
def combine_features(row):
    # Extract the 'name' field for each feature (genres, keywords, cast)
    genres = extract_names(row['genres'])
    keywords = extract_names(row['keywords'])
    cast = extract_names(row['cast'])
    overview = str(row['overview'])  # Ensure 'overview' is a string
    
    # Combine all features into a single string
    return genres + ' ' + keywords + ' ' + cast + ' ' + overview

# Apply the function to create the 'combined_features' column
movies['combined_features'] = movies.apply(combine_features, axis=1)

# Check the combined features
print(movies['combined_features'].head())


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the vectorizer
tfidf = TfidfVectorizer(stop_words='english')

# Fit and transform the combined features
tfidf_matrix = tfidf.fit_transform(movies['combined_features'])

# Check the shape of the tfidf_matrix
print(tfidf_matrix.shape)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Check the cosine similarity matrix
print(cosine_sim.shape)

In [None]:
import ast

def clean_keywords(keywords_str):
    # If the keyword string is empty or None, return an empty set
    if not keywords_str:
        return set()  # Return empty set if no keywords exist
    
    try:
        # Safely evaluate the string to convert it to a list of dictionaries
        keywords_list = ast.literal_eval(keywords_str)
    except:
        return set()  # Return empty set if evaluation fails
    
    # Extract 'name' field from each dictionary in the list (if it's a dictionary and has 'name' key)
    cleaned_keywords = [kw['name'] for kw in keywords_list if isinstance(kw, dict) and 'name' in kw]
    
    # Debugging: Print cleaned keywords
    #print(f"Cleaned keywords: {cleaned_keywords}")
    
    # Return a set of cleaned keywords
    return set(cleaned_keywords)


def recommend_with_explanation(movie_title, similarity_matrix, movies_df, top_n=3):
    # Step 1: Find the index of the selected movie
    if movie_title not in movies_df['title'].values:
        print(f"Movie '{movie_title}' not found!")
        return [], []

    movie_index = movies_df[movies_df['title'] == movie_title].index[0]
    
    # Step 2: Get similarity scores for all movies
    similarity_scores = list(enumerate(similarity_matrix[movie_index]))
    
    # Step 3: Sort movies by similarity score
    sorted_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    
    recommendations = []
    explanations = []
    
    for i in sorted_scores[1:top_n+1]:  # Skip the first movie (it's the selected one itself)
        similar_movie_index = i[0]
        similar_movie_title = movies_df.iloc[similar_movie_index]['title']
        
        # Step 5: Compute overlapping features
        genres_overlap = set(movies_df.iloc[movie_index]['genres']).intersection(
            set(movies_df.iloc[similar_movie_index]['genres'])
        )
        
        # Clean and process the keywords (convert them to sets)
        keywords_overlap = clean_keywords(movies_df.iloc[movie_index]['keywords']).intersection(
            clean_keywords(movies_df.iloc[similar_movie_index]['keywords'])
        )
        
        # Cast overlap
        cast_overlap = set(movies_df.iloc[movie_index]['cast']).intersection(
            set(movies_df.iloc[similar_movie_index]['cast'])
        )
        
        # Step 6: Construct an explanation
        explanation = f"{similar_movie_title} is recommended because it shares:\n"
        if genres_overlap:
            explanation += f"- Genres: {', '.join(genres_overlap)}\n"
        if keywords_overlap:
            explanation += f"- Keywords: {', '.join(keywords_overlap)}\n"
        if cast_overlap:
            explanation += f"- Cast: {', '.join(cast_overlap)}\n"
        
        # Append recommendation and explanation
        recommendations.append(similar_movie_title)
        explanations.append(explanation)
    
    return recommendations, explanations


In [None]:
# Input movie title from the user
movie_title = input("Enter the movie name: ")

# Ensure the movie title exists in the dataset
if movie_title not in movies['title'].values:
    print(f"Movie '{movie_title}' not found in the dataset.")
else:
    # Call your recommendation function if the movie exists
    recommendations, explanations = recommend_with_explanation(movie_title, cosine_sim, movies)

    if recommendations:
        for i in range(len(recommendations)):
            print(f"\nRecommended Movie: {recommendations[i]}")
            print(f"Explanation: {explanations[i]}")
    else:
        print("No recommendations found.")


In [None]:
from fuzzywuzzy import process
from sklearn.metrics import precision_score, recall_score, f1_score
import random
import numpy as np

class MovieRecommendationModel:
    def __init__(self, movies_df, similarity_matrix, top_n=3, fuzzy_threshold=70):
        self.movies_df = movies_df
        self.similarity_matrix = similarity_matrix
        self.top_n = top_n
        self.fuzzy_threshold = fuzzy_threshold

    def get_movie_index(self, movie_title):
        """
        Use fuzzy matching to get the best matching movie title.
        """
        best_match, score = process.extractOne(movie_title, self.movies_df['title'].values)
        
        if score < self.fuzzy_threshold:
            print(f"Movie '{movie_title}' not found or doesn't match closely enough.")
            return None  # Return None if no good match is found
        
        # Get the index of the matched movie
        return self.movies_df[self.movies_df['title'] == best_match].index[0]

    def clean_keywords(self, keywords_str):
        """
        Clean and normalize keywords (optional).
        """
        return set(keywords_str.lower().split())

    def get_relevant_movies_by_similarity(self, movie_title):
        """
        Get the top N relevant movies based on the similarity matrix.
        """
        movie_index = self.get_movie_index(movie_title)
        
        if movie_index is None:
            return []  # Return empty list if no good match is found
        
        # Get similarity scores for all movies
        similarity_scores = list(enumerate(self.similarity_matrix[movie_index]))
        
        # Sort movies by similarity score
        sorted_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
        
        # Get top N relevant movies (skip the first one, as it's the selected movie itself)
        relevant_movies = [self.movies_df.iloc[i[0]]['title'] for i in sorted_scores[1:self.top_n+1]]
        
        return relevant_movies

    def evaluate_recommendations(self, recommended_movies, relevant_movies):
        """
        Evaluate the recommendations using Precision, Recall, and F1-score.
        """
        # If the recommended_movies list is shorter than top_n, pad with empty strings
        if len(recommended_movies) < self.top_n:
            recommended_movies.extend([""] * (self.top_n - len(recommended_movies)))
        
        # Convert to binary lists (1 if movie is relevant, 0 if not)
        y_true = [1 if movie in relevant_movies else 0 for movie in recommended_movies[:self.top_n]]
        y_pred = [1 if movie in recommended_movies[:self.top_n] else 0 for movie in recommended_movies[:self.top_n]]
        
        # Use zero_division=1 to avoid the warning and set a default value for undefined recall
        precision = precision_score(y_true, y_pred, zero_division=1)
        recall = recall_score(y_true, y_pred, zero_division=1)
        f1 = f1_score(y_true, y_pred, zero_division=1)
        
        return precision, recall, f1

    def recommend_with_explanation(self, movie_title, relevant_movies=None):
        """
        Generate movie recommendations with explanations.
        """
        movie_index = self.get_movie_index(movie_title)
        
        if movie_index is None:
            return [], [], None
        
        # Get similarity scores for all movies
        similarity_scores = list(enumerate(self.similarity_matrix[movie_index]))
        
        # Sort movies by similarity score
        sorted_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
        
        recommendations = []
        explanations = []
        
        for i in sorted_scores[1:self.top_n+1]:  # Skip the first movie (the selected one itself)
            similar_movie_index = i[0]
            similar_movie_title = self.movies_df.iloc[similar_movie_index]['title']
            
            # Step 5: Compute overlapping features
            genres_overlap = set(self.movies_df.iloc[movie_index]['genres']).intersection(
                set(self.movies_df.iloc[similar_movie_index]['genres'])
            )
            
            keywords_overlap = self.clean_keywords(self.movies_df.iloc[movie_index]['keywords']).intersection(
                self.clean_keywords(self.movies_df.iloc[similar_movie_index]['keywords'])
            )
            
            cast_overlap = set(self.movies_df.iloc[movie_index]['cast']).intersection(
                set(self.movies_df.iloc[similar_movie_index]['cast'])
            )
            
            # Step 6: Construct an explanation
            explanation = f"{similar_movie_title} is recommended because it shares:\n"
            if genres_overlap:
                explanation += f"- Genres: {', '.join(genres_overlap)}\n"
            if keywords_overlap:
                explanation += f"- Keywords: {', '.join(keywords_overlap)}\n"
            if cast_overlap:
                explanation += f"- Cast: {', '.join(cast_overlap)}\n"
            
            recommendations.append(similar_movie_title)
            explanations.append(explanation)
        
        # Evaluate the recommendations if relevant_movies is provided
        if relevant_movies:
            precision, recall, f1 = self.evaluate_recommendations(recommendations, relevant_movies)
            return recommendations, explanations, (precision, recall, f1)
        
        return recommendations, explanations, None

# Example Usage
# Initialize the movie recommendation model
# Assuming 'movies_df' is the DataFrame containing movie data and 'cosine_sim' is the similarity matrix
movie_model = MovieRecommendationModel(movies_df=movies, similarity_matrix=cosine_sim, top_n=3)

# Movie title to recommend from
movie_title = input("Enter the movie name: ")

# Get relevant movies based on similarity
relevant_movies = movie_model.get_relevant_movies_by_similarity(movie_title)

# Get recommendations with explanations
recommendations, explanations, metrics = movie_model.recommend_with_explanation(movie_title, relevant_movies=relevant_movies)

# Print recommendations and explanations
if recommendations:
    for i in range(len(recommendations)):
        print(f"\nRecommended Movie: {recommendations[i]}")
        print(f"Explanation: {explanations[i]}")
    
    if metrics:
        precision, recall, f1 = metrics
        print(f"\nEvaluation Metrics:")
        print(f"Precision: {precision:.2f}")
        print(f"Recall: {recall:.2f}")
        print(f"F1 Score: {f1:.2f}")
else:
    print("No recommendations found.")


In [None]:
# Assuming you already have movies_df and similarity_matrix loaded from your data

# Initialize the recommendation model
movie_model = MovieRecommendationModel(movies_df=movies, similarity_matrix=cosine_sim, top_n=3)


In [None]:
# Get relevant movies (this could be done dynamically based on the user's preferences or some other metric)
relevant_movies = movie_model.get_relevant_movies_by_similarity(movie_title, cosine_sim, movies, top_n=5)  # This should be dynamically determined

# Get recommendations for a given movie
recommended_movies, explanations, metrics = movie_model.recommend(movie_title="Iron Man", relevant_movies=relevant_movies)

# Display recommendations and explanations
for movie, explanation in zip(recommended_movies, explanations):
    print(f"Recommended Movie: {movie}")
    print(f"Explanation: {explanation}")

# Display evaluation metrics if available
if metrics:
    precision, recall, f1 = metrics
    print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")
