In [None]:
import numpy as np
import pandas as pd
import ast
import os
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import KFold
from joblib import Parallel, delayed

In [None]:
# Load datasets
movies = pd.read_csv('data/tmdb_6000_movies.csv')
credits = pd.read_csv('data/tmdb_6000_credits.csv')

In [None]:
# Merge datasets on 'id'
movies = movies.merge(credits,on='id')

In [None]:
# Select relevant columns for movie analysis
movies = movies[['id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew', 'original_language']]

In [None]:
# Drop missing and duplicate values
movies.dropna(inplace=True)
movies.drop_duplicates(inplace=True)

In [None]:
# Function to safely convert stringified JSON to a list of names
def convert_safe(text):
    """Convert stringified JSON to a list of names."""
    try:
        return [i['name'] for i in ast.literal_eval(text)]
    except (ValueError, SyntaxError):
        return []

In [None]:
# Apply conversion to 'genres' and 'keywords'
movies['genres'] = movies['genres'].apply(convert_safe)
movies['keywords'] = movies['keywords'].apply(convert_safe)

In [None]:
# Function to extract the top 3 cast members
def convert_cast(text):
    """Keep only the top 3 cast members."""
    try:
        return [i['name'] for i in ast.literal_eval(text)[:3]]
    except (ValueError, SyntaxError):
        return []

In [None]:
movies['cast'] = movies['cast'].apply(convert_cast)

In [None]:
# Function to fetch the director's name
def fetch_director(text):
    """Fetch the director's name from the crew data."""
    try:
        for i in ast.literal_eval(text):
            if i['job'] == 'Director':
                return [i['name']]
        return []
    except (ValueError, SyntaxError):
        return []

In [None]:
movies['crew'] = movies['crew'].apply(fetch_director)

In [None]:
# Split the overview text into individual words
movies['overview'] = movies['overview'].apply(lambda x: x.split() if isinstance(x, str) else [])

In [None]:
# Function to remove spaces in names for better matching
def remove_space(L):
    """Remove spaces in names for better matching."""
    return [i.replace(" ", "") for i in L]

In [None]:
movies['cast'] = movies['cast'].apply(remove_space)
movies['crew'] = movies['crew'].apply(remove_space)
movies['genres'] = movies['genres'].apply(remove_space)
movies['keywords'] = movies['keywords'].apply(remove_space)

In [None]:
# Concatenate all textual data into 'tags'
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [None]:
# Keep only the necessary columns
new_movies = movies[['id', 'title', 'tags']]
new_movies.dropna(subset=['tags'], inplace=True)
new_movies.drop_duplicates(subset=['tags'], inplace=True)

In [None]:
# Convert the list of tags to a single string and lowercase them
new_movies['tags'] = new_movies['tags'].apply(lambda x: " ".join(x)).str.lower()

In [None]:
# Initialize PorterStemmer for stemming words
ps = PorterStemmer()

# Function to apply stemming to text
def stems(text):
    """Apply stemming to text."""
    return " ".join([ps.stem(word) for word in text.split()])

In [None]:
new_movies['tags'] = new_movies['tags'].apply(stems)

In [None]:
# TF-IDF vectorization to convert tags into numerical data
tfidf = TfidfVectorizer(max_features=6000, stop_words='english')
vector = tfidf.fit_transform(new_movies['tags']).toarray()

# Save the TF-IDF model and vectorized data
pickle.dump(tfidf, open('processed_data/tfidf_vectorizer.pkl', 'wb'))
pickle.dump(vector, open('processed_data/vectorized_data.pkl', 'wb'))

# Train a KNN model on the vectorized data
knn = NearestNeighbors(n_neighbors=10, metric='cosine', algorithm='brute')
knn.fit(vector)

# Save the trained KNN model
pickle.dump(knn, open('processed_data/optimized_knn.pkl', 'wb'))

In [None]:
# Calculate similarity matrix for visualization
similarity = cosine_similarity(vector)
pickle.dump(similarity, open('processed_data/similarity.pkl', 'wb'))

In [None]:
# Load the pre-trained KNN model for recommendations
optimized_knn = pickle.load(open('processed_data/optimized_knn.pkl', 'rb'))

In [None]:
# Function to recommend movies using KNN
def recommend_knn(movie, k=5):
    """
    Recommends movies similar to the selected movie using KNN.

    Args:
        movie (str): The title of the selected movie.
        k (int): Number of recommendations to provide.

    Returns:
        list: List of recommended movie titles sorted by similarity.
    """
    try:
        # Find the index of the input movie
        index = new_movies[new_movies['title'] == movie].index[0]
    except IndexError:
        return ["Movie not found in dataset."]

    # Get recommendations using the pre-trained KNN model
    distances, indices = optimized_knn.kneighbors([vector[index]], n_neighbors=k + 1)

    # Pair titles with their distances
    recommendations_with_distances = [
        (new_movies.iloc[i].title, distances[0][j])
        for j, i in enumerate(indices[0][1:])
    ]

    # Sort recommendations by similarity (lower distance means higher similarity)
    sorted_recommendations = sorted(recommendations_with_distances, key=lambda x: x[1])

    # Extract only the titles
    recommendations = [title for title, _ in sorted_recommendations[:k]]
    
    # Remove duplicates from recommendations
    recommendations = list(dict.fromkeys(recommendations))
    
    return recommendations



In [None]:
# Function to evaluate recommendation performance
def evaluate_recommendation_performance(true_items, recommended_items):
    """
    Evaluate the performance of recommendations using Precision and Recall.

    Args:
    true_items (list): Ground truth indices or movie titles.
    recommended_items (list): Predicted indices or movie titles.

    Returns:
    dict: Dictionary containing Precision and Recall scores.
    """
    relevant_set = set(true_items)
    recommended_set = set(recommended_items)
    
    true_positives = relevant_set.intersection(recommended_set)
    
    precision = len(true_positives) / len(recommended_set) if recommended_set else 0
    recall = len(true_positives) / len(relevant_set) if relevant_set else 0
    
    return {"Precision": precision, "Recall": recall}

In [None]:
# Function to visualize similarity scores as a bar chart
def visualize_similarity_scores_dynamic(selected_movie, k=5):
    """
    Dynamically visualize similarity scores for recommended movies as a bar chart.

    Args:
    selected_movie (str): The movie selected by the user.
    k (int): Number of recommendations to display.
    """
    try:
        # Find the index of the selected movie
        index = new_movies[new_movies['title'] == selected_movie].index[0]
        
        # Get recommendations using the pre-trained KNN model
        distances, indices = optimized_knn.kneighbors([vector[index]], n_neighbors=k + 1)
        recommended_indices = indices[0][1:]  # Exclude the first (input movie itself)
        similarity_scores = [1 - distances[0][i] for i in range(1, len(distances[0]))]  # Convert distance to similarity
        
        # Map indices to movie titles
        recommended_titles = [new_movies.iloc[i].title for i in recommended_indices]

        # Visualize the similarity scores
        plt.figure(figsize=(10, 6))
        plt.bar(recommended_titles, similarity_scores, color='skyblue')
        plt.xlabel("Recommended Movies")
        plt.ylabel("Similarity Score")
        plt.title(f"Similarity Scores for Recommendations of '{selected_movie}'")
        plt.xticks(rotation=90)
        plt.tight_layout()
        plt.show()
    
    except IndexError:
        print(f"Error: Movie '{selected_movie}' not found in the dataset.")
    except Exception as e:
        print(f"An error occurred: {e}")

In [None]:
def visualize_recommended_movies_heatmap(selected_movie, similarity_matrix, movie_titles, k=10):
    """
    Visualize a heatmap showing the similarity scores between the recommended movies.

    Args:
    selected_movie (str): The movie selected by the user.
    similarity_matrix (numpy.ndarray): Cosine similarity matrix for the movies.
    movie_titles (list): List of movie titles corresponding to the rows/columns of the matrix.
    k (int): Number of recommendations to consider.
    """
    try:
        # Find the index of the selected movie
        index = movie_titles.index(selected_movie)

        # Get indices of the top k similar movies
        similarity_scores = list(enumerate(similarity_matrix[index]))
        sorted_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
        top_k_indices = [idx for idx, score in sorted_scores[1:k+1]]

        # Subset similarity matrix for the top k movies
        subset_matrix = similarity_matrix[top_k_indices][:, top_k_indices]
        subset_titles = [movie_titles[i] for i in top_k_indices]

        # Plot heatmap
        plt.figure(figsize=(10, 8))
        sns.heatmap(
            subset_matrix,
            xticklabels=subset_titles,
            yticklabels=subset_titles,
            cmap="coolwarm",
            annot=True,
            fmt=".2f",
            cbar=True
        )
        plt.title(f"Cosine Similarity Heatmap for Top {k} Recommendations of '{selected_movie}'")
        plt.xlabel("Movies")
        plt.ylabel("Movies")
        plt.xticks(rotation=90)
        plt.tight_layout()
        plt.show()

    except ValueError:
        print(f"Error: Movie '{selected_movie}' not found in the dataset.")
    except Exception as e:
        print(f"An error occurred: {e}")


In [None]:
def cross_validate_knn_parallel(vector, k_values=[5, 10, 15], metric_values=['cosine', 'euclidean'], n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Define a function to evaluate KNN performance for specific parameters
    def evaluate_knn(k, metric):
        scores = []
        for train_index, test_index in kf.split(vector):
            # Train KNN on the training data
            knn = NearestNeighbors(n_neighbors=k, metric=metric, algorithm='brute')
            knn.fit(vector[train_index])

            # Compute distances for the test data
            distances, indices = knn.kneighbors(vector[test_index])
            avg_distance = np.mean(distances)
            scores.append(avg_distance)
        return {'k': k, 'metric': metric, 'score': np.mean(scores)}

    # Use parallel processing to evaluate multiple configurations
    results = Parallel(n_jobs=-1)(delayed(evaluate_knn)(k, metric) for k in k_values for metric in metric_values)
    return sorted(results, key=lambda x: x['score'])

In [None]:
def evaluate_knn_model(movie_name, k):
    """
    Evaluate the optimized KNN model.
    """
    try:
        print("\nEvaluating model performance with optimized KNN...")
        # Find the index of the input movie
        index = new_movies[new_movies['title'] == movie_name].index[0]

        # Get predictions from the optimized KNN model
        distances, indices = optimized_knn.kneighbors([vector[index]], n_neighbors=k + 1)
        predicted_indices = indices[0][1:]  # Exclude the first (input movie itself)

        # Generate ground truth
        similarity_scores = list(enumerate(similarity[index]))
        sorted_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
        true_indices = [i for i, score in sorted_scores if score >= 0.3 and i != index][:5]

        # Convert indices to movie titles
        predicted_titles = [new_movies.iloc[i].title for i in predicted_indices]
        true_titles = [new_movies.iloc[i].title for i in true_indices]

        # Load favorites and update true titles
        favorites = pickle.load(open('processed_data/favorite.pkl', 'rb')) if os.path.exists('processed_data/favorite.pkl') else {}
        if movie_name in favorites:
            for favorite in favorites[movie_name]:
                if favorite not in true_titles:
                    true_titles.append(favorite)

        # Evaluate recommendation metrics
        metrics = evaluate_recommendation_performance(true_titles, predicted_titles)

        # Display evaluation results
        print("\nOptimized KNN Performance Metrics:")
        print(f"Precision: {metrics['Precision']:.4f}")
        print(f"Recall: {metrics['Recall']:.4f}")
        print("\nTrue Titles:", true_titles)
        print("Predicted Titles:", predicted_titles)

    except IndexError:
        print("Error: Movie not found in the dataset.")
    except Exception as e:
        print(f"Error during evaluation: {e}")

In [None]:
# Save movie data for future use
pickle.dump(new_movies, open('processed_data/movie_list.pkl', 'wb'))

In [None]:
# Perform cross-validation to find the best parameters for KNN
print("\nPerforming cross-validation...")
cv_results = cross_validate_knn_parallel(vector)
best_params = cv_results[0] # Extract the best parameters
print(f"Best Parameters from Cross-validation: {best_params}")

In [None]:
# Train the optimized KNN model using the best parameters
best_k = best_params['k'] # Optimal number of neighbors
best_metric = best_params['metric'] # Optimal distance metric

# Initialize and train the optimized KNN model
optimized_knn = NearestNeighbors(n_neighbors=best_k, metric=best_metric, algorithm='brute')
optimized_knn.fit(vector)

# Save the optimized KNN model to a file for later use
pickle.dump(optimized_knn, open('processed_data/optimized_knn.pkl', 'wb'))

In [None]:
# Example movie for evaluation and visualization
movie_name = "Batman"

# Generate recommendations for the specified movie
recommendations = recommend_knn(movie_name, k=best_k)
print(f"\nRecommendations for '{movie_name}':\n")
for rec in recommendations:
    print(rec)

In [None]:
# Visualize similarity scores dynamically as a bar chart
visualize_similarity_scores_dynamic(movie_name, k=50)

In [None]:
# Load movie titles and similarity matrix for heatmap visualization
movie_titles = new_movies['title'].tolist()
similarity_matrix = pickle.load(open('processed_data/similarity.pkl', 'rb'))  

# Visualize the similarity heatmap for the top recommendations
visualize_recommended_movies_heatmap(movie_name, similarity_matrix, movie_titles, k=10)


In [None]:
# Evaluate the KNN model performance on the selected movie
evaluate_knn_model(movie_name, k=best_k)