In [31]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import scipy.sparse
import numpy as np
import random

In [32]:
# Load datasets
movies = pd.read_csv("data/data.csv", sep=';', dtype=str)  # Your main movie dataset
watched = pd.read_csv("data/user_data.csv", sep=';', dtype=str)  # User history dataset

In [33]:
watched_titles = watched["Title"].tolist()  # Get list of watched movie titles

In [34]:
# Function to clean multi-entry columns
def clean_text(text):
    return text.replace(",", "").replace(" ", "")

movies["Genres"] = movies["Genres"].apply(clean_text)
movies["Cast"] = movies["Cast"].apply(clean_text)
movies["Director"] = movies["Director"].apply(clean_text)

In [35]:
# Combine relevant columns into a single text feature
movies["features"] = (
    movies["Genres"] + " " + movies["Director"] + " " + movies["Cast"] + " " + movies["RunningTime"] + " " + movies["Rating"] + " " + movies["Votes"]
)

In [36]:
# Apply TF-IDF Vectorization (sparse format for large datasets)
vectorizer = TfidfVectorizer(stop_words="english", max_features=50000)  # Limit features for efficiency
tfidf_matrix = vectorizer.fit_transform(movies["features"])

In [37]:
# Use Nearest Neighbors for efficient similarity search
nn_model = NearestNeighbors(metric="cosine", algorithm="brute")
nn_model.fit(tfidf_matrix)  # Fit the model on TF-IDF matrix

In [38]:
# Function to recommend ONE movie with Title, Year, and Director
def recommend_one_movie(watched_titles):
    similar_movies = {}

    for title in watched_titles:
        if title not in movies["Title"].values:
            continue  # Skip if the movie is not in the database

        idx = movies[movies["Title"] == title].index[0]
        _, indices = nn_model.kneighbors(tfidf_matrix[idx], n_neighbors=5)  # Find top 5 similar movies

        for i in indices[0][1:]:  # Skip the first (it's the same movie)
            movie_data = movies.iloc[i][["Title", "Year", "Director"]]
            movie_key = (movie_data["Title"], movie_data["Year"], movie_data["Director"])
            if movie_data["Title"] not in watched_titles:  # Don't recommend watched movies
                similar_movies[movie_key] = similar_movies.get(movie_key, 0) + 1  # Count occurrences

    # Get the most frequently recommended movie
    if similar_movies:
        best_match = max(similar_movies, key=similar_movies.get)
        return {"Title": best_match[0], "Year": best_match[1], "Director": best_match[2]}
    else:
        return "No recommendations available."

# Example Usage
recommended_movie = recommend_one_movie(watched_titles)
print("Recommended Movie:", recommended_movie)

Recommended Movie: {'Title': 'Larry Mahoney', 'Year': '1996', 'Director': 'ChristopherNolan'}


In [39]:
def recommend_one_diverse_movie(watched_titles):
    if not watched_titles:
        return "No watched movies provided."

    similar_movies = {}

    # Consider only the last 5 watched movies for recommendations
    watched_titles = watched_titles[-5:]

    for title in watched_titles:
        if title not in movies["Title"].values:
            continue  # Skip if the movie is not in the database

        idx = movies[movies["Title"] == title].index[0]
        distances, indices = nn_model.kneighbors(tfidf_matrix[idx], n_neighbors=10)  # Find top 10 similar movies

        for i, score in zip(indices[0][1:], distances[0][1:]):  # Skip the first (it's the same movie)
            movie_data = movies.iloc[i][["Title", "Year", "Director"]]
            movie_key = (movie_data["Title"], movie_data["Year"], movie_data["Director"])
            
            if movie_data["Title"] not in watched_titles:  # Don't recommend watched movies
                similar_movies[movie_key] = similar_movies.get(movie_key, 0) + (1 - score)  # Higher score = more similar

    # If no movies found, return a message
    if not similar_movies:
        return "No recommendations available."

    # Sort by similarity score (higher is better)
    sorted_movies = sorted(similar_movies.items(), key=lambda x: x[1], reverse=True)

    # Ensure there are at least 3 movies to choose from
    top_movies = sorted_movies[:3] if len(sorted_movies) >= 3 else sorted_movies

    # Extract only the movie keys (Title, Year, Director) for random selection
    movie_choices = [movie[0] for movie in top_movies]

    # Randomly select one movie
    best_match = random.choice(movie_choices)

    return {"Title": best_match[0], "Year": best_match[1], "Director": best_match[2]}

# Example Usage
recommended_movie = recommend_one_diverse_movie(watched_titles)
print("Recommended Movie:", recommended_movie)

Recommended Movie: {'Title': "'Neath the Arizona Skies", 'Year': '1934', 'Director': 'HarryL.Fraser'}
