In [2]:
import sqlite3
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
import numpy as np
import random
from joblib import Parallel, delayed
from sklearn.metrics.pairwise import cosine_similarity

In [28]:
# Connect to the SQLite database
conn = sqlite3.connect('movie_ratings_db.sqlite')


In [29]:
# Query a subset of the data to speed up development/testing
# Adjust the LIMIT clause based on your dataset size
query = """
    SELECT r.userId, r.movieId, r.rating, m.title, m.genres
    FROM ratings r
    JOIN movies m ON r.movieId = m.movieId
    LIMIT 10000  -- Adjust this limit based on your dataset size
"""
df = pd.read_sql_query(query, conn)

In [30]:
# Assuming the genres column is in the format "Genre1|Genre2|Genre3"
# Convert genres into a list
df['genres'] = df['genres'].str.split('|')


In [31]:
# Content-Based Filtering (using Genres)
df['genres_str'] = df['genres'].apply(lambda x: ' '.join(x))
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)  # Adjust max_features
tfidf_matrix = tfidf_vectorizer.fit_transform(df['genres_str'])

In [51]:
from sklearn.metrics.pairwise import cosine_similarity

# Assuming tfidf_matrix_train and tfidf_matrix_test are precomputed TF-IDF matrices

# Calculate cosine similarity between test and train matrices
similarity_matrix = cosine_similarity(tfidf_matrix_test, tfidf_matrix_train)

# Number of top recommendations to consider
top_n = 100

# Evaluate recommendations and compute accuracy
correct_predictions = 0
total_test_items = similarity_matrix.shape[0]

for test_item_index in range(total_test_items):
    similar_items_indices = similarity_matrix[test_item_index].argsort()[:-top_n-1:-1]
    actual_preference = test_data.iloc[test_item_index]['title']
    recommended_movies = train_data.iloc[similar_items_indices]['title'].tolist()
    if actual_preference in recommended_movies:
        correct_predictions += 1


accuracy_percentage = (correct_predictions / total_test_items) * 100
print(f"Accuracy: {accuracy_percentage:.2f}%")


Accuracy: 74.50%


In [11]:
# Approximate Nearest Neighbors with NearestNeighbors
nn = NearestNeighbors(n_neighbors=5, algorithm='auto', metric='cosine')
nn.fit(tfidf_matrix)

In [41]:
# Collaborative Filtering (User-Item Interactions)
user_movie_ratings = df.pivot_table(index='userId', columns='title', values='rating', fill_value=0)
movie_user_ratings = user_movie_ratings.T
movie_similarity = cosine_similarity(movie_user_ratings)

In [47]:
from sklearn.metrics.pairwise import cosine_similarity

# Assuming movie_user_ratings is the user-item interaction matrix

# Calculate cosine similarity between movies
movie_similarity = cosine_similarity(movie_user_ratings)

# Number of top recommendations to consider
top_n = 5

# Evaluate recommendations and compute accuracy
correct_predictions = 0
total_test_users = len(test_data['userId'].unique())

for user_id in test_data['userId'].unique():
    # Movies rated by the user in the test set
    actual_preferences = test_data[test_data['userId'] == user_id]['title'].tolist()
    
    # Find similar movies based on cosine similarity
    similar_movies_indices = movie_similarity[movie_user_ratings.index.get_loc(actual_preferences[0])].argsort()[:-top_n-1:-1]
    recommended_movies = movie_user_ratings.iloc[similar_movies_indices].index.tolist()

    # Check if any actual preferences are among recommended movies
    for movie in actual_preferences:
        if movie in recommended_movies:
            correct_predictions += 1
            break  # Break if at least one correct recommendation is found for the user

accuracy_percentage = (correct_predictions / total_test_users) * 100
print(f"Accuracy: {accuracy_percentage:.2f}%")

Accuracy: 100.00%


In [13]:
# Function to get similar items based on NearestNeighbors
def get_similar_items_nn(movie_index):
    distances, indices = nn.kneighbors(tfidf_matrix[movie_index])
    similar_items = df.iloc[indices[0]]['title'].tolist()
    return similar_items

In [14]:
# Function to get similar items based on Collaborative Filtering
def get_similar_items_cf(movie_title, top_n=5):
    movie_ratings = user_movie_ratings[movie_title].values.reshape(1, -1)
    
    # Calculate similarity scores using cosine_similarity
    similar_scores = cosine_similarity(movie_ratings, movie_user_ratings)
    
    # Extract the similarity scores for the given movie
    similarity_scores_for_movie = similar_scores.flatten()
    
    # Create a DataFrame with movie titles and similarity scores
    similar_movies_df = pd.DataFrame({'movie': movie_user_ratings.index, 'similarity': similarity_scores_for_movie})
    
    # Sort by similarity and get the top N
    similar_movies_df = similar_movies_df.sort_values(by='similarity', ascending=False).head(top_n)
    
    return similar_movies_df['movie'].tolist()

In [15]:
# Function to get hybrid recommendations (combining CF and CB)
def get_hybrid_recommendations(selected_movies, top_n=5):
    cf_recommendations = Parallel(n_jobs=-1)(delayed(get_similar_items_cf)(movie_title, top_n=top_n) for movie_title in selected_movies)
    cf_recommendations = [item for sublist in cf_recommendations for item in sublist]
    
    cb_recommendations = Parallel(n_jobs=-1)(delayed(get_similar_items_nn)(df[df['title'] == movie_title].index[0]) for movie_title in selected_movies)
    cb_recommendations = [item for sublist in cb_recommendations for item in sublist]
    
    genre_filter = set().union(*(tuple(genre) for movie_title in selected_movies for genre in df[df['title'] == movie_title]['genres']))
    cb_recommendations_filtered = [movie for movie in cb_recommendations if any(set(genre) & genre_filter for genre in df[df['title'] == movie]['genres'])]
    
    hybrid_recommendations = list(set(cf_recommendations + cb_recommendations_filtered))[:top_n]
    
    return hybrid_recommendations


In [34]:
# Function to allow the user to choose movies from a random list
def choose_movies():
    random_movies = random.sample(df['title'].tolist(), 10)
    
    print("Choose up to 5 movies from the list:")
    selected_movies = []
    count = 0
    while count < 5:
        print(f"{count + 1}. {random_movies[count]}")
        choice = input("Enter 'yes' if you like this movie or 'no' if you don't: ").lower()
        if choice == 'yes':
            selected_movies.append(random_movies[count])
            count += 1
        elif choice == 'no':
            random_movies.pop(count)
            random_movies.append(random.choice(df['title'].tolist()))
        elif choice != 'no':
            print("Invalid choice. Please enter 'yes' or 'no'.")
    return selected_movies

In [35]:
# Function to allow the user to redraw recommendations
def redraw_recommendations():
    while True:
        selected_movies = choose_movies()
        hybrid_recommendations = get_hybrid_recommendations(selected_movies, top_n=5)
        print(f'Hybrid recommendations based on user-selected movies:\n{hybrid_recommendations}')

        redraw = input("Do you want to redraw recommendations? Enter 'yes' or 'no': ").lower()
        if redraw != 'yes':
            break

# Call the function to start the recommendation process
redraw_recommendations()

Choose up to 5 movies from the list:
1. Pursuit of Happyness, The (2006)
Invalid choice. Please enter 'yes' or 'no'.
1. Pursuit of Happyness, The (2006)
2. Cyrano de Bergerac (1990)
2. Candyman: Farewell to the Flesh (1995)
Invalid choice. Please enter 'yes' or 'no'.
2. Candyman: Farewell to the Flesh (1995)
2. Batman Forever (1995)
3. Shanghai Noon (2000)
4. Untouchables, The (1987)
4. To Live and Die in L.A. (1985)
4. Gattaca (1997)
Invalid choice. Please enter 'yes' or 'no'.
4. Gattaca (1997)
Invalid choice. Please enter 'yes' or 'no'.
4. Gattaca (1997)
4. Ernest Goes to Camp (1987)
4. Goodfellas (1990)
4. Million Dollar Baby (2004)
5. Pulp Fiction (1994)
Hybrid recommendations based on user-selected movies:
['Fargo (1996)', '3:10 to Yuma (2007)', 'Donnie Darko (2001)', 'The Revenant (2015)', 'True Lies (1994)']


In [24]:
df = temp

In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score

# Assuming 'df' is your original DataFrame with user-item interactions

# Splitting data into train and test sets
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

# Function to evaluate recommendations
def evaluate_recommendations(recommendation_function, test_data, top_n=5):
    user_ids = test_data['userId'].unique()
    precision_scores = []
    recall_scores = []

    for user_id in user_ids:
        user_test_data = test_data[test_data['userId'] == user_id]
        selected_movies = user_test_data['title'].tolist()

        recommended_movies = recommendation_function(selected_movies, top_n=top_n)
        
        # Compare recommended movies with test data for this user
        true_positives = len(set(selected_movies) & set(recommended_movies))
        precision = true_positives / top_n
        recall = true_positives / len(selected_movies) if len(selected_movies) > 0 else 0

        precision_scores.append(precision)
        recall_scores.append(recall)

    avg_precision = sum(precision_scores) / len(precision_scores)
    avg_recall = sum(recall_scores) / len(recall_scores)

    return avg_precision, avg_recall

# Example: Evaluating Collaborative Filtering (CF) recommendations
avg_precision_cf, avg_recall_cf = evaluate_recommendations(get_similar_items_cf, test_data)
print(f"Collaborative Filtering - Avg Precision: {avg_precision_cf}, Avg Recall: {avg_recall_cf}")

# Example: Evaluating Hybrid recommendations
avg_precision_hybrid, avg_recall_hybrid = evaluate_recommendations(get_hybrid_recommendations, test_data)
print(f"Hybrid Model - Avg Precision: {avg_precision_hybrid}, Avg Recall: {avg_recall_hybrid}")


KeyError: 'userId'