In [27]:
import sqlite3
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
import numpy as np

In [28]:
# Connect to the SQLite database
conn = sqlite3.connect('movie_ratings_db.sqlite')

In [29]:
# Query the data from the ratings table and join with the movies table
query = """
    SELECT r.userId, r.movieId, r.rating, m.title, m.genres
    FROM ratings r
    JOIN movies m ON r.movieId = m.movieId
"""
df = pd.read_sql_query(query, conn)

In [30]:
# Assuming the genres column is in the format "Genre1|Genre2|Genre3"
# Convert genres into a list
df['genres'] = df['genres'].str.split('|')

# Display the processed dataframe
print(df.head())


   userId  movieId  rating                        title  \
0       1        1     4.0             Toy Story (1995)   
1       1        3     4.0      Grumpier Old Men (1995)   
2       1        6     4.0                  Heat (1995)   
3       1       47     5.0  Seven (a.k.a. Se7en) (1995)   
4       1       50     5.0   Usual Suspects, The (1995)   

                                              genres  
0  [Adventure, Animation, Children, Comedy, Fantasy]  
1                                  [Comedy, Romance]  
2                          [Action, Crime, Thriller]  
3                                [Mystery, Thriller]  
4                         [Crime, Mystery, Thriller]  


In [31]:
# Content-Based Filtering (using Genres)
df['genres_str'] = df['genres'].apply(lambda x: ' '.join(x))
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['genres_str'])

In [32]:
# Approximate Nearest Neighbors with NearestNeighbors
nn = NearestNeighbors(n_neighbors=5, algorithm='auto', metric='cosine')
nn.fit(tfidf_matrix)

In [33]:
# Function to get similar items based on NearestNeighbors
def get_similar_items_nn(movie_index):
    distances, indices = nn.kneighbors(tfidf_matrix[movie_index])
    similar_items = df.iloc[indices[0]]['title'].tolist()
    return similar_items

In [34]:
# Example: Get similar movies for a given movie index
movie_index = 0  # Replace with the actual movie index
similar_movies_nn = get_similar_items_nn(movie_index)
print(f'Similar movies (Nearest Neighbors) for {df.iloc[movie_index]["title"]}:\n{similar_movies_nn}')

Similar movies (Nearest Neighbors) for Toy Story (1995):
['Toy Story 2 (1999)', 'Monsters, Inc. (2001)', 'Toy Story (1995)', 'Toy Story (1995)', 'Toy Story (1995)']


In [35]:
# Collaborative Filtering (User-Item Interactions)
user_movie_ratings = df.pivot_table(index='userId', columns='title', values='rating')
user_movie_ratings = user_movie_ratings.fillna(0)
movie_user_ratings = user_movie_ratings.T
movie_similarity = cosine_similarity(movie_user_ratings)


In [58]:
# Function to get similar items based on Collaborative Filtering
def get_similar_items_cf(movie_title, top_n=5):
    movie_ratings = user_movie_ratings[movie_title]
    
    # Ensure the movie_ratings is a 2D array (column vector)
    movie_ratings = movie_ratings.values.reshape(-1, 1)
    
    # Calculate similarity scores using a loop
    similar_scores = [np.sum(movie_ratings.T * movie_similarity[:, i].reshape(-1, 1)) for i in range(movie_similarity.shape[0])]
    
    # Create a DataFrame with movie titles and similarity scores
    similar_movies_df = pd.DataFrame({'movie': movie_user_ratings.index, 'similarity': similar_scores})
    
    # Sort by similarity and get the top N
    similar_movies_df = similar_movies_df.sort_values(by='similarity', ascending=False).head(top_n)
    
    return similar_movies_df['movie'].tolist()



In [59]:
# Function to get hybrid recommendations (combining CF and CB)
def get_hybrid_recommendations(movie_title, top_n=5):
    # Get similar movies using collaborative filtering
    cf_recommendations = get_similar_items_cf(movie_title, top_n=top_n)
    
    # Get similar movies using content-based filtering with genre filter
    movie_index = df[df['title'] == movie_title].index[0]
    cb_recommendations = get_similar_items_nn(movie_index)
    
    # Filter content-based recommendations based on genre relevance
    genre_filter = df[df['title'] == movie_title]['genres'].values[0]
    cb_recommendations_filtered = [movie for movie in cb_recommendations if any(genre in df[df['title'] == movie]['genres'].values[0] for genre in genre_filter)]
    
    # Combine the recommendations from both methods
    hybrid_recommendations = list(set(cf_recommendations + cb_recommendations_filtered))[:top_n]
    
    return hybrid_recommendations


In [60]:
# Example: Get hybrid recommendations for a given movie title
movie_title = "Toy Story (1995)"  # Replace with the actual movie title
hybrid_recommendations = get_hybrid_recommendations(movie_title, top_n=5)
print(f'Your recommendations based off {movie_title}:\n{hybrid_recommendations}')


Your recommendations based off Toy Story (1995):
['Toy Story 2 (1999)', 'Monsters, Inc. (2001)', 'Earth Girls Are Easy (1988)', 'SpaceCamp (1986)', 'Woman in Red, The (1984)']
