In [7]:
# Step 1: Data Preprocessing
# We'll start by loading and preprocessing the data.

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# Load the data
file_path = 'TMDB_Movies.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
print(data.head())

# Convert release_date to datetime
data['release_date'] = pd.to_datetime(data['release_date'], errors='coerce')

# Handle missing values (e.g., drop rows with missing 'overview')
data.dropna(subset=['overview'], inplace=True)

# Normalize popularity and vote_average
scaler = MinMaxScaler()
data[['popularity', 'vote_average']] = scaler.fit_transform(data[['popularity', 'vote_average']])

# Prepare content features using TF-IDF for 'overview'
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = tfidf_vectorizer.fit_transform(data['overview'])

# Add TF-IDF matrix to the data frame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), index=data.index)
data = pd.concat([data, tfidf_df], axis=1)

# Save the preprocessed data to a CSV file
preprocessed_file_path = 'preprocessed_data.csv'
data.to_csv(preprocessed_file_path)
# data.to_csv(preprocessed_file_path, index=False)


   Unnamed: 0     id                        title  \
0           0    238                The Godfather   
1           1    278     The Shawshank Redemption   
2           2    240        The Godfather Part II   
3           3  19404  Dilwale Dulhania Le Jayenge   
4           4    424             Schindler's List   

                                            overview release_date  popularity  \
0  Spanning the years 1945 to 1955, a chronicle o...   1972-03-14     127.351   
1  Framed in the 1940s for the double murder of h...   1994-09-23      91.282   
2  In the continuing saga of the Corleone crime f...   1974-12-20      67.617   
3  Raj is a rich, carefree, happy-go-lucky second...   1995-10-20      34.208   
4  The true story of how businessman Oskar Schind...   1993-12-15      56.547   

   vote_average  vote_count  
0           8.7       18285  
1           8.7       24196  
2           8.6       11033  
3           8.6        4183  
4           8.6       14301  


In [8]:
len(data)

9998

In [9]:
# Step 2: Collaborative Filtering Implementation \n
# We will use matrix factorization with Singular Value Decomposition (SVD) for collaborative filtering.

from scipy.sparse.linalg import svds

# Assume we have a user-item interaction matrix 'user_item_matrix'
# Example: user_item_matrix[user_id][movie_id] = rating
# For the purpose of this example, we will simulate this matrix
import numpy as np

# Create a sample user-item matrix
n_users = 100
n_movies = data.shape[0]
np.random.seed(42)
user_item_matrix = np.random.rand(n_users, n_movies)

# Apply SVD
U, sigma, Vt = svds(user_item_matrix, k=50)
sigma = np.diag(sigma)

# Reconstruct the matrix
predicted_ratings = np.dot(np.dot(U, sigma), Vt)

# Convert to DataFrame for ease of use
predicted_ratings_df = pd.DataFrame(predicted_ratings, columns=data.index)


In [10]:
# Step 3: Content-Based Filtering Implementation
# We already prepared the TF-IDF matrix for content-based filtering. Now, we will use cosine similarity to find similar movies.

from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Function to get movie recommendations based on content similarity
def get_content_recommendations(movie_id, num_recommendations=10):
    movie_idx = data[data['id'] == movie_id].index[0]
    sim_scores = list(enumerate(cosine_sim[movie_idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:num_recommendations+1]
    movie_indices = [i[0] for i in sim_scores]
    return data['title'].iloc[movie_indices]

# Example usage
print(get_content_recommendations(movie_id=123, num_recommendations=5))


6815                                         The Blackout
2567                                              Mummies
39      The Lord of the Rings: The Fellowship of the Ring
45                  The Lord of the Rings: The Two Towers
7258                        Tom and Jerry: The Magic Ring
Name: title, dtype: object


In [11]:
# Step 4: Hybrid Model Construction
# We will combine the collaborative and content-based approaches by averaging their scores.

def hybrid_recommendations(user_id, movie_id, num_recommendations=10):
    # Get collaborative filtering recommendations
    user_ratings = predicted_ratings_df.loc[user_id]
    cf_recommendations = user_ratings.sort_values(ascending=False).index[:num_recommendations]
    
    # Get content-based recommendations
    content_recommendations = get_content_recommendations(movie_id, num_recommendations)
    
    # Combine recommendations
    combined_recommendations = list(set(cf_recommendations).union(set(content_recommendations.index)))
    combined_scores = {}
    
    for idx in combined_recommendations:
        cf_score = user_ratings[idx]
        content_score = cosine_sim[movie_id][idx]
        combined_scores[idx] = (cf_score + content_score) / 2
    
    # Sort combined scores
    sorted_recommendations = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:num_recommendations]
    return [data['title'].iloc[idx] for idx, score in sorted_recommendations]

# Example usage
print(hybrid_recommendations(user_id=10, movie_id=123, num_recommendations=5))


['Blood Red Sky', 'Fading Gigolo', 'Death Race 2', "Don't Torture a Duckling", 'The White Sheik']


In [12]:
# Step 5: Model Training and Evaluation
# To evaluate the models, we'll use metrics like RMSE for collaborative filtering and precision/recall for content-based and hybrid models.

from sklearn.metrics import mean_squared_error
from math import sqrt

# Create train and test sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Evaluate collaborative filtering
def evaluate_cf(predicted_ratings, actual_ratings):
    rmse = sqrt(mean_squared_error(actual_ratings, predicted_ratings))
    return rmse

# Simulate actual ratings for evaluation
actual_ratings = user_item_matrix  # In practice, this would be the actual ratings from a test set

# Evaluate content-based filtering
def evaluate_content_based(movie_id, actual_data, num_recommendations=10):
    recommendations = get_content_recommendations(movie_id, num_recommendations)
    relevant_items = actual_data['title'].values
    recommended_items = recommendations
    
    precision = len(set(recommended_items) & set(relevant_items)) / len(recommended_items)
    recall = len(set(recommended_items) & set(relevant_items)) / len(relevant_items)
    
    return precision, recall

# Evaluate hybrid model
def evaluate_hybrid(user_id, movie_id, actual_data, num_recommendations=10):
    recommendations = hybrid_recommendations(user_id, movie_id, num_recommendations)
    relevant_items = actual_data['title'].values
    recommended_items = recommendations
    
    precision = len(set(recommended_items) & set(relevant_items)) / len(recommended_items)
    recall = len(set(recommended_items) & set(relevant_items)) / len(relevant_items)
    
    return precision, recall

# Example evaluation
cf_rmse = evaluate_cf(predicted_ratings, actual_ratings)
print(f"Collaborative Filtering RMSE: {cf_rmse}")

content_precision, content_recall = evaluate_content_based(movie_id=data['id'].iloc[0], actual_data=test_data, num_recommendations=10)
print(f"Content-Based Filtering Precision: {content_precision}, Recall: {content_recall}")

hybrid_precision, hybrid_recall = evaluate_hybrid(user_id=10, movie_id=data['id'].iloc[0], actual_data=test_data, num_recommendations=10)
print(f"Hybrid Model Precision: {hybrid_precision}, Recall: {hybrid_recall}")


Collaborative Filtering RMSE: 0.1952269266979494
Content-Based Filtering Precision: 0.1, Recall: 0.0005
Hybrid Model Precision: 0.5, Recall: 0.0025


In [13]:
# Step 6: Save the Model Components

import pickle
import numpy as np

# Save the TF-IDF Vectorizer
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)

# Save the SVD Components
with open('U_matrix.pkl', 'wb') as f:
    pickle.dump(U, f)

with open('sigma_matrix.pkl', 'wb') as f:
    pickle.dump(sigma, f)

with open('Vt_matrix.pkl', 'wb') as f:
    pickle.dump(Vt, f)

# Save the Cosine Similarity Matrix
np.save('cosine_sim.npy', cosine_sim)

# Save the Predicted Ratings DataFrame
predicted_ratings_df.to_pickle('predicted_ratings_df.pkl')


In [14]:
import pandas as pd
import numpy as np
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from scipy.sparse.linalg import svds
from sklearn.metrics.pairwise import cosine_similarity
import threading
import time

# Load existing components and models

data = pd.read_csv('preprocessed_data.csv')

with open('tfidf_vectorizer.pkl', 'rb') as f:
    tfidf_vectorizer = pickle.load(f)
with open('U_matrix.pkl', 'rb') as f:
    U = pickle.load(f)
with open('sigma_matrix.pkl', 'rb') as f:
    sigma = pickle.load(f)
with open('Vt_matrix.pkl', 'rb') as f:
    Vt = pickle.load(f)

cosine_sim = np.load('cosine_sim.npy')
predicted_ratings_df = pd.read_pickle('predicted_ratings_df.pkl')

def get_content_recommendations(movie_id, num_recommendations=10):
    movie_idx = data[data['id'] == movie_id].index[0]
    sim_scores = list(enumerate(cosine_sim[movie_idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:num_recommendations+1]
    movie_indices = [i[0] for i in sim_scores]
    return [(int(data['id'].iloc[idx]), data['title'].iloc[idx]) for idx in movie_indices]

def get_popularity_recommendations(num_recommendations=10):
    popular_movies = data.sort_values('popularity', ascending=False)
    movie_indices = popular_movies.index[:num_recommendations]
    return [(int(data['id'].iloc[idx]), data['title'].iloc[idx]) for idx in movie_indices]

def hybrid_recommendations(user_id=None, movie_id=None, num_recommendations=10):
    if user_id is None or user_id not in predicted_ratings_df.index:
        if movie_id is not None:
            return get_content_recommendations(movie_id, num_recommendations)
        else:
            return get_popularity_recommendations(num_recommendations)
    else:
        user_ratings = predicted_ratings_df.loc[user_id]
        cf_recommendations = user_ratings.sort_values(ascending=False).index[:num_recommendations]

        if movie_id is not None:
            content_recommendations = get_content_recommendations(movie_id, num_recommendations)
            combined_recommendations = list(set(cf_recommendations).union(set([idx for idx, title in content_recommendations])))
            combined_scores = {}

            movie_idx = data[data['id'] == movie_id].index[0]
            for idx in combined_recommendations:
                cf_score = user_ratings.get(idx, 0)
                content_score = cosine_sim[movie_idx][idx] if idx < len(cosine_sim) else 0
                combined_scores[idx] = (cf_score + content_score) / 2

            sorted_recommendations = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:num_recommendations]
            return [(int(data['id'].iloc[idx]), data['title'].iloc[idx]) for idx, score in sorted_recommendations]
        else:
            return [(int(data['id'].iloc[idx]), data['title'].iloc[idx]) for idx in cf_recommendations]



In [15]:
# Example usage for different scenarios
new_user_movie_id = 299534  # Movie ID for "Avengers: Endgame"
recommendations = hybrid_recommendations(user_id=None, movie_id=new_user_movie_id, num_recommendations=5)
print(f"Recommendations for new user based on movie ID {new_user_movie_id}:")
print(recommendations)

existing_user_id = 10  # Assuming user ID 10 exists in predicted_ratings_df
recommendations = hybrid_recommendations(user_id=existing_user_id, movie_id=None, num_recommendations=5)
print(f"Recommendations for existing user with user ID {existing_user_id}:")
print(recommendations)

recommendations = hybrid_recommendations(user_id=existing_user_id, movie_id=new_user_movie_id, num_recommendations=5)
print(f"Recommendations for existing user with user ID {existing_user_id} and movie ID {new_user_movie_id}:")
print(recommendations)

recommendations = hybrid_recommendations(user_id=None, movie_id=None, num_recommendations=5)
print(f"Recommendations without user ID or movie ID:")
print(recommendations)


Recommendations for new user based on movie ID 299534:
[(299536, 'Avengers: Infinity War'), (271110, 'Captain America: Civil War'), (14613, 'Next Avengers: Heroes of Tomorrow'), (340382, 'Attack on Titan II: End of the World'), (299537, 'Captain Marvel')]
Recommendations for existing user with user ID 10:
[(214030, 'Fading Gigolo'), (760883, 'Blood Red Sky'), (49361, "Don't Torture a Duckling"), (51620, 'Death Race 2'), (43361, 'The White Sheik')]
Recommendations for existing user with user ID 10 and movie ID 299534:
[(760883, 'Blood Red Sky'), (214030, 'Fading Gigolo'), (49361, "Don't Torture a Duckling"), (51620, 'Death Race 2'), (43361, 'The White Sheik')]
Recommendations without user ID or movie ID:
[(346698, 'Barbie'), (298618, 'The Flash'), (667538, 'Transformers: Rise of the Beasts'), (1040148, 'Ruby Gillman, Teenage Kraken'), (447365, 'Guardians of the Galaxy Vol. 3')]
