In [1]:
# --- IMPORTS ---
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import os
import warnings
warnings.filterwarnings('ignore')

# --- DATA PREPARATION ---
def load_and_prepare_ubcf_data(ratings_path, movies_path, links_path):
    print("Loading datasets from user-based-datasets folder...")
    ratings = pd.read_csv(ratings_path)
    movies = pd.read_csv(movies_path)
    links = pd.read_csv(links_path)
    
    # Attach tmdbId for posters later
    movies = pd.merge(movies, links[['movieId', 'tmdbId']], on='movieId', how='left')
    
    # --- COMBATING SPARSITY ---
    min_movie_ratings = 15 
    min_user_ratings = 15  
    
    movie_counts = ratings['movieId'].value_counts()
    user_counts = ratings['userId'].value_counts()
    
    ratings = ratings[
        (ratings['movieId'].isin(movie_counts[movie_counts >= min_movie_ratings].index)) &
        (ratings['userId'].isin(user_counts[user_counts >= min_user_ratings].index))
    ]
    
    print(f"Data filtered. Remaining ratings: {len(ratings)}")
    return ratings, movies

# --- MATRIX CREATION & MATH ---
def create_ubcf_matrices(ratings):
    print("Creating User-Item matrices...")
    
    # 1. The Raw Matrix (Actual star ratings, filled with NaN)
    user_movie_matrix = ratings.pivot(index='userId', columns='movieId', values='rating')
    
    # 2. Calculate User Means (What does this user average across all movies?)
    user_means = user_movie_matrix.mean(axis=1)
    
    # 3. The Normalized Matrix (Raw ratings minus the user's mean rating)
    user_movie_matrix_norm = user_movie_matrix.sub(user_means, axis=0)
    
    # Fill missing values with 0 (which now correctly represents a neutral/average guess)
    user_movie_matrix_norm_filled = user_movie_matrix_norm.fillna(0)
    
    print("Calculating Pearson Correlation (Mean-Centered Cosine Similarity)...")
    # By running cosine_similarity on mean-centered data, we are technically calculating Pearson Correlation.
    user_similarity = cosine_similarity(user_movie_matrix_norm_filled)
    
    user_similarity_df = pd.DataFrame(
        user_similarity, 
        index=user_movie_matrix.index, 
        columns=user_movie_matrix.index
    )
    
    # Return everything needed for the strict math formula later
    return user_movie_matrix, user_movie_matrix_norm, user_means, user_similarity_df

# --- UPGRADED RECOMMENDATION LOGIC ---
def get_user_recommendations(target_user_id, user_movie_matrix, user_movie_matrix_norm, user_means, user_similarity_df, movies_df, num_recommendations=5):
    if target_user_id not in user_similarity_df.index:
        return "Target user not found."
    
    # 1. Get Target User's Base Stats
    target_mean = user_means[target_user_id]
    target_user_movies = user_movie_matrix.loc[target_user_id].dropna().index
    
    # 2. Find Positive Neighbors
    # HOW: Get top 15 neighbors, but ONLY keep those with a similarity score > 0.
    similar_users = user_similarity_df[target_user_id].sort_values(ascending=False)[1:16]
    similar_users = similar_users[similar_users > 0]
    
    if similar_users.empty:
        return "No strictly positive matches found for this user."

    # 3. Isolate Neighbor Data and Drop Watched Movies
    # UPGRADE: We now multiply weights against the NORMALIZED ratings, not raw ratings.
    neighbors_norm_ratings = user_movie_matrix_norm.loc[similar_users.index].drop(columns=target_user_movies, errors='ignore')
    neighbors_raw_ratings = user_movie_matrix.loc[similar_users.index].drop(columns=target_user_movies, errors='ignore')
    
    weights = similar_users.values 
    
    # 4. The Numerator: (Normalized Rating * Weight)
    # Fill NaNs with 0 temporarily just for the sum
    weighted_norm_ratings = neighbors_norm_ratings.fillna(0).multiply(weights, axis=0).sum(axis=0)
    
    # 5. The Denominator: Sum of Absolute Weights
    # HOW: We only sum the weights of neighbors who *actually* rated the specific movie.
    sum_of_weights = neighbors_raw_ratings.notna().multiply(weights, axis=0).sum(axis=0)
    
    # 6. Final Mathematical Prediction
    # UPGRADE: Target Mean + (Weighted Differences / Sum of Weights). 
    # replace(0, np.nan) prevents dividing by zero for movies none of the neighbors watched.
    predicted_ratings = target_mean + (weighted_norm_ratings / sum_of_weights.replace(0, np.nan))
    
    # Sort the final predicted star ratings (e.g., 4.9 stars) highest to lowest
    predicted_ratings = predicted_ratings.dropna().sort_values(ascending=False)
    
    recommended_movie_ids = predicted_ratings.head(num_recommendations).index
    recommendations = movies_df.set_index('movieId').loc[recommended_movie_ids].reset_index()
    
    return recommendations[['movieId', 'tmdbId', 'title', 'genres']]

# =====================================================================
# EXECUTION & EXPORT BLOCK
# =====================================================================
RATING_PATH = 'datasets/user-based-datasets/ratings.csv'
MOVIES_PATH = 'datasets/user-based-datasets/movies.csv'
LINKS_PATH = 'datasets/user-based-datasets/links.csv'

# 1. Run Data Prep
ratings_df, movies_df = load_and_prepare_ubcf_data(RATING_PATH, MOVIES_PATH, LINKS_PATH)

# 2. Run Matrix Creation (Now returning 4 distinct items for accurate math)
user_movie_matrix, user_movie_matrix_norm, user_means, similarity_df = create_ubcf_matrices(ratings_df)

print("\nExporting models...")
os.makedirs('models/user-based-models', exist_ok=True)

# Export the required objects
with open('models/user-based-models/user_item_matrix.pkl', 'wb') as f:
    pickle.dump(user_movie_matrix, f)

with open('models/user-based-models/user_item_matrix_norm.pkl', 'wb') as f:
    pickle.dump(user_movie_matrix_norm, f)

with open('models/user-based-models/user_means.pkl', 'wb') as f:
    pickle.dump(user_means, f)

with open('models/user-based-models/user_similarity.pkl', 'wb') as f:
    pickle.dump(similarity_df, f)

with open('models/user-based-models/ubcf_movies.pkl', 'wb') as f:
    pickle.dump(movies_df, f)

print("✅ Success! Upgraded User-Based models have been saved to 'models/user-based-models/'.")

Loading datasets from user-based-datasets folder...
Data filtered. Remaining ratings: 73845
Creating User-Item matrices...
Calculating Pearson Correlation (Mean-Centered Cosine Similarity)...

Exporting models...
✅ Success! Upgraded User-Based models have been saved to 'models/user-based-models/'.
