# Hybrid Movie Recommendation System
#### This script implements a hybrid recommender system using both content-based filtering (TF-IDF + Naive Bayes) and collaborative filtering (user-item matrix + cosine similarity).



### Imports
Core Python libraries, data processing, ML tools, and NLP modules.

In [None]:
import logging
import os
import sqlite3
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import mean_absolute_error, roc_auc_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import GridSearchCV
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
import ast
import ssl

### NLTK Setup
Download and prepare stopwords and the Porter stemmer.

In [None]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

### Logging
Configure logging to output both to file and console.

In [None]:
def setup_logging():
    logging.basicConfig(
        filename='training.log',
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s'
    )

    # create and configure a console handler
    console_handler = logging.StreamHandler()
    console_handler.setLevel(logging.INFO)
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    console_handler.setFormatter(formatter)

    # adding console handler to the root logger
    logging.getLogger().addHandler(console_handler)
    logging.info("Logging initialized.")

## Database Connection
Connects to the SQLite database containing ratings and metadata.

In [None]:
def connect_db(db_path='movies.db'):
 
    try:
        conn = sqlite3.connect(db_path)
        logging.info("Connected to SQLite database.")
        return conn
    except sqlite3.Error as e:
        logging.error(f"Database connection failed: {e}")
        raise

## Data Loading
Load ratings and movie metadata from the database or cached CSV files.

In [None]:
def load_data(conn):

    if os.path.exists("ratings_pre.csv") and os.path.exists("items_post.csv"):
        ratings = pd.read_csv("ratings.csv")
        items = pd.read_csv("items_post.csv")
        logging.info("Data loaded from CSV files.")
        return ratings, items
    
    tqdm.pandas()

    # loading ratings data
    ratings_query = """
    SELECT userId, movieId, rating FROM ratings_small
    """


    ratings = pd.read_sql(ratings_query, conn)
    ratings.to_csv("ratings_pre.csv", index=False, sep=',')

    logging.info(f"Ratings loaded: {ratings.shape[0]} rows.")

    # loading movie metadata
    movies_query = """
    SELECT m.id, m.title, m.overview, m.genres, c.cast, c.crew, k.keywords
    FROM movies_metadata m
    INNER JOIN credits c ON c.id = m.id
    INNER JOIN keywords k ON k.id = m.id
    """
    items = pd.read_sql(movies_query, conn)

    items.to_csv("items_pre.csv", index=False, sep=',')

    # turning JSON like columns into strings
    items['genres'] = items['genres'].apply(
        lambda x: " ".join(sorted(genre['name'] for genre in ast.literal_eval(x))) if pd.notnull(x) else ""
    )

    # for cast: removing internal spaces from names, sort, and join
    items['cast'] = items['cast'].apply(
        lambda x: " ".join(
            sorted(member["name"].replace(" ", "") for member in ast.literal_eval(x))
        ) if pd.notnull(x) else ""
    )
    
    # for crew: filter for Director, Screenplay, and Original Story, removing internal spaces from names, sort, and join
    items['crew'] = items['crew'].apply(
        lambda x: " ".join(
            sorted(member["name"].replace(" ", "") for member in ast.literal_eval(x)
                   if member["job"] in ["Director", "Screenplay", "Original Story"])
        ) if pd.notnull(x) else ""
    )
   
    # for keywords: extract  'name' from each dict and join with a space
    items['keywords'] = items['keywords'].apply(
        lambda x: " ".join(d['name'] for d in ast.literal_eval(x)) if pd.notnull(x) else ""
    )
    items.to_csv("items_post.csv", index=False, sep=',')
    
    logging.info(f"Movie metadata loaded: {items.shape[0]} rows.")
    return ratings, items

## Text Cleaning
Basic NLP preprocessing: lowercasing, stemming, stopword removal.

In [None]:
def text_cleaning(text):    
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = text.split()
    cleaned_tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return ' '.join(cleaned_tokens)

## Add Tags
Merge external user tags with the items dataframe, if available.

In [None]:
def add_tags(items, path="tags.csv"):
    if os.path.exists(path):
        tags_df = pd.read_csv(path)
        tags_df = tags_df[['movieId', 'tag']].dropna()
        tags_agg = tags_df.groupby('movieId')['tag'].apply(lambda x: ' '.join(str(t) for t in x)).reset_index()
        logging.info(f"Loaded and aggregated tags: {tags_agg.shape[0]} movies with tags.")

        items = items.merge(tags_agg, on ='movieId', how='left')
        items['tag'] = items['tag'].fillna('')
    else:
        logging.warning(f"{path} not found. Skipping tag integration.")
        items['tag'] = ''

    return items

## Text Preprocessing
Combines relevant text fields and applies TF-IDF vectorization.

In [None]:
def preprocess_text(items, min_df_threshold=2): 
    # filling NaN values
    items.fillna(' ', inplace=True)

    # clean combined text
    items['clean_text'] = items['overview'].apply(text_cleaning)

    # combining text fields
    text_fields = ['title', 'overview', 'genres', 'cast', 'crew', 'keywords', 'tag'] #mc
    items['combined_text'] = items[text_fields].astype(str).agg(' '.join, axis=1) #mc

    # vectorizing cleaned text using TF-IDF
    tfidf = TfidfVectorizer(min_df=min_df_threshold, max_features=50000)
    item_features = tfidf.fit_transform(items['combined_text'])
    logging.info("Text data preprocessed and vectorized using TF-IDF.")
    return items, item_features

## Label Creation
Create labels by taking the most common rating for each movie. Used as the target variable for the classifier.

In [None]:
def create_labels(ratings):
    item_labels = ratings.groupby('movieId')['rating'].apply(
        lambda x: x.mode()[0] if not x.mode().empty else 0
    ).reset_index(name='label')
    logging.info("Labels created from ratings.")
    return item_labels

## Merge Labels
Merges labels with the item (movie) metadata dataframe.

In [None]:
def merge_labels(items, labels_df): 

    items['id'] = items['id'].astype(int)
    items['movieId'] = items['id'] 

    items = items.merge(labels_df, left_on ='id', right_on='movieId', how='left')
    items['label'] = items['label'].fillna(0).astype(int)

    items.drop(columns=['movieId_y'], inplace=True, errors='ignore') 
    items.rename(columns= {'movieId_x': 'movieId'}, inplace=True) 

    logging.info("Labels merged with movie metadata.")
    return items

## Grid Search for Naive Bayes
 Tests different values for 'alpha' and 'fit_prior' using 5-fold CV. Selects the best model based on accuracy.

In [None]:
def grid_search(item_features, labels):
    param_grid = {
        'alpha': [0.1, 0.25, 0.5 ],
        'fit_prior': [True, False]
    }
    grid = GridSearchCV(MultinomialNB(), param_grid, cv = 5, scoring ='accuracy')
    grid.fit(item_features, labels)
    logging.info(f"Best parameters: {grid.best_params_}")
    logging.info(f"Best CV score: {grid.best_score_:.4f}")
    print(f"Naive Bayes best parameters: {grid.best_params_}")
    print(f"Naive Bayes best CV score: {grid.best_score_:.4f}")

    return grid.best_estimator_

## Create Rating Matrix
Builds a user-item matrix for collaborative filtering.

In [None]:
def create_rating_matrix(ratings):
    rating_matrix = ratings.pivot(index ='userId', columns ='movieId', values ='rating').fillna(0)
    logging.info("Rating matrix created for collaborative filtering.")
    return rating_matrix

## Normalize Ratings
Normalizes ratings by subtracting each user's mean rating.

In [None]:
def normalize_ratings(rating_matrix): 
    user_means = rating_matrix.mean(axis=1)
    norm_ratings = rating_matrix.subtract(user_means, axis=0)
    logging.info("User ratings normalized.")
    return norm_ratings, user_means

## Compute Item Similarity
Calculates cosine similarity between items.

In [None]:
def item_similarity(norm_ratings):

    item_sim = cosine_similarity(norm_ratings.T)
    logging.info("Item similarity matrix computed. ")
    return item_sim

## Movie ID to Index Mapping
Maps movieId to its corresponding column index in the rating matrix.

In [None]:
def id_mapping(rating_matrix): 

    movie_id_to_idx = {movie_id: idx for idx, movie_id in enumerate(rating_matrix.columns)}
    logging.info("Movie ID to index mapping created.")
    return movie_id_to_idx

## Hybrid Prediction
CF uses user-item similarity and normalized ratings. NB uses text features and a trained classifier. Final prediction is chosen based on:
- NB confidence (difference between top predicted class probabilities)
- Similarity between NB and CF predictions
- Fallbacks when one method lacks enough data

In [None]:
def predict_rating(user_id, target_movie_id, rating_matrix, user_means, item_sim,
                   movie_id_to_idx, items, item_features, nb, alpha=0.25, beta=0.4):
    cf_pred = 0.0  # collaborative filtering prediction (default to 0)

    # --- Collaborative Filtering (CF) Prediction ---
    # Check if we have enough data for this user and movie
    if target_movie_id in movie_id_to_idx and user_id in rating_matrix.index:
        user_ratings = rating_matrix.loc[user_id]         
        user_mean = user_means.loc[user_id]                
        target_idx = movie_id_to_idx[target_movie_id]      

        # get movies this user has rated
        rated_movie_ids = user_ratings[user_ratings != 0].index
        if not rated_movie_ids.empty:
            rated_indices = [movie_id_to_idx[m_id] for m_id in rated_movie_ids if m_id in movie_id_to_idx]
            sim_scores = item_sim[target_idx][rated_indices]        # similarity between target and rated movies
            user_rated_ratings = user_ratings[rated_movie_ids].values
            user_norm_ratings = user_rated_ratings - user_mean      # normalize ratings

            # adjust similarity scores based on how many users rated both items
            min_common = 50
            for i, rated_id in enumerate(rated_movie_ids):
                if rated_id not in movie_id_to_idx:
                    continue
                co_rated = ((rating_matrix[target_movie_id] != 0) & (rating_matrix[rated_id] != 0)).sum()
                weight_factor = min(1.0, co_rated / min_common)
                sim_scores[i] *= weight_factor  # reduce weight if little overlap

            # Compute weighted average prediction
            numerator = np.dot(sim_scores, user_norm_ratings)
            denominator = np.sum(np.abs(sim_scores))
            if denominator > 0:
                cf_pred = user_mean + (numerator / denominator)

    # --- Content-Based Prediction (Naive Bayes) ---
    # Use the trained classifier to predict the rating class from movie text features
    item_row = items[items['movieId'] == target_movie_id]
    if not item_row.empty:
        item_idx = item_row.index[0]
        features = item_features[item_idx]
        nb_pred = nb.predict(features)[0]              # predicted class
        probas = nb.predict_proba(features)[0]         # class probabilities
        sorted_probs = np.sort(probas)

        # Confidence = difference between top two predicted class probabilities
        if len(sorted_probs) > 1:
            nb_confidence = sorted_probs[-1] - sorted_probs[-2]
        else:
            nb_confidence = 0
    else:
        # Fallback if movie metadata is missing
        nb_pred = user_means.mean()
        nb_confidence = 0

    # --- Hybrid Decision Logic ---
    # Decide whether to trust CF or NB, or blend them

    if cf_pred == 0:
        # CF has no prediction (user/movie not known), fall back to NB
        pred = nb_pred

    elif nb_confidence > alpha:
        # NB is confident, use its prediction
        pred = nb_pred

    elif abs(nb_pred - cf_pred) < beta:
        # NB and CF predictions are close, use NB
        pred = nb_pred

    else:
        # Tie-breaking logic: check if any of the top NB classes are close to CF prediction
        top_prob = sorted_probs[-1]
        top_classes = [i for i, p in enumerate(probas) if np.isclose(p, top_prob)]

        tie_resolved = False
        for tied_class in top_classes:
            if abs(tied_class - cf_pred) < beta:
                pred = tied_class
                tie_resolved = True
                break

        # If still not resolved, use CF prediction
        if not tie_resolved:
            pred = cf_pred

    # Ensure prediction is in valid rating range
    return np.clip(pred, 0, 5.0)

## Evaluation
Computes MAE, ROC AUC, and coverage for the hybrid recommendation model.

In [None]:
def evaluate_model(ratings, rating_matrix, user_means, item_sim,
                   movie_id_to_idx, items, item_features, nb):
    predictions = []
    true_values = []
    binary_true = []  # 1 if rating > user average, else 0
    scores = []       # predicted scores used for ROC AUC

    logging.info("Starting model evaluation.")
    
    # goes through each user-movie-rating in the dataset
    for _, row in tqdm(ratings.iterrows(), total=len(ratings), desc="Evaluating"):
        user = row['userId']
        movie = row['movieId']
        true_rating = row['rating']

        # gets prediction from hybrid model
        pred = predict_rating(user, movie, rating_matrix, user_means, item_sim,
                              movie_id_to_idx, items, item_features, nb)

        predictions.append(pred)
        true_values.append(true_rating)

        # creates binary label: 1 if rating above user's average
        user_avg = user_means[user]
        binary_true.append(1 if true_rating > user_avg else 0)

        scores.append(pred)

    # MAE
    mae = mean_absolute_error(true_values, predictions)

    #  ROC AUC for binary classification
    try:
        roc_auc = roc_auc_score(binary_true, scores)
    except Exception as e:
        roc_auc = None
        logging.error(f"ROC AUC calculation failed: {e}")

    # Compute coverage: % of movies that received at least one prediction
    predicted_movies = set(ratings['movieId'])
    total_movies = set(items['movieId'])
    coverage = len(predicted_movies) / len(total_movies) if total_movies else 0

    logging.info("Predictions generated.")
    logging.info(f"Hybrid Model MAE: {mae:.4f}")
    if roc_auc is not None:
        logging.info(f"Hybrid Model ROC AUC: {roc_auc:.4f}")
    logging.info(f"Coverage (movies predicted): {coverage:.4f}")

    print(f"Hybrid Model MAE: {mae:.4f}")
    if roc_auc is not None:
        print(f"Hybrid Model ROC AUC: {roc_auc:.4f}")
    print(f"Coverage (movies predicted): {coverage:.4f}")


## Main Execution
Coordinates all steps: data loading, preprocessing, training, evaluation.

In [None]:
def main():
    setup_logging()

    # connecting to the database and load data
    try:
        conn = connect_db()
    except Exception as e:
        logging.error(f"Database not reached, try csv backup. Error: {e}")

    ratings, items = load_data(conn)

    
    items['movieId'] = items['id']
    items = add_tags(items)  

    # preprocessing and computing TF-IDF features
    items, item_features = preprocess_text(items)

    # merging labels with movie metadata
    labels_df = create_labels(ratings)
    items = merge_labels(items, labels_df)

    items['label'] = items['label'].apply(lambda x: 1 if x >= 3 else 0) #mc
    labeled_items = items[items['label'] > 0] #mc
    labels = labeled_items['label'] #mc
    labeled_items = labeled_items.reset_index(drop=True) #mc
    item_features = item_features[labeled_items.index] #mc
    ratings = ratings[ratings['movieId'].isin(labeled_items['movieId'])] #mc

    # tuning via grid search
    nb = grid_search(item_features, labels)
    logging.info("Naive Bayes classifier retrained with optimal hyperparameters.")

    # components for collaborative filtering
    rating_matrix = create_rating_matrix(ratings)
    norm_ratings, user_means = normalize_ratings(rating_matrix)
    item_sim = item_similarity(norm_ratings)
    movie_id_to_idx =id_mapping(rating_matrix)

    # evaluation
    evaluate_model(ratings, rating_matrix, user_means, item_sim,
               movie_id_to_idx, labeled_items, item_features, nb) #mc


    conn.close()
    logging.info("Database connection closed.")
    logging.info("Training process finished.")


if __name__ == "__main__":
    main()
