In [None]:
import numpy as np
import pandas as pd

from pathlib import Path


In [None]:
rank_path = "../data/processed/ranking_dataset.parquet"
df_rank = pd.read_parquet(rank_path)
df_rank.head()


Basic Cleaning & Feature Selection

In [None]:
#Handle missing values
df_rank["user_rating_std"] = df_rank["user_rating_std"].fillna(0.0)
df_rank["movie_avg_rating"] = df_rank["movie_avg_rating"].fillna(df_rank["movie_avg_rating"].mean())
#Choose feature columns and target
target_col = "label"

feature_cols = [
    "user_total_ratings",
    "user_avg_rating",
    "user_rating_std",
    "movie_popularity",
    "movie_avg_rating",
    "svd_score",
]

X = df_rank[feature_cols]
y = df_rank[target_col]


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y,
)

X_train.shape, X_val.shape


Train LightGBM Classifier

In [None]:
import lightgbm as lgb

lgbm = lgb.LGBMClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=-1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
)

lgbm.fit(
    X_train,
    y_train,
    eval_set=[(X_val, y_val)],
    eval_metric="auc",
    verbose=False,
)


Quick Evaluation (AUC + Feature Importance)

In [None]:
from sklearn.metrics import roc_auc_score

val_pred_proba = lgbm.predict_proba(X_val)[:, 1]
auc = roc_auc_score(y_val, val_pred_proba)
auc
print(f"Validation AUC: {auc:.4f}")


Feature Importance

In [None]:
import matplotlib.pyplot as plt

importances = lgbm.feature_importances_
for col, imp in sorted(zip(feature_cols, importances), key=lambda x: -x[1]):
    print(col, imp)
plt.barh(feature_cols, importances)
plt.xlabel("Importance")
plt.title("LightGBM Feature Importances")
plt.show()


Save Model with joblib

In [None]:
import joblib

model_path = "../models/lgbm_ranker.pkl"
joblib.dump(lgbm, model_path)
model_path


In [None]:
from recommender.ranker import score_candidates

sample = X_val.head(5)
scores = score_candidates(sample)
scores
