### ***`Supervised Machine Learning`***

In [69]:
import json
import joblib
import logging
import warnings
import numpy as np
import pandas as pd
from pathlib import Path
from functools import lru_cache
from collections import Counter
from datetime import datetime, timezone
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

warnings.filterwarnings('ignore')
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

##### `Reading modelpoints dataset`

In [70]:
# Reads model points data from local directory
def loading_modelpoint_data(file_name: str) -> pd.DataFrame:
    path = (Path('.').cwd().parent / 'data/processed') / file_name
    df = pd.read_parquet(path)
    return df

##### `Split data modelpoints`

In [71]:
# Reading Modepoints
modelpoint_train = loading_modelpoint_data('modelpoint_train.parquet')
# modelpoint_train = modelpoint_train[modelpoint_train.item_id != 'no_item_id']
modelpoint_eval = loading_modelpoint_data('modelpoint_eval.parquet')
modelpoint_train.head()

Unnamed: 0,user_id,target,item_id,user_item_lifestyle_rate,day_sin,month_sin,week_of_month,is_month_start,item_type_lifestyle,item_type_all,...,total_user_int_by_time_of_day,user_active_rate,active_mode_active,user_unique_segment,weekend_interaction_rate,segment,prev_user_int_freq,item_type_transact,screen_page_screen1,user_unique_items
0,4521,click,ibab,0,-2.449294e-16,0.866025,1,0,0.0,0.0,...,5,0,0.0,1,0.6,1,0.307323,0.0,1.0,2
1,4521,checkout,ibab,0,-2.449294e-16,0.866025,1,0,0.0,0.0,...,5,0,0.0,1,0.6,1,0.13042,0.0,1.0,2
2,14454,click,cafm,0,0.4338837,0.866025,2,0,0.0,0.0,...,3,1,1.0,1,0.0,2,0.228236,1.0,0.0,2
3,14454,checkout,cafm,0,0.4338837,0.866025,2,0,0.0,0.0,...,3,1,1.0,1,0.0,2,0.033729,1.0,0.0,2
4,15000,click,carf,0,0.9749279,0.5,5,0,0.0,0.0,...,3,0,0.0,1,0.0,0,0.307323,0.0,0.0,2


In [72]:
# Data Split
# data_point_size = 20000
# X_points = modelpoint_train.iloc[:data_point_size]
X_points = modelpoint_train.copy()

y = X_points['target']
X = X_points.drop(columns=['target', 'item_id', 'user_id']).astype(float)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Target encoding
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

### **`ML Experiments`**

In [73]:
from inspect import signature
from dataclasses import dataclass
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import (
    StratifiedKFold, cross_val_score
)
from sklearn.metrics import (
    f1_score, accuracy_score, 
    roc_auc_score, classification_report
)
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [74]:
# Function to save model metadata into the local 'models' folder
def save_model_and_metadata(model: ClassifierMixin, metadata: dict, classifier_name: str, version: float, report_df: pd.DataFrame):
    file_directory = Path('.').cwd().parent / "models/classifiers"
    file_directory.mkdir(exist_ok=True)

    model_path = file_directory / f"{classifier_name}_model_v{version}.pkl"
    joblib.dump(model, model_path)

    meta_path = file_directory / f"{classifier_name}_metadata_v{version}.json"
    with open(meta_path, 'w', encoding='utf-8') as f:
        json.dump(metadata, f, indent=4)

    report_path = file_directory / f"{classifier_name}_classifier_report_v{version}.csv"
    report_df.to_csv(report_path)

    logger.info(f"Saved model to: {model_path}")
    logger.info(f"Saved metadata to: {meta_path}")
    logger.info(f"Saved report classifier to: {report_path}")

    return model_path, meta_path, report_path

# Extract model parameters (non-defaults)
def model_non_default_params(model: BaseEstimator):
    sig = signature(model.__class__.__init__)
    defaults = {k: v.default for k, v in sig.parameters.items() if v.default is not v.empty}
    current_params = model.get_params()
    return {k: v for k, v in current_params.items() if k in defaults and v != defaults[k]}

# Main model trainer
@dataclass
class ModelTrainer:
    classifier: ClassifierMixin | BaseEstimator
    X_train: np.ndarray | pd.DataFrame
    X_test: np.ndarray | pd.DataFrame
    y_train: np.ndarray | pd.Series
    y_test: np.ndarray | pd.Series
    data_size: int
    version: float = 0.1
    decimal: int = 3
    
    def train_model(self):
        classifier_name = self.classifier.__class__.__name__.lower()
        logger.info(f'Model training [{self.classifier.__class__.__name__.lower()}] started ... ')
        
        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        cv_scores = cross_val_score(self.classifier, self.X_train, self.y_train, cv=skf, scoring='f1_weighted')

        print(f"CV Weighted F1 scores: {cv_scores}")
        print(f"Mean CV Weighted F1 score: {cv_scores.mean():.4f}")

        self.classifier.fit(self.X_train, self.y_train)
        train_y_pred = self.classifier.predict(self.X_train)
        train_y_pred_proba = self.classifier.predict_proba(self.X_train)[:, 1]
        test_y_pred = self.classifier.predict(self.X_test)
        test_y_pred_proba = self.classifier.predict_proba(self.X_test)[:, 1]

        train_acc_score = accuracy_score(self.y_train, train_y_pred)
        train_f_score = f1_score(self.y_train, train_y_pred, average='weighted')
        train_ra_score = roc_auc_score(self.y_train, train_y_pred_proba)

        test_acc_score = accuracy_score(self.y_test, test_y_pred)
        test_f_score = f1_score(self.y_test, test_y_pred, average='weighted')
        test_ra_score = roc_auc_score(self.y_test, test_y_pred_proba)

        if hasattr(self.classifier, "feature_importances_"):
            feature_names = self.X_train.columns if isinstance(self.X_train, pd.DataFrame) else np.arange(self.X_train.shape[1])
            importances = self.classifier.feature_importances_
            feat_imp = pd.Series(importances, index=feature_names).sort_values(ascending=False)
            
            # Save to file
            plt.style.use('ggplot')
            plt.figure(figsize=(10, 6))
            top_n = 20
            feat_imp.head(top_n).plot(kind='barh', title=f'Top {top_n} Feature Importances')
            plt.gca().invert_yaxis()
            plt.tight_layout()
            file_img_path = f'{classifier_name}_top_feature_importances_v{self.version}.png'
            plt.savefig(file_img_path, dpi=300)
            plt.close()
        else:
            feat_imp = pd.Series()

        report_dict = classification_report(self.y_test, test_y_pred, output_dict=True)
        report_df = pd.DataFrame(report_dict).transpose()

        metadata =  {
            'data_size': self.data_size, 
            'classifier': classifier_name,
            'params': model_non_default_params(self.classifier),
            'cv weighted f1': np.round(cv_scores.mean(), self.decimal),
            'accuracy score': {'train': np.round(train_acc_score, self.decimal), 'test': np.round(test_acc_score, self.decimal)},
            'weighted f1 score': {'train': np.round(train_f_score, self.decimal), 'test': np.round(test_f_score, self.decimal)},
            'roc-auc score': {'train': np.round(train_ra_score, self.decimal), 'test': np.round(test_ra_score, self.decimal)},
            'feature importance': feat_imp.to_dict() if not feat_imp.empty else {},
        }
        model_path, meta_path, report_path = save_model_and_metadata(self.classifier, metadata,classifier_name, self.version, report_df)

        print(f"\nResults for {classifier_name}:")
        print("Accuracy Score:", train_acc_score)
        print("F1 Score:", train_f_score)
        print("Roc-Auc Score:", train_ra_score)
        print(classification_report(self.y_test, test_y_pred))

        return {
           **metadata,
            'model path': model_path,
            'metdata path': meta_path,
            'report path': report_path
        }


`Experiment on Different Models`

In [75]:
# Compare model performance metrics using Logistic Regression as the baseline
seed = 43
data_point_size = len(modelpoint_train)
models = {
    'Logistic Regression': LogisticRegression(max_iter=100, random_state=seed),
    'Random Forest': RandomForestClassifier(n_estimators=200, max_depth=10, class_weight='balanced', random_state=seed),
    'XGBoost': XGBClassifier(n_estimators=200, learning_rate=0.1, max_depth=6, use_label_encoder=False, eval_metric='logloss'),
    'LightGBM': LGBMClassifier(n_estimators=200, learning_rate=0.1, class_weight='balanced', random_state=seed, verbosity=-1)
}

# Train, predict, and evaluate
for name, model in models.items():
    print(f"\nTraining {name}...")
    model_trainer = ModelTrainer(model, X_train, X_test, y_train_enc, y_test_enc, data_point_size, version=0.1)
    trainer = model_trainer.train_model()

2025-06-26 01:29:28,404 - INFO - Model training [logisticregression] started ... 



Training Logistic Regression...
CV Weighted F1 scores: [0.39255345 0.39259079 0.39257406 0.39257406 0.39257406]
Mean CV Weighted F1 score: 0.3926


2025-06-26 01:29:34,886 - INFO - Saved model to: /mnt/d/research-workspace/workx-projects/fnb-dataquest-recosys/models/classifiers/logisticregression_model_v0.1.pkl
2025-06-26 01:29:34,888 - INFO - Saved metadata to: /mnt/d/research-workspace/workx-projects/fnb-dataquest-recosys/models/classifiers/logisticregression_metadata_v0.1.json
2025-06-26 01:29:34,892 - INFO - Saved report classifier to: /mnt/d/research-workspace/workx-projects/fnb-dataquest-recosys/models/classifiers/logisticregression_classifier_report_v0.1.csv
2025-06-26 01:29:34,908 - INFO - Model training [randomforestclassifier] started ... 



Results for logisticregression:
Accuracy Score: 0.5519260361263464
F1 Score: 0.3925732828279465
Roc-Auc Score: 0.5248555217904204
              precision    recall  f1-score   support

           0       0.00      0.00      0.00     17544
           1       0.55      1.00      0.71     21611

    accuracy                           0.55     39155
   macro avg       0.28      0.50      0.36     39155
weighted avg       0.30      0.55      0.39     39155


Training Random Forest...
CV Weighted F1 scores: [0.60313432 0.60908946 0.60235925 0.60152152 0.6047266 ]
Mean CV Weighted F1 score: 0.6042


2025-06-26 01:32:20,466 - INFO - Saved model to: /mnt/d/research-workspace/workx-projects/fnb-dataquest-recosys/models/classifiers/randomforestclassifier_model_v0.1.pkl
2025-06-26 01:32:20,468 - INFO - Saved metadata to: /mnt/d/research-workspace/workx-projects/fnb-dataquest-recosys/models/classifiers/randomforestclassifier_metadata_v0.1.json
2025-06-26 01:32:20,469 - INFO - Saved report classifier to: /mnt/d/research-workspace/workx-projects/fnb-dataquest-recosys/models/classifiers/randomforestclassifier_classifier_report_v0.1.csv
2025-06-26 01:32:20,487 - INFO - Model training [xgbclassifier] started ... 



Results for randomforestclassifier:
Accuracy Score: 0.6339222434346208
F1 Score: 0.6271011851826916
Roc-Auc Score: 0.7241420185692343
              precision    recall  f1-score   support

           0       0.54      0.78      0.64     17544
           1       0.73      0.47      0.57     21611

    accuracy                           0.61     39155
   macro avg       0.64      0.63      0.61     39155
weighted avg       0.64      0.61      0.60     39155


Training XGBoost...
CV Weighted F1 scores: [0.67072466 0.6654609  0.66655335 0.67047336 0.66687442]
Mean CV Weighted F1 score: 0.6680


2025-06-26 01:32:36,560 - INFO - Saved model to: /mnt/d/research-workspace/workx-projects/fnb-dataquest-recosys/models/classifiers/xgbclassifier_model_v0.1.pkl
2025-06-26 01:32:36,561 - INFO - Saved metadata to: /mnt/d/research-workspace/workx-projects/fnb-dataquest-recosys/models/classifiers/xgbclassifier_metadata_v0.1.json
2025-06-26 01:32:36,563 - INFO - Saved report classifier to: /mnt/d/research-workspace/workx-projects/fnb-dataquest-recosys/models/classifiers/xgbclassifier_classifier_report_v0.1.csv
2025-06-26 01:32:36,596 - INFO - Model training [lgbmclassifier] started ... 



Results for xgbclassifier:
Accuracy Score: 0.7019161393718434
F1 Score: 0.6996194950130216
Roc-Auc Score: 0.7809618485153436
              precision    recall  f1-score   support

           0       0.66      0.58      0.62     17544
           1       0.69      0.75      0.72     21611

    accuracy                           0.68     39155
   macro avg       0.67      0.67      0.67     39155
weighted avg       0.67      0.68      0.67     39155


Training LightGBM...
CV Weighted F1 scores: [0.6627169  0.66110156 0.66123664 0.66489927 0.66064478]
Mean CV Weighted F1 score: 0.6621


2025-06-26 01:32:58,572 - INFO - Saved model to: /mnt/d/research-workspace/workx-projects/fnb-dataquest-recosys/models/classifiers/lgbmclassifier_model_v0.1.pkl
2025-06-26 01:32:58,573 - INFO - Saved metadata to: /mnt/d/research-workspace/workx-projects/fnb-dataquest-recosys/models/classifiers/lgbmclassifier_metadata_v0.1.json
2025-06-26 01:32:58,574 - INFO - Saved report classifier to: /mnt/d/research-workspace/workx-projects/fnb-dataquest-recosys/models/classifiers/lgbmclassifier_classifier_report_v0.1.csv



Results for lgbmclassifier:
Accuracy Score: 0.6858003920391784
F1 Score: 0.6851509451257946
Roc-Auc Score: 0.7737841049733397
              precision    recall  f1-score   support

           0       0.60      0.76      0.67     17544
           1       0.75      0.59      0.66     21611

    accuracy                           0.67     39155
   macro avg       0.68      0.68      0.67     39155
weighted avg       0.68      0.67      0.67     39155



`HyperTuning Best Classifier (Optuna)`

In [76]:
# import optuna
# import numpy as np
# import pandas as pd
# from lightgbm import LGBMClassifier
# from sklearn.metrics import f1_score
# from sklearn.model_selection import StratifiedKFold

# # Define Optuna objective
# def objective(trial):
#     params = {
#         'n_estimators': trial.suggest_int('n_estimators', 100, 500),
#         'max_depth': trial.suggest_int('max_depth', 3, 10),
#         'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
#         'num_leaves': trial.suggest_int('num_leaves', 20, 100),
#         'min_child_samples': trial.suggest_int('min_child_samples', 10, 50),
#         'subsample': trial.suggest_float('subsample', 0.5, 1.0),
#         'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
#     }
#     model = LGBMClassifier(**params, random_state=42, class_weight='balanced', verbosity=-1, n_jobs=-1)
#     kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
#     scores = []
#     for train_idx, val_idx in kf.split(X_train, y_train):
#         model.fit(X_train.iloc[train_idx], y_train.iloc[train_idx])
#         preds = model.predict(X_train.iloc[val_idx])
#         scores.append(f1_score(y_train.iloc[val_idx], preds, average='weighted'))
#     return np.mean(scores)

# # Run optimization
# study = optuna.create_study(direction='maximize')
# study.optimize(objective, n_trials=100)

# # study.best_params

`Retraining With Tune-params`

In [77]:
# params = study.best_params
# model = LGBMClassifier(**params, random_state=42, class_weight='balanced', verbosity=-1, n_jobs=-1)
# model_trainer = ModelTrainer(model, X_train, X_test, y_train_enc, y_test_enc, version=0.6)
# trainer = model_trainer.train_model()