In [13]:
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, precision_score, recall_score, average_precision_score, make_scorer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
import pandas as pd
import ast

import warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore")

DATA_DIR = './library-data'

In [14]:
df = pd.read_csv(f'{DATA_DIR}/model_ready_dataset.csv')

print(df.columns)
df.shape

Index(['book_pages', 'customer_library_distance', 'book_price', 'book_title',
       'book_authors_wendy nelson espeland',
       'book_authors_oscar adolph leutwiler',
       'book_authors_axel petrus johnson', 'book_authors_william stokes',
       'book_categories_libraries', 'book_authors_kim starkey jonker',
       'book_authors_murthy', 'book_authors_les carlson',
       'book_authors_samuel milroy ballard', 'book_authors_samuel newth',
       'book_categories_water supply',
       'book_categories_language arts & disciplines',
       'book_authors_stanisław lem', 'book_authors_richard schmalensee',
       'book_authors_bruce m. becker', 'book_authors_michael sauder',
       'book_authors_gary copeland', 'book_categories_geology', 'book_age',
       'book_authors_gordon w. stewart',
       'book_authors_sir william james moore', 'is_late'],
      dtype='object')


(1544, 26)

In [15]:
df['is_late'].value_counts()

is_late
0    1383
1     161
Name: count, dtype: int64

In [16]:
X = df.drop(columns=['is_late'])
y = df['is_late']

In [17]:
model_grid = {
    "Logistic Regression": (
        LogisticRegression(class_weight='balanced', max_iter=1000, random_state=25),
        {
            'clf__penalty': ['l2'],
            'clf__C': [0.01, 0.1, 1, 10, 100],
            'clf__solver': ['liblinear', 'saga']
        }
    ),

    "Random Forest": (
        RandomForestClassifier(class_weight='balanced', random_state=25),
        {
            'clf__n_estimators': [100, 200, 300],
            'clf__max_depth': [20, 30],
            'clf__min_samples_split': [2, 5, 10],
            'clf__min_samples_leaf': [1, 2, 4],
            'clf__max_features': ['sqrt', 'log2']
        }
    ),

    "XGBoost": (
        XGBClassifier(eval_metric='logloss', random_state=25),
        {
            'clf__n_estimators': [100, 200],
            'clf__max_depth': [3, 6, 9],
            'clf__learning_rate': [0.01, 0.1, 0.2],
            'clf__subsample': [0.6, 0.8, 1.0],
            'clf__colsample_bytree': [0.6, 0.8, 1.0],
            'clf__scale_pos_weight': [5, 10] 
        }
    ),

    "LightGBM": (
        LGBMClassifier(class_weight='balanced', random_state=25),
        {
            'clf__n_estimators': [100, 200],
            'clf__max_depth': [-1, 10, 20],
            'clf__learning_rate': [0.01, 0.1],
            'clf__num_leaves': [31, 50],
            'clf__min_child_samples': [30, 40],
            'clf__subsample': [0.6, 0.8, 1.0],
            'clf__colsample_bytree': [0.6, 1.0]
        }
    ),

    "SVM": (
        SVC(probability=True, class_weight='balanced', random_state=25),
        {
            'clf__C': [0.1, 1, 10, 100],
            'clf__kernel': ['rbf', 'linear'],
            'clf__gamma': ['scale', 'auto'] 
        }
    )
}


In [18]:
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier

def create_ensemble_model_entry(name, model, param_grid):

    return {name: (model, param_grid)}

def get_ensemble_models(seed=25):

    bagging_rf_model = BaggingClassifier(
        estimator=RandomForestClassifier(class_weight='balanced', random_state=seed),
        random_state=seed
    )
    
    bagging_rf_params = {
        'clf__estimator__n_estimators': [50, 100],
        'clf__estimator__max_depth': [10, 20],
        'clf__estimator__min_samples_leaf': [1, 2],
        'clf__n_estimators': [5, 10], 
        'clf__max_samples': [0.5, 1.0],
        'clf__max_features': [0.5, 1.0]
    }
    
    adaboost_rf_model = AdaBoostClassifier(
        estimator=RandomForestClassifier(class_weight='balanced', random_state=seed),
        random_state=seed
    )
    
    adaboost_rf_params = {
        'clf__estimator__n_estimators': [50, 100],
        'clf__estimator__max_depth': [5, 10],
        'clf__n_estimators': [50, 100],
        'clf__learning_rate': [0.01, 0.1, 1.0]
    }
    
    bagging_entry = create_ensemble_model_entry("Bagging + RandomForest", bagging_rf_model, bagging_rf_params)
    adaboost_entry = create_ensemble_model_entry("AdaBoost + RandomForest", adaboost_rf_model, adaboost_rf_params)
    
    ensemble_models = {}
    ensemble_models.update(bagging_entry)
    ensemble_models.update(adaboost_entry)
    
    return ensemble_models


In [19]:
def run_random_search_cv(X, y, model_grid, n_iter=25, n_splits=5, seed=25):

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    
    results = []
    
    for name, (model, param_grid) in model_grid.items():
        print(f"Running RandomizedSearchCV for {name}...")

        pipe = ImbPipeline([
            ('smote', SMOTE(random_state=seed)),
            ('scaler', StandardScaler()),
            ('clf', model)
        ])

        search = RandomizedSearchCV(
            estimator=pipe,
            param_distributions=param_grid,
            n_iter=n_iter,
            scoring=make_scorer(f1_score),
            cv=skf,
            n_jobs=-1,
            verbose=1,
            random_state=seed
        )
        
        search.fit(X, y)
        best_model = search.best_estimator_
        
        y_pred = cross_val_predict(best_model, X, y, cv=skf, method='predict')
        y_proba = cross_val_predict(best_model, X, y, cv=skf, method='predict_proba')[:, 1]

        f1 = f1_score(y, y_pred)
        precision = precision_score(y, y_pred)
        recall = recall_score(y, y_pred)
        pr_auc = average_precision_score(y, y_proba)
        
        results.append({
            'Model': name,
            'F1 Score': round(f1, 4),
            'PR AUC': round(pr_auc, 4),
            'Recall': round(recall, 4),
            'Precision': round(precision, 4),
            'Best Params': search.best_params_
        })

    results_df = pd.DataFrame(results).sort_values(by='F1 Score', ascending=False)
    return results_df

In [20]:
ensemble_models = get_ensemble_models()

model_grid.update(ensemble_models)

%time results_df = run_random_search_cv(X, y, model_grid, n_iter=25, n_splits=5, seed=25)


Running RandomizedSearchCV for Logistic Regression...
Fitting 5 folds for each of 10 candidates, totalling 50 fits




Running RandomizedSearchCV for Random Forest...
Fitting 5 folds for each of 25 candidates, totalling 125 fits
Running RandomizedSearchCV for XGBoost...
Fitting 5 folds for each of 25 candidates, totalling 125 fits
Running RandomizedSearchCV for LightGBM...
Fitting 5 folds for each of 25 candidates, totalling 125 fits


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.

Dask dataframe query

[LightGBM] [Info] Number of positive: 1106, number of negative: 1106
[LightGBM] [Info] Number of positive: 1106, number of negative: 1106
[LightGBM] [Info] Number of positive: 1106, number of negative: 1106
[LightGBM] [Info] Number of positive: 1106, number of negative: 1106
[LightGBM] [Info] Number of positive: 1107, number of negative: 1107
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003991 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1002
[LightGBM] [Info] Number of positive: 1107, number of negative: 1107
[LightGBM] [Info] Number of positive: 1107, number of negative: 1107
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004459 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1002
[LightGBM] [Info] Number of data points in the train set: 2212, number of used features: 5
[LightGBM] [Info] Number of positi



Running RandomizedSearchCV for Bagging + RandomForest...
Fitting 5 folds for each of 25 candidates, totalling 125 fits
Running RandomizedSearchCV for AdaBoost + RandomForest...
Fitting 5 folds for each of 24 candidates, totalling 120 fits




CPU times: user 2min, sys: 1min 21s, total: 3min 21s
Wall time: 4min 41s


In [21]:
try:
    from IPython.display import display
    display(results_df.style.background_gradient(cmap='Blues').format(precision=4))
except:
    pass

Unnamed: 0,Model,F1 Score,PR AUC,Recall,Precision,Best Params
3,LightGBM,0.3478,0.2465,0.4969,0.2676,"{'clf__subsample': 0.6, 'clf__num_leaves': 31, 'clf__n_estimators': 100, 'clf__min_child_samples': 30, 'clf__max_depth': 20, 'clf__learning_rate': 0.01, 'clf__colsample_bytree': 0.6}"
5,Bagging + RandomForest,0.3447,0.3027,0.472,0.2714,"{'clf__n_estimators': 5, 'clf__max_samples': 0.5, 'clf__max_features': 1.0, 'clf__estimator__n_estimators': 100, 'clf__estimator__min_samples_leaf': 2, 'clf__estimator__max_depth': 20}"
6,AdaBoost + RandomForest,0.3447,0.2863,0.5031,0.2621,"{'clf__n_estimators': 100, 'clf__learning_rate': 0.1, 'clf__estimator__n_estimators': 50, 'clf__estimator__max_depth': 5}"
1,Random Forest,0.3434,0.298,0.5652,0.2466,"{'clf__n_estimators': 100, 'clf__min_samples_split': 2, 'clf__min_samples_leaf': 4, 'clf__max_features': 'log2', 'clf__max_depth': 30}"
4,SVM,0.2994,0.2638,0.6211,0.1972,"{'clf__kernel': 'rbf', 'clf__gamma': 'scale', 'clf__C': 10}"
0,Logistic Regression,0.2939,0.2814,0.6025,0.1944,"{'clf__solver': 'saga', 'clf__penalty': 'l2', 'clf__C': 0.1}"
2,XGBoost,0.2889,0.2173,0.4845,0.2058,"{'clf__subsample': 0.6, 'clf__scale_pos_weight': 5, 'clf__n_estimators': 100, 'clf__max_depth': 6, 'clf__learning_rate': 0.2, 'clf__colsample_bytree': 0.6}"


In [22]:
results_df.to_csv(f"{DATA_DIR}/evaluation_results", index=False)