# Model training

In [1]:
%pip install -r requirements.txt -q

## Prepare data

In [None]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from utils.model_training import prepare_data_for_training


In [3]:
# import preprocessed data or run the preprocessing

path_to_merged_clean_data = "data/clean_merged_data.csv"
path_to_books = "data/books.csv"
path_to_customers = "data/customers.csv"
path_to_libraries = "data/libraries.csv"
path_to_checkouts = "data/checkouts.csv"

try:
    data = pd.read_csv(path_to_merged_clean_data)
except:
    from utils.preprocess import (preprocess_checkouts_df, preprocess_books_df, preprocess_customers_df, 
                                  preprocess_libraries_df, merge_data)
    books_df = preprocess_books_df(pd.read_csv(path_to_books))
    customers_df = preprocess_customers_df(pd.read_csv(path_to_customers))
    libraries_df = preprocess_libraries_df(pd.read_csv(path_to_libraries))
    checkouts_df = preprocess_checkouts_df(pd.read_csv(path_to_checkouts))
    data = merge_data(checkouts_df=checkouts_df, 
                  customers_df=customers_df, 
                  books_df=books_df,
                  libraries_df=libraries_df)

print(data.columns)

Index(['id', 'patron_id', 'library_id', 'date_checkout', 'date_returned',
       'days_borrowed', 'late_return', 'checkout_month', 'checkout_dayofweek',
       'return_due_date', 'return_due_date_dayofweek', 'is_holiday_checkout',
       'is_holiday_for_return', 'customer_id', 'customer_name',
       'customer_street_address', 'customer_city', 'customer_state',
       'customer_zipcode', 'customer_birth_date', 'customer_gender',
       'customer_education', 'customer_occupation', 'customer_age',
       'customer_age_group', 'book_id', 'book_title', 'book_authors',
       'book_publisher', 'book_publishedDate', 'book_categories', 'book_price',
       'book_pages', 'book_age', 'book_expensive', 'library_name',
       'library_street_address', 'library_city', 'library_region',
       'library_postal_code', 'same_city'],
      dtype='object')


In [4]:
print(f"{len(data[data.late_return==1])=}, { len(data[data.late_return==0])=}")

len(data[data.late_return==1])=117,  len(data[data.late_return==0])=1098


In [5]:
categorical_cols = ["customer_gender", "customer_education", "customer_occupation", "book_categories", "customer_zipcode", 
                    "book_authors", "book_publisher", "library_id","library_city", "checkout_dayofweek", "checkout_month",
                    "is_holiday_for_return"]
numerical_cols = ["customer_age", "book_price", "book_pages", "book_age"]


# the best results
categorical_cols = ["customer_zipcode", "library_city", "is_holiday_for_return", "checkout_dayofweek",]
numerical_cols = ["book_pages", "book_age", "customer_age"]

X_train, X_test, y_train, y_test, preprocessor = prepare_data_for_training(data, categorical_cols, numerical_cols)

## Random forest

In [6]:
from utils.model_training import train_random_forest

In [7]:
random_forest_model = train_random_forest(X_train, X_test, y_train, y_test, threshold=0.1)

Accuracy: 0.72
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.71      0.83       223
           1       0.22      0.90      0.35        20

    accuracy                           0.72       243
   macro avg       0.60      0.80      0.59       243
weighted avg       0.92      0.72      0.79       243

Confusion Matrix:
 [[158  65]
 [  2  18]]


## XGBoost

In [8]:
from utils.model_training import train_xgboost
xgboost_model = train_xgboost(X_train, X_test, y_train, y_test, threshold=0.1)

Accuracy: 0.78
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.78      0.87       223
           1       0.23      0.75      0.36        20

    accuracy                           0.78       243
   macro avg       0.60      0.77      0.61       243
weighted avg       0.91      0.78      0.82       243

Confusion Matrix:
 [[174  49]
 [  5  15]]


## Gradient boosting

In [9]:
from utils.model_training import train_gradient_boosting

In [10]:
gradient_boost_model = train_gradient_boosting(X_train, X_test, y_train, y_test, threshold=0.1)

Accuracy: 0.69
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.68      0.80       223
           1       0.18      0.80      0.30        20

    accuracy                           0.69       243
   macro avg       0.58      0.74      0.55       243
weighted avg       0.91      0.69      0.76       243

Confusion Matrix:
 [[151  72]
 [  4  16]]


## Logistic regression

In [11]:
from utils.model_training import train_logistic_regression

In [12]:
logistic_regression_model = train_logistic_regression(X_train, X_test, y_train, y_test)

Accuracy: 0.81
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.81      0.88       223
           1       0.27      0.80      0.41        20

    accuracy                           0.81       243
   macro avg       0.62      0.80      0.64       243
weighted avg       0.92      0.81      0.85       243

Confusion Matrix:
 [[180  43]
 [  4  16]]


## KNN

In [13]:
from utils.model_training import train_knn

In [14]:
X_train_knn, X_test_knn, y_train_knn, y_test_knn, preprocessor = prepare_data_for_training(data, categorical_cols, numerical_cols, balancing="hybrid")
knn_model = train_knn(X_train_knn, X_test_knn, y_train_knn, y_test_knn, n_neighbors=1)

Accuracy: 0.66
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.67      0.78       223
           1       0.13      0.55      0.21        20

    accuracy                           0.66       243
   macro avg       0.54      0.61      0.50       243
weighted avg       0.88      0.66      0.74       243

Confusion Matrix:
 [[149  74]
 [  9  11]]


## Neural network

In [15]:
from utils.model_training import train_neural_network_imbalanced

In [16]:
nn_model = train_neural_network_imbalanced(X_train, X_test, y_train, y_test, threshold=0.1)

2025-02-10 01:59:01.260449: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 
Accuracy: 0.72
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.71      0.82       223
           1       0.20      0.80      0.32        20

    accuracy                           0.72       243
   macro avg       0.59      0.75      0.57       243
weighted avg       0.91      0.72      0.78       243

Confusion Matrix:
 [[158  65]
 [  4  16]]
