In [25]:

# Import necessary libraries
import sqlite3
import pandas as pd
import time
import os

os.environ["LOKY_MAX_CPU_COUNT"] = "4"

import setup.setup as setup
import setup.duration_cal as duration_cal
import EDA.eda_step as EDA
import model_select.model_select as model_select
import model_eval.model_eval as model_eval

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from typing import Dict
from tqdm import tqdm

In [26]:
# Load configuration and setup parameters
start_time = time.time()

(
    db_path,
    target_col,
    num_map_dict,
    standard_list,
    one_hot_list,
    model_test_size,
    model_random_state,
    model_search_method,
    model_cv_num,
    model_scoring,
    model_num_iter,
    model_num_jobs,
    model_param_dict,
) = setup.setup_stage()

print("Configuration loaded successfully!")

Configuration loaded successfully!


In [27]:
# Create connection to SQL database
print("1. Connecting to SQL database....")
conn = sqlite3.connect(db_path)
print("Connection done!")

1. Connecting to SQL database....
Connection done!


In [28]:
part1_time = time.time()
part1_duration, part1_tag = duration_cal.duration_cal(part1_time - start_time)
print(f"Part 1 has run for {part1_duration:.3f} {part1_tag}!")

Part 1 has run for 0.017 sec!


In [29]:
# Get data from 'noshow' table
print("2. Extract SQL database table as DataFrame...")

noshow_data_query = "SELECT * FROM noshow;"
noshow_data_df = pd.read_sql_query(noshow_data_query, conn)

print("Extraction done!")

2. Extract SQL database table as DataFrame...
Extraction done!


In [30]:
part2_time = time.time()

part2_duration, part2_tag = duration_cal.duration_cal(part2_time - part1_time)

print(f"Part 2 has run for {part2_duration:.3f} {part2_tag}!")

Part 2 has run for 0.344 sec!


In [31]:
# Using analysis from task_1 EDA, perform data preprocessing, feature data standardization and one-hot encoding
print("3. Performing EDA on DataFrame...")

fil_noshow_data_df, preprocessor, X_train, X_test, Y_train, Y_test = EDA.ml_eda_step(
    noshow_data_df,
    target_col,
    num_map_dict,
    standard_list,
    one_hot_list,
    model_test_size,
    model_random_state,
)

print("EDA done!")

3. Performing EDA on DataFrame...
EDA done!


In [32]:
part3_time = time.time()

part3_duration, part3_tag = duration_cal.duration_cal(part3_time - part2_time)

print(f"Part 3 has run for {part3_duration:.3f} {part3_tag}!")

Part 3 has run for 0.823 sec!


In [33]:
best_estimators_dict = {}
# Pre-select a few models and train models to get best optimized parameters
print("4. Training machine learning models...")

4. Training machine learning models...


In [23]:
# Train Logistic Regression
if "Logistic Regression" in model_param_dict:
    model_start_time = time.time()
    print("Processing Logistic Regression now...")
    
    model = LogisticRegression(random_state=model_random_state)
    pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])

    if model_search_method == "grid":
        search = GridSearchCV(
            pipeline,
            param_grid=model_param_dict["Logistic Regression"],
            cv=model_cv_num,
            scoring=model_scoring,
            n_jobs=model_num_jobs,
            verbose=3
        )
    elif model_search_method == "random":
        search = RandomizedSearchCV(
            pipeline,
            param_distributions=model_param_dict["Logistic Regression"],
            n_iter=model_num_iter,
            cv=model_cv_num,
            scoring=model_scoring,
            random_state=model_random_state,
            n_jobs=model_num_jobs,
        )
    with tqdm(total=100, desc="Progress", bar_format="{l_bar}{bar} [ time left: {remaining} ]") as pbar:
        search.fit(X_train, Y_train)
        pbar.update(100)

    # Save best model
    best_estimators_dict["Logistic Regression"] = search.best_estimator_
    print("Best parameters for Logistic Regression:", search.best_params_)

    model_end_time = time.time()
    model_total_time = model_end_time - model_start_time
    model_duration, model_tag = duration_cal.duration_cal(model_total_time)
    print(f"Logistic Regression has run tuning for {model_duration:.3f} {model_tag}")


Processing Logistic Regression now...
Fitting 5 folds for each of 480 candidates, totalling 2400 fits
Best parameters for Logistic Regression: {'model__C': 0.1, 'model__class_weight': 'balanced', 'model__max_iter': 100, 'model__solver': 'lbfgs'}
Logistic Regression has run tuning for 24.084 min!


In [25]:
# Train Random Forest
if "Random Forest" in model_param_dict:
    model_start_time = time.time()
    print("Processing Random Forest now...")
    
    model = RandomForestClassifier(random_state=model_random_state)
    pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])

    if model_search_method == "grid":
        search = GridSearchCV(
            pipeline,
            param_grid=model_param_dict["Random Forest"],
            cv=model_cv_num,
            scoring=model_scoring,
            n_jobs=model_num_jobs,
            verbose=3
        )
    elif model_search_method == "random":
        search = RandomizedSearchCV(
            pipeline,
            param_distributions=model_param_dict["Random Forest"],
            n_iter=model_num_iter,
            cv=model_cv_num,
            scoring=model_scoring,
            random_state=model_random_state,
            n_jobs=model_num_jobs,
        )

    search.fit(X_train, Y_train)

    # Save best model
    best_estimators_dict["Random Forest"] = search.best_estimator_
    print("Best parameters for Random Forest:", search.best_params_)

    model_end_time = time.time()
    model_total_time = model_end_time - model_start_time
    model_duration, model_tag = duration_cal.duration_cal(model_total_time)
    print(f"Random Forest has run tuning for {model_duration:.3f} {model_tag}")


Processing Random Forest now...
Fitting 5 folds for each of 416 candidates, totalling 2080 fits
Best parameters for Random Forest: {'model__class_weight': 'balanced', 'model__max_depth': 40, 'model__n_estimators': 400}
Random Forest has run tuning for 2.044 hr!


In [None]:
# Train SVC
if "SVC" in model_param_dict:
    model_start_time = time.time()
    print("Processing SVC now...")
    
    model = SVC()
    pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])

    if model_search_method == "grid":
        search = GridSearchCV(
            pipeline,
            param_grid=model_param_dict["SVC"],
            cv=model_cv_num,
            scoring=model_scoring,
            n_jobs=model_num_jobs,
            verbose=10
        )
    elif model_search_method == "random":
        search = RandomizedSearchCV(
            pipeline,
            param_distributions=model_param_dict["SVC"],
            n_iter=model_num_iter,
            cv=model_cv_num,
            scoring=model_scoring,
            random_state=model_random_state,
            n_jobs=model_num_jobs,
        )

    search.fit(X_train, Y_train)

    # Save best model
    best_estimators_dict["SVC"] = search.best_estimator_
    print("Best parameters for SVC:", search.best_params_)

    model_end_time = time.time()
    model_total_time = model_end_time - model_start_time
    model_duration, model_tag = duration_cal.duration_cal(model_total_time)
    print(f"SVC has run tuning for {model_duration:.3f} {model_tag}")

In [None]:
# Train MLP
if "MLP" in model_param_dict:
    model_start_time = time.time()
    print("Processing MLP now...")
    
    model = MLPClassifier(random_state=model_random_state)
    pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])

    if model_search_method == "grid":
        search = GridSearchCV(
            pipeline,
            param_grid=model_param_dict["MLP"],
            cv=model_cv_num,
            scoring=model_scoring,
            n_jobs=model_num_jobs,
            verbose=3
        )
    elif model_search_method == "random":
        search = RandomizedSearchCV(
            pipeline,
            param_distributions=model_param_dict["MLP"],
            n_iter=model_num_iter,
            cv=model_cv_num,
            scoring=model_scoring,
            random_state=model_random_state,
            n_jobs=model_num_jobs,
        )

    search.fit(X_train, Y_train)

    # Save best model
    best_estimators_dict["MLP"] = search.best_estimator_
    print("Best parameters for MLP:", search.best_params_)

    model_end_time = time.time()
    model_total_time = model_end_time - model_start_time
    model_duration, model_tag = duration_cal.duration_cal(model_total_time)
    print(f"MLP has run tuning for {model_duration:.3f} {model_tag}")
    print()


In [13]:
# Train Naive Bayes
if "Naive Bayes" in model_param_dict:
    model_start_time = time.time()
    print("Processing Naive Bayes now...")
    
    model = BernoulliNB()
    pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])

    if model_search_method == "grid":
        search = GridSearchCV(
            pipeline,
            param_grid=model_param_dict["Naive Bayes"],
            cv=model_cv_num,
            scoring=model_scoring,
            n_jobs=model_num_jobs,
            verbose=3
        )
    elif model_search_method == "random":
        search = RandomizedSearchCV(
            pipeline,
            param_distributions=model_param_dict["Naive Bayes"],
            n_iter=model_num_iter,
            cv=model_cv_num,
            scoring=model_scoring,
            random_state=model_random_state,
            n_jobs=model_num_jobs,
        )

    search.fit(X_train, Y_train)

    # Save best model
    best_estimators_dict["Naive Bayes"] = search.best_estimator_
    print("Best parameters for Naive Bayes:", search.best_params_)

    model_end_time = time.time()
    model_total_time = model_end_time - model_start_time
    model_duration, model_tag = duration_cal.duration_cal(model_total_time)
    print(f"Naive Bayes has run tuning for {model_duration:.3f} {model_tag}")
    print()


Processing Naive Bayes now...
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best parameters for Naive Bayes: {'model__alpha': 3.0}
Naive Bayes has run tuning for 6.195 sec



In [14]:
# Train XG Boost
if "XG Boost" in model_param_dict:
    model_start_time = time.time()
    print("Processing XG Boost now...")
    
    model = XGBClassifier(objective="reg:squarederror", random_state=model_random_state)
    pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])

    if model_search_method == "grid":
        search = GridSearchCV(
            pipeline,
            param_grid=model_param_dict["XG Boost"],
            cv=model_cv_num,
            scoring=model_scoring,
            n_jobs=model_num_jobs,
            verbose=3
        )
    elif model_search_method == "random":
        search = RandomizedSearchCV(
            pipeline,
            param_distributions=model_param_dict["XG Boost"],
            n_iter=model_num_iter,
            cv=model_cv_num,
            scoring=model_scoring,
            random_state=model_random_state,
            n_jobs=model_num_jobs,
        )

    search.fit(X_train, Y_train)

    # Save best model
    best_estimators_dict["XG Boost"] = search.best_estimator_
    print("Best parameters for XG Boost:", search.best_params_)

    model_end_time = time.time()
    model_total_time = model_end_time - model_start_time
    model_duration, model_tag = duration_cal.duration_cal(model_total_time)
    print(f"XG Boost has run tuning for {model_duration:.3f} {model_tag}")
    print()


Processing XG Boost now...
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best parameters for XG Boost: {'model__learning_rate': 0.1, 'model__max_depth': 10, 'model__subsample': 0.8}
XG Boost has run tuning for 1.411 min



In [None]:
print("Training done!")

In [None]:
part4_time = time.time()
part4_duration, part4_tag = duration_cal.duration_cal(part4_time - part3_time)
print(f"Part 4 has run for {part4_duration:.3f} {part4_tag}!")
print()

In [None]:
# Evaluate pre-selected models to get mean-squared error and r^2 values to determine which model is better for current dataset
print("5. Evaluating machine learning model...")
model_eval.model_evaluation(X_test, Y_test, best_estimators_dict)
print("Evaluation done!")

In [None]:
part5_time = time.time()
part5_duration, part5_tag = duration_cal.duration_cal(part5_time - part4_time)
print(f"Part 5 has run for {part5_duration:.3f} {part5_tag}!")
print()
print()

In [None]:
end_time = time.time()
final_time = end_time - start_time
final_duration, final_tag = duration_cal.duration_cal(final_time)

print("Script has reached end of line - It will terminate now!")
print(f"Script has run for {final_duration:.3f} {final_tag}!")

In [None]:
# Save results to a CSV file
results_df = pd.DataFrame(best_estimators_dict).T
results_df.to_csv("model_results.csv", index=True)
print("Results saved to 'model_results.csv'")