# Binary Classification with a Bank Dataset: Model Training
link: https://www.kaggle.com/competitions/playground-series-s5e8/data

In [53]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.base import clone
from collections import Counter

from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

import csv
import os
import pickle

import warnings
warnings.filterwarnings('ignore')

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.neural_network import MLPClassifier
from imblearn.ensemble import BalancedBaggingClassifier


In [54]:
def create_dir(dir):
    if os.path.exists(dir):
        pass
    else:
        os.makedirs(dir)

In [55]:
def load_data(type, dir='modified_training_data'):
    search_dir = os.path.join(dir, type)

    if os.path.exists(search_dir):
        x_data_path = os.path.join(search_dir, 'x_train.csv')
        y_data_path = os.path.join(search_dir, 'y_train.csv')

        x_data = pd.read_csv(x_data_path)
        y_data = pd.read_csv(y_data_path)

        if 'id' in x_data.columns:
            x_data.set_index('id') 
        if 'id' in y_data.columns:
            y_data.set_index('id')

        return x_data, y_data


In [62]:
# loading data 
X_train, y_train = load_data('base')
X_test, y_test = load_data('test')

X_over, y_over = load_data('oversampling')
X_under, y_under = load_data('undersampling')
X_smote, y_smote = load_data('smote')

In [63]:
y_train

Unnamed: 0,y
0,0
1,1
2,0
3,0
4,0
...,...
674995,0
674996,0
674997,1
674998,0


## Model

In [None]:
def save_result_data(result_data, csv_file_path="training-log/model_result.csv"):
    if os.path.exists(csv_file_path):
        with open(csv_file_path, 'a') as csvfile:
            csvwriter = csv.writer(csvfile)   
            csvwriter.writerow(result_data)

    else:
        with open(csv_file_path, 'w') as csvfile:
            csvwriter = csv.writer(csvfile)   
            csvwriter.writerow(['Name', 'Accuracy', 'Precision', 'Recall', 'F1 score', 'ROC_AUC Score'])
            csvwriter.writerow(result_data)

    csvfile.close()

In [68]:
def model_save(model,file_name, dir='model'):
    path = os.path.join(dir, file_name)

    create_dir(dir)
        
    with open(path, 'wb') as file:
        pickle.dump(model, file)

In [69]:
def evaluate_model(model, model_name):
    model_base = clone(model)
    model_oversample = clone(model)
    model_undersample = clone(model)
    model_smote = clone(model)

    model_base.fit(X_train, y_train)
    model_oversample.fit(X_over, y_over)
    model_undersample.fit(X_under, y_under)
    model_smote.fit(X_smote, y_smote)

    print("Training start!")

    y_pred_base = model_base.predict(X_test)
    print("Training model_base done")
    y_pred_oversample = model_oversample.predict(X_test)
    print("Training model_oversample done")
    y_pred_undersample = model_undersample.predict(X_test)
    print("Training model_undersample done")
    y_pred_smote = model_smote.predict(X_test)
    print("Training model_smote done")

    print()

    for type_name, y_pred in zip(["Base", "OverSampled", "UnderSampled", "SMOTE"],[y_pred_base, y_pred_oversample, y_pred_undersample, y_pred_smote]):
        print(f"Type: {type_name}")
        result = [
            model_name + "_" + type_name,
            accuracy_score(y_test, y_pred),
            precision_score(y_test, y_pred),
            recall_score(y_test, y_pred),
            f1_score(y_test, y_pred),
            roc_auc_score(y_test, y_pred)
        ]
        print(result)
        save_result_data(result)    

    for model, type_name in zip([model_base, model_oversample, model_undersample, model_smote],["Base", "OverSampled", "UnderSampled", "SMOTE"]):
        file_name = model_name + "_" + type_name + ".pkl"
        model_save(model, file_name)


In [70]:
# Models
model_logistic_regression = LogisticRegression()
model_clf = RandomForestClassifier(n_estimators=100, random_state=42)
model_gb = GradientBoostingClassifier(n_estimators=100, random_state=42)
model_ada = AdaBoostClassifier(n_estimators=100, random_state=42)
model_dt_gini = DecisionTreeClassifier(criterion='gini', random_state=42)
model_dt_en = DecisionTreeClassifier(criterion='entropy', random_state=42)
model_knn = KNeighborsClassifier(n_neighbors=5)
model_svc = SVC() 
model_gnb = GaussianNB()
model_bnb = BernoulliNB()
model_mlp = MLPClassifier(hidden_layer_sizes=(50, 50), max_iter=500, random_state=42)

# model_list = [model_logistic_regression, model_clf, model_gb, model_ada, model_dt_gini, model_dt_en, model_knn, model_svc, model_gnb, model_bnb, model_mlp]
# model_name = ["logistic_regression", "clf", "gb", "ada", "dt_gini", "dt_en", "knn", "svc", "gnb", "bnb", "mlp"]

model_list = [model_gnb, model_bnb, model_mlp]
model_name = ["gnb", "bnb", "mlp"]

for model, name in zip(model_list, model_name):
    evaluate_model(model, name)

Training start!
Training model_base done
Training model_oversample done
Training model_undersample done
Training model_smote done

Type: Base
['gnb_Base', 0.85932, 0.43936702728887456, 0.6013924190518289, 0.5077676696990903, np.float64(0.748051063887486)]
Type: OverSampled
['gnb_OverSampled', 0.8123066666666666, 0.37436281859070464, 0.827826279146867, 0.5155717677827868, np.float64(0.8190017659778852)]
Type: UnderSampled
['gnb_UnderSampled', 0.8104266666666666, 0.372060789069848, 0.8305890153608133, 0.5139145299145299, np.float64(0.8191246239788706)]
Type: SMOTE
['gnb_SMOTE', 0.7104533333333334, 0.2684023841737668, 0.8111393524146314, 0.40334102648642706, np.float64(0.7538888828910658)]
Training start!
Training model_base done
Training model_oversample done
Training model_undersample done
Training model_smote done

Type: Base
['bnb_Base', 0.86016, 0.359390267735001, 0.20322687589788926, 0.2596357475645913, np.float64(0.5767616540487763)]
Type: OverSampled
['bnb_OverSampled', 0.70804, 0

In [71]:
base_classifier = RandomForestClassifier(random_state=42)
balanced_bagging_classifier = BalancedBaggingClassifier(
    base_classifier,
    sampling_strategy='auto', 
    replacement=False,  
    random_state=42
)
                                       
evaluate_model(balanced_bagging_classifier, "bbc")

Training start!
Training model_base done
Training model_oversample done
Training model_undersample done
Training model_smote done

Type: Base
['bbc_Base', 0.8889733333333333, 0.5227559253656077, 0.9164548568902641, 0.6657568337815598, np.float64(0.9008287536714362)]
Type: OverSampled
['bbc_OverSampled', 0.9945333333333334, 0.9566550375303944, 1.0, 0.9778474173330451, np.float64(0.9968916316659339)]
Type: UnderSampled
['bbc_UnderSampled', 0.89572, 0.5366392170903449, 0.9938114708807603, 0.6969426899678381, np.float64(0.9380362717476385)]
Type: SMOTE
['bbc_SMOTE', 0.9178, 0.6263803680981596, 0.7898110288429661, 0.6986656239307885, np.float64(0.8625860651333752)]
