In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, roc_curve, roc_auc_score
from imblearn.over_sampling import SMOTE
import pickle

In [2]:
def check_directory(directory_path):
    if not os.path.exists(directory_path):
        try:
            os.makedirs(directory_path)
            print(f"Directory '{directory_path}' created successfully.")
        except OSError as e:
            print(f"Error creating directory '{directory_path}': {e}")
    else:
        print(f"Directory '{directory_path}' already exists.")
    return directory_path

In [10]:
# X_train, y_train, X_test, y_test have already been vectorized and stored in csv files
def train_model(model, X_train, y_train, X_test, y_test, over_sample):
    if over_sample:
        sm = SMOTE(random_state = 2)
        X_train, y_train = sm.fit_resample(X_train, y_train.ravel())
        print("Oversampling Done for Training Data.")

    model = model.fit(X_train, y_train)
    print("Model Fitted Successfully.")

    # calculating y_pred
    y_pred = model.predict(X_test)
    y_pred_prob = model.predict_proba(X_test)
    roc_auc = round(roc_auc_score(y_test, y_pred_prob[:, 1]), 2)

    print(f"\033[1mROC-AUC Score\033[0m \t\t: {roc_auc*100} %")

    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob[:,1], pos_label=1)
    
    gmeans = np.sqrt(tpr * (1-fpr))
    
    ix = np.argmax(gmeans)
    print('\033[1mBest Threshold\033[0m \t\t: %.3f \n\033[1mG-Mean\033[0m \t\t\t: %.3f' % (thresholds[ix], gmeans[ix]))

    y_pred = (y_pred > thresholds[ix])

    accuracy = accuracy_score(y_test, y_pred)
    print("\033[1mModel Accuracy\033[0m \t\t:", round(accuracy,2,)*100, "%")

    print("\033[1m\nClassification Report:\033[0m")
    print(classification_report(y_test, y_pred))

    return model, y_pred

In [17]:
def save_model(model, dataset_name, model_name):
    data = {
    "model": model
    }
    with open(f'../output/{dataset_name}/saved/trained_{model_name}.pkl', 'wb') as file:
        pickle.dump(data, file)

In [6]:
dataset_name = "blinks" # to be modified with argparse in .py file

check_directory(f"../output/{dataset_name}/training")
check_directory(f"../output/{dataset_name}/saved")

#X_train_path = f"../output/{dataset_name}/training/{dataset_name}-Xtrain.csv"
#y_train_path = f"../output/{dataset_name}/training/{dataset_name}-ytrain.csv"
#X_test_path = f"../output/{dataset_name}/training/{dataset_name}-Xtest.csv"
#y_test_path = f"../output/{dataset_name}/training/{dataset_name}-ytest.csv"

train_path = f"../output/{dataset_name}/training/{dataset_name}-train.csv"
test_path = f"../output/{dataset_name}/training/{dataset_name}-test.csv"

df_train, df_test = pd.read_csv(train_path), pd.read_csv(test_path)

X_train, y_train = df_train.drop("target", axis=1), df_train["target"]
X_test, y_test = df_test.drop("target", axis=1), df_test["target"]

Directory '../output/blinks/training' already exists.
Directory '../output/blinks/saved' already exists.


In [18]:
model_LR = LogisticRegression(max_iter=1000)
model_LR, y_pred = train_model(model_LR, X_train, y_train, X_test, y_test, True)
save_model(model_LR, dataset_name, "LogisticRegression")

Oversampling Done for Training Data.
Model Fitted Successfully.
[1mROC-AUC Score[0m 		: 67.0 %
[1mBest Threshold[0m 		: 0.528 
[1mG-Mean[0m 			: 0.624
[1mModel Accuracy[0m 		: 62.0 %
[1m
Classification Report:[0m
              precision    recall  f1-score   support

           0       0.64      0.63      0.64      1586
           1       0.59      0.61      0.60      1410

    accuracy                           0.62      2996
   macro avg       0.62      0.62      0.62      2996
weighted avg       0.62      0.62      0.62      2996



In [19]:
model_RF = RandomForestClassifier(n_jobs=3, oob_score=True, n_estimators=100, criterion="gini")
model_RF, y_pred = train_model(model_RF, X_train, y_train, X_test, y_test, True)
save_model(model_RF, dataset_name, "RandomForests")

Oversampling Done for Training Data.
Model Fitted Successfully.
[1mROC-AUC Score[0m 		: 98.0 %
[1mBest Threshold[0m 		: 0.440 
[1mG-Mean[0m 			: 0.928
[1mModel Accuracy[0m 		: 92.0 %
[1m
Classification Report:[0m
              precision    recall  f1-score   support

           0       0.91      0.95      0.93      1586
           1       0.94      0.90      0.92      1410

    accuracy                           0.92      2996
   macro avg       0.93      0.92      0.92      2996
weighted avg       0.93      0.92      0.92      2996



In [21]:
model_SVM = SVC()
model_SVM, y_pred = train_model(model_SVM, X_train, y_train, X_test, y_test, True)
save_model(model_SVM, dataset_name, "SVM")

Oversampling Done for Training Data.
Model Fitted Successfully.
[1mROC-AUC Score[0m 		: 50.0 %
[1mBest Threshold[0m 		: 0.500 
[1mG-Mean[0m 			: 0.025
[1mModel Accuracy[0m 		: 48.0 %
[1m
Classification Report:[0m
              precision    recall  f1-score   support

           0       0.67      0.03      0.06      1586
           1       0.47      0.98      0.64      1410

    accuracy                           0.48      2996
   macro avg       0.57      0.51      0.35      2996
weighted avg       0.58      0.48      0.33      2996

