In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from imblearn.metrics import geometric_mean_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import roc_auc_score
import math
from matplotlib import pyplot as plt
import tensorflow as tf
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn import neighbors
from lightgbm import LGBMClassifier

In [None]:
#################################### Common Functions ####################################
def get_results(y, predicted, pred_prob):
    if np.isnan(predicted).any():
        acc = 0
        pre = 0
        rec = 0
        spe = 0
        f1 = 0
        gmean = 0
        bacc = 0
        rauc = 0
    else:
        TN = metrics.confusion_matrix(y, predicted)[0,0]
        try:
            FP = metrics.confusion_matrix(y, predicted)[0,1]
        except:
            FP = 0
        try:
            FN = metrics.confusion_matrix(y, predicted)[1,0]
        except:
            FN = 0
        try:
            TP = metrics.confusion_matrix(y, predicted)[1,1]
        except:
            TP = 0
            #         acc = np.round((TP+TN)/(TP+TN+FP+FN),4)
#         if TP+FP == 0:
#             pre = 0
#         else:
#             pre = np.round(TP/(TP+FP),4)
#         rec = np.round(TP/(TP+FN),4)
#         spe = np.round(TN/(FP+TN),4)
#         f1 = np.round(TP/(TP + 0.5*(FP+FN)),4)
#         gmean = np.round(((TP/(TP+FN)) * (TN/(TN+FP)))**0.5,4)
#         bacc = np.round(0.5*(TP/(TP+FN) + TN/(TN+FP)),4)*2
        acc = np.round(accuracy_score(y, predicted),4)
        pre = np.round(precision_score(y, predicted),4)
        rec = np.round(recall_score(y, predicted),4)
        spe = np.round(TN/(FP+TN),4)
        f1 = np.round(f1_score(y, predicted),4)
        gmean = np.round(geometric_mean_score(y, predicted),4)
        bacc = np.round(balanced_accuracy_score(y, predicted),4)
        try:
            rauc = np.round(roc_auc_score(y, pred_prob),4)
        except:
            rauc = 0
    
    return acc, pre, rec, spe, f1, gmean, bacc, rauc

In [None]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state = 2)   # 5-fold-cross validation

In [None]:
final_list = [18, 41, 14, 43, 53, 28, 20, 63, 69, 56, 19, 25, 6, 24, 80, 32, 22, 15, 27, 33, 58,
              46, 29, 64, 62, 17, 47, 13, 44, 9, 49, 55, 3, 35, 67, 54, 12, 7, 39, 36, 4, 79, 59, 52, 5, 57,
              21, 50, 45, 42, 11, 1, 51, 38, 34, 16, 10, 2, 26, 91]
print(len(final_list), final_list)

## Validation (70 => train:val = 56:14, 5CV) and Test(30)

In [None]:
Strategy = [0.2, 0.4, 0.6, 0.8, 1.0]

In [None]:
Model = []

# Logistic Regression  # 7
param_lr = {'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000]}
# print(len(list(param_lr.items()))) # 1
for i in range(len(list(param_lr.items())[0][1])):
    Model.append(LogisticRegression(random_state=100, max_iter=100000,    #class_weight='balanced', 
                                    C=param_lr[list(param_lr.keys())[0]][i]))

# Decision Tree  # 4*3*3=36
param_dt = {'max_depth':[10,20,30,40],
            'min_samples_split':[2,4,6], 
            'min_samples_leaf':[1,2,3]}
# print(len(list(param_dt.items()))) # 3
for i in range(len(list(param_dt.items())[0][1])):
    for j in range(len(list(param_dt.items())[1][1])):
        for k in range(len(list(param_dt.items())[2][1])):
            Model.append(DecisionTreeClassifier(random_state=100,    #class_weight='balanced', 
                                                max_depth=param_dt[list(param_dt.keys())[0]][i], 
                                                min_samples_split=param_dt[list(param_dt.keys())[1]][j], 
                                                min_samples_leaf=param_dt[list(param_dt.keys())[2]][k]))

# SVM  # 3*2*4=24
param_svm = {'C':[0.1, 1, 10],
             'kernel':['rbf', 'sigmoid'],
             'gamma':['scale', 'auto', 0.1, 1]}
# print(len(list(param_svm.items()))) # 2
for i in range(len(list(param_svm.items())[0][1])):
    for j in range(len(list(param_svm.items())[1][1])):
        for k in range(len(list(param_svm.items())[2][1])):
            Model.append(svm.SVC(random_state=100, probability=True,     #class_weight='balanced', 
                                 C=param_svm[list(param_svm.keys())[0]][i],
                                 kernel=param_svm[list(param_svm.keys())[1]][j],
                                 gamma=param_svm[list(param_svm.keys())[2]][k]))
        
# KNN  # 4*2*3=24
param_knn = {'n_neighbors':[3, 5, 7, 9],
             'p':[1, 2],
             'metric':['euclidean','manhattan', 'minkowski']}
# print(len(list(param_knn.items()))) # 3
for i in range(len(list(param_knn.items())[0][1])):
    for j in range(len(list(param_knn.items())[1][1])):
        for k in range(len(list(param_knn.items())[2][1])):
            Model.append(neighbors.KNeighborsClassifier(n_neighbors=param_knn[list(param_knn.keys())[0]][i],
                                                        p=param_knn[list(param_knn.keys())[1]][j],
                                                        metric=param_knn[list(param_knn.keys())[2]][k]))
            
# LGBM  # 3*2*2*4=48
param_lg = {'boosting_type' : ['gbdt', 'dart', 'goss'],
            'max_depth' : [10,20], 
            'learning_rate' : [0.01,0.05],
            'n_estimators' : list(range(0,201,50))[1:]}
# print(len(list(param_lg.items()))) # 3
for i in range(len(list(param_lg.items())[0][1])):
    for j in range(len(list(param_lg.items())[1][1])):
        for k in range(len(list(param_lg.items())[2][1])):
            for l in range(len(list(param_lg.items())[3][1])):
                Model.append(LGBMClassifier(random_state=100, objective='binary',     #is_unbalance=True,
                                            boosting_type=param_lg[list(param_lg.keys())[0]][i],
                                            max_depth=param_lg[list(param_lg.keys())[1]][j],
                                            learning_rate=param_lg[list(param_lg.keys())[2]][k],
                                            n_estimators=param_lg[list(param_lg.keys())[3]][l]))

print(len(Model))

In [None]:
ModelName = []

# LR
tot_num = 1
for i in range(len(list(param_lr.keys()))):
    tot_num *= len(param_lr[list(param_lr.keys())[i]])
for i in range(tot_num):
    ModelName.append('LR'+str(i+1))
    
    
# DT
tot_num = 1
for i in range(len(list(param_dt.keys()))):
    tot_num *= len(param_dt[list(param_dt.keys())[i]])
for i in range(tot_num):
    ModelName.append('DT'+str(i+1))    
    
# SVM
tot_num = 1
for i in range(len(list(param_svm.keys()))):
    tot_num *= len(param_svm[list(param_svm.keys())[i]])
for i in range(tot_num):
    ModelName.append('SVM'+str(i+1)) 
    
# KNN
tot_num = 1
for i in range(len(list(param_knn.keys()))):
    tot_num *= len(param_knn[list(param_knn.keys())[i]])
for i in range(tot_num):
    ModelName.append('KNN'+str(i+1)) 
    
# LGBM
tot_num = 1
for i in range(len(list(param_lg.keys()))):
    tot_num *= len(param_lg[list(param_lg.keys())[i]])
for i in range(tot_num):
    ModelName.append('LG'+str(i+1))
    
print(len(ModelName))

In [None]:
for i in final_list: 
    start = time.time()
    df = pd.read_csv('data_newest/ds'+ str(i) +'_new.csv')
    print('+'*35, '{}th Dataset'.format(i), '+'*35)
    
    # Make major class as '0' and minor class as '1'
    MAJOR = df.iloc[:,-1].value_counts()[df.iloc[:,-1].value_counts() == max(df.iloc[:,-1].value_counts())].index[0] # Moj Label
    minor = df.iloc[:,-1].value_counts()[df.iloc[:,-1].value_counts() != max(df.iloc[:,-1].value_counts())].index[0] # min Label    
    df.iloc[:,-1] = df.iloc[:,-1].replace(MAJOR, -100)
    df.iloc[:,-1] = df.iloc[:,-1].replace(minor, 1)
    df.iloc[:,-1] = df.iloc[:,-1].replace(-100, 0)
    df.rename(columns={df.columns[-1]:'NEW_LABEL'}, inplace=True)
    print('<Modified Class>\n', df.iloc[:,-1].value_counts())
    print('<Imabalance ratio>\n', "1:{: .2f}".format(df.iloc[:,-1].value_counts()[1]/df.iloc[:,-1].value_counts()[0]))
        
    ##################### Validation:Test = 70:30 #######################
    df_val, df_test = train_test_split(df, test_size=0.3, random_state=100, stratify=df.iloc[:,-1])
    X = df_val.iloc[:, :-1]        # For validation
    y = df_val.iloc[:, -1]         # For validation
    X_test = df_test.iloc[:, :-1]  # For test
    y_test = df_test.iloc[:, -1]   # For test
    X_test = np.array(X_test)
    X_test = X_test.astype(float)
    y_test = np.array(y_test)
    y_test = y_test.astype(float)
    
    res_df = pd.DataFrame({'Dataset':[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}, 
                       index = ['Acc_tr','Pre_tr','Rec_tr','Spe_tr','F1_tr','Gmean_tr','B_Acc_tr','R-AUC_tr',
                                'Acc_val','Pre_val','Rec_val','Spe_val','F1_val','Gmean_val','B_Acc_val','R-AUC_val',
                                'Acc_t','Pre_t','Rec_t','Spe_t','F1_t','Gmean_t','B_Acc_t','R-AUC_t'])
    res_df.iloc[:,0] = [i for b in range(24)]
    
    ##################### For Loop for Every Loss Functions ####################### 
    ind = int((y.value_counts()[1]/y.value_counts()[0])//0.2)
    min_strategy = Strategy[ind]
    if i == 33:
        min_strategy = Strategy[1]
    print("<min_strategy>:",min_strategy)   
    
    print("==========", "Original", "==========")
    for k in range(len(Model)):
        train_acc = []
        train_pre = []
        train_rec = []
        train_spe = []
        train_f1 = []
        train_gmean = []
        train_bacc = []
        train_rauc = []
        
        list_acc = []
        list_pre = []
        list_rec = []
        list_spe = []
        list_f1 = []
        list_gmean = []
        list_bacc = []
        list_rauc = []
        
        # 5-fold-CV
        n_iter=0
        for train_index, val_index in skf.split(X, y):
            model = Model[k]
            n_iter += 1
            X_train = X.iloc[train_index]
            y_train= y.iloc[train_index]
            if k == 0 and n_iter == 1:
                print("TRAIN(0/1/total):", list(y_train).count(0), list(y_train).count(1), len(y_train))
            X_val = X.iloc[val_index]
            y_val= y.iloc[val_index]
            if k == 0 and n_iter == 1:
                print("VALIDATION(0/1/total):", list(y_val).count(0), list(y_val).count(1), len(y_val))
            # Array
            X_train = np.array(X_train)
            X_train = X_train.astype(float)
            y_train = np.array(y_train)
            y_train = y_train.astype(float)
            X_val = np.array(X_val)
            X_val = X_val.astype(float)
            y_val = np.array(y_val)
            y_val = y_val.astype(float)
            # Learning
            model.fit(X_train, y_train)
            train_result = model.predict(X_train)
            train_result_prob = model.predict_proba(X_train)[:, 1] # Get probability of class 1
            acc, pre, rec, spe, f1, gmean, bacc, rauc = get_results(y_train, train_result, train_result_prob)
            train_acc.append(acc)
            train_pre.append(pre)
            train_rec.append(rec)
            train_spe.append(spe)
            train_f1.append(f1)
            train_gmean.append(gmean)
            train_bacc.append(bacc)
            train_rauc.append(rauc) 
            # Results 
            result = model.predict(X_val)
            result_prob = model.predict_proba(X_val)[:, 1] # Get probability of class 1
            acc, pre, rec, spe, f1, gmean, bacc, rauc = get_results(y_val, result, result_prob)
            list_acc.append(acc)
            list_pre.append(pre)
            list_rec.append(rec)
            list_spe.append(spe)
            list_f1.append(f1)
            list_gmean.append(gmean)
            list_bacc.append(bacc)
            list_rauc.append(rauc) 
        
        # Test
        if k == 0:
            print("TEST(0/1/total):", list(y_test).count(0), list(y_test).count(1), len(y_test))  
        model = Model[k]
        model.fit(np.array(X).astype(float), np.array(y).astype(float))
        result = model.predict(X_test)
        result_prob = model.predict_proba(X_test)[:, 1] # Get probability of class 1
        acc_t, pre_t, rec_t, spe_t, f1_t, gmean_t, bacc_t, rauc_t = get_results(y_test, result, result_prob)
        
        res_df['{}'.format(ModelName[k])] = [np.mean(train_acc),np.mean(train_pre), np.mean(train_rec), np.mean(train_spe),
                                             np.mean(train_f1), np.mean(train_gmean), np.mean(train_bacc), np.mean(train_rauc),
                                             np.mean(list_acc),np.mean(list_pre), np.mean(list_rec), np.mean(list_spe),
                                             np.mean(list_f1), np.mean(list_gmean), np.mean(list_bacc), np.mean(list_rauc),
                                             acc_t, pre_t, rec_t, spe_t, f1_t, gmean_t, bacc_t, rauc_t]
    
    for j in range(len(Strategy)):
        print("==========", "SMOTE_{}".format(Strategy[j]), "==========") 
        for k in range(len(Model)):   
            train_acc = []
            train_pre = []
            train_rec = []
            train_spe = []
            train_f1 = []
            train_gmean = []
            train_bacc = []
            train_rauc = []   
            list_acc = []
            list_pre = []
            list_rec = []
            list_spe = []
            list_f1 = []
            list_gmean = []
            list_bacc = []
            list_rauc = []
                
            if min_strategy > Strategy[j]:
                train_acc.append(0)
                train_pre.append(0)
                train_rec.append(0)
                train_spe.append(0)
                train_f1.append(0)
                train_gmean.append(0)
                train_bacc.append(0)
                train_rauc.append(0)
                list_acc.append(0)
                list_pre.append(0)
                list_rec.append(0)
                list_spe.append(0)
                list_f1.append(0)
                list_gmean.append(0)
                list_bacc.append(0)
                list_rauc.append(0)
                acc_t = 0
                pre_t = 0
                rec_t = 0
                spe_t = 0
                f1_t = 0
                gmean_t = 0
                bacc_t = 0
                rauc_t = 0
            
            else:               
                # 5-fold-CV
                n_iter=0
                for train_index, val_index in skf.split(X, y):
                    model = Model[k]
                    n_iter += 1
                    X_train = X.iloc[train_index]
                    y_train= y.iloc[train_index]
                    if k == 0 and n_iter == 1:
                        print("TRAIN(0/1/total):", list(y_train).count(0), list(y_train).count(1), len(y_train))     
                    # Loading Resmapled Data
                    over_df = pd.read_csv('SMOTE_over/'
                                          +'ds'+str(i)+'_S_'+str(Strategy[j])+'_'+str(n_iter)+'th.csv')
                    over_df = over_df.replace('False', False)
                    over_df = over_df.replace('FALSE', False)
                    over_df = over_df.fillna(df.mean())
                    X_train = over_df.iloc[:, :-1]
                    y_train = over_df.iloc[:, -1]     
                    if k == 0 and n_iter == 1:
                        print("TRAIN_over(0/1/total):", list(y_train).count(0), list(y_train).count(1), len(y_train))    
                    X_val = X.iloc[val_index]
                    y_val= y.iloc[val_index]
                    if k == 0 and n_iter == 1:
                        print("VALIDATION(0/1/total):", list(y_val).count(0), list(y_val).count(1), len(y_val))
                    # Array
                    X_train = np.array(X_train)
                    X_train = X_train.astype(float)
                    y_train = np.array(y_train)
                    y_train = y_train.astype(float)
                    X_val = np.array(X_val)
                    X_val = X_val.astype(float)
                    y_val = np.array(y_val)
                    y_val = y_val.astype(float)
                    # Learning
                    model.fit(X_train, y_train)
                    train_result = model.predict(X_train)
                    train_result_prob = model.predict_proba(X_train)[:, 1] # Get probability of class 1
                    acc, pre, rec, spe, f1, gmean, bacc, rauc = get_results(y_train, train_result, train_result_prob)
                    train_acc.append(acc)
                    train_pre.append(pre)
                    train_rec.append(rec)
                    train_spe.append(spe)
                    train_f1.append(f1)
                    train_gmean.append(gmean)
                    train_bacc.append(bacc)
                    train_rauc.append(rauc) 
                    # Results 
                    result = model.predict(X_val)
                    result_prob = model.predict_proba(X_val)[:, 1] # Get probability of class 1
                    acc, pre, rec, spe, f1, gmean, bacc, rauc = get_results(y_val, result, result_prob)
                    list_acc.append(acc)
                    list_pre.append(pre)
                    list_rec.append(rec)
                    list_spe.append(spe)
                    list_f1.append(f1)
                    list_gmean.append(gmean)
                    list_bacc.append(bacc)
                    list_rauc.append(rauc)
                            
                # Test
                model = Model[k]
                # Loading Resmapled Data
                over_df = pd.read_csv(r'SMOTE_over/'
                                      +'ds'+str(i)+'_S_'+str(Strategy[j])+'_'+str('full')+'.csv')
                over_df = over_df.replace('False', False)
                over_df = over_df.replace('FALSE', False)
                over_df = over_df.fillna(df.mean())
                X_over = over_df.iloc[:, :-1]
                y_over = over_df.iloc[:, -1]
                if k == 0:
                    print("TRAIN_over(0/1/total):", list(y_over).count(0), list(y_over).count(1), len(y_over))
                    print("TEST(0/1/total):", list(y_test).count(0), list(y_test).count(1), len(y_test))
                model.fit(np.array(X_over).astype(float), np.array(y_over).astype(float))
                result = model.predict(X_test)
                result_prob = model.predict_proba(X_test)[:, 1] # Get probability of class 1
                acc_t, pre_t, rec_t, spe_t, f1_t, gmean_t, bacc_t, rauc_t = get_results(y_test, result, result_prob)
                                                                                        
            res_df['S_{}_{}'.format(Strategy[j],ModelName[k])] = [np.mean(train_acc), np.mean(train_pre), np.mean(train_rec), np.mean(train_spe),
                                                                  np.mean(train_f1), np.mean(train_gmean), np.mean(train_bacc), np.mean(train_rauc),
                                                                  np.mean(list_acc),np.mean(list_pre), np.mean(list_rec), np.mean(list_spe),
                                                                  np.mean(list_f1), np.mean(list_gmean), np.mean(list_bacc), np.mean(list_rauc),
                                                                  acc_t, pre_t, rec_t, spe_t, f1_t, gmean_t, bacc_t, rauc_t]
            
    for h in range(len(Strategy)):
        print("==========", "LLM_{}".format(Strategy[h]), "==========")         
        for k in range(len(Model)):     
            train_acc = []
            train_pre = []
            train_rec = []
            train_spe = []
            train_f1 = []
            train_gmean = []
            train_bacc = []
            train_rauc = []      
            list_acc = []
            list_pre = []
            list_rec = []
            list_spe = []
            list_f1 = []
            list_gmean = []
            list_bacc = []
            list_rauc = []
            
            if min_strategy > Strategy[h]:
                train_acc.append(0)
                train_pre.append(0)
                train_rec.append(0)
                train_spe.append(0)
                train_f1.append(0)
                train_gmean.append(0)
                train_bacc.append(0)
                train_rauc.append(0)
                list_acc.append(0)
                list_pre.append(0)
                list_rec.append(0)
                list_spe.append(0)
                list_f1.append(0)
                list_gmean.append(0)
                list_bacc.append(0)
                list_rauc.append(0)
                acc_t = 0
                pre_t = 0
                rec_t = 0
                spe_t = 0
                f1_t = 0
                gmean_t = 0
                bacc_t = 0
                rauc_t = 0
            
            else:
                # 5-fold-CV
                n_iter=0
                for train_index, val_index in skf.split(X, y):
                    model = Model[k]
                    n_iter += 1
                    X_train = X.iloc[train_index]
                    y_train= y.iloc[train_index]
                    if k == 0 and n_iter == 1:
                        print("TRAIN(0/1/total):", list(y_train).count(0), list(y_train).count(1), len(y_train))
                    # Loading Resmapled Data
                    over_df = pd.read_csv('LLM_over/'
                                          +'ds'+str(i)+'_L_'+str(Strategy[h])+'_'+str(n_iter)+'th.csv')
                    over_df = over_df.replace('False', False)  # sometimes False happen
                    over_df = over_df.replace('FALSE', False)
                    over_df = over_df.fillna(df.mean())   # sometime NAN happen
                    X_train = over_df.iloc[:, :-1]
                    y_train = over_df.iloc[:, -1]
                    if k == 0 and n_iter == 1:
                        print("TRAIN_over(0/1/total):", list(y_train).count(0), list(y_train).count(1), len(y_train))
                    X_val = X.iloc[val_index]
                    y_val = y.iloc[val_index]
                    if k == 0 and n_iter == 1:
                        print("VALIDATION(0/1/total):", list(y_val).count(0), list(y_val).count(1), len(y_val))
                    # Array
                    X_train = np.array(X_train)
                    X_train = X_train.astype(float)
                    y_train = np.array(y_train)
                    y_train = y_train.astype(float)
                    X_val = np.array(X_val)
                    X_val = X_val.astype(float)
                    y_val = np.array(y_val)
                    y_val = y_val.astype(float)
                    # Learning
                    model.fit(X_train, y_train)
                    train_result = model.predict(X_train)
                    train_result_prob = model.predict_proba(X_train)[:, 1] # Get probability of class 1
                    acc, pre, rec, spe, f1, gmean, bacc, rauc = get_results(y_train, train_result, train_result_prob)
                    train_acc.append(acc)
                    train_pre.append(pre)
                    train_rec.append(rec)
                    train_spe.append(spe)
                    train_f1.append(f1)
                    train_gmean.append(gmean)
                    train_bacc.append(bacc)
                    train_rauc.append(rauc) 
                    # Results 
                    result = model.predict(X_val)
                    result_prob = model.predict_proba(X_val)[:, 1] # Get probability of class 1
                    acc, pre, rec, spe, f1, gmean, bacc, rauc = get_results(y_val, result, result_prob)
                    list_acc.append(acc)
                    list_pre.append(pre)
                    list_rec.append(rec)
                    list_spe.append(spe)
                    list_f1.append(f1)
                    list_gmean.append(gmean)
                    list_bacc.append(bacc)
                    list_rauc.append(rauc)
                            
                # Test     
                model = Model[k]
                # Loading Resmapled Data
                over_df = pd.read_csv(r'LLM_over/'
                                      +'ds'+str(i)+'_L_'+str(Strategy[h])+'_'+str('comb')+'.csv')
                over_df = over_df.replace('False', False)  # sometimes False happen
                over_df = over_df.replace('FALSE', False)
                over_df = over_df.fillna(df.mean())   # sometime NAN happen
                X_over = over_df.iloc[:, :-1]
                y_over = over_df.iloc[:, -1]
                if k == 0:
                    print("TRAIN_over(0/1/total):", list(y_over).count(0), list(y_over).count(1), len(y_over))
                    print("TEST(0/1/total):", list(y_test).count(0), list(y_test).count(1), len(y_test))
                model.fit(np.array(X_over).astype(float), np.array(y_over).astype(float))
                result = model.predict(X_test)
                result_prob = model.predict_proba(X_test)[:, 1] # Get probability of class 1
                acc_t, pre_t, rec_t, spe_t, f1_t, gmean_t, bacc_t, rauc_t = get_results(y_test, result, result_prob)
            res_df['L_{}_{}'.format(Strategy[h],ModelName[k])] = [np.mean(train_acc),np.mean(train_pre), np.mean(train_rec), np.mean(train_spe),
                                                                  np.mean(train_f1), np.mean(train_gmean), np.mean(train_bacc), np.mean(train_rauc),
                                                                  np.mean(list_acc),np.mean(list_pre), np.mean(list_rec), np.mean(list_spe),
                                                                  np.mean(list_f1), np.mean(list_gmean), np.mean(list_bacc), np.mean(list_rauc),
                                                                  acc_t, pre_t, rec_t, spe_t, f1_t, gmean_t, bacc_t, rauc_t]
    end = time.time()
    print(end-start)
    res_df.to_csv("validation_test.csv", mode = 'a', float_format='%.4g')

In [None]:
for i in final_list: 
    df = pd.read_csv('data_newest/ds'+ str(i) +'_new.csv')
    print('+'*35, '{}th Dataset'.format(i), '+'*35)
    
    # Make major class as '0' and minor class as '1'
    MAJOR = df.iloc[:,-1].value_counts()[df.iloc[:,-1].value_counts() == max(df.iloc[:,-1].value_counts())].index[0] # Moj Label
    minor = df.iloc[:,-1].value_counts()[df.iloc[:,-1].value_counts() != max(df.iloc[:,-1].value_counts())].index[0] # min Label    
    df.iloc[:,-1] = df.iloc[:,-1].replace(MAJOR, -100)
    df.iloc[:,-1] = df.iloc[:,-1].replace(minor, 1)
    df.iloc[:,-1] = df.iloc[:,-1].replace(-100, 0)
    df.rename(columns={df.columns[-1]:'NEW_LABEL'}, inplace=True)
    print('<Modified Class>\n', df.iloc[:,-1].value_counts())
    print('<Imabalance ratio>\n', "1:{: .2f}".format(df.iloc[:,-1].value_counts()[1]/df.iloc[:,-1].value_counts()[0]))
        
    ##################### Validation:Test = 70:30 #######################
    df_val, df_test = train_test_split(df, test_size=0.3, random_state=100, stratify=df.iloc[:,-1])
    X = df_val.iloc[:, :-1]        # For validation
    y = df_val.iloc[:, -1]         # For validation 
    X_test = df_test.iloc[:, :-1]  # For test
    y_test = df_test.iloc[:, -1]   # For test
    X_test = np.array(X_test)
    X_test = X_test.astype(float)
    y_test = np.array(y_test)
    y_test = y_test.astype(float)
    
    res_df = pd.DataFrame({'Dataset':[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}, 
                       index = ['Acc_tr','Pre_tr','Rec_tr','Spe_tr','F1_tr','Gmean_tr','B_Acc_tr','R-AUC_tr',
                                'Acc_val','Pre_val','Rec_val','Spe_val','F1_val','Gmean_val','B_Acc_val','R-AUC_val',
                                'Acc_t','Pre_t','Rec_t','Spe_t','F1_t','Gmean_t','B_Acc_t','R-AUC_t'])
    res_df.iloc[:,0] = [i for b in range(24)]
    
    ##################### For Loop for Every Loss Functions ####################### 
    ind = int((y.value_counts()[1]/y.value_counts()[0])//0.2)
    min_strategy = Strategy[ind]
    adj_strategy = Strategy[ind+1]  # original min_strategy-> LLM oversmaple, so SMOTE is used from the next 
    if i == 33:
        min_strategy = Strategy[1]
        adj_strategy = Strategy[2]
    print("<min_strategy>:",min_strategy)   
    
    for j in range(len(Strategy)):
        print("==========", "LLM_SMOTE_{}".format(Strategy[j]), "==========") 
        for k in range(len(Model)):  
            train_acc = []
            train_pre = []
            train_rec = []
            train_spe = []
            train_f1 = []
            train_gmean = []
            train_bacc = []
            train_rauc = []    
            list_acc = []
            list_pre = []
            list_rec = []
            list_spe = []
            list_f1 = []
            list_gmean = []
            list_bacc = []
            list_rauc = []
                
            if adj_strategy > Strategy[j]:
                train_acc.append(0)
                train_pre.append(0)
                train_rec.append(0)
                train_spe.append(0)
                train_f1.append(0)
                train_gmean.append(0)
                train_bacc.append(0)
                train_rauc.append(0)
                list_acc.append(0)
                list_pre.append(0)
                list_rec.append(0)
                list_spe.append(0)
                list_f1.append(0)
                list_gmean.append(0)
                list_bacc.append(0)
                list_rauc.append(0)
                acc_t = 0
                pre_t = 0
                rec_t = 0
                spe_t = 0
                f1_t = 0
                gmean_t = 0
                bacc_t = 0
                rauc_t = 0
            
            else:
                # 5-fold-CV
                n_iter=0
                for train_index, val_index in skf.split(X, y):
                    model = Model[k]
                    n_iter += 1
                    X_train = X.iloc[train_index]
                    y_train= y.iloc[train_index]
                    if k == 0 and n_iter == 1:
                        print("TRAIN(0/1/total):", list(y_train).count(0), list(y_train).count(1), len(y_train))     
                    # Loading Resmapled Data
                    over_df = pd.read_csv('LLM_SMOTE/'
                                          +'ds'+str(i)+'_LS_'+str(Strategy[j])+'_'+str(n_iter)+'th.csv')
                    over_df = over_df.replace('False', False)
                    over_df = over_df.fillna(df.mean())
                    X_train = over_df.iloc[:, :-1]
                    y_train = over_df.iloc[:, -1]     
                    if k == 0 and n_iter == 1:
                        print("TRAIN_over(0/1/total):", list(y_train).count(0), list(y_train).count(1), len(y_train))    
                    X_val = X.iloc[val_index]
                    y_val= y.iloc[val_index]
                    if k == 0 and n_iter == 1:
                        print("VALIDATION(0/1/total):", list(y_val).count(0), list(y_val).count(1), len(y_val))
                    # Array
                    X_train = np.array(X_train)
                    X_train = X_train.astype(float)
                    y_train = np.array(y_train)
                    y_train = y_train.astype(float)
                    X_val = np.array(X_val)
                    X_val = X_val.astype(float)
                    y_val = np.array(y_val)
                    y_val = y_val.astype(float)
                    # Learning
                    model.fit(X_train, y_train)
                    train_result = model.predict(X_train)
                    train_result_prob = model.predict_proba(X_train)[:, 1] # Get probability of class 1
                    acc, pre, rec, spe, f1, gmean, bacc, rauc = get_results(y_train, train_result, train_result_prob)
                    train_acc.append(acc)
                    train_pre.append(pre)
                    train_rec.append(rec)
                    train_spe.append(spe)
                    train_f1.append(f1)
                    train_gmean.append(gmean)
                    train_bacc.append(bacc)
                    train_rauc.append(rauc)
                    # Results 
                    result = model.predict(X_val)
                    result_prob = model.predict_proba(X_val)[:, 1] # Get probability of class 1
                    acc, pre, rec, spe, f1, gmean, bacc, rauc = get_results(y_val, result, result_prob)
                    list_acc.append(acc)
                    list_pre.append(pre)
                    list_rec.append(rec)
                    list_spe.append(spe)
                    list_f1.append(f1)
                    list_gmean.append(gmean)
                    list_bacc.append(bacc)
                    list_rauc.append(rauc)
                            
                # Test
                model = Model[k]
                # Loading Resmapled Data
                over_df = pd.read_csv(r'LLM_SMOTE/'
                                      +'ds'+str(i)+'_LS_'+str(Strategy[j])+'_'+str('comb')+'.csv')
                over_df = over_df.replace('False', False)
                over_df = over_df.fillna(df.mean())
                X_over = over_df.iloc[:, :-1]
                y_over = over_df.iloc[:, -1]     
                if k == 0:
                    print("TRAIN_over(0/1/total):", list(y_over).count(0), list(y_over).count(1), len(y_over)) 
                    print("TEST(0/1/total):", list(y_test).count(0), list(y_test).count(1), len(y_test)) 
                model.fit(np.array(X_over).astype(float), np.array(y_over).astype(float))
                result = model.predict(X_test)
                result_prob = model.predict_proba(X_test)[:, 1] # Get probability of class 1
                acc_t, pre_t, rec_t, spe_t, f1_t, gmean_t, bacc_t, rauc_t = get_results(y_test, result, result_prob)


            res_df['LS_{}_{}'.format(Strategy[j],ModelName[k])] = [np.mean(train_acc),np.mean(train_pre), np.mean(train_rec), np.mean(train_spe),
                                                                   np.mean(train_f1), np.mean(train_gmean), np.mean(train_bacc), np.mean(train_rauc),
                                                                   np.mean(list_acc),np.mean(list_pre), np.mean(list_rec), np.mean(list_spe),
                                                                   np.mean(list_f1), np.mean(list_gmean), np.mean(list_bacc), np.mean(list_rauc),
                                                                   acc_t, pre_t, rec_t, spe_t, f1_t, gmean_t, bacc_t, rauc_t]
            
    res_df.to_csv("validation_test_LS.csv", mode = 'a', float_format='%.4g')

# 3. Very Imbalanced

In [None]:
very_imb_list = ['ds8_new_05', 'ds8_new_01', 'ds8_new_00']
very_imb_list = ['ds58_new_05', 'ds58_new_01', 'ds58_new_00']
very_imb_list = ['ds14_new_05', 'ds14_new_01', 'ds14_new_00']
very_imb_list = ['ds44_new_05', 'ds44_new_01', 'ds44_new_00']

In [None]:
# Here, we get validation scores, not test scores.
for i in very_imb_list:
    df = pd.read_csv('data_newest/'+ str(i) +'.csv')
    print('+'*35, '{}th Dataset'.format(i), '+'*35)
    
    # Make major class as '0' and minor class as '1'
    MAJOR = df.iloc[:,-1].value_counts()[df.iloc[:,-1].value_counts() == max(df.iloc[:,-1].value_counts())].index[0] # Moj Label
    try:
        minor = df.iloc[:,-1].value_counts()[df.iloc[:,-1].value_counts() != max(df.iloc[:,-1].value_counts())].index[0] # min Label    
    except:
        minor = "NONE"    
    df.iloc[:,-1] = df.iloc[:,-1].replace(MAJOR, -100)
    df.iloc[:,-1] = df.iloc[:,-1].replace(minor, 1)
    df.iloc[:,-1] = df.iloc[:,-1].replace(-100, 0)
    df.rename(columns={df.columns[-1]:'NEW_LABEL'}, inplace=True)
    print('<Modified Class>\n', df.iloc[:,-1].value_counts())
    try:
        print('<Imabalance ratio>\n', "1:{: .2f}".format(df.iloc[:,-1].value_counts()[1]/df.iloc[:,-1].value_counts()[0]))
    except:
        print('<Imabalance ratio>\n',"1: 0.00")
        
    # For validation & test (adding more data points removed from original dataset)
    val_add = pd.read_csv('data_newest/'+str(i)+'_val.csv')
    val_add.iloc[:,-1] = val_add.iloc[:,-1].replace(2, 1)
    val_add.rename(columns={val_add.columns[-1]:'NEW_LABEL'}, inplace=True)
    test_add = pd.read_csv('data_newest/'+str(i)+'_test.csv')
    test_add.iloc[:,-1] = test_add.iloc[:,-1].replace(2, 1)
    test_add.rename(columns={test_add.columns[-1]:'NEW_LABEL'}, inplace=True)
    
    ##################### Validation:Test = 70:30 #######################
    df_val, df_test = train_test_split(df, test_size=0.3, random_state=100, stratify=df.iloc[:,-1])
    X = df_val.iloc[:, :-1]        # For validation
    y = df_val.iloc[:, -1]         # For validation    
    df_test = pd.concat([test_add, df_test], axis=0)
    X_test = df_test.iloc[:, :-1]  # For test
    y_test = df_test.iloc[:, -1]   # For test
    X_test = np.array(X_test)
    X_test = X_test.astype(float)
    y_test = np.array(y_test)
    y_test = y_test.astype(float)
    
    res_df = pd.DataFrame({'Dataset':[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}, 
                       index = ['Acc_tr','Pre_tr','Rec_tr','Spe_tr','F1_tr','Gmean_tr','B_Acc_tr','R-AUC_tr',
                                'Acc_val','Pre_val','Rec_val','Spe_val','F1_val','Gmean_val','B_Acc_val','R-AUC_val',
                                'Acc_t','Pre_t','Rec_t','Spe_t','F1_t','Gmean_t','B_Acc_t','R-AUC_t'])
    res_df.iloc[:,0] = [i for b in range(24)]
    
    ##################### For Loop for Every Loss Functions ####################### 
    try:
        ind = int((y.value_counts()[1]/y.value_counts()[0])//0.2)
    except:
        ind = 0
    min_strategy = Strategy[ind]
    print("<min_strategy>:",min_strategy)   
    
    print("==========", "Original", "==========")
    for k in range(len(Model)):  
        train_acc = []
        train_pre = []
        train_rec = []
        train_spe = []
        train_f1 = []
        train_gmean = []
        train_bacc = []
        train_rauc = []    
        list_acc = []
        list_pre = []
        list_rec = []
        list_spe = []
        list_f1 = []
        list_gmean = []
        list_bacc = []
        list_rauc = []
        
        if i in very_imb_list[2:]:  # no label '1' in training set, no training available
            train_acc.append(0)
            train_pre.append(0)
            train_rec.append(0)
            train_spe.append(0)
            train_f1.append(0)
            train_gmean.append(0)
            train_bacc.append(0)
            train_rauc.append(0)
            list_acc.append(0)
            list_pre.append(0)
            list_rec.append(0)
            list_spe.append(0)
            list_f1.append(0)
            list_gmean.append(0)
            list_bacc.append(0)
            list_rauc.append(0)
            acc_t = 0
            pre_t = 0
            rec_t = 0
            spe_t = 0
            f1_t = 0
            gmean_t = 0
            bacc_t = 0
            rauc_t = 0
        
        else:
            # 5-fold-CV
            n_iter=0
            for train_index, val_index in skf.split(X, y):
                model = Model[k]
                n_iter += 1
                X_train = X.iloc[train_index]
                y_train= y.iloc[train_index]
                if k == 0 and n_iter == 1:
                    print("TRAIN(0/1/total):", list(y_train).count(0), list(y_train).count(1), len(y_train))
                X_val = X.iloc[val_index]
                X_val = pd.concat([val_add.iloc[:,:-1], X_val], axis=0)
                y_val= y.iloc[val_index]
                y_val = pd.concat([val_add.iloc[:,-1], y_val], axis=0)
                if k == 0 and n_iter == 1:
                    print("VALIDATION(0/1/total):", list(y_val).count(0), list(y_val).count(1), len(y_val))
                # Array
                X_train = np.array(X_train)
                X_train = X_train.astype(float)
                y_train = np.array(y_train)
                y_train = y_train.astype(float)
                X_val = np.array(X_val)
                X_val = X_val.astype(float)
                y_val = np.array(y_val)
                y_val = y_val.astype(float)
                # Learning
                model.fit(X_train, y_train)  
                train_result = model.predict(X_train)
                train_result_prob = model.predict_proba(X_train)[:, 1] # Get probability of class 1
                acc, pre, rec, spe, f1, gmean, bacc, rauc = get_results(y_train, train_result, train_result_prob)
                train_acc.append(acc)
                train_pre.append(pre)
                train_rec.append(rec)
                train_spe.append(spe)
                train_f1.append(f1)
                train_gmean.append(gmean)
                train_bacc.append(bacc)
                train_rauc.append(rauc)      
                # Results
                result = model.predict(X_val)
                result_prob = model.predict_proba(X_val)[:, 1] # Get probability of class 1
                acc, pre, rec, spe, f1, gmean, bacc, rauc = get_results(y_val, result, result_prob)
                list_acc.append(acc)
                list_pre.append(pre)
                list_rec.append(rec)
                list_spe.append(spe)
                list_f1.append(f1)
                list_gmean.append(gmean)
                list_bacc.append(bacc)
                list_rauc.append(rauc) 

            # Test
            if k == 0:
                print("TEST(0/1/total):", list(y_test).count(0), list(y_test).count(1), len(y_test))
            model = Model[k]
            model.fit(np.array(X).astype(float), np.array(y).astype(float))
            result = model.predict(X_test)
            result_prob = model.predict_proba(X_test)[:, 1] # Get probability of class 1
            acc_t, pre_t, rec_t, spe_t, f1_t, gmean_t, bacc_t, rauc_t = get_results(y_test, result, result_prob)
        
        res_df['{}'.format(ModelName[k])] = [np.mean(train_acc),np.mean(train_pre), np.mean(train_rec), np.mean(train_spe),
                                             np.mean(train_f1), np.mean(train_gmean), np.mean(train_bacc), np.mean(train_rauc),
                                             np.mean(list_acc),np.mean(list_pre), np.mean(list_rec), np.mean(list_spe),
                                             np.mean(list_f1), np.mean(list_gmean), np.mean(list_bacc), np.mean(list_rauc),
                                             acc_t, pre_t, rec_t, spe_t, f1_t, gmean_t, bacc_t, rauc_t]
    
    for j in range(len(Strategy)):
        print("==========", "SMOTE_{}".format(Strategy[j]), "==========") 
        for k in range(len(Model)):   
            train_acc = []
            train_pre = []
            train_rec = []
            train_spe = []
            train_f1 = []
            train_gmean = []
            train_bacc = []
            train_rauc = []       
            list_acc = []
            list_pre = []
            list_rec = []
            list_spe = []
            list_f1 = []
            list_gmean = []
            list_bacc = []
            list_rauc = []
                
            if min_strategy > Strategy[j] or i in very_imb_list[1:]:  # SMOTE cannot generate
                train_acc.append(0)
                train_pre.append(0)
                train_rec.append(0)
                train_spe.append(0)
                train_f1.append(0)
                train_gmean.append(0)
                train_bacc.append(0)
                train_rauc.append(0)
                list_acc.append(0)
                list_pre.append(0)
                list_rec.append(0)
                list_spe.append(0)
                list_f1.append(0)
                list_gmean.append(0)
                list_bacc.append(0)
                list_rauc.append(0)
                acc_t = 0
                pre_t = 0
                rec_t = 0
                spe_t = 0
                f1_t = 0
                gmean_t = 0
                bacc_t = 0
                rauc_t = 0
            
            else:
                # 5-fold-CV
                n_iter=0
                for train_index, val_index in skf.split(X, y):
                    model = Model[k]
                    n_iter += 1
                    X_train = X.iloc[train_index]
                    y_train= y.iloc[train_index]
                    if k == 0 and n_iter == 1:
                        print("TRAIN(0/1/total):", list(y_train).count(0), list(y_train).count(1), len(y_train))     
                    # Loading Resmapled Data
                    over_df = pd.read_csv('SMOTE_over/'
                                          +str(i)+'_S_'+str(Strategy[j])+'_'+str(n_iter)+'th.csv')
                    over_df = over_df.replace('False', False)
                    over_df = over_df.fillna(df.mean())
                    X_train = over_df.iloc[:, :-1]
                    y_train = over_df.iloc[:, -1]     
                    if k == 0 and n_iter == 1:
                        print("TRAIN_over(0/1/total):", list(y_train).count(0), list(y_train).count(1), len(y_train))
                    X_val = X.iloc[val_index]
                    X_val = pd.concat([val_add.iloc[:,:-1], X_val], axis=0)
                    y_val= y.iloc[val_index]
                    y_val = pd.concat([val_add.iloc[:,-1], y_val], axis=0)
                    if k == 0 and n_iter == 1:
                        print("VALIDATION(0/1/total):", list(y_val).count(0), list(y_val).count(1), len(y_val))
                    # Array
                    X_train = np.array(X_train)
                    X_train = X_train.astype(float)
                    y_train = np.array(y_train)
                    y_train = y_train.astype(float)
                    X_val = np.array(X_val)
                    X_val = X_val.astype(float)
                    y_val = np.array(y_val)
                    y_val = y_val.astype(float)
                    # Learning
                    model.fit(X_train, y_train)  
                    train_result = model.predict(X_train)
                    train_result_prob = model.predict_proba(X_train)[:, 1] # Get probability of class 1
                    acc, pre, rec, spe, f1, gmean, bacc, rauc = get_results(y_train, train_result, train_result_prob)
                    train_acc.append(acc)
                    train_pre.append(pre)
                    train_rec.append(rec)
                    train_spe.append(spe)
                    train_f1.append(f1)
                    train_gmean.append(gmean)
                    train_bacc.append(bacc)
                    train_rauc.append(rauc)      
                    # Results 
                    result = model.predict(X_val)
                    result_prob = model.predict_proba(X_val)[:, 1] # Get probability of class 1
                    acc, pre, rec, spe, f1, gmean, bacc, rauc = get_results(y_val, result, result_prob)
                    list_acc.append(acc)
                    list_pre.append(pre)
                    list_rec.append(rec)
                    list_spe.append(spe)
                    list_f1.append(f1)
                    list_gmean.append(gmean)
                    list_bacc.append(bacc)
                    list_rauc.append(rauc)
                            
                # Test
                model = Model[k]
                # Loading Resmapled Data
                over_df = pd.read_csv('SMOTE_over/'
                                      +str(i)+'_S_'+str(Strategy[j])+'_'+str('full')+'.csv')
                over_df = over_df.replace('False', False)
                over_df = over_df.fillna(df.mean())
                X_over = over_df.iloc[:, :-1]
                y_over = over_df.iloc[:, -1]                  
                if k == 0:
                    print("TRAIN_over(0/1/total):", list(y_over).count(0), list(y_over).count(1), len(y_over)) 
                    print("TEST(0/1/total):", list(y_test).count(0), list(y_test).count(1), len(y_test))
                model.fit(np.array(X_over).astype(float), np.array(y_over).astype(float))
                result = model.predict(X_test)
                result_prob = model.predict_proba(X_test)[:, 1] # Get probability of class 1
                acc_t, pre_t, rec_t, spe_t, f1_t, gmean_t, bacc_t, rauc_t = get_results(y_test, result, result_prob)

            res_df['S_{}_{}'.format(Strategy[j],ModelName[k])] = [np.mean(train_acc),np.mean(train_pre), np.mean(train_rec), np.mean(train_spe),
                                                                  np.mean(train_f1), np.mean(train_gmean), np.mean(train_bacc), np.mean(train_rauc),
                                                                  np.mean(list_acc),np.mean(list_pre), np.mean(list_rec), np.mean(list_spe),
                                                                  np.mean(list_f1), np.mean(list_gmean), np.mean(list_bacc), np.mean(list_rauc),
                                                                  acc_t, pre_t, rec_t, spe_t, f1_t, gmean_t, bacc_t, rauc_t]
            
    for h in range(len(Strategy)):
        print("==========", "LLM_{}".format(Strategy[h]), "==========")         
        for k in range(len(Model)):    
            train_acc = []
            train_pre = []
            train_rec = []
            train_spe = []
            train_f1 = []
            train_gmean = []
            train_bacc = []
            train_rauc = []            
            list_acc = []
            list_pre = []
            list_rec = []
            list_spe = []
            list_f1 = []
            list_gmean = []
            list_bacc = []
            list_rauc = []
            
            if min_strategy > Strategy[h]:
                train_acc.append(0)
                train_pre.append(0)
                train_rec.append(0)
                train_spe.append(0)
                train_f1.append(0)
                train_gmean.append(0)
                train_bacc.append(0)
                train_rauc.append(0)
                list_acc.append(0)
                list_pre.append(0)
                list_rec.append(0)
                list_spe.append(0)
                list_f1.append(0)
                list_gmean.append(0)
                list_bacc.append(0)
                list_rauc.append(0)
                acc_t = 0
                pre_t = 0
                rec_t = 0
                spe_t = 0
                f1_t = 0
                gmean_t = 0
                bacc_t = 0
                rauc_t = 0
            
            else:
                # 5-fold-CV
                n_iter=0
                for train_index, val_index in skf.split(X, y):
                    model = Model[k]
                    n_iter += 1
                    X_train = X.iloc[train_index]
                    y_train= y.iloc[train_index]
                    if k == 0 and n_iter == 1:
                        print("TRAIN(0/1/total):", list(y_train).count(0), list(y_train).count(1), len(y_train))
                    # Loading Resmapled Data
                    over_df = pd.read_csv('LLM_over/'
                                          +str(i)+'_L_'+str(Strategy[h])+'_'+str(n_iter)+'th.csv')
                    over_df = over_df.replace('False', False)  # sometimes False happen
                    over_df = over_df.fillna(df.mean())   # sometime NAN happen
                    X_train = over_df.iloc[:, :-1]
                    y_train = over_df.iloc[:, -1]
                    if k == 0 and n_iter == 1:
                        print("TRAIN_over(0/1/total):", list(y_train).count(0), list(y_train).count(1), len(y_train))
                    X_val = X.iloc[val_index]
                    X_val = pd.concat([val_add.iloc[:,:-1], X_val], axis=0)
                    y_val= y.iloc[val_index]
                    y_val = pd.concat([val_add.iloc[:,-1], y_val], axis=0)
                    if k == 0 and n_iter == 1:
                        print("VALIDATION(0/1/total):", list(y_val).count(0), list(y_val).count(1), len(y_val))
                    # Array
                    X_train = np.array(X_train)
                    X_train = X_train.astype(float)
                    y_train = np.array(y_train)
                    y_train = y_train.astype(float)
                    X_val = np.array(X_val)
                    X_val = X_val.astype(float)
                    y_val = np.array(y_val)
                    y_val = y_val.astype(float)
                    # Learning
                    model.fit(X_train, y_train)  
                    train_result = model.predict(X_train)
                    train_result_prob = model.predict_proba(X_train)[:, 1] # Get probability of class 1
                    acc, pre, rec, spe, f1, gmean, bacc, rauc = get_results(y_train, train_result, train_result_prob)
                    train_acc.append(acc)
                    train_pre.append(pre)
                    train_rec.append(rec)
                    train_spe.append(spe)
                    train_f1.append(f1)
                    train_gmean.append(gmean)
                    train_bacc.append(bacc)
                    train_rauc.append(rauc)    
                    # Results 
                    result = model.predict(X_val)
                    result_prob = model.predict_proba(X_val)[:, 1] # Get probability of class 1
                    acc, pre, rec, spe, f1, gmean, bacc, rauc = get_results(y_val, result, result_prob)
                    list_acc.append(acc)
                    list_pre.append(pre)
                    list_rec.append(rec)
                    list_spe.append(spe)
                    list_f1.append(f1)
                    list_gmean.append(gmean)
                    list_bacc.append(bacc)
                    list_rauc.append(rauc)
                            
                # Test
                model = Model[k]
                # Loading Resmapled Data
                over_df = pd.read_csv('LLM_over/'
                                          +str(i)+'_L_'+str(Strategy[h])+'_'+str('comb')+'.csv')
                over_df = over_df.replace('False', False)
                over_df = over_df.fillna(df.mean())
                X_over = over_df.iloc[:, :-1]
                y_over = over_df.iloc[:, -1]                  
                if k == 0:
                    print("TRAIN_over(0/1/total):", list(y_over).count(0), list(y_over).count(1), len(y_over)) 
                    print("TEST(0/1/total):", list(y_test).count(0), list(y_test).count(1), len(y_test))
                model.fit(np.array(X_over).astype(float), np.array(y_over).astype(float))
                result = model.predict(X_test)
                result_prob = model.predict_proba(X_test)[:, 1] # Get probability of class 1
                acc_t, pre_t, rec_t, spe_t, f1_t, gmean_t, bacc_t, rauc_t = get_results(y_test, result, result_prob)

            res_df['L_{}_{}'.format(Strategy[h],ModelName[k])] = [np.mean(train_acc),np.mean(train_pre), np.mean(train_rec), np.mean(train_spe),
                                                                  np.mean(train_f1), np.mean(train_gmean), np.mean(train_bacc), np.mean(train_rauc),
                                                                  np.mean(list_acc),np.mean(list_pre), np.mean(list_rec), np.mean(list_spe),
                                                                  np.mean(list_f1), np.mean(list_gmean), np.mean(list_bacc), np.mean(list_rauc),
                                                                  acc_t, pre_t, rec_t, spe_t, f1_t, gmean_t, bacc_t, rauc_t] 
            
            
    res_df.to_csv("veryimbalance_validation_test.csv", mode = 'a', float_format='%.4g')

# 4. Very imbalanced with LLM+SMOTE

In [None]:
# Here, we get validation scores, not test scores.
for i in very_imb_list:
    df = pd.read_csv('data_newest/'+ str(i) +'.csv')
    print('+'*35, '{}th Dataset'.format(i), '+'*35)
    
    # Make major class as '0' and minor class as '1'
    MAJOR = df.iloc[:,-1].value_counts()[df.iloc[:,-1].value_counts() == max(df.iloc[:,-1].value_counts())].index[0] # Moj Label
    try:
        minor = df.iloc[:,-1].value_counts()[df.iloc[:,-1].value_counts() != max(df.iloc[:,-1].value_counts())].index[0] # min Label    
    except:
        minor = "NONE"    
    df.iloc[:,-1] = df.iloc[:,-1].replace(MAJOR, -100)
    df.iloc[:,-1] = df.iloc[:,-1].replace(minor, 1)
    df.iloc[:,-1] = df.iloc[:,-1].replace(-100, 0)
    df.rename(columns={df.columns[-1]:'NEW_LABEL'}, inplace=True)
    print('<Modified Class>\n', df.iloc[:,-1].value_counts())
    try:
        print('<Imabalance ratio>\n', "1:{: .2f}".format(df.iloc[:,-1].value_counts()[1]/df.iloc[:,-1].value_counts()[0]))
    except:
        print('<Imabalance ratio>\n',"1: 0.00")  
        
    # For validation & test (adding more data points removed from original dataset)
    val_add = pd.read_csv('data_newest/'+str(i)+'_val.csv')
    val_add.iloc[:,-1] = val_add.iloc[:,-1].replace(2, 1)
    val_add.rename(columns={val_add.columns[-1]:'NEW_LABEL'}, inplace=True)
    test_add = pd.read_csv('data_newest/'+str(i)+'_test.csv')
    test_add.iloc[:,-1] = test_add.iloc[:,-1].replace(2, 1)
    test_add.rename(columns={test_add.columns[-1]:'NEW_LABEL'}, inplace=True)
    
    ##################### Validation:Test = 70:30 #######################
    df_val, df_test = train_test_split(df, test_size=0.3, random_state=100, stratify=df.iloc[:,-1])
    X = df_val.iloc[:, :-1]        # For validation
    y = df_val.iloc[:, -1]         # For validation     
    df_test = pd.concat([test_add, df_test], axis=0)
    X_test = df_test.iloc[:, :-1]  # For test
    y_test = df_test.iloc[:, -1]   # For test
    X_test = np.array(X_test)
    X_test = X_test.astype(float)
    y_test = np.array(y_test)
    y_test = y_test.astype(float)
    
    res_df = pd.DataFrame({'Dataset':[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}, 
                       index = ['Acc_tr','Pre_tr','Rec_tr','Spe_tr','F1_tr','Gmean_tr','B_Acc_tr','R-AUC_tr',
                                'Acc_val','Pre_val','Rec_val','Spe_val','F1_val','Gmean_val','B_Acc_val','R-AUC_val',
                                'Acc_t','Pre_t','Rec_t','Spe_t','F1_t','Gmean_t','B_Acc_t','R-AUC_t'])
    res_df.iloc[:,0] = [i for b in range(24)]
    
    ##################### For Loop for Every Loss Functions ####################### 
    try:
        ind = int((y.value_counts()[1]/y.value_counts()[0])//0.2)
    except:
        ind = 0
    min_strategy = Strategy[ind]
    adj_strategy = Strategy[ind+1]  # original min_strategy-> LLM oversmaple, so SMOTE is used from the next 
    print("<min_strategy>:",min_strategy)   
    
    for j in range(len(Strategy)):
        print("==========", "LLM_SMOTE_{}".format(Strategy[j]), "==========") 
        for k in range(len(Model)):  
            train_acc = []
            train_pre = []
            train_rec = []
            train_spe = []
            train_f1 = []
            train_gmean = []
            train_bacc = []
            train_rauc = []         
            list_acc = []
            list_pre = []
            list_rec = []
            list_spe = []
            list_f1 = []
            list_gmean = []
            list_bacc = []
            list_rauc = []
                
            if adj_strategy > Strategy[j]:
                train_acc.append(0)
                train_pre.append(0)
                train_rec.append(0)
                train_spe.append(0)
                train_f1.append(0)
                train_gmean.append(0)
                train_bacc.append(0)
                train_rauc.append(0)
                list_acc.append(0)
                list_pre.append(0)
                list_rec.append(0)
                list_spe.append(0)
                list_f1.append(0)
                list_gmean.append(0)
                list_bacc.append(0)
                list_rauc.append(0)
                acc_t = 0
                pre_t = 0
                rec_t = 0
                spe_t = 0
                f1_t = 0
                gmean_t = 0
                bacc_t = 0
                rauc_t = 0
            
            else:
                # 5-fold-CV
                n_iter=0
                for train_index, val_index in skf.split(X, y):
                    model = Model[k]
                    n_iter += 1
                    X_train = X.iloc[train_index]
                    y_train= y.iloc[train_index]
                    if k == 0 and n_iter == 1:
                        print("TRAIN(0/1/total):", list(y_train).count(0), list(y_train).count(1), len(y_train))     
                    # Loading Resmapled Data
                    over_df = pd.read_csv(r'LLM_SMOTE/'
                                          +str(i)+'_LS_'+str(Strategy[j])+'_'+str(n_iter)+'th.csv')
                    over_df = over_df.replace('False', False)
                    over_df = over_df.fillna(df.mean())
                    X_train = over_df.iloc[:, :-1]
                    y_train = over_df.iloc[:, -1]     
                    if k == 0 and n_iter == 1:
                        print("TRAIN_over(0/1/total):", list(y_train).count(0), list(y_train).count(1), len(y_train)) 
                    X_val = X.iloc[val_index]
                    X_val = pd.concat([val_add.iloc[:,:-1], X_val], axis=0)
                    y_val= y.iloc[val_index]
                    y_val = pd.concat([val_add.iloc[:,-1], y_val], axis=0)
                    if k == 0 and n_iter == 1:
                        print("VALIDATION(0/1/total):", list(y_val).count(0), list(y_val).count(1), len(y_val))
                    # Array
                    X_train = np.array(X_train)
                    X_train = X_train.astype(float)
                    y_train = np.array(y_train)
                    y_train = y_train.astype(float)
                    X_val = np.array(X_val)
                    X_val = X_val.astype(float)
                    y_val = np.array(y_val)
                    y_val = y_val.astype(float)
                    # Learning
                    model.fit(X_train, y_train)
                    train_result = model.predict(X_train)
                    train_result_prob = model.predict_proba(X_train)[:, 1] # Get probability of class 1
                    acc, pre, rec, spe, f1, gmean, bacc, rauc = get_results(y_train, train_result, train_result_prob)
                    train_acc.append(acc)
                    train_pre.append(pre)
                    train_rec.append(rec)
                    train_spe.append(spe)
                    train_f1.append(f1)
                    train_gmean.append(gmean)
                    train_bacc.append(bacc)
                    train_rauc.append(rauc)
                    # Results 
                    result = model.predict(X_val)
                    result_prob = model.predict_proba(X_val)[:, 1] # Get probability of class 1
                    acc, pre, rec, spe, f1, gmean, bacc, rauc = get_results(y_val, result, result_prob)
                    list_acc.append(acc)
                    list_pre.append(pre)
                    list_rec.append(rec)
                    list_spe.append(spe)
                    list_f1.append(f1)
                    list_gmean.append(gmean)
                    list_bacc.append(bacc)
                    list_rauc.append(rauc)
                            
                # Test
                model = Model[k]
                # Loading Resmapled Data
                over_df = pd.read_csv(r'LLM_SMOTE/'
                                          +str(i)+'_LS_'+str(Strategy[j])+'_'+str('comb')+'.csv')
                over_df = over_df.replace('False', False)
                over_df = over_df.fillna(df.mean())
                X_over = over_df.iloc[:, :-1]
                y_over = over_df.iloc[:, -1]                  
                if k == 0:
                    print("TRAIN_over(0/1/total):", list(y_over).count(0), list(y_over).count(1), len(y_over)) 
                    print("TEST(0/1/total):", list(y_test).count(0), list(y_test).count(1), len(y_test))
                model.fit(np.array(X_over).astype(float), np.array(y_over).astype(float))
                result = model.predict(X_test)
                result_prob = model.predict_proba(X_test)[:, 1] # Get probability of class 1
                acc_t, pre_t, rec_t, spe_t, f1_t, gmean_t, bacc_t, rauc_t = get_results(y_test, result, result_prob)

            res_df['LS_{}_{}'.format(Strategy[j],ModelName[k])] = [np.mean(train_acc),np.mean(train_pre), np.mean(train_rec), np.mean(train_spe),
                                                                   np.mean(train_f1), np.mean(train_gmean), np.mean(train_bacc), np.mean(train_rauc),
                                                                   np.mean(list_acc),np.mean(list_pre), np.mean(list_rec), np.mean(list_spe),
                                                                   np.mean(list_f1), np.mean(list_gmean), np.mean(list_bacc), np.mean(list_rauc),
                                                                   acc_t, pre_t, rec_t, spe_t, f1_t, gmean_t, bacc_t, rauc_t]
            
    res_df.to_csv("veryimbalance_validation_test_LS.csv", mode = 'a', float_format='%.4g')