In [None]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from datagen.oversampler import oversample   # LLM oversample
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state = 2)   # 5-fold-cross validation

In [None]:
final_list = [18, 41, 14, 43, 53, 28, 20, 63, 69, 56, 19, 25, 6, 24, 80, 32, 22, 15, 27, 33, 58,
              46, 29, 64, 62, 17, 47, 13, 44, 9, 49, 55, 3, 35, 67, 54, 12, 7, 39, 36, 4, 79, 59, 52, 5, 57,
              21, 50, 45, 42, 11, 1, 51, 38, 34, 16, 10, 2, 26, 91]
print(len(final_list), final_list)

# 1.SMOTE-5CV (48% x 5 times)

In [None]:
# Data Generation 
for i in final_list:
    df = pd.read_csv('ds'+ str(i) +'_new.csv')
    print('+'*35, '{}th Dataset'.format(i), '+'*35)
    
    # Make major class as '0' and minor class as '1'
    MAJOR = df.iloc[:,-1].value_counts()[df.iloc[:,-1].value_counts() == max(df.iloc[:,-1].value_counts())].index[0] # Moj Label
    minor = df.iloc[:,-1].value_counts()[df.iloc[:,-1].value_counts() != max(df.iloc[:,-1].value_counts())].index[0] # min Label    
    df.iloc[:,-1] = df.iloc[:,-1].replace(MAJOR, -100)
    df.iloc[:,-1] = df.iloc[:,-1].replace(minor, 1)
    df.iloc[:,-1] = df.iloc[:,-1].replace(-100, 0)
    df.rename(columns={df.columns[-1]:'NEW_LABEL'}, inplace=True)
    print('<Modified Class>\n', df.iloc[:,-1].value_counts())
    print('<Imabalance ratio>\n', "1:{: .3f}".format(df.iloc[:,-1].value_counts()[1]/df.iloc[:,-1].value_counts()[0]))
        
    ##################### Validation:Test = 70:30 #######################
    df_val, df_test = train_test_split(df, test_size=0.3, random_state=100, stratify=df.iloc[:,-1])
    X = df_val.iloc[:, :-1]
    y = df_val.iloc[:, -1]
    
    ##################### For Validation Set #######################
    Strategy = [0.2, 0.4, 0.6, 0.8, 1.0]
    ind = int((y.value_counts()[1]/y.value_counts()[0])//0.2)
    min_strategy = Strategy[ind]
    if i == 33:
        min_strategy = Strategy[1]
    print("<min_strategy>:",min_strategy)   
    
    for j in range(len(Strategy)):
        print("==========", "SMOTE_{}".format(Strategy[j]), "==========") 
        over = SMOTE(sampling_strategy=Strategy[j], random_state=0)
        if min_strategy > Strategy[j]:
            continue         
        else:     
            n_iter=0
            for train_index, val_index in skf.split(X, y):
                n_iter += 1
                print("=======", "{}th-cv".format(n_iter), "=======")
                X_train = X.iloc[train_index]
                y_train= y.iloc[train_index]
                if n_iter == 1:
                    print(list(y_train).count(0), list(y_train).count(1), len(y_train))
                df_train = pd.concat([X_train, y_train], axis=1)
                minority_data = df_train[df_train["NEW_LABEL"] == 1]
                print("Nedeed Samples:",int((len(df_train)-len(minority_data))*Strategy[j]-len(minority_data)))
                # Resmapling
                X_train, y_train = over.fit_resample(X_train, y_train) 
                if n_iter == 1:
                    print(list(y_train).count(0), list(y_train).count(1), len(y_train))
                over_df = pd.concat([X_train, y_train], axis=1)
                over_df.to_csv("SMOTE_over/ds{}_S_{}_{}th.csv".format(i, Strategy[j], n_iter), index=False)

# 2.SMOTE-Train (70%)

In [None]:
# Data Generation 
for i in final_list:
    df = pd.read_csv('ds'+ str(i) +'_new.csv')
    print('+'*35, '{}th Dataset'.format(i), '+'*35)
    
    # Make major class as '0' and minor class as '1'
    MAJOR = df.iloc[:,-1].value_counts()[df.iloc[:,-1].value_counts() == max(df.iloc[:,-1].value_counts())].index[0] # Moj Label
    minor = df.iloc[:,-1].value_counts()[df.iloc[:,-1].value_counts() != max(df.iloc[:,-1].value_counts())].index[0] # min Label    
    df.iloc[:,-1] = df.iloc[:,-1].replace(MAJOR, -100)
    df.iloc[:,-1] = df.iloc[:,-1].replace(minor, 1)
    df.iloc[:,-1] = df.iloc[:,-1].replace(-100, 0)
    df.rename(columns={df.columns[-1]:'NEW_LABEL'}, inplace=True)
    print('<Modified Class>\n', df.iloc[:,-1].value_counts())
    print('<Imabalance ratio>\n', "1:{: .3f}".format(df.iloc[:,-1].value_counts()[1]/df.iloc[:,-1].value_counts()[0]))
        
    ##################### Validation:Test = 70:30 #######################
    df_val, df_test = train_test_split(df, test_size=0.3, random_state=100, stratify=df.iloc[:,-1])
    X = df_val.iloc[:, :-1]
    y = df_val.iloc[:, -1]
    
    ##################### For Validation Set #######################
    Strategy = [0.2, 0.4, 0.6, 0.8, 1.0]
    ind = int((y.value_counts()[1]/y.value_counts()[0])//0.2)
    min_strategy = Strategy[ind]
    if i == 33:
        min_strategy = Strategy[1]
    print("<min_strategy>:",min_strategy)   
    
    for j in range(len(Strategy)):
        print("==========", "SMOTE_{}".format(Strategy[j]), "==========") 
        over = SMOTE(sampling_strategy=Strategy[j], random_state=0)
        if min_strategy > Strategy[j]:
            continue         
        else:     
            print("=======", "full-train", "=======")
            print(list(y).count(0), list(y).count(1), len(y))
            minority_data = df_val[df_val["NEW_LABEL"] == 1]
            print("Nedeed Samples:",int((len(df_val)-len(minority_data))*Strategy[j]-len(minority_data)))
            # Resmapling
            X_over, y_over = over.fit_resample(X, y) 
            print(list(y_over).count(0), list(y_over).count(1), len(y_over))
            over_df = pd.concat([X_over, y_over], axis=1)
            over_df.to_csv("SMOTE_over/ds{}_S_{}_full.csv".format(i, Strategy[j]), index=False)

# 3.LLM-5CV (48% x 5 times)

In [None]:
# Data Generation
for i in final_list:
    df = pd.read_csv('ds'+ str(i) +'_new.csv')
    print('+'*35, '{}th Dataset'.format(i), '+'*35)
    
    # Make major class as '0' and minor class as '1'
    MAJOR = df.iloc[:,-1].value_counts()[df.iloc[:,-1].value_counts() == max(df.iloc[:,-1].value_counts())].index[0] # Moj Label
    minor = df.iloc[:,-1].value_counts()[df.iloc[:,-1].value_counts() != max(df.iloc[:,-1].value_counts())].index[0] # min Label    
    df.iloc[:,-1] = df.iloc[:,-1].replace(MAJOR, -100)
    df.iloc[:,-1] = df.iloc[:,-1].replace(minor, 1)
    df.iloc[:,-1] = df.iloc[:,-1].replace(-100, 0)
    df.rename(columns={df.columns[-1]:'NEW_LABEL'}, inplace=True)
    print('<Modified Class>\n', df.iloc[:,-1].value_counts())
    print('<Imabalance ratio>\n', "1:{: .2f}".format(df.iloc[:,-1].value_counts()[1]/df.iloc[:,-1].value_counts()[0]))
        
    ##################### Validation:Test = 70:30 #######################
    df_val, df_test = train_test_split(df, test_size=0.3, random_state=100, stratify=df.iloc[:,-1])
    X = df_val.iloc[:, :-1]
    y = df_val.iloc[:, -1]
    
    ##################### For Validation Set #######################
    Strategy = [0.2, 0.4, 0.6, 0.8, 1.0]
    ind = int((y.value_counts()[1]/y.value_counts()[0])//0.2)
    min_strategy = Strategy[ind]
    if i == 33:
        min_strategy = Strategy[1]
    print("<min_strategy>:",min_strategy)   
    
    for h in range(len(Strategy)):
        print("==========", "LLM_{}".format(Strategy[h]), "==========")       
        if min_strategy > Strategy[h]:
            continue
        else:     
            n_iter=0
            for train_index, val_index in skf.split(X, y):
                n_iter += 1
                print("=======", "{}th-cv".format(n_iter), "=======")
                X_train = X.iloc[train_index]
                y_train= y.iloc[train_index]
                if n_iter == 1:
                    print(list(y_train).count(0), list(y_train).count(1), len(y_train))
                # Resmapling
                df_train = pd.concat([X_train, y_train], axis=1)
                minority_data = df_train[df_train["NEW_LABEL"] == 1]
                print("Nedeed Samples:",int((len(df_train)-len(minority_data))*Strategy[h]-len(minority_data)))
                try:
                    new_data = oversample(minority_data, 
                                          int((len(df_train)-len(minority_data))*Strategy[h]-len(minority_data)), # number of generated data samples
                                          "NEW_LABEL",
                                          1, # minor class
                                          single_desc=True, single_vars=True
                                          )
                except:
                    continue
                over_df = pd.concat([df_train, new_data], axis=0)
                over_df.to_csv("LLM_over/ds{}_L_{}_{}th.csv".format(i, Strategy[h], n_iter))

# 4. LLM-Train (70%) 

In [None]:
# Data Generation
for i in final_list:
    df = pd.read_csv('ds'+ str(i) +'_new.csv')
    print('+'*35, '{}th Dataset'.format(i), '+'*35)
    
    # Make major class as '0' and minor class as '1'
    MAJOR = df.iloc[:,-1].value_counts()[df.iloc[:,-1].value_counts() == max(df.iloc[:,-1].value_counts())].index[0] # Moj Label
    minor = df.iloc[:,-1].value_counts()[df.iloc[:,-1].value_counts() != max(df.iloc[:,-1].value_counts())].index[0] # min Label    
    df.iloc[:,-1] = df.iloc[:,-1].replace(MAJOR, -100)
    df.iloc[:,-1] = df.iloc[:,-1].replace(minor, 1)
    df.iloc[:,-1] = df.iloc[:,-1].replace(-100, 0)
    df.rename(columns={df.columns[-1]:'NEW_LABEL'}, inplace=True)
    print('<Modified Class>\n', df.iloc[:,-1].value_counts())
    print('<Imabalance ratio>\n', "1:{: .2f}".format(df.iloc[:,-1].value_counts()[1]/df.iloc[:,-1].value_counts()[0]))
        
    ##################### Validation:Test = 70:30 #######################
    df_val, df_test = train_test_split(df, test_size=0.3, random_state=100, stratify=df.iloc[:,-1])
    X = df_val.iloc[:, :-1]
    y = df_val.iloc[:, -1]
    
    ##################### For Validation Set #######################     
    print("=======", "full-train", "=======")
    print(list(y).count(0), list(y).count(1), len(y))
    minority_data = df_val[df_val["NEW_LABEL"] == 1]
    print("Nedeed Samples:",int((len(df_val)-len(minority_data))*1-len(minority_data)))
    # Resmapling
    try:
        new_data = oversample(minority_data, 
                              int((len(df_val)-len(minority_data))*1-len(minority_data)), # number of generated data samples
                              "NEW_LABEL",
                              1, # minor class
                              single_desc=True, single_vars=True
                              )
    except:
        continue
    over_df = pd.concat([df_val, new_data], axis=0)
    over_df.to_csv('LLM_over/ds{}_L_1.0_full.csv".format(i))

# 5. LLM + SMOTE - CV (48% x 5 times)

In [None]:
# Data Generation 
for i in final_list:
    df = pd.read_csv('ds'+ str(i) +'_new.csv')
    print('+'*35, '{}th Dataset'.format(i), '+'*35)
    
    # Make major class as '0' and minor class as '1'
    MAJOR = df.iloc[:,-1].value_counts()[df.iloc[:,-1].value_counts() == max(df.iloc[:,-1].value_counts())].index[0] # Moj Label
    minor = df.iloc[:,-1].value_counts()[df.iloc[:,-1].value_counts() != max(df.iloc[:,-1].value_counts())].index[0] # min Label    
    df.iloc[:,-1] = df.iloc[:,-1].replace(MAJOR, -100)
    df.iloc[:,-1] = df.iloc[:,-1].replace(minor, 1)
    df.iloc[:,-1] = df.iloc[:,-1].replace(-100, 0)
    df.rename(columns={df.columns[-1]:'NEW_LABEL'}, inplace=True)
    print('<Modified Class>\n', df.iloc[:,-1].value_counts())
    print('<Imabalance ratio>\n', "1:{: .2f}".format(df.iloc[:,-1].value_counts()[1]/df.iloc[:,-1].value_counts()[0]))
        
    ##################### Validation:Test = 70:30 #######################
    df_val, df_test = train_test_split(df, test_size=0.3, random_state=100, stratify=df.iloc[:,-1])
    X = df_val.iloc[:, :-1]
    y = df_val.iloc[:, -1]
    print(list(y).count(0), list(y).count(1), len(y))
    print(list(y).count(0)*0.8, list(y).count(1)*0.8, len(y)*0.8)
    ##################### For Validation Set #######################
    Strategy = [0.2, 0.4, 0.6, 0.8, 1.0]
    ind = int((y.value_counts()[1]/y.value_counts()[0])//0.2)
    min_strategy = Strategy[ind]   
    adj_strategy = Strategy[ind+1]  # original min_strategy-> LLM oversmaple, so SMOTE is used from the next 
    if i == 33:
        min_strategy = Strategy[1]
        adj_strategy = Strategy[2]
    print("<min_strategy>:",min_strategy)   
    
    for j in range(len(Strategy)):
        print("==========", "LLM_SMOTE_{}".format(Strategy[j]), "==========") 
        over = SMOTE(sampling_strategy=Strategy[j], random_state=0)
        if adj_strategy > Strategy[j]:
            continue
        else:
            for n_iter in range(1,6):
                print("=======", "{}th-cv".format(n_iter), "=======")
                df_lm = pd.read_csv('LLM_over/'+'ds'+str(i)+'_L_'+str(min_strategy)+'_'+str(n_iter)+'th.csv')
                print(str(i), str(min_strategy), str(n_iter))
                df_lm = df_lm.replace('False', False)  # sometimes False happen
                df_lm = df_lm.replace('FALSE', False)  # sometimes False happen
                df_lm = df_lm.fillna(df.mean())   # sometime NAN happen
                X_lm = df_lm.iloc[:, :-1]
                y_lm = df_lm.iloc[:, -1]
                if n_iter == 1:
                    print(list(y_lm).count(0), list(y_lm).count(1), len(y_lm)) 
                minority_data = df_lm[df_lm["NEW_LABEL"] == 1]
                print(len(df_lm), len(minority_data), Strategy[j])
                print("Nedeed Samples:",int((len(df_lm)-len(minority_data))*Strategy[j]-len(minority_data)))
                # Resmapling
                X_lm, y_lm = over.fit_resample(X_lm, y_lm)
                if n_iter == 1:
                    print(list(y_lm).count(0), list(y_lm).count(1), len(y_lm))
                over_df = pd.concat([X_lm, y_lm], axis=1)
                over_df.to_csv("LLM_SMOTE/ds{}_LS_{}_{}th.csv".format(i, Strategy[j], n_iter), index=False)

# 6. LLM + SMOTE - Train (70%)

In [None]:
# Data Generation 
for i in final_list:
    df = pd.read_csv('ds'+ str(i) +'_new.csv')
    print('+'*35, '{}th Dataset'.format(i), '+'*35)
    
    # Make major class as '0' and minor class as '1'
    MAJOR = df.iloc[:,-1].value_counts()[df.iloc[:,-1].value_counts() == max(df.iloc[:,-1].value_counts())].index[0] # Moj Label
    minor = df.iloc[:,-1].value_counts()[df.iloc[:,-1].value_counts() != max(df.iloc[:,-1].value_counts())].index[0] # min Label    
    df.iloc[:,-1] = df.iloc[:,-1].replace(MAJOR, -100)
    df.iloc[:,-1] = df.iloc[:,-1].replace(minor, 1)
    df.iloc[:,-1] = df.iloc[:,-1].replace(-100, 0)
    df.rename(columns={df.columns[-1]:'NEW_LABEL'}, inplace=True)
    print('<Modified Class>\n', df.iloc[:,-1].value_counts())
    print('<Imabalance ratio>\n', "1:{: .2f}".format(df.iloc[:,-1].value_counts()[1]/df.iloc[:,-1].value_counts()[0]))
        
    ##################### Validation:Test = 70:30 #######################
    df_val, df_test = train_test_split(df, test_size=0.3, random_state=100, stratify=df.iloc[:,-1])
    X = df_val.iloc[:, :-1]
    y = df_val.iloc[:, -1]

    ##################### For Validation Set #######################
    Strategy = [0.2, 0.4, 0.6, 0.8, 1.0]
    ind = int((y.value_counts()[1]/y.value_counts()[0])//0.2)
    min_strategy = Strategy[ind]   
    adj_strategy = Strategy[ind+1]  # original min_strategy-> LLM oversmaple, so SMOTE is used from the next 
    if i == 33:
        min_strategy = Strategy[1]
        adj_strategy = Strategy[2]
    print("<min_strategy>:",min_strategy)   
    
    for j in range(len(Strategy)):
        print("==========", "LLM_SMOTE_{}".format(Strategy[j]), "==========") 
        over = SMOTE(sampling_strategy=Strategy[j], random_state=0)
        if adj_strategy > Strategy[j]:
            continue
        else:
            print("=======", "full-train", "=======")
            print(list(y).count(0), list(y).count(1), len(y))
            df_lm = pd.read_csv('LLM_over/'+'ds'+str(i)+'_L_'+str(min_strategy)+'_'+str('comb')+'.csv')
            print(str(i), str(min_strategy), str(n_iter))
            df_lm = df_lm.replace('False', False)  # sometimes False happen
            df_lm = df_lm.replace('FALSE', False)  # sometimes False happen
            df_lm = df_lm.fillna(df.mean())   # sometime NAN happen
            X_lm = df_lm.iloc[:, :-1]
            y_lm = df_lm.iloc[:, -1] 
            minority_data = df_lm[df_lm["NEW_LABEL"] == 1]
            print(len(df_lm), len(minority_data), Strategy[j])
            print("Nedeed Samples:",int((len(df_lm)-len(minority_data))*Strategy[j]-len(minority_data)))
            X_over, y_over = over.fit_resample(X_lm, y_lm)
            print(list(y_over).count(0), list(y_over).count(1), len(y_over))
            over_df = pd.concat([X_over, y_over], axis=1)
            over_df.to_csv("LLM_SMOTE/ds{}_LS_{}_comb.csv".format(i, Strategy[j]), index=False)

# 7. Very Imbalanced Data Generation

In [None]:
df = pd.read_csv('data_newest/ds8_new.csv')  # ds58, ds14, ds44
df

In [None]:
df.iloc[:,-1].value_counts()

In [None]:
# We intentionally make it imbalanced.
df_0 = df[df.iloc[:,-1]==1]
df_1 = df[df.iloc[:,-1]==2]
print(len(df_0), len(df_1), len(df_1)/len(df_0))

In [None]:
very_imb_list = ['ds8_new_05', 'ds8_new_01', 'ds8_new_00']

In [None]:
N = round(len(df_0)*0.05)  # 1:0.05
df_1_samp = df_1.sample(n=N, random_state = 100)
print(len(df_1_samp))
df_05 = pd.concat([df_0, df_1_samp], axis=0)
print(df_05.iloc[:,-1].value_counts())
df_05.to_csv('data_newest/ds8_new_05.csv', index = False)

# To add val(14%) & test(30%)
samp_ind = df_1_samp.index
df_1_va_te = df_1.drop(samp_ind)
print(len(df_1_va_te))
N_va = round(len(df_1_va_te)*(14/(14+30)))
df_1_va = df_1_va_te.sample(n=N_va, random_state = 100)
N_te = len(df_1_va_te)-N_va
df_1_te = df_1_va_te.sample(n=N_te, random_state = 100)
print(len(df_1_va), len(df_1_te))
df_1_va.to_csv('data_newest/ds8_new_05_val.csv', index = False)
df_1_te.to_csv('data_newest/ds8_new_05_test.csv', index = False)

In [None]:
N = round(len(df_0)*0.01)  # 1:0.01
df_1_samp = df_1.sample(n=N, random_state = 100)
print(len(df_1_samp))
df_01 = pd.concat([df_0, df_1_samp], axis=0)
print(df_01.iloc[:,-1].value_counts())
df_01.to_csv('data_newest/ds8_new_01.csv', index = False)

# To add val(14%) & test(30%)
samp_ind = df_1_samp.index
df_1_va_te = df_1.drop(samp_ind)
print(len(df_1_va_te))
N_va = round(len(df_1_va_te)*(14/(14+30)))
df_1_va = df_1_va_te.sample(n=N_va, random_state = 100)
N_te = len(df_1_va_te)-N_va
df_1_te = df_1_va_te.sample(n=N_te, random_state = 100)
print(len(df_1_va), len(df_1_te))
df_1_va.to_csv('data_newest/ds8_new_01_val.csv', index = False)
df_1_te.to_csv('data_newest/ds8_new_01_test.csv', index = False)

In [None]:
N = round(len(df_0)*0.00)  # 1:0.00
df_1_samp = df_1.sample(n=N, random_state = 100)
print(len(df_1_samp))
df_00 = pd.concat([df_0, df_1_samp], axis=0)
print(df_00.iloc[:,-1].value_counts())
df_00.to_csv('data_newest/ds8_new_00.csv', index = False)

# To add val(14%) & test(30%)
samp_ind = df_1_samp.index
df_1_va_te = df_1.drop(samp_ind)
print(len(df_1_va_te))
N_va = round(len(df_1_va_te)*(14/(14+30)))
df_1_va = df_1_va_te.sample(n=N_va, random_state = 100)
N_te = len(df_1_va_te)-N_va
df_1_te = df_1_va_te.sample(n=N_te, random_state = 100)
print(len(df_1_va), len(df_1_te))
df_1_va.to_csv('data_newest/ds8_new_00_val.csv', index = False)
df_1_te.to_csv('data_newest/ds8_new_00_test.csv', index = False)

In [None]:
############################################## data58 ##############################################

In [None]:
df = pd.read_csv('data_newest/ds58_new.csv')
df

In [None]:
df.iloc[:,-1].value_counts()

In [None]:
# We intentionally make it imbalanced.
df_0 = df[df.iloc[:,-1]==1]
df_1 = df[df.iloc[:,-1]==2]
print(len(df_0), len(df_1), len(df_1)/len(df_0))

In [None]:
very_imb_list = ['ds58_new_05', 'ds58_new_01', 'ds58_new_00']

In [None]:
N = round(len(df_0)*0.05)  # 1:0.05
df_1_samp = df_1.sample(n=N, random_state = 100)
print(len(df_1_samp))
df_05 = pd.concat([df_0, df_1_samp], axis=0)
print(df_05.iloc[:,-1].value_counts())
df_05.to_csv('data_newest/ds58_new_05.csv', index = False)

# To add val(14%) & test(30%)
samp_ind = df_1_samp.index
df_1_va_te = df_1.drop(samp_ind)
print(len(df_1_va_te))
N_va = round(len(df_1_va_te)*(14/(14+30)))
df_1_va = df_1_va_te.sample(n=N_va, random_state = 100)
N_te = len(df_1_va_te)-N_va
df_1_te = df_1_va_te.sample(n=N_te, random_state = 100)
print(len(df_1_va), len(df_1_te))
df_1_va.to_csv('data_newest/ds58_new_05_val.csv', index = False)
df_1_te.to_csv('data_newest/ds58_new_05_test.csv', index = False)

In [None]:
N = round(len(df_0)*0.01)  # 1:0.01
df_1_samp = df_1.sample(n=N, random_state = 100)
print(len(df_1_samp))
df_01 = pd.concat([df_0, df_1_samp], axis=0)
print(df_01.iloc[:,-1].value_counts())
df_01.to_csv('data_newest/ds58_new_01.csv', index = False)

# To add val(14%) & test(30%)
samp_ind = df_1_samp.index
df_1_va_te = df_1.drop(samp_ind)
print(len(df_1_va_te))
N_va = round(len(df_1_va_te)*(14/(14+30)))
df_1_va = df_1_va_te.sample(n=N_va, random_state = 100)
N_te = len(df_1_va_te)-N_va
df_1_te = df_1_va_te.sample(n=N_te, random_state = 100)
print(len(df_1_va), len(df_1_te))
df_1_va.to_csv('data_newest/ds58_new_01_val.csv', index = False)
df_1_te.to_csv('data_newest/ds58_new_01_test.csv', index = False)

In [None]:
N = round(len(df_0)*0.00)  # 1:0.00
df_1_samp = df_1.sample(n=N, random_state = 100)
print(len(df_1_samp))
df_00 = pd.concat([df_0, df_1_samp], axis=0)
print(df_00.iloc[:,-1].value_counts())
df_00.to_csv('data_newest/ds58_new_00.csv', index = False)

# To add val(14%) & test(30%)
samp_ind = df_1_samp.index
df_1_va_te = df_1.drop(samp_ind)
print(len(df_1_va_te))
N_va = round(len(df_1_va_te)*(14/(14+30)))
df_1_va = df_1_va_te.sample(n=N_va, random_state = 100)
N_te = len(df_1_va_te)-N_va
df_1_te = df_1_va_te.sample(n=N_te, random_state = 100)
print(len(df_1_va), len(df_1_te))
df_1_va.to_csv('data_newest/ds58_new_00_val.csv', index = False)
df_1_te.to_csv('data_newest/ds58_new_00_test.csv', index = False)

In [None]:
############################################## data14 ##############################################

In [None]:
df = pd.read_csv('data_newest/ds14_new.csv')
df = df.iloc[:,1:]
df

In [None]:
df.iloc[:,-1].value_counts()

In [None]:
# We intentionally make it imbalanced.
df_0 = df[df.iloc[:,-1]==1]
df_1 = df[df.iloc[:,-1]==2]
print(len(df_0), len(df_1), len(df_1)/len(df_0))

In [None]:
very_imb_list = ['ds14_new_05', 'ds14_new_01', 'ds14_new_00']

In [None]:
N = round(len(df_0)*0.05)  # 1:0.05
df_1_samp = df_1.sample(n=N, random_state = 100)
print(len(df_1_samp))
df_05 = pd.concat([df_0, df_1_samp], axis=0)
print(df_05.iloc[:,-1].value_counts())
df_05.to_csv('data_newest/ds14_new_05.csv', index = False)

# To add val(14%) & test(30%)
samp_ind = df_1_samp.index
df_1_va_te = df_1.drop(samp_ind)
print(len(df_1_va_te))
N_va = round(len(df_1_va_te)*(14/(14+30)))
df_1_va = df_1_va_te.sample(n=N_va, random_state = 100)
N_te = len(df_1_va_te)-N_va
df_1_te = df_1_va_te.sample(n=N_te, random_state = 100)
print(len(df_1_va), len(df_1_te))
df_1_va.to_csv('data_newest/ds14_new_05_val.csv', index = False)
df_1_te.to_csv('data_newest/ds14_new_05_test.csv', index = False)

In [None]:
N = round(len(df_0)*0.01)  # 1:0.01
df_1_samp = df_1.sample(n=N, random_state = 100)
print(len(df_1_samp))
df_01 = pd.concat([df_0, df_1_samp], axis=0)
print(df_01.iloc[:,-1].value_counts())
df_01.to_csv('data_newest/ds14_new_01.csv', index = False)

# To add val(14%) & test(30%)
samp_ind = df_1_samp.index
df_1_va_te = df_1.drop(samp_ind)
print(len(df_1_va_te))
N_va = round(len(df_1_va_te)*(14/(14+30)))
df_1_va = df_1_va_te.sample(n=N_va, random_state = 100)
N_te = len(df_1_va_te)-N_va
df_1_te = df_1_va_te.sample(n=N_te, random_state = 100)
print(len(df_1_va), len(df_1_te))
df_1_va.to_csv('data_newest/ds14_new_01_val.csv', index = False)
df_1_te.to_csv('data_newest/ds14_new_01_test.csv', index = False)

In [None]:
N = round(len(df_0)*0.00)  # 1:0.00
df_1_samp = df_1.sample(n=N, random_state = 100)
print(len(df_1_samp))
df_00 = pd.concat([df_0, df_1_samp], axis=0)
print(df_00.iloc[:,-1].value_counts())
df_00.to_csv('data_newest/ds14_new_00.csv', index = False)

# To add val(14%) & test(30%)
samp_ind = df_1_samp.index
df_1_va_te = df_1.drop(samp_ind)
print(len(df_1_va_te))
N_va = round(len(df_1_va_te)*(14/(14+30)))
df_1_va = df_1_va_te.sample(n=N_va, random_state = 100)
N_te = len(df_1_va_te)-N_va
df_1_te = df_1_va_te.sample(n=N_te, random_state = 100)
print(len(df_1_va), len(df_1_te))
df_1_va.to_csv('data_newest/ds14_new_00_val.csv', index = False)
df_1_te.to_csv('data_newest/ds14_new_00_test.csv', index = False)

In [None]:
############################################## data44 ##############################################

In [None]:
df = pd.read_csv('data_newest/ds44_new.csv')
df

In [None]:
df.iloc[:,-1].value_counts()

In [None]:
# We intentionally make it imbalanced.
df_0 = df[df.iloc[:,-1]==1]
df_1 = df[df.iloc[:,-1]==2]
print(len(df_0), len(df_1), len(df_1)/len(df_0))

In [None]:
very_imb_list = ['ds44_new_05', 'ds44_new_01', 'ds44_new_00']

In [None]:
N = round(len(df_0)*0.05)  # 1:0.05
df_1_samp = df_1.sample(n=N, random_state = 100)
print(len(df_1_samp))
df_05 = pd.concat([df_0, df_1_samp], axis=0)
print(df_05.iloc[:,-1].value_counts())
df_05.to_csv('data_newest/ds44_new_05.csv', index = False)

# To add val(14%) & test(30%)
samp_ind = df_1_samp.index
df_1_va_te = df_1.drop(samp_ind)
print(len(df_1_va_te))
N_va = round(len(df_1_va_te)*(14/(14+30)))
df_1_va = df_1_va_te.sample(n=N_va, random_state = 100)
N_te = len(df_1_va_te)-N_va
df_1_te = df_1_va_te.sample(n=N_te, random_state = 100)
print(len(df_1_va), len(df_1_te))
df_1_va.to_csv('data_newest/ds44_new_05_val.csv', index = False)
df_1_te.to_csv('data_newest/ds44_new_05_test.csv', index = False)

In [None]:
N = round(len(df_0)*0.01)  # 1:0.01
df_1_samp = df_1.sample(n=N, random_state = 100)
print(len(df_1_samp))
df_01 = pd.concat([df_0, df_1_samp], axis=0)
print(df_01.iloc[:,-1].value_counts())
df_01.to_csv('data_newest/ds44_new_01.csv', index = False)

# To add val(14%) & test(30%)
samp_ind = df_1_samp.index
df_1_va_te = df_1.drop(samp_ind)
print(len(df_1_va_te))
N_va = round(len(df_1_va_te)*(14/(14+30)))
df_1_va = df_1_va_te.sample(n=N_va, random_state = 100)
N_te = len(df_1_va_te)-N_va
df_1_te = df_1_va_te.sample(n=N_te, random_state = 100)
print(len(df_1_va), len(df_1_te))
df_1_va.to_csv('data_newest/ds44_new_01_val.csv', index = False)
df_1_te.to_csv('data_newest/ds44_new_01_test.csv', index = False)

In [None]:
N = round(len(df_0)*0.00)  # 1:0.00
df_1_samp = df_1.sample(n=N, random_state = 100)
print(len(df_1_samp))
df_00 = pd.concat([df_0, df_1_samp], axis=0)
print(df_00.iloc[:,-1].value_counts())
df_00.to_csv('data_newest/ds44_new_00.csv', index = False)

# To add val(14%) & test(30%)
samp_ind = df_1_samp.index
df_1_va_te = df_1.drop(samp_ind)
print(len(df_1_va_te))
N_va = round(len(df_1_va_te)*(14/(14+30)))
df_1_va = df_1_va_te.sample(n=N_va, random_state = 100)
N_te = len(df_1_va_te)-N_va
df_1_te = df_1_va_te.sample(n=N_te, random_state = 100)
print(len(df_1_va), len(df_1_te))
df_1_va.to_csv('data_newest/ds44_new_00_val.csv', index = False)
df_1_te.to_csv('data_newest/ds44_new_00_test.csv', index = False)