In [None]:
from matplotlib import pyplot as plt
from sklearn import datasets
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix
import math
from sklearn.model_selection import StratifiedKFold

## Check if there Null data

In [None]:
for i in range(1, 106):
    df = pd.read_csv('ds'+ str(i) +'.csv')
    print('='*50)
    print('shape of {}:'.format(i), df.shape)
    print(df.columns)                            # features names and label
    if df.isnull().sum().sum() != 0:             # null?
        print(df.isnull().sum())
        break

## Experiment setting

In [None]:
import warnings
warnings.filterwarnings("ignore")
from sklearn import metrics
import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Dense
from tensorflow.keras.optimizers import SGD
from matplotlib import pyplot as plt
from keras.layers import BatchNormalization, Dropout
from keras.layers import Activation
from keras import optimizers
from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import StratifiedKFold
from keras.callbacks import EarlyStopping

In [None]:
epochs = 50
batch_size = 32
hidden_node = 2
learning_rate=0.001
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state = 0)

def mlp_model():
    model = Sequential()
    model.add(Dense(hidden_node, input_dim=X.shape[1], kernel_initializer=keras.initializers.he_normal(seed=100)))
    model.add(BatchNormalization())
    model.add(Activation('sigmoid'))
    model.add(Dense(1, activation='sigmoid'))
    return model

# 1. Generating Probablistic Labels for 102 datasets

In [None]:
# Experiments for 105-3 Datasets

for i in range(1, 106):
    if i == 23 or i == 82 or i == 84:
        continue
    df = pd.read_csv(r'ds'+ str(i) +'.csv')
    print('+'*35, '{}th Dataset'.format(i), '+'*35)
    print('<Original Class>\n', df.iloc[:,-1].value_counts())
    
    # Make major class as '0' and minor class as '1'
    MAJOR = df.iloc[:,-1].value_counts()[df.iloc[:,-1].value_counts() == max(df.iloc[:,-1].value_counts())].index[0]
    minor = df.iloc[:,-1].value_counts()[df.iloc[:,-1].value_counts() != max(df.iloc[:,-1].value_counts())].index[0]
    df.iloc[:,-1] = df.iloc[:,-1].replace(MAJOR, -100)
    df.iloc[:,-1] = df.iloc[:,-1].replace(minor, 1)
    df.iloc[:,-1] = df.iloc[:,-1].replace(-100, 0)
    y_hard = list(df.iloc[:,-1])
    print('<Modified Class>\n', df.iloc[:,-1].value_counts())
    print('<Imabalance ratio>\n', "{: .2f}:1".format(df.iloc[:,-1].value_counts()[0]/df.iloc[:,-1].value_counts()[1]))
    
    X = df.iloc[:, :-1]
    X = (X - X.mean())/X.std()    # Features // Standardization
    y = df.iloc[:, -1]

    model = mlp_model()
    opt = optimizers.Adam(learning_rate = learning_rate)
    batch_size = 32
    model.compile(loss='BinaryCrossentropy', optimizer=opt, metrics=['accuracy'])
    history = model.fit(X, y, validation_split=0.2, epochs=epochs, verbose=0, batch_size=batch_size)   
#     plt.plot(history.history['loss'], label='loss')
#     plt.ylim([0, 1])
#     plt.xlabel('Iteration',fontweight="bold",fontsize = 15)
#     plt.ylabel('Loss',fontweight="bold",fontsize = 15)
#     plt.title("Cost Function",fontweight="bold",fontsize = 20)
#     plt.legend()
#     plt.show()
    result = model.predict(X)
    df.rename(columns={df.columns[-1]:'y'}, inplace=True)
    prob_label = list(result.reshape(len(X),))
    df['y'] = prob_label
    df['y_hard'] = y_hard
    df.to_csv("Prob_102datasets/ds{}_prob.csv".format(i), mode = 'a', float_format='%.4g')

# 1-1. Focal(Hard) and SLS(Hard/alpha)

In [None]:
c = 0.3  # criterion decides easy/hard
for i in range(1, 106):
    if i == 23 or i == 82 or i == 84:
        continue
    df_data = pd.read_csv(r'ds'+str(i)+'_prob'+'.csv', index_col='Unnamed: 0')
    print('+'*35, '{}th Dataset'.format(i), '+'*35)
    y = df_data.iloc[:, -2]
    y_hard = df_data.iloc[:, -1]
    y_edge = []
    for j in range(len(y)):
        if y_hard[j] == 0:
            if y[j] <= c:
                y_edge.append(0)  # easy sample
            else:
                y_edge.append(2) # hard sample
        if y_hard[j] == 1:
            if y[j] >= 1-c:
                y_edge.append(0)  # easy sample
            else:
                y_edge.append(2) # hard sample
    df_data['y_edge'] = y_edge
    df_data['y_sls'] = [0 for i in range(len(y))]
    if y_edge.count(0) == 0 or y_edge.count(2) == 0:  # if normal or edge = 0, all samples are edges or normal, meaning nothing
        continue
    if y_edge.count(0) <= 4 or y_edge.count(2) <= 4:  # too small # of samples cannot be used both for train and test
        continue
    else:
        r = y_edge.count(0)/y_edge.count(2)  # normal/edge
        if r <= 2:
            r = 2  # although edge cases are more than normal cases, we want to give more weights to edge cases
        alpha = (r-1)/(2*r)
#     print("normal:", y_edge.count(0), "edge:", y_edge.count(2))
    print("r:", r, "alpha:", alpha)
    
    X = df_data.iloc[:, :-4]
    X = (X - X.mean())/X.std()    # Features // Standardization
    
    res = pd.DataFrame({'Dataset':[0, 0, 0]}, index = ['Total','Edge','Normal'])
    res.iloc[:,0] = [i for b in range(3)]
    
    # Focal
    print('#'*50,'Focal','#'*50)
    list_total = []
    list_edge = []
    list_normal = []   
    focal_model = mlp_model()

    n_iter = 0
    for train_index, test_index in skf.split(X, y_edge):  # straticiation by y_edge
        n_iter += 1
        df_train = df_data.iloc[train_index]
        X_train = df_train.iloc[:,:-4]
        y_train = df_train.iloc[:,-3]  # train with hard labels
        if n_iter == 1:
            print(y_train.value_counts())        
        df_test = df_data.iloc[test_index]
        X_test = df_test.iloc[:,:-4]
        y_test = df_test.iloc[:,-3]  # test with hard labels
        df_test_edge = df_test[df_test['y_edge'] == 2.00]
#         print(len(df_test_edge))
        X_test_edge = df_test_edge.iloc[:,:-4]
        y_test_edge = df_test_edge.iloc[:,-3]  # test with hard labels
        df_test_normal = df_test[df_test['y_edge'] == 0.00]
#         print(len(df_test_normal))
        X_test_normal = df_test_normal.iloc[:,:-4]
        y_test_normal = df_test_normal.iloc[:,-3]  # test with hard labels
        
        X_train = np.array(X_train)
        y_train = np.array(y_train)
        y_train = y_train.astype(float)    
        X_test = np.array(X_test)
        y_test = np.array(y_test)
        y_test = y_test.astype(float)
        X_test_edge = np.array(X_test_edge)
        y_test_edge = np.array(y_test_edge)
        y_test_edge = y_test_edge.astype(float)
        X_test_normal = np.array(X_test_normal)
        y_test_normal = np.array(y_test_normal)
        y_test_normal = y_test_normal.astype(float)

        focal_model.compile(loss='BinaryFocalCrossentropy', optimizer=optimizers.Adam(learning_rate = 0.001), metrics=['accuracy'])
        history = focal_model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=epochs, verbose=0, batch_size=batch_size)#, callbacks=[early_stopping])
#         plt.plot(history.history['loss'], label='loss')
#         plt.ylim([0, 1])
#         plt.xlabel('Iteration',fontweight="bold",fontsize = 15)
#         plt.ylabel('Loss',fontweight="bold",fontsize = 15)
#         plt.title("Cost Function",fontweight="bold",fontsize = 20)
#         plt.legend()
#         plt.show()

        # TEST (total)
        predicted_total = np.round(focal_model.predict(X_test, verbose=0))
        list_total.append(metrics.accuracy_score(y_test, predicted_total))
        # TEST (edge)
        predicted_edge = np.round(focal_model.predict(X_test_edge, verbose=0))
        list_edge.append(metrics.accuracy_score(y_test_edge, predicted_edge))
        # TEST (normal)
        predicted_normal = np.round(focal_model.predict(X_test_normal, verbose=0))
        list_normal.append(metrics.accuracy_score(y_test_normal, predicted_normal))
            
    res['Focal'] = [np.mean(list_total), np.mean(list_edge), np.mean(list_normal)]
    print([np.mean(list_total), np.mean(list_edge), np.mean(list_normal)])
    
    B = [0.00, alpha]     # Hard/SLS with alpha
    for b in B:
        print('#'*50,'SLS',b,'#'*50)
        y_sls = []
        for i in range(len(y_hard)):
            if y_hard[i] == 0:
                if y[i] <= c:
                    y_sls.append(b)  # easy sample
                else:
                    y_sls.append(0) # hard sample
            if y_hard[i] == 1:
                if y[i] >= 1-c:
                    y_sls.append(1-b)  # easy sample
                else:
                    y_sls.append(1) # hard sample
        df_data['y_sls'] = y_sls  # update the column (y_sls)            

        sls_total = []
        sls_edge = []
        sls_normal = []
        model_sls = mlp_model()
        
        n_iter = 0
        for train_index, test_index in skf.split(X, y_edge):  # straticiation by y_edge
            n_iter += 1
            df_train = df_data.iloc[train_index]
            X_train = df_train.iloc[:,:-4]
            y_sls_train = df_train.iloc[:,-1]  # train with sls labels
            if n_iter == 1:
                print(y_sls_train.value_counts())                
            df_test = df_data.iloc[test_index]
            X_test = df_test.iloc[:,:-4]
            y_test = df_test.iloc[:,-3]  # test with hard labels
            df_test_edge = df_test[df_test['y_edge'] == 2.00]
            X_test_edge = df_test_edge.iloc[:,:-4]
            y_test_edge = df_test_edge.iloc[:,-3]  # test with hard labels
            df_test_normal = df_test[df_test['y_edge'] == 0.00]
            X_test_normal = df_test_normal.iloc[:,:-4]
            y_test_normal = df_test_normal.iloc[:,-3]  # test with hard labels

            X_train = np.array(X_train)
            y_sls_train = np.array(y_sls_train)
            y_sls_train = y_sls_train.astype(float)
            X_test = np.array(X_test)
            y_test = np.array(y_test)
            y_test = y_test.astype(float)
            X_test_edge = np.array(X_test_edge)
            y_test_edge = np.array(y_test_edge)
            y_test_edge = y_test_edge.astype(float)
            X_test_normal = np.array(X_test_normal)
            y_test_normal = np.array(y_test_normal)
            y_test_normal = y_test_normal.astype(float)

            # MLP_BCE(y_005)
            model_sls.compile(loss='BinaryCrossentropy', optimizer=optimizers.Adam(learning_rate = 0.001), metrics=['accuracy'])
            history = model_sls.fit(X_train, y_sls_train, validation_data=(X_test, y_test), epochs=epochs, verbose=0, batch_size=batch_size)#, callbacks=[early_stopping])
#             plt.plot(history.history['loss'], label='loss')
#             plt.ylim([0, 1])
#             plt.xlabel('Iteration',fontweight="bold",fontsize = 15)
#             plt.ylabel('Loss',fontweight="bold",fontsize = 15)
#             plt.title("Cost Function",fontweight="bold",fontsize = 20)
#             plt.legend()
#             plt.show()

            # TEST (total)
            predicted_total = np.round(model_sls.predict(X_test, verbose=0))
            sls_total.append(metrics.accuracy_score(y_test, predicted_total))
            # TEST (edge)
            predicted_edge = np.round(model_sls.predict(X_test_edge, verbose=0))
            sls_edge.append(metrics.accuracy_score(y_test_edge, predicted_edge))
            # TEST (normal)
            predicted_normal = np.round(model_sls.predict(X_test_normal, verbose=0))
            sls_normal.append(metrics.accuracy_score(y_test_normal, predicted_normal))
        
        if b == 0.00:
            res['SLS(0.0)'] = [np.mean(sls_total), np.mean(sls_edge), np.mean(sls_normal)]
        else:
            res['SLS(alpha)'] = [np.mean(sls_total), np.mean(sls_edge), np.mean(sls_normal)]
            res['alpha'] = [b,b,b]
        print([np.mean(sls_total), np.mean(sls_edge), np.mean(sls_normal)])
    res.to_csv("102datasets_5CV(SLS_c0.3)_alphaimproved.csv", mode = 'a', float_format='%.4g')