<a href="https://colab.research.google.com/github/duonghung86/Injury-severity-classification/blob/main/VCA_2_3_MLP_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from psutil import virtual_memory,cpu_percent
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))
print('Current system-wide CPU utilization %: ',cpu_percent())
#Remove all warning
import warnings
warnings.filterwarnings("ignore")

Your runtime has 270.0 gigabytes of available RAM

Current system-wide CPU utilization %:  25.0


In [2]:
# Basic packages
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import time
import os

#INFO and WARNING messages are not printed
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 
# Preprocessing
from sklearn.preprocessing import StandardScaler # Standardization
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

# Machine learning algos
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from collections import Counter
from imblearn.over_sampling import SMOTE, RandomOverSampler,BorderlineSMOTE
from imblearn.under_sampling import RandomUnderSampler,NearMiss,EditedNearestNeighbours
# Metrics
from imblearn.metrics import geometric_mean_score
# Tensorflow
import tensorflow as tf
print(tf.__version__)
from tensorflow import feature_column  # for data wrangling
from tensorflow.keras.layers import Dense,Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow_addons.metrics import CohenKappa,F1Score,FBetaScore
from tensorboard.plugins.hparams import api as hp
from tensorflow.keras.initializers import Zeros, RandomNormal

2.1.0


# Load data

In [3]:
url = 'https://github.com/duonghung86/Injury-severity-classification/blob/main/Prepared%20Texas%202019.zip?raw=true' 
data_path = tf.keras.utils.get_file(origin=url, fname=url.split('/')[-1].split('?')[0], extract=True)
data_path = data_path.replace('%20',' ').replace('.zip','.csv')

In [4]:
# Load data
df = pd.read_csv(data_path)
print(df.shape)
df.head(3)

(949856, 19)


Unnamed: 0,Prsn_Injry_Sev,Prsn_Age,Prsn_Gndr,Wthr_Cond,Light_Cond,Surf_Cond,Veh_Body_Styl,Prsn_Rest,Prsn_Drg_Rslt,Harm_Evnt,Rural,Crash_Speed_Limit,Road_Algn,Veh_Mod_Year,Weekend,Crash_season,Part_of_day,Collsn_type,Collsn_name
0,0,26,MALE,CLEAR,DAYLIGHT,DRY,"PASSENGER CAR, 2-DOOR",SHOULDER & LAP BELT,Not Applicable,MOTOR VEHICLE IN TRANSPORT,0,-1,"STRAIGHT, LEVEL",33,1,3,3,SAME DIRECTION,ONE STRAIGHT-ONE LEFT TURN
1,0,52,MALE,CLEAR,DAYLIGHT,DRY,"PASSENGER CAR, 2-DOOR",SHOULDER & LAP BELT,Not Applicable,MOTOR VEHICLE IN TRANSPORT,0,-1,"STRAIGHT, LEVEL",19,1,3,3,SAME DIRECTION,ONE STRAIGHT-ONE LEFT TURN
2,0,27,FEMALE,CLEAR,DAYLIGHT,DRY,PICKUP,SHOULDER & LAP BELT,Not Applicable,MOTOR VEHICLE IN TRANSPORT,1,-1,"CURVE, LEVEL",16,1,3,4,SAME DIRECTION,BOTH LEFT TURN


In [5]:
# Let's just use 80% of the total dataset
df, _ = train_test_split(df, test_size=0.80,stratify = df['Prsn_Injry_Sev'])
#df.shape

In [6]:
y = df['Prsn_Injry_Sev']
print('All target values:')
print(y.value_counts())
X = df.drop(columns=['Prsn_Injry_Sev'])

All target values:
0    158512
1     20482
2      9048
3      1590
4       339
Name: Prsn_Injry_Sev, dtype: int64


In [7]:
# %% Data wrangling -------------
# Classify variable type
emb_vars, ind_vars, num_vars = [], [], []
for var in X.columns:
    if X[var].dtypes == 'O':
        if len(X[var].unique()) > 5:
            emb_vars.append(var)
        else:
            ind_vars.append(var)
    else:
        num_vars.append(var)
print('Numerical variables are ', num_vars)
print('Categorical variables that have at most 5 categories are ', ind_vars)
print('Categorical variables that have more than 5 categories are ', emb_vars)

# Create feature columns
feature_columns = []
# numeric cols
for header in num_vars:
    feature_columns.append(feature_column.numeric_column(header))
# bucketized cols
# age = feature_column.numeric_column('Prsn_Age')
# age_buckets = feature_column.bucketized_column(age, boundaries=[16, 22, 35, 55, 65])
# feature_columns.append(age_buckets)
# indicator_columns
for col_name in ind_vars:
    categorical_column = feature_column.categorical_column_with_vocabulary_list(
        col_name, X[col_name].unique())
    indicator_column = feature_column.indicator_column(categorical_column)
    feature_columns.append(indicator_column)
# embedding columns
for col_name in emb_vars:
    emb_column = feature_column.categorical_column_with_vocabulary_list(
        col_name, X[col_name].unique())
    col_embedding = feature_column.embedding_column(emb_column, dimension=5)
    feature_columns.append(col_embedding)

# Convert all setup into new dataset
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)
X = feature_layer(dict(X)).numpy()
print('New shape of the input data set:',X.shape)

Numerical variables are  ['Prsn_Age', 'Rural', 'Crash_Speed_Limit', 'Veh_Mod_Year', 'Weekend', 'Crash_season', 'Part_of_day']
Categorical variables that have at most 5 categories are  ['Prsn_Gndr', 'Prsn_Drg_Rslt', 'Collsn_type']
Categorical variables that have more than 5 categories are  ['Wthr_Cond', 'Light_Cond', 'Surf_Cond', 'Veh_Body_Styl', 'Prsn_Rest', 'Harm_Evnt', 'Road_Algn', 'Collsn_name']
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
New shape of the input data set: (189971, 59)


In [8]:
# %% Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=48)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train, random_state=48)

print('Training features shape:', X_train.shape)
print('Validation features shape:', X_val.shape)
print('Test features shape:', X_test.shape)

# %% standardization
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

Training features shape: (121580, 59)
Validation features shape: (30396, 59)
Test features shape: (37995, 59)


In [9]:
# %% Function to compare the prediction and true labels
def get_accs(label, prediction, tr_time,  show=False):
    cm = confusion_matrix(label, prediction)
    length = cm.shape[0]
    num_cases = len(label)
    # global accuracy
    glb_acc = np.trace(cm) / len(label)
    ind_accs = cm / np.sum(cm, axis=1)[:, np.newaxis]
    accs = [ind_accs[i, i] for i in range(length)]
    index = ['Class {}'.format(i) for i in range(length)]
    # Global accuracy
    accs.append(glb_acc)
    #index.append
    # G-mean
    accs.append(geometric_mean_score(label, prediction, correction=0.001))
    #index.append('G-mean')
    # Average perf
    accs.append((glb_acc + accs[-1]) / 2)
    #index.append('Avg_Pfm')
    # Training time
    accs.append(np.round(tr_time,3))
    index = index + ['Overall Accuracy','G-mean','Avg_Pfm','Training Time']
    # Plot confusion matrix
    plot_dict = {'Confusion matrix': (cm,'g'),
                 'Normalized confusion matrix': (ind_accs,'.2f')}
    if show:
        plt.figure(figsize=(14, 6))
        i = 1
        for key, value in plot_dict.items():
            plt.subplot(1, 2, i)
            sns.heatmap(value[0], xticklabels=np.arange(length), yticklabels=np.arange(length),
                        annot=True, fmt=value[1], cmap="YlGnBu")
            plt.xlabel('Prediction')
            plt.ylabel('Label')
            plt.title(key)
            i+= 1
        plt.show()
    out = np.array(accs).reshape(1, len(accs))
    return pd.DataFrame(out, columns=index)

# Class weights

In [10]:
wgt=None
clfs = [LogisticRegression(solver = 'lbfgs',class_weight=wgt),
        DecisionTreeClassifier(class_weight=wgt),
        RandomForestClassifier(max_depth=4,class_weight=wgt)]
clf_names = ['LR','DT','RF']
rsts = pd.DataFrame()
for model, name in zip(clfs,clf_names):
    start = time.time()
    print(name)
    model.fit(X_train, y_train)
    # use the model to make predictions with the test data
    y_pred = model.predict(X_test)
    end= time.time()
    # get the evaluation metrics
    result = get_accs(y_test.values, y_pred, end-start)
    result.index = [name + '-weight'] 
    rsts = rsts.append(result)
print(rsts.iloc[:,5:])

LR
DT
RF
           Overall Accuracy    G-mean   Avg_Pfm  Training Time
LR-weight          0.834478  0.025002  0.429740          5.861
DT-weight          0.701856  0.151801  0.426828          1.501
RF-weight          0.834399  0.003981  0.419190          6.371


## MLP

In [11]:
es = EarlyStopping(monitor='val_cohen_kappa',
                   verbose=1, patience=10, mode='max',
                   restore_best_weights=True)

In [12]:
def create_mlp():
    MLP = Sequential([Dense(10,activation='relu',
                           input_dim=X_train.shape[1]),
                      Dropout(0.5),
                      Dense(5, activation='softmax')])
    MLP.compile(optimizer='adam',
                loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                metrics=[CohenKappa(num_classes=5,sparse_labels=True)])
    return MLP

In [16]:
# Add weights
weights = len(y) / (5 * np.bincount(y))
cls_wgt = dict(zip(np.arange(5), weights))
cls_wgt

{0: 0.23969289391339457,
 1: 1.8550043941021384,
 2: 4.199182139699381,
 3: 23.89572327044025,
 4: 112.07728613569321}

In [18]:
model = create_mlp()
start = time.time()
monitor = model.fit(X_train, y_train,
                    callbacks=[es],
                    class_weight=cls_wgt,
                    validation_data=(X_val, y_val.values),
                    verbose=0, epochs=50)
end = time.time()
# use the model to make predictions with the test data
y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)
# get the evaluation metrics
result = get_accs(y_test.values, y_pred, end-start)
result.index = ['MLP-Weights']
rsts = rsts.append(result)
print(rsts.iloc[:,5:])

  ...
    to  
  ['...']
  ...
    to  
  ['...']
Restoring model weights from the end of the best epoch.
Epoch 00014: early stopping
             Overall Accuracy    G-mean   Avg_Pfm  Training Time
LR-weight            0.834478  0.025002  0.429740          5.861
DT-weight            0.701856  0.151801  0.426828          1.501
RF-weight            0.834399  0.003981  0.419190          6.371
MLP-Weights          0.834399  0.003981  0.419190         60.542
MLP-Weights          0.491512  0.370671  0.431092         84.820


In [15]:
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV

In [25]:
def create_mlp(optimizer = 'adam', init='zeros', acti = 'relu'):
    MLP = Sequential([Dense(10,activation=acti,
                           input_dim=X_train.shape[1],
                           kernel_initializer=init),
                      Dropout(0.5),
                      Dense(5, activation='softmax')])
    MLP.compile(optimizer=optimizer,
                loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                metrics=[CohenKappa(num_classes=5,sparse_labels=True)])
    return MLP

In [30]:
# create model
model = KerasClassifier(build_fn=create_mlp,verbose=1)
# grid search epochs, batch size and optimizer
optimizers = ['adam']
init = ['glorot_uniform', 'normal', 'uniform']
epochs = [1, 2]
#batches = [5, 10, 20]
param_grid = dict(#optimizer=optimizers, 
                  epochs=epochs, 
                  #batch_size=batches, 
                  init=init)
scores={'BA':'balanced_accuracy','F1':'f1','AUC':'roc_auc','kappa':'cohen_kappa_score'}
grid = GridSearchCV(estimator=model, 
                    param_grid=param_grid,
                    scoring=scores,
                    cv=5,
                    refit='AUC')
grid_result = grid.fit(X_train, y_train,
                        callbacks=[es],
                        class_weight=cls_wgt,
                        validation_data=(X_val, y_val.values), 
                      )
grid_result.cv_results_

ValueError: 'cohen_kappa_score' is not a valid scoring value. Use sorted(sklearn.metrics.SCORERS.keys()) to get valid options.

In [32]:
import sklearn

In [33]:
sorted(sklearn.metrics.SCORERS.keys())

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'roc_auc_ovo',
 'roc_auc_ovo_weighted',
 'roc_auc_ovr',
 'roc_auc_ovr_weighted',
 'v_measure_score']

In [None]:
clf = LogisticRegression(solver = 'lbfgs')
clf.fit(X_svm, y_svm)
y_pred2 = clf.predict(model.predict(X_test))
get_accs(y_test.values, y_pred2, 0)

Unnamed: 0,Class 0,Class 1,Class 2,Class 3,Class 4,Overall Accuracy,G-mean,Avg_Pfm,Training Time
0,0.495064,0.456055,0.20442,0.323899,0.588235,0.475747,0.388001,0.431874,0.0


In [35]:
from sklearn.ensemble import BaggingClassifier
start = time.time()
clf = BaggingClassifier(base_estimator=LogisticRegression(solver = 'lbfgs'),
                        n_estimators=10, random_state=0).fit(X_train, y_train)
end = time.time()
y_pred = clf.predict(X_test)
# get the evaluation metrics
result = get_accs(y_test.values, y_pred, end-start)
result.index = ['LR-HR-Bagging'] 
rsts = rsts.append(result)
print(rsts.iloc[:,5:])

               Overall Accuracy    G-mean   Avg_Pfm  Training Time
LR-HR                  0.505011  0.394077  0.449544         16.052
DT-HR                  0.432316  0.253554  0.342935          7.533
RF-HR                  0.572358  0.316632  0.444495         20.934
MLP-Weights            0.502085  0.375003  0.438544        679.522
LR-HR-Bagging          0.503443  0.393480  0.448461        158.434


In [None]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_svm, y_svm)
y_pred2 = clf.predict(model.predict(X_test))
get_accs(y_test.values, y_pred2, 0)

Unnamed: 0,Class 0,Class 1,Class 2,Class 3,Class 4,Overall Accuracy,G-mean,Avg_Pfm,Training Time
0,0.559095,0.464111,0.122652,0.311321,0.588235,0.526043,0.357362,0.441702,0.0


In [36]:
def early_stops(metric_name):
    es = EarlyStopping(monitor='val_'+ metric_name,
                   verbose=1,
                   patience=10,
                   mode='max',
                   restore_best_weights=True)
    return es

# Ordinal multiclass


In [40]:
early_stop = {'accuracy':  tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy'),
            #  'cohen_kappa': CohenKappa(num_classes=5, sparse_labels=True), 
              'f1_micro': F1Score(num_classes=5,average="micro",threshold=0.5, name='f1_micro'),
              'f1_micro_nt': F1Score(num_classes=5,average="micro", name='f1_micro_nt'),
              'f_beta2': FBetaScore(num_classes=5,average="micro",beta=2.0, name='f_beta2'),
               'f_beta05': FBetaScore(num_classes=5,average="micro",beta=0.5, name='f_beta05'),            
               }

In [43]:
def evaluation(model, monitor, time, name):
    # use the model to make predictions with the test data
    y_pred = model.predict(X_test)
    y_pred = np.argmax(y_pred, axis=1)

    # get the evaluation metrics
    result = get_accs(y_test.values, y_pred,time)
    result.index = [name]
    return result

In [47]:
def create_mlp(metric):
    MLP = Sequential([Dense(10,
                           activation='relu',
                           input_dim=X_train.shape[1],
                            bias_initializer=RandomNormal(mean=0.0, stddev=0.005, seed=None)
                           ),
                      Dropout(0.5),
                      Dense(5, activation='softmax')])
    MLP.compile(optimizer='adam',
                loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                #loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                metrics=[metric]
               )
    return MLP

In [48]:
for name, metric in early_stop.items():
    print(name)
    model = create_mlp(metric)
    start = time.time()
    monitor = model.fit(X_train, y_train,
                    callbacks=[early_stops(name)],
                    validation_data=(X_val, y_val.values),
                    verbose=0, epochs=50)
    end = time.time()
    rsts = rsts.append(evaluation(model, monitor, end - start,  'MLP-HR-'+ name))
    #evaluation(model, monitor, end - start,  'MLP '+ name)
print(rsts.iloc[:,-4:])

accuracy
Restoring model weights from the end of the best epoch.
Epoch 00025: early stopping
f1_micro
Restoring model weights from the end of the best epoch.
Epoch 00017: early stopping
f1_micro_nt
Restoring model weights from the end of the best epoch.
Epoch 00011: early stopping
f_beta2
Restoring model weights from the end of the best epoch.
Epoch 00011: early stopping
f_beta05
Restoring model weights from the end of the best epoch.
Epoch 00011: early stopping
                    Overall Accuracy    G-mean   Avg_Pfm  Training Time
LR-HR                       0.505011  0.394077  0.449544         16.052
DT-HR                       0.432316  0.253554  0.342935          7.533
RF-HR                       0.572358  0.316632  0.444495         20.934
MLP-Weights                 0.502085  0.375003  0.438544        679.522
LR-HR-Bagging               0.503443  0.393480  0.448461        158.434
MLP-HR-accuracy             0.523161  0.370971  0.447066        415.634
MLP-HR-f1_micro             0

In [49]:
 early_stop = {'auc':       tf.keras.metrics.AUC(name='auc'),
     'accuracy':  tf.keras.metrics.CategoricalAccuracy(name='accuracy'),
               'precision': tf.keras.metrics.Precision(name='precision'),
               'recall':    tf.keras.metrics.Recall(name='recall'),
               
               }

In [50]:
def create_mlp(metric):
    MLP = Sequential([Dense(10,
                           activation='relu',
                           input_dim=X_train.shape[1],
                            bias_initializer=RandomNormal(mean=0.0, stddev=0.005, seed=None)
                           ),
                      Dropout(0.5),
                      Dense(5, activation='softmax')])
    MLP.compile(optimizer='adam',
                #loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                metrics=[metric]
               )
    return MLP

In [None]:
#rsts = rsts.iloc[:6,:].copy()

In [51]:
for name, metric in early_stop.items():
    print(name)
    model = create_mlp(metric)
    start = time.time()
    monitor = model.fit(X_train, pd.get_dummies(y_train).values,
                    callbacks=[early_stops(name)],
                    validation_data=(X_val, pd.get_dummies(y_val).values),
                    verbose=0, epochs=50)
    end = time.time()
    rsts = rsts.append(evaluation(model, monitor, end - start,  'MLP 1H '+ name))
print(rsts.iloc[:,-4:])

auc
Restoring model weights from the end of the best epoch.
Epoch 00011: early stopping
accuracy
Restoring model weights from the end of the best epoch.
Epoch 00013: early stopping
precision
Restoring model weights from the end of the best epoch.
Epoch 00018: early stopping
recall
Restoring model weights from the end of the best epoch.
Epoch 00016: early stopping
                    Overall Accuracy    G-mean   Avg_Pfm  Training Time
LR-HR                       0.505011  0.394077  0.449544         16.052
DT-HR                       0.432316  0.253554  0.342935          7.533
RF-HR                       0.572358  0.316632  0.444495         20.934
MLP-Weights                 0.502085  0.375003  0.438544        679.522
LR-HR-Bagging               0.503443  0.393480  0.448461        158.434
MLP-HR-accuracy             0.523161  0.370971  0.447066        415.634
MLP-HR-f1_micro             0.469859  0.355995  0.412927        292.311
MLP-HR-f1_micro_nt          0.522045  0.373567  0.447806  

# Normal Resampling

In [None]:
Resamples = {'ROS':             RandomOverSampler(), 
             'SMOTE':           SMOTE(random_state=42),
             'BorderlineSMOTE': BorderlineSMOTE(),
             'RUS': RandomUnderSampler(), 
             'NearMiss': NearMiss()
             }
for key,value in Resamples.items():
    start = time.time()
    X_res, y_res = value.fit_resample(X_train, y_train)
    end = time.time()
    res_time = end - start
    print(key)
    print('Resampled dataset shape %s' % Counter(y_res))
    print('resampling time is {0:.2f} seconds'.format(res_time))
    
    #Logistic model
    LR = LogisticRegression(solver = 'lbfgs')
    start = time.time()
    LR.fit(X_res, y_res)
    end= time.time()
    # get the evaluation metrics
    # use the model to make predictions with the test data
    y_pred = LR.predict(X_test)
    result = get_accs(y_test.values,y_pred, tr_time= end-start)
    result['Resample time'] = res_time
    result.index = ['LR-' + key]
    rsts = rsts.append(result)
    
print(rsts.iloc[:,5:])

ROS
Resampled dataset shape Counter({1: 101446, 0: 101446, 2: 101446, 3: 101446, 4: 101446})
resampling time is 0.11 seconds
SMOTE
Resampled dataset shape Counter({1: 101446, 0: 101446, 2: 101446, 3: 101446, 4: 101446})
resampling time is 14.73 seconds
BorderlineSMOTE
Resampled dataset shape Counter({1: 101446, 0: 101446, 2: 101446, 3: 101446, 4: 101446})
resampling time is 315.54 seconds
RUS
Resampled dataset shape Counter({0: 217, 1: 217, 2: 217, 3: 217, 4: 217})
resampling time is 0.03 seconds
NearMiss
Resampled dataset shape Counter({0: 217, 1: 217, 2: 217, 3: 217, 4: 217})
resampling time is 2.04 seconds
                    Overall Accuracy    G-mean   Avg_Pfm  Training Time  \
LR-Weights                  0.487538  0.384926  0.436232          5.019   
DT-Weights                  0.716700  0.148694  0.432697          1.564   
RF-Weights                  0.567706  0.196124  0.381915          4.073   
MLP-Weights                 0.486353  0.346576  0.416465        113.833   
LR-ROS  

In [None]:
#rsts = rsts.iloc[:9,:]
#rsts

# Hybrid Resampling


In [None]:
y_dict = Counter(y_train)
y_dict

Counter({1: 13109, 0: 101446, 2: 5790, 3: 1018, 4: 217})

## Oversampling and then undersampling

In [None]:
ss = {}
for i in range(2,5):
    ss[i] = y_dict[1]
oses = ['ROS','SMOTE','BorderlineSMOTE']
uses = ['RUS','NearMiss']

In [None]:
for os_name in oses:
    for us_name in uses:
        print(os_name,'-',us_name)
        if os_name == 'ROS': 
            res = RandomOverSampler(sampling_strategy=ss)
        elif os_name == 'BorderlineSMOTE': 
            res = BorderlineSMOTE(sampling_strategy=ss)
        else: res = SMOTE(sampling_strategy=ss)
        start = time.time()
        X_res, y_res = res.fit_resample(X_train, y_train)

        if us_name == 'RUS': 
            res = RandomUnderSampler(sampling_strategy='majority')
        else: res = NearMiss(sampling_strategy='majority')  
        X_res, y_res = res.fit_resample(X_res, y_res)
        end = time.time()
        res_time = end-start

        print('Resampled dataset shape %s' % Counter(y_res))
        print('Resamling time %.2f sec' % (res_time))
        LR = LogisticRegression(solver = 'lbfgs')
        start = time.time()
        LR.fit(X_res, y_res)
        end= time.time()
        # get the evaluation metrics
        # use the model to make predictions with the test data
        y_pred = LR.predict(X_test)
        result = get_accs(y_test.values,y_pred, tr_time= end-start)
        result['Resample time'] = res_time
        result.index = ['LR-' + os_name + '-' + us_name]
        rsts = rsts.append(result)

print(rsts.iloc[:,5:])

ROS - RUS
Resampled dataset shape Counter({0: 13109, 1: 13109, 2: 13109, 3: 13109, 4: 13109})
Resamling time 0.13 sec
ROS - NearMiss
Resampled dataset shape Counter({0: 13109, 1: 13109, 2: 13109, 3: 13109, 4: 13109})
Resamling time 86.98 sec
SMOTE - RUS
Resampled dataset shape Counter({0: 13109, 1: 13109, 2: 13109, 3: 13109, 4: 13109})
Resamling time 2.64 sec
SMOTE - NearMiss
Resampled dataset shape Counter({0: 13109, 1: 13109, 2: 13109, 3: 13109, 4: 13109})
Resamling time 89.48 sec
BorderlineSMOTE - RUS
Resampled dataset shape Counter({0: 13109, 1: 13109, 2: 13109, 3: 13109, 4: 13109})
Resamling time 150.43 sec
BorderlineSMOTE - NearMiss
Resampled dataset shape Counter({0: 13109, 1: 13109, 2: 13109, 3: 13109, 4: 13109})
Resamling time 237.54 sec


Unnamed: 0,Class 0,Class 1,Class 2,Class 3,Class 4,Overall Accuracy,G-mean,Avg_Pfm,Training Time,Resample time
LR-Weights,0.514462,0.430664,0.160773,0.358491,0.661765,0.487538,0.384926,0.436232,5.019,
DT-Weights,0.835315,0.138184,0.087293,0.037736,0.191176,0.7167,0.148694,0.432697,1.564,
RF-Weights,0.628174,0.36792,0.00663,0.286164,0.661765,0.567706,0.196124,0.381915,4.073,
MLP-Weights,0.518311,0.416016,0.105525,0.339623,0.647059,0.486353,0.346576,0.416465,113.833,
LR-ROS,0.516134,0.436523,0.149724,0.349057,0.647059,0.488933,0.377034,0.432983,21.084,0.11453
LR-SMOTE,0.515156,0.419434,0.150829,0.36478,0.602941,0.48638,0.372459,0.429419,20.791,14.726981
LR-BorderlineSMOTE,0.572091,0.420166,0.227624,0.295597,0.470588,0.536807,0.376955,0.456881,22.433,315.539367
LR-RUS,0.443365,0.297607,0.262431,0.361635,0.661765,0.418739,0.383424,0.401081,0.095,0.032193
LR-NearMiss,0.012112,0.079346,0.08453,0.41195,0.808824,0.027583,0.122037,0.07481,0.094,2.035056
LR-ROS-RUS,0.513548,0.432861,0.149724,0.361635,0.632353,0.486459,0.376956,0.431707,2.617,0.133523


## Under sampling and then over sampling

In [None]:
for us_name in uses:
    for os_name in oses:
        print(us_name,'-',os_name)
        if us_name == 'RUS': 
            res = RandomUnderSampler(sampling_strategy={0: y_dict[1]})
        else: res = NearMiss(sampling_strategy={0: y_dict[1]})
        start = time.time()
        X_res, y_res = res.fit_resample(X_train, y_train)

        if os_name == 'ROS': 
            res = RandomOverSampler(sampling_strategy='not majority')
        elif os_name == 'BorderlineSMOTE': 
            res = BorderlineSMOTE(sampling_strategy='not majority')
        else: res = SMOTE(sampling_strategy='not majority')
        X_res, y_res = res.fit_resample(X_res, y_res)
        end = time.time()
        res_time = end-start
        print('Resampled dataset shape %s' % Counter(y_res))
        print('Resamling time %.2f sec' % (res_time))
        LR = LogisticRegression(solver = 'lbfgs',
                                #class_weight= 'balanced'
                                )
        start = time.time()
        LR.fit(X_res, y_res)
        end= time.time()
        # get the evaluation metrics
        # use the model to make predictions with the test data
        y_pred = LR.predict(X_test)
        result = get_accs(y_test.values,y_pred, tr_time= end-start)
        result['Resample time'] = res_time
        result.index = ['LR-' + us_name + '-' + os_name]
        rsts = rsts.append(result)

print(rsts.iloc[:,5:])

RUS - ROS
Resampled dataset shape Counter({0: 13109, 1: 13109, 2: 13109, 3: 13109, 4: 13109})
Resamling time 0.05 sec
RUS - SMOTE
Resampled dataset shape Counter({0: 13109, 1: 13109, 2: 13109, 3: 13109, 4: 13109})
Resamling time 2.62 sec
RUS - BorderlineSMOTE
Resampled dataset shape Counter({0: 13109, 1: 13109, 2: 13109, 3: 13109, 4: 13109})
Resamling time 24.61 sec
NearMiss - ROS
Resampled dataset shape Counter({0: 13109, 1: 13109, 2: 13109, 3: 13109, 4: 13109})
Resamling time 1.71 sec
NearMiss - SMOTE
Resampled dataset shape Counter({0: 13109, 1: 13109, 2: 13109, 3: 13109, 4: 13109})
Resamling time 4.27 sec
NearMiss - BorderlineSMOTE
Resampled dataset shape Counter({0: 13109, 1: 13109, 2: 13109, 3: 13109, 4: 13109})
Resamling time 25.52 sec


Unnamed: 0,Class 0,Class 1,Class 2,Class 3,Class 4,Overall Accuracy,G-mean,Avg_Pfm,Training Time,Resample time
LR-Weights,0.514462,0.430664,0.160773,0.358491,0.661765,0.487538,0.384926,0.436232,5.019,
DT-Weights,0.835315,0.138184,0.087293,0.037736,0.191176,0.7167,0.148694,0.432697,1.564,
RF-Weights,0.628174,0.36792,0.00663,0.286164,0.661765,0.567706,0.196124,0.381915,4.073,
MLP-Weights,0.518311,0.416016,0.105525,0.339623,0.647059,0.486353,0.346576,0.416465,113.833,
LR-ROS,0.516134,0.436523,0.149724,0.349057,0.647059,0.488933,0.377034,0.432983,21.084,0.11453
LR-SMOTE,0.515156,0.419434,0.150829,0.36478,0.602941,0.48638,0.372459,0.429419,20.791,14.726981
LR-BorderlineSMOTE,0.572091,0.420166,0.227624,0.295597,0.470588,0.536807,0.376955,0.456881,22.433,315.539367
LR-RUS,0.443365,0.297607,0.262431,0.361635,0.661765,0.418739,0.383424,0.401081,0.095,0.032193
LR-NearMiss,0.012112,0.079346,0.08453,0.41195,0.808824,0.027583,0.122037,0.07481,0.094,2.035056
LR-ROS-RUS,0.513548,0.432861,0.149724,0.361635,0.632353,0.486459,0.376956,0.431707,2.617,0.133523


### Refining result
NearMiss not only increased the computing time but also reduced the accuracy significantly
Apply 5-fold cross validation


In [None]:
# Border → RUS
bor2rus = pd.DataFrame()
for i in range(10):
    print(i)
    start = time.time()
    res = BorderlineSMOTE(sampling_strategy=ss)
    print('sampling #1 ...')
    X_res, y_res = res.fit_resample(X_train, y_train)
    res = RandomUnderSampler(sampling_strategy='majority')
    print('sampling #2 ...')
    X_res, y_res = res.fit_resample(X_res, y_res)
    end = time.time()
    res_time = end-start
    # training and prediction
    start = time.time()
    LR = LogisticRegression(solver = 'lbfgs',
                            #class_weight= 'balanced'
                            )
    start = time.time()
    print('training...')
    LR.fit(X_res, y_res)
    end= time.time()
    # get the evaluation metrics
    # use the model to make predictions with the test data
    y_pred = LR.predict(X_test)
    result = get_accs(y_test.values,y_pred, tr_time= end-start)
    bor2rus = bor2rus.append(result)
bor2rus

0
sampling #1 ...
sampling #2 ...
training...
1
sampling #1 ...
sampling #2 ...
training...
2
sampling #1 ...
sampling #2 ...
training...
3
sampling #1 ...
sampling #2 ...
training...
4
sampling #1 ...
sampling #2 ...
training...
5
sampling #1 ...
sampling #2 ...
training...
6
sampling #1 ...


In [None]:
bor2rus.describe().iloc[1:3,5:]

In [None]:
# RUS → Borderline
rus2bor = pd.DataFrame()
for i in range(10):
    print(i)
    start = time.time()
    res = RandomUnderSampler(sampling_strategy={0: y_dict[1]})
    print('sampling #1 ...')
    X_res, y_res = res.fit_resample(X_train, y_train)
    res = BorderlineSMOTE(sampling_strategy='not majority')
    print('sampling #2 ...')
    X_res, y_res = res.fit_resample(X_res, y_res)
    end = time.time()
    res_time = end-start
    # training and prediction
    start = time.time()
    LR = LogisticRegression(solver = 'lbfgs',
                            #class_weight= 'balanced'
                            )
    start = time.time()
    print('training...')
    LR.fit(X_res, y_res)
    end= time.time()
    # get the evaluation metrics
    # use the model to make predictions with the test data
    y_pred = LR.predict(X_test)
    result = get_accs(y_test.values,y_pred, tr_time= end-start)
    rus2bor = rus2bor.append(result)
rus2bor

In [None]:
print(rus2bor.describe().iloc[1:3,5:])
print(bor2rus.describe().iloc[1:3,5:])

In [None]:
y_dict

In [None]:
start = time.time()
res = RandomUnderSampler(sampling_strategy={0: y_dict[1]})
print('sampling #1 ...')
X_res, y_res = res.fit_resample(X_train, y_train)
res = BorderlineSMOTE(sampling_strategy='not majority')
print('sampling #2 ...')
X_res, y_res = res.fit_resample(X_res, y_res)
print(Counter(y_res))
end = time.time()
res_time = end-start
model = create_mlp()
start = time.time()
model.fit(X_res, y_res,
                    callbacks=[es],
                 #   class_weight = cls_wgt,
                    validation_data=(X_val, y_val.values),
                    verbose=1, epochs=50)
end = time.time()
# use the model to make predictions with the test data
y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)
# get the evaluation metrics
result = get_accs(y_test.values, y_pred, end-start)
result.index = ['MLP-RUS2BOR']
rsts = rsts.append(result)
print(rsts.iloc[:,5:])

In [None]:
rsts.to_csv('VCA_Resamplings.csv')