<a href="https://colab.research.google.com/github/duonghung86/Injury-severity-classification/blob/main/VCA_2_1_MLP_earlystopping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from psutil import virtual_memory,cpu_percent
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))
print('Current system-wide CPU utilization %: ',cpu_percent())
#Remove all warning
import warnings
warnings.filterwarnings("ignore")

Your runtime has 270.0 gigabytes of available RAM

Current system-wide CPU utilization %:  33.3


In [2]:
# Basic packages
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import time
import os
from collections import Counter
# Preprocessing
from sklearn.preprocessing import StandardScaler # Standardization
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

# Machine learning algos
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# Imblearn
from imblearn.metrics import geometric_mean_score
from imblearn.over_sampling import SMOTE, RandomOverSampler,BorderlineSMOTE
from imblearn.under_sampling import RandomUnderSampler,NearMiss,EditedNearestNeighbours

# Grid search
from kerastuner.tuners import RandomSearch,Hyperband,BayesianOptimization
import kerastuner as kt
from tensorflow.keras.optimizers import Adam
# Tensorflow
import tensorflow as tf
print(tf.__version__)
from tensorflow import feature_column  # for data wrangling
from tensorflow.keras.losses import SparseCategoricalCrossentropy,CategoricalCrossentropy
from tensorflow.keras.layers import Dense,Dropout, BatchNormalization
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.metrics import SparseCategoricalAccuracy,CategoricalAccuracy
from tensorflow_addons.metrics import CohenKappa,F1Score


2.1.0


In [3]:
url = 'https://github.com/duonghung86/Injury-severity-classification/blob/main/Prepared%20Texas%202019.zip?raw=true' 
data_path = tf.keras.utils.get_file(origin=url, fname=url.split('/')[-1].split('?')[0], extract=True)
data_path = data_path.replace('%20',' ').replace('.zip','.csv')

In [4]:
# Load data
df = pd.read_csv(data_path)
print(df.shape)
df.head(3)

(949856, 19)


Unnamed: 0,Prsn_Injry_Sev,Prsn_Age,Prsn_Gndr,Wthr_Cond,Light_Cond,Surf_Cond,Veh_Body_Styl,Prsn_Rest,Prsn_Drg_Rslt,Harm_Evnt,Rural,Crash_Speed_Limit,Road_Algn,Veh_Mod_Year,Weekend,Crash_season,Part_of_day,Collsn_type,Collsn_name
0,0,26,MALE,CLEAR,DAYLIGHT,DRY,"PASSENGER CAR, 2-DOOR",SHOULDER & LAP BELT,Not Applicable,MOTOR VEHICLE IN TRANSPORT,0,-1,"STRAIGHT, LEVEL",33,1,3,3,SAME DIRECTION,ONE STRAIGHT-ONE LEFT TURN
1,0,52,MALE,CLEAR,DAYLIGHT,DRY,"PASSENGER CAR, 2-DOOR",SHOULDER & LAP BELT,Not Applicable,MOTOR VEHICLE IN TRANSPORT,0,-1,"STRAIGHT, LEVEL",19,1,3,3,SAME DIRECTION,ONE STRAIGHT-ONE LEFT TURN
2,0,27,FEMALE,CLEAR,DAYLIGHT,DRY,PICKUP,SHOULDER & LAP BELT,Not Applicable,MOTOR VEHICLE IN TRANSPORT,1,-1,"CURVE, LEVEL",16,1,3,4,SAME DIRECTION,BOTH LEFT TURN


In [5]:
# Let's just use 80% of the total dataset
#df, _ = train_test_split(df, test_size=0.9,stratify = df['Prsn_Injry_Sev'])
df.shape

(949856, 19)

In [6]:
y = df['Prsn_Injry_Sev']
print('All target values:')
print(y.value_counts())
X = df.drop(columns=['Prsn_Injry_Sev'])

All target values:
0    792558
1    102409
2     45242
3      7951
4      1696
Name: Prsn_Injry_Sev, dtype: int64


In [7]:
# %% Data wrangling -------------
# Classify variable type
emb_vars, ind_vars, num_vars = [], [], []
for var in X.columns:
    if X[var].dtypes == 'O':
        if len(X[var].unique()) > 5:
            emb_vars.append(var)
        else:
            ind_vars.append(var)
    else:
        num_vars.append(var)
print('Numerical variables are ', num_vars)
print('Categorical variables that have at most 5 categories are ', ind_vars)
print('Categorical variables that have more than 5 categories are ', emb_vars)

# Create feature columns
feature_columns = []
# numeric cols
for header in num_vars:
    feature_columns.append(feature_column.numeric_column(header))
# bucketized cols
# age = feature_column.numeric_column('Prsn_Age')
# age_buckets = feature_column.bucketized_column(age, boundaries=[16, 22, 35, 55, 65])
# feature_columns.append(age_buckets)
# indicator_columns
for col_name in ind_vars:
    categorical_column = feature_column.categorical_column_with_vocabulary_list(
        col_name, X[col_name].unique())
    indicator_column = feature_column.indicator_column(categorical_column)
    feature_columns.append(indicator_column)
# embedding columns
for col_name in emb_vars:
    emb_column = feature_column.categorical_column_with_vocabulary_list(
        col_name, X[col_name].unique())
    col_embedding = feature_column.embedding_column(emb_column, dimension=5)
    feature_columns.append(col_embedding)

# Convert all setup into new dataset
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)
X = feature_layer(dict(X)).numpy()
print('New shape of the input data set:',X.shape)

Numerical variables are  ['Prsn_Age', 'Rural', 'Crash_Speed_Limit', 'Veh_Mod_Year', 'Weekend', 'Crash_season', 'Part_of_day']
Categorical variables that have at most 5 categories are  ['Prsn_Gndr', 'Prsn_Drg_Rslt', 'Collsn_type']
Categorical variables that have more than 5 categories are  ['Wthr_Cond', 'Light_Cond', 'Surf_Cond', 'Veh_Body_Styl', 'Prsn_Rest', 'Harm_Evnt', 'Road_Algn', 'Collsn_name']
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
New shape of the input data set: (949856, 59)


In [8]:
# %% Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=48)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train, random_state=48)

print('Training features shape:', X_train.shape)
print('Validation features shape:', X_val.shape)
print('Test features shape:', X_test.shape)

# %% standardization
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

Training features shape: (607907, 59)
Validation features shape: (151977, 59)
Test features shape: (189972, 59)


# ALL mini functions



In [9]:
# Import Metrics
from imblearn.metrics import geometric_mean_score
from sklearn.metrics import roc_auc_score,confusion_matrix,accuracy_score

# %% Function to compare the prediction and true labels
def get_accs(label, pred_proba, tr_time=0,index=None):
    prediction = pred_proba.argmax(axis=1)
    cm = confusion_matrix(label, prediction)
    length = cm.shape[0]
    num_cases = len(label)
    # global accuracy
    glb_acc = np.trace(cm) / len(label)
    ind_accs = cm / np.sum(cm, axis=1)[:, np.newaxis]
    accs = [ind_accs[i, i] for i in range(length)]
    cols = ['Class {}'.format(i) for i in range(length)]
    # Global accuracy
    accs.append(glb_acc)
    # AUC
    accs.append(roc_auc_score(label, pred_proba,multi_class='ovr'))
    # G-mean
    accs.append(geometric_mean_score(label, prediction, correction=0.001))
    # Average perf
    accs.append(np.mean(accs[-3:]))
    # Training time
    accs.append(np.round(tr_time,3))
    cols = cols + ['Accuracy','AUC','G-mean','Avg_Pfm','Training Time']

    out = np.array(accs).reshape(1, len(accs))
    return pd.DataFrame(out, columns=cols,index=[index])

# ML with class weight

# MLP functions


In [10]:
# Add weights
weights = len(y_train) / (5 * np.bincount(y_train))
cls_wgt = dict(zip(np.arange(5), weights))
cls_wgt

{0: 0.23969347661941065,
 1: 1.8550434079431195,
 2: 4.199122746425364,
 3: 23.891019846728238,
 4: 111.95340699815839}

In [11]:
def early_stops(metric_name):
    es = EarlyStopping(monitor='val_'+ metric_name,
                   verbose=1, patience=10, mode='max',
                   restore_best_weights=True)
    return es

In [12]:
# Constant
EPOCH = 50
BATCH_SIZE = 2048
VERBOSE = 0

In [13]:
METRICS = [SparseCategoricalAccuracy(name='accuracy'),
           CohenKappa(name='kappa',num_classes=5,sparse_labels=True),
           F1Score(name='f1_micro', num_classes=5,average="micro",threshold=0.5),
          ]
def create_mlp():
    MLP = Sequential([Dense(10,
                           activation='relu',
                           input_dim=X_train.shape[1],
                           ),
                      Dropout(0.5),
                      Dense(5, activation='softmax')])
    MLP.compile(optimizer='adam',
                loss=SparseCategoricalCrossentropy(from_logits=True),
                metrics=METRICS
               )
    return MLP

# Hybrid Resampling 

In [14]:
y_dict = Counter(y_train)

start = time.time()
res = RandomUnderSampler(random_state = 54, sampling_strategy={0: y_dict[1]})
print('under sampling ...')
X_res, y_res = res.fit_resample(X_train, y_train)

res = SMOTE(random_state = 34,sampling_strategy='not majority')
print(Counter(y_res))
print('over sampling #2 ...')
X_res, y_res = res.fit_resample(X_res, y_res)
end = time.time()
res_time = end-start
Counter(y_res),res_time

under sampling ...
Counter({0: 65541, 1: 65541, 2: 28954, 3: 5089, 4: 1086})
over sampling #2 ...


(Counter({0: 65541, 1: 65541, 2: 65541, 3: 65541, 4: 65541}),
 70.37647199630737)

# MLP with Hybrid Sampling

In [15]:
rsts = pd.DataFrame()
for i in range(3):
    model = create_mlp()
    start = time.time()
    monitor = model.fit(X_res, y_res,
                        callbacks=[early_stops('accuracy')],
                        validation_data=(X_val,y_val),
                        batch_size=BATCH_SIZE,
                        verbose=VERBOSE, epochs=EPOCH
                       )
    end = time.time()
    # use the model to make predictions with the test data
    Y_pred = model.predict(X_test)
    rsts = rsts.append(get_accs(y_test.values,Y_pred,end-start,'MLP-W-'+str(i+1)))
print(rsts.iloc[:,5:])

Restoring model weights from the end of the best epoch.
Epoch 00011: early stopping
Restoring model weights from the end of the best epoch.
Epoch 00020: early stopping
Restoring model weights from the end of the best epoch.
Epoch 00011: early stopping
         Accuracy       AUC    G-mean   Avg_Pfm  Training Time
MLP-W-1  0.545780  0.631388  0.244335  0.473834          6.282
MLP-W-2  0.509422  0.702490  0.353039  0.521650         10.828
MLP-W-3  0.549086  0.652409  0.193501  0.464998          6.165


# Set up grid search

We will investigates the following parameters:

- Initial weights
- Activation function
- Number of nodes
- Dropout rate
- Early Stop
- Learning rate

# Keras tuner

In [22]:
def build_model(hp):
    hp_units = hp.Int('units', min_value=5, max_value=20, step=5)
    hp_learning_rate = hp.Choice('learning_rate', values = [1e-2, 1e-3, 1e-4]) 
    hp_dos = hp.Float('dropouts',min_value=0.2, max_value=0.3, step=0.1)
    hp_acts = hp.Choice('activation', values = ['relu','sigmoid','tanh','selu'])
    keins = ['uniform', 'normal', 'zeros', 'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform']
    hp_keins = hp.Choice('kernel_ini', values = keins) 
    model = Sequential([Dense(hp_units,
                           activation=hp_acts,
                           input_dim=X_train.shape[1],
                            kernel_initializer= hp_keins 
                           ),
                      Dropout(hp_dos),
                      Dense(5, activation='softmax')])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate = hp_learning_rate),
                loss=SparseCategoricalCrossentropy(from_logits=True),
                metrics=METRICS
               )
    return model

In [17]:
metr = ['loss','accuracy','kappa','f1_micro']

In [18]:
MAX_EPOCHS = 30
FACTOR = 5

In [23]:
bps = pd.DataFrame()
obj = 'accuracy'
tuner = Hyperband(build_model,
                     objective = obj, 
                     max_epochs = MAX_EPOCHS,
                     factor = FACTOR,
                     directory = 'my_dir',
                     project_name = 'val_'+ obj+'_'+time.ctime())
tuner.search(X_res, y_res,
             epochs=MAX_EPOCHS,batch_size=2048,
             verbose=0,
             callbacks=[early_stops(obj)],
             validation_data=(X_val, y_val))
end = time.time()

print('Tuning time is %.2f' % (end-start))
print(tuner.oracle.get_best_trials(num_trials=1)[0].hyperparameters.values)
bp = pd.Series(tuner.oracle.get_best_trials(num_trials=1)[0].hyperparameters.values,name=obj)
bp = bp.append(pd.Series(end-start,index=['Tuning_time']))
bps = pd.concat((bps,bp),axis=1)
models = tuner.get_best_models(num_models=FACTOR)
for i in range(FACTOR):
    Y_pred = models[i].predict(X_test)
    rsts = rsts.append(get_accs(y_test.values,Y_pred,end-start,'MLP-'+obj+'-'+str(i+1)))
print(bps)
print(rsts.iloc[:,5:])

Restoring model weights from the end of the best epoch.
Epoch 00026: early stopping
Restoring model weights from the end of the best epoch.
Epoch 00029: early stopping
Restoring model weights from the end of the best epoch.
Epoch 00025: early stopping
Restoring model weights from the end of the best epoch.
Epoch 00017: early stopping
Restoring model weights from the end of the best epoch.
Epoch 00025: early stopping
Restoring model weights from the end of the best epoch.
Epoch 00026: early stopping
Restoring model weights from the end of the best epoch.
Epoch 00022: early stopping
INFO:tensorflow:Oracle triggered exit
Tuning time is 796.44
{'units': 20, 'learning_rate': 0.01, 'dropouts': 0.2, 'activation': 'sigmoid', 'kernel_ini': 'glorot_uniform', 'tuner/epochs': 30, 'tuner/initial_epoch': 6, 'tuner/bracket': 2, 'tuner/round': 2, 'tuner/trial_id': 'b912598052bca92d2c05195ad1b231d5'}
                                                    0
units                                            

Unnamed: 0,Class 0,Class 1,Class 2,Class 3,Class 4,Accuracy,AUC,G-mean,Avg_Pfm,Training Time
MLP-W-1,0.63387,0.080851,0.1062,0.203145,0.787611,0.54578,0.631388,0.244335,0.473834,6.282
MLP-W-2,0.547542,0.403232,0.108189,0.312579,0.734513,0.509422,0.70249,0.353039,0.52165,10.828
MLP-W-3,0.616168,0.21619,0.212399,0.01195,0.80236,0.549086,0.652409,0.193501,0.464998,6.165
MLP-accuracy-1,0.527121,0.330925,0.226876,0.348428,0.702065,0.490483,0.708612,0.395534,0.531543,301.527
MLP-accuracy-2,0.533398,0.38214,0.166538,0.369182,0.696165,0.498531,0.714079,0.38739,0.533333,301.527
MLP-accuracy-3,0.563743,0.38756,0.121339,0.37673,0.693215,0.52234,0.716389,0.369883,0.536204,301.527
MLP-accuracy-4,0.536906,0.398399,0.128191,0.28239,0.755162,0.500763,0.707498,0.357596,0.521952,301.527
MLP-accuracy-5,0.558027,0.284591,0.239584,0.354717,0.710914,0.511949,0.712649,0.394827,0.539808,301.527
MLP-accuracy-1,0.514119,0.421785,0.153387,0.408176,0.634218,0.486309,0.718895,0.386372,0.530525,796.436
MLP-accuracy-2,0.496846,0.393321,0.216488,0.392453,0.616519,0.47167,0.714047,0.39997,0.528562,796.436


In [24]:
for i in range(FACTOR):
    Y_pred = models[i].predict(X_test)
    rsts = rsts.append(get_accs(y_test.values,Y_pred,end-start,'MLP-'+obj+'-'+str(i+1)))
print(bps)
print(rsts.iloc[:,5:])

                                                    0
units                                              20
learning_rate                                    0.01
dropouts                                          0.2
activation                                    sigmoid
kernel_ini                             glorot_uniform
tuner/epochs                                       30
tuner/initial_epoch                                 6
tuner/bracket                                       2
tuner/round                                         2
tuner/trial_id       b912598052bca92d2c05195ad1b231d5
Tuning_time                                   796.436
                Accuracy       AUC    G-mean   Avg_Pfm  Training Time
MLP-W-1         0.545780  0.631388  0.244335  0.473834          6.282
MLP-W-2         0.509422  0.702490  0.353039  0.521650         10.828
MLP-W-3         0.549086  0.652409  0.193501  0.464998          6.165
MLP-accuracy-1  0.490483  0.708612  0.395534  0.531543        301.527
ML

In [25]:
bestm = models[0]

In [30]:
def hyb_sam(random=12):
    start = time.time()
    res = RandomUnderSampler(random_state = random, sampling_strategy={0: y_dict[1]})
    print('under sampling ...')
    X_sam, y_sam = res.fit_resample(X_train, y_train)
    print(Counter(y_sam))
    print('over sampling #2 ...')
    res = SMOTE(random_state = random,sampling_strategy='not majority')
    X_sam, y_sam = res.fit_resample(X_sam, y_sam)
    end = time.time()
    res_time = end-start
    print('Resampling time is %.2f' % res_time)
    return X_sam, y_sam

In [31]:
X_res2, y_res2 = hyb_sam(32)

under sampling ...
Counter({0: 65541, 1: 65541, 2: 28954, 3: 5089, 4: 1086})
over sampling #2 ...
Resampling time is 72.78


In [54]:
np.random.seed(4)
rs = np.random.choice(range(100), 5, replace=False)
rs

array([20, 10, 96, 16, 63])

In [55]:
rsts
for i in rs:
    bestm = models[0]
    X_res2, y_res2 = hyb_sam(i)
    bestm.fit(X_res2, y_res2,
                            callbacks=[early_stops('accuracy')],
                            validation_data=(X_val,y_val),
                            batch_size=BATCH_SIZE,
                            verbose=VERBOSE, epochs=EPOCH
                           )
    print('Retraining Done!')
    Y_pred = bestm.predict(X_test)
    rsts = rsts.append(get_accs(y_test.values,Y_pred,end-start,obj+'-refit-'+str(i+1)))
rsts.iloc[:,5:]

under sampling ...
Counter({0: 65541, 1: 65541, 2: 28954, 3: 5089, 4: 1086})
over sampling #2 ...
Resampling time is 72.07
Restoring model weights from the end of the best epoch.
Epoch 00014: early stopping
Retraining Done!
under sampling ...
Counter({0: 65541, 1: 65541, 2: 28954, 3: 5089, 4: 1086})
over sampling #2 ...
Resampling time is 70.41
Restoring model weights from the end of the best epoch.
Epoch 00025: early stopping
Retraining Done!
under sampling ...
Counter({0: 65541, 1: 65541, 2: 28954, 3: 5089, 4: 1086})
over sampling #2 ...
Resampling time is 73.28
Restoring model weights from the end of the best epoch.
Epoch 00013: early stopping
Retraining Done!
under sampling ...
Counter({0: 65541, 1: 65541, 2: 28954, 3: 5089, 4: 1086})
over sampling #2 ...
Resampling time is 70.86
Restoring model weights from the end of the best epoch.
Epoch 00013: early stopping
Retraining Done!
under sampling ...
Counter({0: 65541, 1: 65541, 2: 28954, 3: 5089, 4: 1086})
over sampling #2 ...
Resamp

Unnamed: 0,Accuracy,AUC,G-mean,Avg_Pfm,Training Time
MLP-W-1,0.54578,0.631388,0.244335,0.473834,6.282
MLP-W-2,0.509422,0.70249,0.353039,0.52165,10.828
MLP-W-3,0.549086,0.652409,0.193501,0.464998,6.165
MLP-accuracy-1,0.490483,0.708612,0.395534,0.531543,301.527
MLP-accuracy-2,0.498531,0.714079,0.38739,0.533333,301.527
MLP-accuracy-3,0.52234,0.716389,0.369883,0.536204,301.527
MLP-accuracy-4,0.500763,0.707498,0.357596,0.521952,301.527
MLP-accuracy-5,0.511949,0.712649,0.394827,0.539808,301.527
MLP-accuracy-1,0.486309,0.718895,0.386372,0.530525,796.436
MLP-accuracy-2,0.47167,0.714047,0.39997,0.528562,796.436


In [56]:
rsts.to_csv('Extra fiting.csv')

In [29]:
Y_pred = bestm.predict(X_test)
print(get_accs(y_test.values,Y_pred,end-start,'MLP-'+obj+'-'+str(i+1)))

                 Class 0   Class 1   Class 2   Class 3   Class 4  Accuracy  \
MLP-accuracy-5  0.548274  0.349087  0.199912  0.394969  0.628319  0.509064   

                    AUC    G-mean  Avg_Pfm  Training Time  
MLP-accuracy-5  0.72071  0.394006  0.54126         73.087  
