<a href="https://colab.research.google.com/github/duonghung86/Injury-severity-classification/blob/main/VCA_2_1_MLP_earlystopping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [30]:
from psutil import virtual_memory,cpu_percent
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))
print('Current system-wide CPU utilization %: ',cpu_percent())
#Remove all warning
import warnings
warnings.filterwarnings("ignore")

Your runtime has 270.0 gigabytes of available RAM

Current system-wide CPU utilization %:  6.8


In [31]:
# Basic packages
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import time
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 
from collections import Counter
# Preprocessing
from sklearn.preprocessing import StandardScaler # Standardization
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

# Machine learning algos
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# Imblearn
from imblearn.metrics import geometric_mean_score
from imblearn.over_sampling import SMOTE, RandomOverSampler,BorderlineSMOTE
from imblearn.under_sampling import RandomUnderSampler,NearMiss,EditedNearestNeighbours

# Grid search
from kerastuner.tuners import RandomSearch,Hyperband,BayesianOptimization
import kerastuner as kt
from tensorflow.keras.optimizers import Adam
# Tensorflow
import tensorflow as tf
print(tf.__version__)
from tensorflow import feature_column  # for data wrangling
from tensorflow.keras.losses import SparseCategoricalCrossentropy,CategoricalCrossentropy
from tensorflow.keras.layers import Dense,Dropout, BatchNormalization
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.metrics import SparseCategoricalAccuracy

2.1.0


In [32]:
url = 'https://github.com/duonghung86/Injury-severity-classification/blob/main/Prepared%20Texas%202019.zip?raw=true' 
data_path = tf.keras.utils.get_file(origin=url, fname=url.split('/')[-1].split('?')[0], extract=True)
data_path = data_path.replace('%20',' ').replace('.zip','.csv')

In [33]:
# Load data
df = pd.read_csv(data_path)
print(df.shape)
df.head(3)

(949856, 19)


Unnamed: 0,Prsn_Injry_Sev,Prsn_Age,Prsn_Gndr,Wthr_Cond,Light_Cond,Surf_Cond,Veh_Body_Styl,Prsn_Rest,Prsn_Drg_Rslt,Harm_Evnt,Rural,Crash_Speed_Limit,Road_Algn,Veh_Mod_Year,Weekend,Crash_season,Part_of_day,Collsn_type,Collsn_name
0,0,26,MALE,CLEAR,DAYLIGHT,DRY,"PASSENGER CAR, 2-DOOR",SHOULDER & LAP BELT,Not Applicable,MOTOR VEHICLE IN TRANSPORT,0,-1,"STRAIGHT, LEVEL",33,1,3,3,SAME DIRECTION,ONE STRAIGHT-ONE LEFT TURN
1,0,52,MALE,CLEAR,DAYLIGHT,DRY,"PASSENGER CAR, 2-DOOR",SHOULDER & LAP BELT,Not Applicable,MOTOR VEHICLE IN TRANSPORT,0,-1,"STRAIGHT, LEVEL",19,1,3,3,SAME DIRECTION,ONE STRAIGHT-ONE LEFT TURN
2,0,27,FEMALE,CLEAR,DAYLIGHT,DRY,PICKUP,SHOULDER & LAP BELT,Not Applicable,MOTOR VEHICLE IN TRANSPORT,1,-1,"CURVE, LEVEL",16,1,3,4,SAME DIRECTION,BOTH LEFT TURN


In [34]:
# Let's just use 80% of the total dataset
#df, _ = train_test_split(df, test_size=0.9,stratify = df['Prsn_Injry_Sev'])
df.shape

(949856, 19)

In [35]:
y = df['Prsn_Injry_Sev']
print('All target values:')
print(y.value_counts())
X = df.drop(columns=['Prsn_Injry_Sev'])

All target values:
0    792558
1    102409
2     45242
3      7951
4      1696
Name: Prsn_Injry_Sev, dtype: int64


In [36]:
# %% Data wrangling -------------
# Classify variable type
emb_vars, ind_vars, num_vars = [], [], []
for var in X.columns:
    if X[var].dtypes == 'O':
        if len(X[var].unique()) > 5:
            emb_vars.append(var)
        else:
            ind_vars.append(var)
    else:
        num_vars.append(var)
print('Numerical variables are ', num_vars)
print('Categorical variables that have at most 5 categories are ', ind_vars)
print('Categorical variables that have more than 5 categories are ', emb_vars)

# Create feature columns
feature_columns = []
# numeric cols
for header in num_vars:
    feature_columns.append(feature_column.numeric_column(header))
# bucketized cols
# age = feature_column.numeric_column('Prsn_Age')
# age_buckets = feature_column.bucketized_column(age, boundaries=[16, 22, 35, 55, 65])
# feature_columns.append(age_buckets)
# indicator_columns
for col_name in ind_vars:
    categorical_column = feature_column.categorical_column_with_vocabulary_list(
        col_name, X[col_name].unique())
    indicator_column = feature_column.indicator_column(categorical_column)
    feature_columns.append(indicator_column)
# embedding columns
for col_name in emb_vars:
    emb_column = feature_column.categorical_column_with_vocabulary_list(
        col_name, X[col_name].unique())
    col_embedding = feature_column.embedding_column(emb_column, dimension=5)
    feature_columns.append(col_embedding)

# Convert all setup into new dataset
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)
X = feature_layer(dict(X)).numpy()
print('New shape of the input data set:',X.shape)

Numerical variables are  ['Prsn_Age', 'Rural', 'Crash_Speed_Limit', 'Veh_Mod_Year', 'Weekend', 'Crash_season', 'Part_of_day']
Categorical variables that have at most 5 categories are  ['Prsn_Gndr', 'Prsn_Drg_Rslt', 'Collsn_type']
Categorical variables that have more than 5 categories are  ['Wthr_Cond', 'Light_Cond', 'Surf_Cond', 'Veh_Body_Styl', 'Prsn_Rest', 'Harm_Evnt', 'Road_Algn', 'Collsn_name']
New shape of the input data set: (949856, 59)


In [37]:
# %% Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=48)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train, random_state=48)

print('Training features shape:', X_train.shape)
print('Validation features shape:', X_val.shape)
print('Test features shape:', X_test.shape)

# %% standardization
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

Training features shape: (607907, 59)
Validation features shape: (151977, 59)
Test features shape: (189972, 59)


# ALL mini functions



In [38]:
# Import Metrics
from imblearn.metrics import geometric_mean_score
from sklearn.metrics import roc_auc_score,confusion_matrix,accuracy_score

# %% Function to compare the prediction and true labels
def get_accs(label, pred_proba, tr_time=0,index=None):
    prediction = pred_proba.argmax(axis=1)
    cm = confusion_matrix(label, prediction)
    length = cm.shape[0]
    num_cases = len(label)
    # global accuracy
    glb_acc = np.trace(cm) / len(label)
    ind_accs = cm / np.sum(cm, axis=1)[:, np.newaxis]
    accs = [ind_accs[i, i] for i in range(length)]
    cols = ['Class {}'.format(i) for i in range(length)]
    # Global accuracy
    accs.append(glb_acc)
    # AUC
    accs.append(roc_auc_score(label, pred_proba,multi_class='ovr'))
    # G-mean
    accs.append(geometric_mean_score(label, prediction, correction=0.001))
    # Average perf
    accs.append(np.mean(accs[-3:]))
    # Training time
    accs.append(np.round(tr_time,3))
    cols = cols + ['Accuracy','AUC','G-mean','Avg_Pfm','Training Time']

    out = np.array(accs).reshape(1, len(accs))
    return pd.DataFrame(out, columns=cols,index=[index])

# ML with class weight

# MLP functions


In [39]:
# Add weights
weights = len(y_train) / (5 * np.bincount(y_train))
cls_wgt = dict(zip(np.arange(5), weights))
cls_wgt

{0: 0.23969347661941065,
 1: 1.8550434079431195,
 2: 4.199122746425364,
 3: 23.891019846728238,
 4: 111.95340699815839}

In [40]:
def early_stops(metric_name):
    es = EarlyStopping(monitor='val_'+ metric_name,
                   verbose=1, patience=10, mode='max',
                   restore_best_weights=True)
    return es

In [41]:
# Constant
EPOCH = 50
BATCH_SIZE = 2048
VERBOSE = 0

In [42]:
METRICS = ['accuracy']
def create_mlp():
    MLP = Sequential([Dense(10,
                           activation='relu',
                           input_dim=X_train.shape[1],
                           ),
                      Dropout(0.5),
                      Dense(5, activation='softmax')])
    MLP.compile(optimizer='adam',
                loss=SparseCategoricalCrossentropy(from_logits=True),
                metrics=METRICS
               )
    return MLP

# Hybrid Resampling 

In [43]:
y_dict = Counter(y_train)

def hyb_sam(random=54):
    start = time.time()
    res = RandomUnderSampler(random_state = random, sampling_strategy={0: y_dict[1]})
    print('under sampling ...')
    X_sam, y_sam = res.fit_resample(X_train, y_train)
    print(Counter(y_sam))
    print('over sampling #2 ...')
    res = SMOTE(random_state = random,sampling_strategy='not majority')
    X_sam, y_sam = res.fit_resample(X_sam, y_sam)
    end = time.time()
    res_time = end-start
    print('Resampling time is %.2f' % res_time)
    return X_sam, y_sam
X_res, y_res = hyb_sam()

under sampling ...
Counter({0: 65541, 1: 65541, 2: 28954, 3: 5089, 4: 1086})
over sampling #2 ...
Resampling time is 64.31


# MLP with Hybrid Sampling

# Set up grid search

We will investigates the following parameters:

- Initial weights
- Activation function
- Number of nodes
- Dropout rate
- Early Stop
- Learning rate

# Keras tuner

In [44]:
def build_model(hp):
    hp_units = hp.Int('units', min_value=5, max_value=20, step=5)
    hp_learning_rate = hp.Choice('learning_rate', values = [1e-2, 1e-3, 1e-4]) 
    hp_dos = hp.Float('dropouts',min_value=0.2, max_value=0.3, step=0.1)
    hp_acts = hp.Choice('activation', values = ['relu','sigmoid','tanh','selu'])
    keins = ['uniform', 'normal', 'zeros', 'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform']
    hp_keins = hp.Choice('kernel_ini', values = keins) 
    model = Sequential([Dense(hp_units,
                           activation=hp_acts,
                           input_dim=X_train.shape[1],
                            kernel_initializer= hp_keins 
                           ),
                      Dropout(hp_dos),
                      Dense(5, activation='softmax')])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate = hp_learning_rate),
                loss=SparseCategoricalCrossentropy(from_logits=True),
                metrics=METRICS
               )
    return model

In [45]:
MAX_EPOCHS = 30
FACTOR = 5
rsts = pd.DataFrame()
bps = pd.DataFrame()
obj = 'accuracy'

In [None]:
start = time.time()
tuner = Hyperband(build_model,
                    objective = obj, 
                    max_epochs = MAX_EPOCHS,
                    hyperband_iterations=3,
                    factor = FACTOR,
                    directory = 'my_dir',
                    project_name = 'val_'+ obj+'_'+time.ctime())
tuner.search(X_res, y_res,
             epochs=MAX_EPOCHS,batch_size=2048,
             verbose=0,
             callbacks=[early_stops(obj)],
             validation_data=(X_val, y_val))
end = time.time()

print('Tuning time is %.2f' % (end-start))
print(tuner.oracle.get_best_trials(num_trials=1)[0].hyperparameters.values)
bp = pd.Series(tuner.oracle.get_best_trials(num_trials=1)[0].hyperparameters.values,name=obj)
bp = bp.append(pd.Series(end-start,index=['Tuning_time']))
bps = pd.concat((bps,bp),axis=1)
models = tuner.get_best_models(num_models=FACTOR)
for i in range(FACTOR):
    Y_pred = models[i].predict(X_test)
    rsts = rsts.append(get_accs(y_test.values,Y_pred,end-start,'Best model-HB-'+str(i+1)))
print(bps)
print(rsts.iloc[:,5:])

Restoring model weights from the end of the best epoch.
Epoch 00021: early stopping
Restoring model weights from the end of the best epoch.
Epoch 00019: early stopping


In [None]:
# Save the entire model as a SavedModel.
!mkdir -p saved_model
bestm = models[0]
bestm.save('saved_model/bestm')

In [None]:
# Load the best model
new_model = tf.keras.models.load_model('saved_model/bestm')

# Check its architecture
new_model.summary()

In [None]:
np.random.seed(6)
rs = np.random.choice(range(100), 10, replace=False)
rs

In [None]:
def hyb_sam2(random=48):
    start = time.time()
    res = RandomUnderSampler(random_state = random, sampling_strategy={0: y_dict[1]})
    print('under sampling ...')
    X_sam, y_sam = res.fit_resample(X_train, y_train)
    print(Counter(y_sam))
    print('over sampling #2 ...')
    res = RandomOverSampler(random_state = random,sampling_strategy='not majority')
    X_sam, y_sam = res.fit_resample(X_sam, y_sam)
    end = time.time()
    print(Counter(y_sam))
    res_time = end-start
    print('Resampling time is %.2f' % res_time)
    return X_sam, y_sam

In [None]:
refit_models = []
for i in rs:
    bestm = models[2]
    X_res2, y_res2 = hyb_sam(i)
    bestm.fit(X_res2, y_res2,
                            callbacks=[early_stops('accuracy')],
                            validation_data=(X_val,y_val),
                            batch_size=BATCH_SIZE,
                            verbose=VERBOSE, epochs=EPOCH
                           )
    print('Retraining Done!')
    refit_models.append(bestm)
    Y_pred = bestm.predict(X_test)
    rsts = rsts.append(get_accs(y_test.values,Y_pred,end-start,'HB-refit-'+str(i+1)))
rsts.iloc[:,5:]

In [None]:
Y_pred = 0
for i in range(5):
    print(i)
    Y_pred+= refit_models[i].predict(X_test)

rsts = rsts.append(get_accs(y_test.values,Y_pred/5,end-start,'HB-refit-ensemble'))
rsts.iloc[:,5:]

In [28]:
X_res, y_res = hyb_sam()
start = time.time()
tuner = BayesianOptimization(build_model,
                     objective = obj, 
                     max_trials = 5,
                     directory = 'my_dir',
                     project_name = 'val_'+ obj+'_'+time.ctime())
tuner.search(X_res, y_res,
             epochs=MAX_EPOCHS,batch_size=2048,
             verbose=0,
             callbacks=[early_stops(obj)],
             validation_data=(X_val, y_val))
end = time.time()

print('Tuning time is %.2f' % (end-start))
print(tuner.oracle.get_best_trials(num_trials=1)[0].hyperparameters.values)
bp = pd.Series(tuner.oracle.get_best_trials(num_trials=1)[0].hyperparameters.values,name=obj)
bp = bp.append(pd.Series(end-start,index=['Tuning_time']))
bps = pd.concat((bps,bp),axis=1)
models = tuner.get_best_models(num_models=FACTOR)
for i in range(FACTOR):
    Y_pred = models[i].predict(X_test)
    rsts = rsts.append(get_accs(y_test.values,Y_pred,end-start,'Best model-BO-'+str(i+1)))
print(bps)
print(rsts.iloc[:,5:])

under sampling ...
Counter({0: 65541, 1: 65541, 2: 28954, 3: 5089, 4: 1086})
over sampling #2 ...
Resampling time is 62.99
Restoring model weights from the end of the best epoch.
Epoch 00020: early stopping
Restoring model weights from the end of the best epoch.
Epoch 00022: early stopping
Restoring model weights from the end of the best epoch.
Epoch 00013: early stopping
Restoring model weights from the end of the best epoch.
Epoch 00015: early stopping
Restoring model weights from the end of the best epoch.
Epoch 00012: early stopping
INFO:tensorflow:Oracle triggered exit
Tuning time is 43.56
{'units': 20, 'learning_rate': 0.001, 'dropouts': 0.2, 'activation': 'relu', 'kernel_ini': 'uniform'}
                                                    0        0
units                                              20       20
learning_rate                                    0.01    0.001
dropouts                                          0.2      0.2
activation                                  

In [None]:
refit_models = []
for i in rs:
    bestm = models[2]
    X_res2, y_res2 = hyb_sam(i)
    bestm.fit(X_res2, y_res2,
                            callbacks=[early_stops('accuracy')],
                            validation_data=(X_val,y_val),
                            batch_size=BATCH_SIZE,
                            verbose=VERBOSE, epochs=EPOCH
                           )
    print('Retraining Done!')
    refit_models.append(bestm)
    Y_pred = bestm.predict(X_test)
    rsts = rsts.append(get_accs(y_test.values,Y_pred,end-start,'BO-refit-'+str(i+1)))
rsts.iloc[:,5:]

In [29]:
# Save the entire model as a SavedModel.
bestm = models[0]
bestm.save('saved_model/bestmBO')

INFO:tensorflow:Assets written to: saved_model/bestmBO/assets


# Refit with RUS2ROS

In [38]:
def hyb_sam2(random=48):
    start = time.time()
    res = RandomUnderSampler(random_state = random, sampling_strategy={0: y_dict[1]})
    print('under sampling ...')
    X_sam, y_sam = res.fit_resample(X_train, y_train)
    print(Counter(y_sam))
    print('over sampling #2 ...')
    res = RandomOverSampler(random_state = random,sampling_strategy='not majority')
    X_sam, y_sam = res.fit_resample(X_sam, y_sam)
    end = time.time()
    print(Counter(y_sam))
    res_time = end-start
    print('Resampling time is %.2f' % res_time)
    return X_sam, y_sam
X_res, y_res = hyb_sam2()

under sampling ...
Counter({0: 65541, 1: 65541, 2: 28954, 3: 5089, 4: 1086})
over sampling #2 ...
Counter({0: 65541, 1: 65541, 2: 65541, 3: 65541, 4: 65541})
Resampling time is 0.27


In [39]:
refit_models = []
for i in rs:
    for j in range(3):
        bestm = models[j]
        X_res2, y_res2 = hyb_sam2(i)
        bestm.fit(X_res2, y_res2,
                                callbacks=[early_stops('accuracy')],
                                validation_data=(X_val,y_val),
                                batch_size=BATCH_SIZE,
                                verbose=VERBOSE, epochs=EPOCH
                               )
        print('Retraining Done!')
        refit_models.append(bestm)
        Y_pred = bestm.predict(X_test)
        rsts = rsts.append(get_accs(y_test.values,Y_pred,end-start,'Refit-'+str(j)+'-'+str(i+1)))
rsts.iloc[:,5:]

under sampling ...
Counter({0: 65541, 1: 65541, 2: 28954, 3: 5089, 4: 1086})
over sampling #2 ...
Counter({0: 65541, 1: 65541, 2: 65541, 3: 65541, 4: 65541})
Resampling time is 0.26
Restoring model weights from the end of the best epoch.
Epoch 00025: early stopping
Retraining Done!
under sampling ...
Counter({0: 65541, 1: 65541, 2: 28954, 3: 5089, 4: 1086})
over sampling #2 ...
Counter({0: 65541, 1: 65541, 2: 65541, 3: 65541, 4: 65541})
Resampling time is 0.27
Restoring model weights from the end of the best epoch.
Epoch 00013: early stopping
Retraining Done!
under sampling ...
Counter({0: 65541, 1: 65541, 2: 28954, 3: 5089, 4: 1086})
over sampling #2 ...
Counter({0: 65541, 1: 65541, 2: 65541, 3: 65541, 4: 65541})
Resampling time is 0.27
Restoring model weights from the end of the best epoch.
Epoch 00011: early stopping
Retraining Done!
under sampling ...
Counter({0: 65541, 1: 65541, 2: 28954, 3: 5089, 4: 1086})
over sampling #2 ...
Counter({0: 65541, 1: 65541, 2: 65541, 3: 65541, 4: 6

Unnamed: 0,Accuracy,AUC,G-mean,Avg_Pfm,Training Time
Best models-1,0.464447,0.707905,0.392048,0.521467,1248.516
Best models-2,0.513207,0.708926,0.382362,0.534832,1248.516
Best models-3,0.512707,0.721654,0.39153,0.541964,1248.516
accuracy-refit-21,0.502664,0.711148,0.397822,0.537211,1248.516
accuracy-refit-11,0.490788,0.708562,0.395748,0.531699,1248.516
accuracy-refit-97,0.495573,0.709236,0.384634,0.529814,1248.516
accuracy-refit-17,0.50668,0.709383,0.397778,0.537947,1248.516
accuracy-refit-64,0.507527,0.709259,0.395648,0.537478,1248.516
accuracy-refit-ensemble,0.507527,0.709259,0.395648,0.537478,1248.516
accuracy-refit-18,0.512581,0.721357,0.400488,0.544809,1248.516


In [41]:
Y_pred = 0
for i in range(len(refit_models)):
    print(i)
    Y_pred+= refit_models[i].predict(X_test)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14


ValueError: Target scores need to be probabilities for multiclass roc_auc, i.e. they should sum up to 1.0 over classes

In [42]:
rsts = rsts.append(get_accs(y_test.values,Y_pred/len(refit_models),end-start,'Refit-ensemble'))
rsts.iloc[:,5:]

Unnamed: 0,Accuracy,AUC,G-mean,Avg_Pfm,Training Time
Best models-1,0.464447,0.707905,0.392048,0.521467,1248.516
Best models-2,0.513207,0.708926,0.382362,0.534832,1248.516
Best models-3,0.512707,0.721654,0.39153,0.541964,1248.516
accuracy-refit-21,0.502664,0.711148,0.397822,0.537211,1248.516
accuracy-refit-11,0.490788,0.708562,0.395748,0.531699,1248.516
accuracy-refit-97,0.495573,0.709236,0.384634,0.529814,1248.516
accuracy-refit-17,0.50668,0.709383,0.397778,0.537947,1248.516
accuracy-refit-64,0.507527,0.709259,0.395648,0.537478,1248.516
accuracy-refit-ensemble,0.507527,0.709259,0.395648,0.537478,1248.516
accuracy-refit-18,0.512581,0.721357,0.400488,0.544809,1248.516


## Refit with full data

In [47]:
refit_models = []
for i in range(3):
    bestm = models[i]
    bestm.fit(X_train, y_train,
                            callbacks=[early_stops('accuracy')],
                            class_weight=cls_wgt,
                            validation_data=(X_val,y_val),
                            batch_size=BATCH_SIZE,
                            verbose=VERBOSE, epochs=EPOCH
                           )
    print('Retraining Done!')
    refit_models.append(bestm)
    Y_pred = bestm.predict(X_test)
    print(get_accs(y_test.values,Y_pred,end-start,obj+'-refit-'+str(i+1)))

  ...
    to  
  ['...']
  ...
    to  
  ['...']
Restoring model weights from the end of the best epoch.
Epoch 00013: early stopping
Retraining Done!
                   Class 0   Class 1   Class 2  Class 3   Class 4  Accuracy  \
accuracy-refit-1  0.531865  0.420516  0.109957  0.36478  0.684366  0.498637   

                       AUC    G-mean   Avg_Pfm  Training Time  
accuracy-refit-1  0.693861  0.361098  0.517865       1248.516  
  ...
    to  
  ['...']
  ...
    to  
  ['...']
Restoring model weights from the end of the best epoch.
Epoch 00023: early stopping
Retraining Done!
                   Class 0   Class 1   Class 2   Class 3   Class 4  Accuracy  \
accuracy-refit-2  0.600844  0.261791  0.228644  0.310063  0.705015  0.544312   

                       AUC    G-mean   Avg_Pfm  Training Time  
accuracy-refit-2  0.691426  0.379407  0.538382       1248.516  
  ...
    to  
  ['...']
  ...
    to  
  ['...']
Restoring model weights from the end of the best epoch.
Epoch 00016: ear

In [48]:
Y_pred = 0
for i in range(len(refit_models)):
    print(i)
    Y_pred+= refit_models[i].predict(X_test)

rsts = rsts.append(get_accs(y_test.values,Y_pred/len(refit_models),end-start,'Refit-ensemble-full1'))
rsts.iloc[:,5:]

0
1
2


Unnamed: 0,Accuracy,AUC,G-mean,Avg_Pfm,Training Time
Best models-1,0.464447,0.707905,0.392048,0.521467,1248.516
Best models-2,0.513207,0.708926,0.382362,0.534832,1248.516
Best models-3,0.512707,0.721654,0.39153,0.541964,1248.516
accuracy-refit-21,0.502664,0.711148,0.397822,0.537211,1248.516
accuracy-refit-11,0.490788,0.708562,0.395748,0.531699,1248.516
accuracy-refit-97,0.495573,0.709236,0.384634,0.529814,1248.516
accuracy-refit-17,0.50668,0.709383,0.397778,0.537947,1248.516
accuracy-refit-64,0.507527,0.709259,0.395648,0.537478,1248.516
accuracy-refit-ensemble,0.507527,0.709259,0.395648,0.537478,1248.516
accuracy-refit-18,0.512581,0.721357,0.400488,0.544809,1248.516


In [49]:
for i in range(3):
    bestm = models[i]
    bestm.fit(X_train, y_train,
                            callbacks=[early_stops('accuracy')],
                            validation_data=(X_val,y_val),
                            batch_size=BATCH_SIZE,
                            verbose=VERBOSE, epochs=EPOCH
                           )
    print('Retraining Done!')
    refit_models.append(bestm)
    Y_pred = bestm.predict(X_test)
    print(get_accs(y_test.values,Y_pred,end-start,obj+'-refit-'+str(i+1)))

Restoring model weights from the end of the best epoch.
Epoch 00019: early stopping
Retraining Done!
                   Class 0  Class 1  Class 2  Class 3   Class 4  Accuracy  \
accuracy-refit-1  0.999174      0.0      0.0      0.0  0.368732  0.834365   

                       AUC   G-mean   Avg_Pfm  Training Time  
accuracy-refit-1  0.634947  0.01298  0.494097       1248.516  
Restoring model weights from the end of the best epoch.
Epoch 00014: early stopping
Retraining Done!
                   Class 0  Class 1  Class 2   Class 3   Class 4  Accuracy  \
accuracy-refit-2  0.998972      0.0      0.0  0.000629  0.415929  0.834286   

                       AUC    G-mean   Avg_Pfm  Training Time  
accuracy-refit-2  0.664361  0.012118  0.503588       1248.516  
Restoring model weights from the end of the best epoch.
Epoch 00032: early stopping
Retraining Done!
                   Class 0  Class 1  Class 2  Class 3   Class 4  Accuracy  \
accuracy-refit-3  0.999861      0.0      0.0      0.0 

In [50]:
Y_pred = 0
for i in range(len(refit_models)):
    print(i)
    Y_pred+= refit_models[i].predict(X_test)

rsts = rsts.append(get_accs(y_test.values,Y_pred/len(refit_models),end-start,'Refit-ensemble-full2'))
rsts.iloc[:,5:]

0
1
2
3
4
5


Unnamed: 0,Accuracy,AUC,G-mean,Avg_Pfm,Training Time
Best models-1,0.464447,0.707905,0.392048,0.521467,1248.516
Best models-2,0.513207,0.708926,0.382362,0.534832,1248.516
Best models-3,0.512707,0.721654,0.39153,0.541964,1248.516
accuracy-refit-21,0.502664,0.711148,0.397822,0.537211,1248.516
accuracy-refit-11,0.490788,0.708562,0.395748,0.531699,1248.516
accuracy-refit-97,0.495573,0.709236,0.384634,0.529814,1248.516
accuracy-refit-17,0.50668,0.709383,0.397778,0.537947,1248.516
accuracy-refit-64,0.507527,0.709259,0.395648,0.537478,1248.516
accuracy-refit-ensemble,0.507527,0.709259,0.395648,0.537478,1248.516
accuracy-refit-18,0.512581,0.721357,0.400488,0.544809,1248.516


# check tuner for class weight model

In [51]:
start = time.time()
tuner = Hyperband(build_model,
                    objective = obj, 
                    max_epochs = MAX_EPOCHS,
                    hyperband_iterations=3,
                    factor = FACTOR,
                    directory = 'my_dir',
                    project_name = 'val_'+ obj+'_'+time.ctime())
tuner.search(X_train, y_train,
             epochs=MAX_EPOCHS,batch_size=2048,
             verbose=0,
             class_weight=cls_wgt,
             callbacks=[early_stops(obj)],
             validation_data=(X_val, y_val))
end = time.time()

print('Tuning time is %.2f' % (end-start))
print(tuner.oracle.get_best_trials(num_trials=1)[0].hyperparameters.values)
bp = pd.Series(tuner.oracle.get_best_trials(num_trials=1)[0].hyperparameters.values,name=obj)
bp = bp.append(pd.Series(end-start,index=['Tuning_time']))
bps = pd.concat((bps,bp),axis=1)
models = tuner.get_best_models(num_models=FACTOR)
for i in range(FACTOR):
    Y_pred = models[i].predict(X_test)
    evaluation = get_accs(y_test.values,Y_pred,end-start,'Best models-Weight-'+str(i+1))
    print(evaluation)
    rsts = rsts.append(evaluation)
print(bps)
print(rsts.iloc[:,5:])

  ...
    to  
  ['...']
  ...
    to  
  ['...']
  ...
    to  
  ['...']
  ...
    to  
  ['...']
  ...
    to  
  ['...']
  ...
    to  
  ['...']
  ...
    to  
  ['...']
  ...
    to  
  ['...']
  ...
    to  
  ['...']
  ...
    to  
  ['...']
  ...
    to  
  ['...']
  ...
    to  
  ['...']
  ...
    to  
  ['...']
  ...
    to  
  ['...']
  ...
    to  
  ['...']
  ...
    to  
  ['...']
  ...
    to  
  ['...']
  ...
    to  
  ['...']
  ...
    to  
  ['...']
  ...
    to  
  ['...']
  ...
    to  
  ['...']
  ...
    to  
  ['...']
  ...
    to  
  ['...']
  ...
    to  
  ['...']
  ...
    to  
  ['...']
  ...
    to  
  ['...']
  ...
    to  
  ['...']
  ...
    to  
  ['...']
  ...
    to  
  ['...']
  ...
    to  
  ['...']
  ...
    to  
  ['...']
  ...
    to  
  ['...']
  ...
    to  
  ['...']
  ...
    to  
  ['...']
  ...
    to  
  ['...']
  ...
    to  
  ['...']
  ...
    to  
  ['...']
  ...
    to  
  ['...']
  ...
    to  
  ['...']
  ...
    to  
  ['...']


In [30]:
rsts.to_csv('Extra fiting.csv')