In [20]:
from sklearn.metrics import make_scorer
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Sequential
from keras.layers import Conv1D, MaxPooling1D
from keras.layers import Activation, Dropout, Flatten, Dense

from sklearn.model_selection import KFold
import tensorflow as tf

In [21]:
# Remove missing values
cleaned_data = ""
cleaned_p_count = 0
cleaned_n_count = 0

with open('data/crx.data', 'r') as f:
    data = f.readlines()
    for i, row in enumerate(data):
        # Check for '?' value in each row (indicates missing)
        if '?' not in row:
            cleaned_data += row
            if '+' in row:
                cleaned_p_count += 1
            elif '-' in row:
                cleaned_n_count += 1

    print(cleaned_data)

with open('./data/crx_clean.data.txt', 'w') as f:
    f.write(cleaned_data)

with open('./data/crx_clean.names.txt', 'w') as f:
    f.write("Class Distribution\n")
    f.write("+ Classes: %d\n" %cleaned_p_count)
    f.write("- Classes: %d\n" %cleaned_n_count)

b,30.83,0,u,g,w,v,1.25,t,t,01,f,g,00202,0,+
a,58.67,4.46,u,g,q,h,3.04,t,t,06,f,g,00043,560,+
a,24.50,0.5,u,g,q,h,1.5,t,f,0,f,g,00280,824,+
b,27.83,1.54,u,g,w,v,3.75,t,t,05,t,g,00100,3,+
b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,00120,0,+
b,32.08,4,u,g,m,v,2.5,t,f,0,t,g,00360,0,+
b,33.17,1.04,u,g,r,h,6.5,t,f,0,t,g,00164,31285,+
a,22.92,11.585,u,g,cc,v,0.04,t,f,0,f,g,00080,1349,+
b,54.42,0.5,y,p,k,h,3.96,t,f,0,f,g,00180,314,+
b,42.50,4.915,y,p,w,v,3.165,t,f,0,t,g,00052,1442,+
b,22.08,0.83,u,g,c,h,2.165,f,f,0,t,g,00128,0,+
b,29.92,1.835,u,g,c,h,4.335,t,f,0,f,g,00260,200,+
a,38.25,6,u,g,k,v,1,t,f,0,t,g,00000,0,+
b,48.08,6.04,u,g,k,v,0.04,f,f,0,f,g,00000,2690,+
a,45.83,10.5,u,g,q,v,5,t,t,07,t,g,00000,0,+
b,36.67,4.415,y,p,k,v,0.25,t,t,10,t,g,00320,0,+
b,28.25,0.875,u,g,m,v,0.96,t,t,03,t,g,00396,0,+
a,23.25,5.875,u,g,q,v,3.17,t,t,10,f,g,00120,245,+
b,21.83,0.25,u,g,d,h,0.665,t,f,0,t,g,00000,0,+
a,19.17,8.585,u,g,cc,h,0.75,t,t,07,f,g,00096,0,+
b,25.00,11.25,u,g,c,v,2.5,t,t,17,f,g,00200,1208,+
b,23

In [22]:
def one_hot_encode_category(credit_data):
    """
    Splits 'category' columns into one-hot columns
    arg, return
        credit_data: Dataframe
    """
    cat_columns = []
    for i, _ in enumerate(credit_data):
        # dtype == 'object' after ensuring data has been cleaned
        # i.e no 'float' dtypes as 'object' because of '?' values
        if credit_data[i].dtype == 'object' and not i==15:
            cat_columns.append(i)


    # get_dummies() one-hot encodes data
    credit_data = pd.get_dummies(credit_data, columns=cat_columns)
    
    return credit_data

In [23]:
seed = 100

def import_data(url):
    """
    args
        url: url string of CLEANED csv data
    returns
        credit_data: Dataframe
    """

    credit_data = pd.read_csv(url, sep=',', header=None)

    # Bring class attribute to first column
    cols = credit_data.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    credit_data = credit_data[cols]
    print("Reordered Dataset: \n", credit_data.head())

    credit_data = one_hot_encode_category(credit_data)
    print("Dataset length: ", len(credit_data))
    print("Dataset shape: ", credit_data.shape)
    print("One-hot Dataset: \n", credit_data.head())
    # print(credit_data.info())
    return credit_data

In [24]:
#Read filtered data and feature setup and split to feed

from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
# Building Phase
data = import_data("data/crx_clean.data.txt")

X = data.values[:, 1:]
Y = data.values[:, 0]

# Train-test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = seed)

#Float conversion
X_train = X_train.astype(np.float)
X_test = X_test.astype(np.float)

Y_train = np.where(Y_train=='+', 1, Y_train)
Y_train = np.where(Y_train=='-', 0, Y_train)
Y_train = Y_train.astype(np.float)

Y_test = np.where(Y_test=='+', 1, Y_test)
Y_test = np.where(Y_test=='-', 0, Y_test)
Y_test = Y_test.astype(np.float)

Reordered Dataset: 
   15 0      1      2  3  4  5  6     7  8  9   10 11 12   13   14
0  +  b  30.83  0.000  u  g  w  v  1.25  t  t   1  f  g  202    0
1  +  a  58.67  4.460  u  g  q  h  3.04  t  t   6  f  g   43  560
2  +  a  24.50  0.500  u  g  q  h  1.50  t  f   0  f  g  280  824
3  +  b  27.83  1.540  u  g  w  v  3.75  t  t   5  t  g  100    3
4  +  b  20.17  5.625  u  g  w  v  1.71  t  f   0  f  s  120    0
Dataset length:  653
Dataset shape:  (653, 47)
One-hot Dataset: 
   15      1      2     7  10   13   14  0_a  0_b  3_l  ...  6_z  8_f  8_t  \
0  +  30.83  0.000  1.25   1  202    0    0    1    0  ...    0    0    1   
1  +  58.67  4.460  3.04   6   43  560    1    0    0  ...    0    0    1   
2  +  24.50  0.500  1.50   0  280  824    1    0    0  ...    0    0    1   
3  +  27.83  1.540  3.75   5  100    3    0    1    0  ...    0    0    1   
4  +  20.17  5.625  1.71   0  120    0    0    1    0  ...    0    0    1   

   9_f  9_t  11_f  11_t  12_g  12_p  12_s  
0    0    

In [25]:
# Merge inputs and targets
inputs = np.concatenate((X_train, X_test), axis=0)
targets = np.concatenate((Y_train, Y_test), axis=0)

In [26]:
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [27]:
def get_model(optimizerF, lossF):
    model = Sequential()
    model.add(Dense(46, input_dim=46, activation='relu'))
    model.add(Dense(20, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss=lossF,optimizer=optimizerF,metrics=[f1_m])
    return model

In [28]:
def evaluateModel(optimizerF, lossF):
    print(f'**************************************{optimizerF}----{lossF}**************************************')
    # Define per-fold score containers
    f1_per_fold = []
    loss_per_fold = []

    # Define the K-fold Cross Validator
    kfold = KFold(n_splits=5, shuffle=True)

    # K-fold Cross Validation model evaluation
    fold_no = 1
    for train, test in kfold.split(inputs, targets):

    # Define the model architecture
        model = get_model(optimizerF, lossF)
#         print('-------------------------------------------------------------------------------------------------')
#         print(f'Training for fold {fold_no} ...')

    # Fit data to model
        history = model.fit(inputs[train], targets[train],batch_size=10,epochs=50, verbose=0)

    # Generate generalization metrics
        scores = model.evaluate(inputs[test], targets[test], verbose=0)
#         print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]} {model.metrics_names[1]} of {scores[1]}')
        f1_per_fold.append(scores[1])
        loss_per_fold.append(scores[0])
    # Increase fold number
        fold_no = fold_no + 1

    # == Provide average scores ==
    print('------------------------------------------------------------------------')
    print('Score per fold')
    for i in range(0, len(f1_per_fold)):
        print('------------------------------------------------------------------------')
        print(f'> Fold {i+1} - Loss: {loss_per_fold[i]} - F1 score: {f1_per_fold[i]}')

    print('------------------------------------------------------------------------')
    print('F1 scores for all folds:')
    print(f'> F1: {np.mean(f1_per_fold)} (+- {np.std(f1_per_fold)})')
    print(f'> Loss: {np.mean(loss_per_fold)}')
    print('------------------------------------------------------------------------')

In [29]:
optimzers = ['adam', 'SGD', 'RMSprop', 'Adadelta', 'Adagrad', 'Adamax', 'Nadam', 'Ftrl']
lossFunctions = ['binary_crossentropy', 'mean_squared_error']

In [31]:
for opti in optimzers:
    for los in lossFunctions:
        evaluateModel(opti,los)

**************************************adam----binary_crossentropy**************************************
------------------------------------------------------------------------
Score per fold
------------------------------------------------------------------------
> Fold 1 - Loss: 0.7380684614181519 - F1 score: 0.6606453657150269
------------------------------------------------------------------------
> Fold 2 - Loss: 0.3622402250766754 - F1 score: 0.8632726669311523
------------------------------------------------------------------------
> Fold 3 - Loss: 0.7761539816856384 - F1 score: 0.8000894784927368
------------------------------------------------------------------------
> Fold 4 - Loss: 0.5851089954376221 - F1 score: 0.8237780332565308
------------------------------------------------------------------------
> Fold 5 - Loss: 0.800238847732544 - F1 score: 0.5476856231689453
------------------------------------------------------------------------
F1 scores for all folds:
> F1: 0.739

------------------------------------------------------------------------
Score per fold
------------------------------------------------------------------------
> Fold 1 - Loss: 0.5178340673446655 - F1 score: 0.6711379885673523
------------------------------------------------------------------------
> Fold 2 - Loss: 0.7362622618675232 - F1 score: 0.6934472322463989
------------------------------------------------------------------------
> Fold 3 - Loss: 0.47649747133255005 - F1 score: 0.7414284944534302
------------------------------------------------------------------------
> Fold 4 - Loss: 1.31766939163208 - F1 score: 0.5101239681243896
------------------------------------------------------------------------
> Fold 5 - Loss: 0.5467340350151062 - F1 score: 0.5781061053276062
------------------------------------------------------------------------
F1 scores for all folds:
> F1: 0.6388487577438354 (+- 0.08343372401339955)
> Loss: 0.718999445438385
---------------------------------------

In [32]:
def get_model(optimizerF, lossF):
    model = Sequential()
    model.add(Dense(46, input_dim=46, activation='relu'))
    model.add(Dense(20, activation='relu'))
    model.add(Dense(10, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss=lossF,optimizer=optimizerF,metrics=[f1_m])
    return model


for opti in optimzers:
    for los in lossFunctions:
        evaluateModel(opti,los)

**************************************adam----binary_crossentropy**************************************
------------------------------------------------------------------------
Score per fold
------------------------------------------------------------------------
> Fold 1 - Loss: 0.4597422182559967 - F1 score: 0.8283277750015259
------------------------------------------------------------------------
> Fold 2 - Loss: 0.5009877681732178 - F1 score: 0.6318618059158325
------------------------------------------------------------------------
> Fold 3 - Loss: 0.5495796203613281 - F1 score: 0.7999998927116394
------------------------------------------------------------------------
> Fold 4 - Loss: 0.39907586574554443 - F1 score: 0.6696685552597046
------------------------------------------------------------------------
> Fold 5 - Loss: 0.9551959037780762 - F1 score: 0.5423809289932251
------------------------------------------------------------------------
F1 scores for all folds:
> F1: 0.6

------------------------------------------------------------------------
Score per fold
------------------------------------------------------------------------
> Fold 1 - Loss: 0.48175400495529175 - F1 score: 0.7713332772254944
------------------------------------------------------------------------
> Fold 2 - Loss: 0.6529777646064758 - F1 score: 0.6641644835472107
------------------------------------------------------------------------
> Fold 3 - Loss: 0.5858878493309021 - F1 score: 0.7445237040519714
------------------------------------------------------------------------
> Fold 4 - Loss: 0.6554355621337891 - F1 score: 0.40840578079223633
------------------------------------------------------------------------
> Fold 5 - Loss: 0.5871337056159973 - F1 score: 0.5007023811340332
------------------------------------------------------------------------
F1 scores for all folds:
> F1: 0.6178259253501892 (+- 0.14095305441170192)
> Loss: 0.5926377773284912
-----------------------------------

In [34]:

def get_model(optimizerF, lossF):
    model = Sequential()
    model.add(Dense(46, input_dim=46, activation='relu'))
    model.add(Dense(10, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss=lossF,optimizer=optimizerF,metrics=[f1_m])
    return model


for opti in optimzers:
    for los in lossFunctions:
        evaluateModel(opti,los)

**************************************adam----binary_crossentropy**************************************
------------------------------------------------------------------------
Score per fold
------------------------------------------------------------------------
> Fold 1 - Loss: 0.5775033831596375 - F1 score: 0.6335237622261047
------------------------------------------------------------------------
> Fold 2 - Loss: 0.49306952953338623 - F1 score: 0.814073920249939
------------------------------------------------------------------------
> Fold 3 - Loss: 0.3714781403541565 - F1 score: 0.8959332704544067
------------------------------------------------------------------------
> Fold 4 - Loss: 1.4474138021469116 - F1 score: 0.6190057992935181
------------------------------------------------------------------------
> Fold 5 - Loss: 0.7515568733215332 - F1 score: 0.8480879068374634
------------------------------------------------------------------------
F1 scores for all folds:
> F1: 0.76

------------------------------------------------------------------------
Score per fold
------------------------------------------------------------------------
> Fold 1 - Loss: 1.2316051721572876 - F1 score: 0.4767506718635559
------------------------------------------------------------------------
> Fold 2 - Loss: 0.5735089182853699 - F1 score: 0.7539047002792358
------------------------------------------------------------------------
> Fold 3 - Loss: 0.6088120937347412 - F1 score: 0.6503904461860657
------------------------------------------------------------------------
> Fold 4 - Loss: 0.6559863090515137 - F1 score: 0.4659718871116638
------------------------------------------------------------------------
> Fold 5 - Loss: 1.1095212697982788 - F1 score: 0.5223754644393921
------------------------------------------------------------------------
F1 scores for all folds:
> F1: 0.5738786339759827 (+- 0.11134965242115694)
> Loss: 0.8358867526054382
-------------------------------------

In [43]:
from keras.regularizers import l2,l1

def get_model(optimizerF, lossF):
    lamda=0.1
    model = Sequential()
    model.add(Dense(46, input_dim=46, activation='relu'))
    model.add(Dense(10, activation='relu', kernel_regularizer=l2(lamda), bias_regularizer=l2(lamda)))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss=lossF,optimizer=optimizerF,metrics=[f1_m])
    return model
evaluateModel('RMSprop','binary_crossentropy')

**************************************RMSprop----binary_crossentropy**************************************
------------------------------------------------------------------------
Score per fold
------------------------------------------------------------------------
> Fold 1 - Loss: 0.6171974539756775 - F1 score: 0.8058371543884277
------------------------------------------------------------------------
> Fold 2 - Loss: 2.969980478286743 - F1 score: 0.6206064224243164
------------------------------------------------------------------------
> Fold 3 - Loss: 0.6620937585830688 - F1 score: 0.8579448461532593
------------------------------------------------------------------------
> Fold 4 - Loss: 1.1280773878097534 - F1 score: 0.6322553753852844
------------------------------------------------------------------------
> Fold 5 - Loss: 0.9970367550849915 - F1 score: 0.7748794555664062
------------------------------------------------------------------------
F1 scores for all folds:
> F1: 0.