In [1]:
import pickle
import os
import numpy as np

project_folder = "."

with open(os.path.join(project_folder,"data","train_test.pkl"), "rb") as f:
    X_train, X_test, y_train, y_test = pickle.load(f)

In [2]:
type(X_test)

scipy.sparse.coo.coo_matrix

In [3]:
X_train_np = X_train.toarray()
X_test_np = (X_test.toarray()) 


y_train_np = np.array(y_train)
y_test_np  = np.array(y_test)

In [4]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
np_data_scaled = sc.fit_transform(X_train_np)
np_data_scaled_test = sc.transform(X_test_np)

In [5]:
### Imports 

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Activation, Dropout
from tensorflow.keras.utils import normalize, to_categorical
from sklearn.metrics import roc_curve, auc
from tensorflow.keras import optimizers
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras import initializers
from tensorflow.keras.layers import LeakyReLU

In [6]:
model = Sequential()

model.add(tf.keras.Input(shape=(68,)))

model.add(Dense(500))
model.add(LeakyReLU())
model.add(Dropout(rate=0.2))

model.add(Dense(300))
model.add(LeakyReLU())

model.add(Dense(100))
model.add(LeakyReLU())

model.add(Dense(1,activation = 'sigmoid'))

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 500)               34500     
_________________________________________________________________
leaky_re_lu (LeakyReLU)      (None, 500)               0         
_________________________________________________________________
dropout (Dropout)            (None, 500)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 300)               150300    
_________________________________________________________________
leaky_re_lu_1 (LeakyReLU)    (None, 300)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 100)               30100     
_________________________________________________________________
leaky_re_lu_2 (LeakyReLU)    (None, 100)               0

In [7]:
# Compiling the ANN

earlyStopping = EarlyStopping(monitor='val_auc', mode='max', min_delta = 0.0001 , patience = 10,restore_best_weights=True)

callbacks_a = [earlyStopping]

adam = optimizers.Adam(lr = 0.0001)

model.compile(optimizer = optimizers.Adam(), loss = 'binary_crossentropy', metrics = [tf.keras.metrics.AUC(),tf.keras.metrics.Recall()])

In [8]:
# Fitting the ANN
model.fit(np_data_scaled, y_train_np, batch_size = 256, validation_data=(np_data_scaled_test,y_test_np), epochs = 1000 , callbacks = callbacks_a)

Train on 128000 samples, validate on 32000 samples
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000


<tensorflow.python.keras.callbacks.History at 0x1a506e3dd0>

In [9]:
test = np_data_scaled_test[1]

In [10]:
test.shape

(68,)

In [11]:
model.predict(np_data_scaled_test)

array([[6.2316656e-05],
       [9.9841070e-01],
       [5.9143591e-01],
       ...,
       [1.8882751e-04],
       [3.0104220e-03],
       [3.3550262e-03]], dtype=float32)

In [12]:
from sklearn import metrics
import numpy as np
from tqdm import tqdm

probas = list(np.arange(0.01, 0.4, 0.01))

def calcCost(model, X, y, proba, fp = 10, fn = 500):
    y_hat_proba = model.predict(X,use_multiprocessing=True)
    y_hat = (y_hat_proba > proba).astype(int)
    mt = metrics.confusion_matrix(y, y_hat)
    acc = metrics.accuracy_score(y, y_hat)
    precision = metrics.precision_score(y, y_hat)
    return {
        "probability":proba,
        "accuracy": acc,
        "precision": precision,
        "fn":mt[1,0],
        "fn_cost":mt[1,0]*fn,
        "fp":mt[0,1],
        "fp_cost":mt[0,1]*fp
    }

cost = [ calcCost(model, np_data_scaled_test, y_test, proba) for proba in tqdm(probas) ]

100%|██████████| 39/39 [00:56<00:00,  1.45s/it]


In [13]:
import pandas as pd

cost_df = pd.DataFrame(cost)

cost_df["total_cost"] = cost_df.fn_cost + cost_df.fp_cost

cost_df.sort_values(by=["total_cost"], ascending=True).head()

Unnamed: 0,probability,accuracy,precision,fn,fn_cost,fp,fp_cost,total_cost
2,0.03,0.88925,0.786326,103,51500,3441,34410,85910
3,0.04,0.901937,0.807015,112,56000,3026,30260,86260
1,0.02,0.870125,0.75723,93,46500,4063,40630,87130
0,0.01,0.832875,0.706537,78,39000,5270,52700,91700
4,0.05,0.910094,0.821384,129,64500,2748,27480,91980


In [14]:
model.save('models/model_85.h5')