In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import numpy as np

from tensorflow.keras.models import Sequential
import tensorflow_addons as tfa
import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout
from keras.callbacks import ModelCheckpoint
from sklearn.metrics import average_precision_score, roc_auc_score
from sklearn import metrics
from sklearn.utils import shuffle
from tqdm import tqdm

In [14]:
path = ""

# Read the data
train_data = pd.read_pickle(path + "train_task1.pkl")
test_data = pd.read_pickle(path + "test_task1.pkl")

train = []

label1 = train_data[train_data['label'] == 1] # small business owner
label0 = train_data[train_data['label'] == 0] # Non-small business owner

print("Number of small business owners: ", len(label1))
print("Number of Non-small business owners: ", len(label0))

loc = 0
for i in tqdm(range(14)):
    if i != 13:
        label0_ = label0[loc:loc + label1.shape[0]]
        loc += label1.shape[0]
        train.append(pd.concat([label1, label0_]))
    else:
        label0_ = label0[loc:]
        loc += label1.shape[0]
        train.append(pd.concat([label1, label0_]))


Number of small business owners:  47085
Number of Non-small business owners:  672914


100%|██████████| 14/14 [00:00<00:00, 40.21it/s]


In [15]:
train_data.columns

Index(['pay day login counts', 'pay day login with money transfer counts',
       'pay day login duration counts', 'c_week1', 'c_week2', 'c_week3',
       'c_week4', 'c_week5', 'c_week6', 'c_week7',
       ...
       'region_code_9', 'region_code_10', 'region_code_11', 'region_code_12',
       'region_code_13', 'region_code_14', 'region_code_15', 'region_code_16',
       'region_code_17', 'region_code_18'],
      dtype='object', length=140)

In [22]:
train_data["total logins"] = 0
train_data["total logins with transfer"] = 0
test_data["total logins with transfer"] = 0
train_data["total logins duration"] = 0
test_data["total logins duration"] = 0

for col in train_data.columns:
    if "c_" in col:
        train_data["total logins"] += train_data[col]

for col in train_data.columns:
    if "s_" in col: 
        train_data["total logins with transfer"] += train_data[col]
        test_data["total logins with transfer"] += test_data[col]

for col in train_data.columns:
    if "t_" in col: 
        train_data["total logins duration"] += train_data[col]
        test_data["total logins duration"] += test_data[col]

print(train_data.shape)
train_data = train_data[train_data["total logins"] != 0]
print(train_data.shape)

(716730, 143)
(716730, 143)


In [23]:
train_data[["total logins", "total logins with transfer", "total logins duration", "label"]].corr()

Unnamed: 0,total logins,total logins with transfer,total logins duration,label
total logins,1.0,0.713689,0.751361,0.217239
total logins with transfer,0.713689,1.0,0.661368,0.207604
total logins duration,0.751361,0.661368,1.0,0.270116
label,0.217239,0.207604,0.270116,1.0


In [3]:
def Find_Optimal_Cutoff(target, predicted):
    """ Find the optimal probability cutoff point for a classification model related to event rate
    Parameters
    ----------
    target : Matrix with dependent or target data, where rows are observations

    predicted : Matrix with predicted data, where rows are observations

    Returns
    -------     
    list type, with optimal cutoff value
        
    """
    fpr, tpr, threshold = metrics.roc_curve(target, predicted)
    i = np.arange(len(tpr)) 
    roc = pd.DataFrame({'tf' : pd.Series(tpr-(1-fpr), index=i), 'threshold' : pd.Series(threshold, index=i)})
    roc_t = roc.iloc[(roc.tf-0).abs().argsort()[:1]]

    return list(roc_t['threshold']) 


In [11]:
# Neural Network
def NeuralNet_pred(train_data, test_data):
    # Split the data
    train_data = shuffle(train_data)
    X_train = train_data.drop(['label'], axis=1)
    y_train = train_data['label']
    X_test = test_data.drop(['label'], axis=1)
    y_test = test_data['label']

    # Scale the data
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Create the model
    model = Sequential()
    model.add(Dense(128, input_dim=X_train.shape[1], activation='selu'))
    model.add(Dense(64, activation='gelu'))
    model.add(Dense(32, activation='selu'))
    model.add(Dense(16, activation='gelu'))
    model.add(Dense(1, activation='sigmoid'))


    # Compile model
    LEARNING_RATE = 0.0001
    WEIGHT_DECAY = 0.0001
    

    optimizer = tfa.optimizers.AdamW(
        learning_rate=LEARNING_RATE, weight_decay=WEIGHT_DECAY
    )


    # Compile the model
    model.compile(
        optimizer=optimizer,
        loss=tf.keras.losses.BinaryCrossentropy(),
        metrics=[tf.keras.metrics.AUC(name="PR AUC", curve='PR')],
    )

    preds = []
    for i in tqdm(range(1, 15)):
        model.load_weights(f"models_with_payday_info_dropout/model{i}")
        # Predict the test data
        pred = model.predict(X_test)
        preds.append(pred)
    
    preds = np.mean(preds, axis=0)


    print(f"PR AUC: {average_precision_score(y_test, preds)}")
    print(f"ROC AUC: {roc_auc_score(y_test, preds)}")
    print("Threshold: ", Find_Optimal_Cutoff(y_test, preds))
    y_pred = np.where(preds > Find_Optimal_Cutoff(y_test, preds)[0], 1, 0)
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(f"Confusion Matrix: {confusion_matrix(y_test, y_pred)}")
    print(f"Classification Report: {classification_report(y_test, y_pred)}")
    
    return y_test, preds

In [12]:
y_test, preds = NeuralNet_pred(train[0], test_data)

  0%|          | 0/14 [00:00<?, ?it/s]


Two checkpoint references resolved to different objects (<keras.layers.core.dense.Dense object at 0x000001DCB347E280> and <keras.layers.core.dense.Dense object at 0x000001DCBB84ECA0>).

Two checkpoint references resolved to different objects (<keras.layers.core.dense.Dense object at 0x000001DCBB84ECA0> and <keras.layers.core.dense.Dense object at 0x000001DCBB8617C0>).

Two checkpoint references resolved to different objects (<keras.layers.core.dense.Dense object at 0x000001DCBB8617C0> and <keras.layers.core.dense.Dense object at 0x000001DCBB857310>).


  7%|▋         | 1/14 [00:03<00:40,  3.10s/it]


Two checkpoint references resolved to different objects (<keras.layers.core.dense.Dense object at 0x000001DCB347E280> and <keras.layers.core.dense.Dense object at 0x000001DCBB84ECA0>).

Two checkpoint references resolved to different objects (<keras.layers.core.dense.Dense object at 0x000001DCBB84ECA0> and <keras.layers.core.dense.Dense object at 0x000001DCBB8617C0>).

Two checkpoint references resolved to different objects (<keras.layers.core.dense.Dense object at 0x000001DCBB8617C0> and <keras.layers.core.dense.Dense object at 0x000001DCBB857310>).


 14%|█▍        | 2/14 [00:06<00:36,  3.06s/it]


Two checkpoint references resolved to different objects (<keras.layers.core.dense.Dense object at 0x000001DCB347E280> and <keras.layers.core.dense.Dense object at 0x000001DCBB84ECA0>).

Two checkpoint references resolved to different objects (<keras.layers.core.dense.Dense object at 0x000001DCBB84ECA0> and <keras.layers.core.dense.Dense object at 0x000001DCBB8617C0>).

Two checkpoint references resolved to different objects (<keras.layers.core.dense.Dense object at 0x000001DCBB8617C0> and <keras.layers.core.dense.Dense object at 0x000001DCBB857310>).


 21%|██▏       | 3/14 [00:09<00:34,  3.15s/it]


Two checkpoint references resolved to different objects (<keras.layers.core.dense.Dense object at 0x000001DCB347E280> and <keras.layers.core.dense.Dense object at 0x000001DCBB84ECA0>).

Two checkpoint references resolved to different objects (<keras.layers.core.dense.Dense object at 0x000001DCBB84ECA0> and <keras.layers.core.dense.Dense object at 0x000001DCBB8617C0>).

Two checkpoint references resolved to different objects (<keras.layers.core.dense.Dense object at 0x000001DCBB8617C0> and <keras.layers.core.dense.Dense object at 0x000001DCBB857310>).


 29%|██▊       | 4/14 [00:12<00:30,  3.05s/it]


Two checkpoint references resolved to different objects (<keras.layers.core.dense.Dense object at 0x000001DCB347E280> and <keras.layers.core.dense.Dense object at 0x000001DCBB84ECA0>).

Two checkpoint references resolved to different objects (<keras.layers.core.dense.Dense object at 0x000001DCBB84ECA0> and <keras.layers.core.dense.Dense object at 0x000001DCBB8617C0>).

Two checkpoint references resolved to different objects (<keras.layers.core.dense.Dense object at 0x000001DCBB8617C0> and <keras.layers.core.dense.Dense object at 0x000001DCBB857310>).


 36%|███▌      | 5/14 [00:15<00:27,  3.03s/it]


Two checkpoint references resolved to different objects (<keras.layers.core.dense.Dense object at 0x000001DCB347E280> and <keras.layers.core.dense.Dense object at 0x000001DCBB84ECA0>).

Two checkpoint references resolved to different objects (<keras.layers.core.dense.Dense object at 0x000001DCBB84ECA0> and <keras.layers.core.dense.Dense object at 0x000001DCBB8617C0>).

Two checkpoint references resolved to different objects (<keras.layers.core.dense.Dense object at 0x000001DCBB8617C0> and <keras.layers.core.dense.Dense object at 0x000001DCBB857310>).


 43%|████▎     | 6/14 [00:18<00:23,  2.99s/it]


Two checkpoint references resolved to different objects (<keras.layers.core.dense.Dense object at 0x000001DCB347E280> and <keras.layers.core.dense.Dense object at 0x000001DCBB84ECA0>).

Two checkpoint references resolved to different objects (<keras.layers.core.dense.Dense object at 0x000001DCBB84ECA0> and <keras.layers.core.dense.Dense object at 0x000001DCBB8617C0>).

Two checkpoint references resolved to different objects (<keras.layers.core.dense.Dense object at 0x000001DCBB8617C0> and <keras.layers.core.dense.Dense object at 0x000001DCBB857310>).


 50%|█████     | 7/14 [00:21<00:21,  3.05s/it]


Two checkpoint references resolved to different objects (<keras.layers.core.dense.Dense object at 0x000001DCB347E280> and <keras.layers.core.dense.Dense object at 0x000001DCBB84ECA0>).

Two checkpoint references resolved to different objects (<keras.layers.core.dense.Dense object at 0x000001DCBB84ECA0> and <keras.layers.core.dense.Dense object at 0x000001DCBB8617C0>).

Two checkpoint references resolved to different objects (<keras.layers.core.dense.Dense object at 0x000001DCBB8617C0> and <keras.layers.core.dense.Dense object at 0x000001DCBB857310>).


 57%|█████▋    | 8/14 [00:24<00:17,  2.98s/it]


Two checkpoint references resolved to different objects (<keras.layers.core.dense.Dense object at 0x000001DCB347E280> and <keras.layers.core.dense.Dense object at 0x000001DCBB84ECA0>).

Two checkpoint references resolved to different objects (<keras.layers.core.dense.Dense object at 0x000001DCBB84ECA0> and <keras.layers.core.dense.Dense object at 0x000001DCBB8617C0>).

Two checkpoint references resolved to different objects (<keras.layers.core.dense.Dense object at 0x000001DCBB8617C0> and <keras.layers.core.dense.Dense object at 0x000001DCBB857310>).


 64%|██████▍   | 9/14 [00:27<00:14,  3.00s/it]


Two checkpoint references resolved to different objects (<keras.layers.core.dense.Dense object at 0x000001DCB347E280> and <keras.layers.core.dense.Dense object at 0x000001DCBB84ECA0>).

Two checkpoint references resolved to different objects (<keras.layers.core.dense.Dense object at 0x000001DCBB84ECA0> and <keras.layers.core.dense.Dense object at 0x000001DCBB8617C0>).

Two checkpoint references resolved to different objects (<keras.layers.core.dense.Dense object at 0x000001DCBB8617C0> and <keras.layers.core.dense.Dense object at 0x000001DCBB857310>).


 71%|███████▏  | 10/14 [00:30<00:11,  2.98s/it]


Two checkpoint references resolved to different objects (<keras.layers.core.dense.Dense object at 0x000001DCB347E280> and <keras.layers.core.dense.Dense object at 0x000001DCBB84ECA0>).

Two checkpoint references resolved to different objects (<keras.layers.core.dense.Dense object at 0x000001DCBB84ECA0> and <keras.layers.core.dense.Dense object at 0x000001DCBB8617C0>).

Two checkpoint references resolved to different objects (<keras.layers.core.dense.Dense object at 0x000001DCBB8617C0> and <keras.layers.core.dense.Dense object at 0x000001DCBB857310>).


 79%|███████▊  | 11/14 [00:33<00:08,  2.95s/it]


Two checkpoint references resolved to different objects (<keras.layers.core.dense.Dense object at 0x000001DCB347E280> and <keras.layers.core.dense.Dense object at 0x000001DCBB84ECA0>).

Two checkpoint references resolved to different objects (<keras.layers.core.dense.Dense object at 0x000001DCBB84ECA0> and <keras.layers.core.dense.Dense object at 0x000001DCBB8617C0>).

Two checkpoint references resolved to different objects (<keras.layers.core.dense.Dense object at 0x000001DCBB8617C0> and <keras.layers.core.dense.Dense object at 0x000001DCBB857310>).


 86%|████████▌ | 12/14 [00:36<00:05,  2.98s/it]


Two checkpoint references resolved to different objects (<keras.layers.core.dense.Dense object at 0x000001DCB347E280> and <keras.layers.core.dense.Dense object at 0x000001DCBB84ECA0>).

Two checkpoint references resolved to different objects (<keras.layers.core.dense.Dense object at 0x000001DCBB84ECA0> and <keras.layers.core.dense.Dense object at 0x000001DCBB8617C0>).

Two checkpoint references resolved to different objects (<keras.layers.core.dense.Dense object at 0x000001DCBB8617C0> and <keras.layers.core.dense.Dense object at 0x000001DCBB857310>).


 93%|█████████▎| 13/14 [00:39<00:03,  3.01s/it]


Two checkpoint references resolved to different objects (<keras.layers.core.dense.Dense object at 0x000001DCB347E280> and <keras.layers.core.dense.Dense object at 0x000001DCBB84ECA0>).

Two checkpoint references resolved to different objects (<keras.layers.core.dense.Dense object at 0x000001DCBB84ECA0> and <keras.layers.core.dense.Dense object at 0x000001DCBB8617C0>).

Two checkpoint references resolved to different objects (<keras.layers.core.dense.Dense object at 0x000001DCBB8617C0> and <keras.layers.core.dense.Dense object at 0x000001DCBB857310>).


100%|██████████| 14/14 [00:42<00:00,  3.01s/it]


PR AUC: 0.3029021474111073
ROC AUC: 0.8152537580784494
Threshold:  [5.839669463073828e-24]
Accuracy: 0.7383407707403657
Confusion Matrix: [[55206 19563]
 [ 1370  3862]]
Classification Report:               precision    recall  f1-score   support

           0       0.98      0.74      0.84     74769
           1       0.16      0.74      0.27      5232

    accuracy                           0.74     80001
   macro avg       0.57      0.74      0.56     80001
weighted avg       0.92      0.74      0.80     80001



In [18]:
train[0].head()

Unnamed: 0,c_week1,c_week2,c_week3,c_week4,c_week5,c_week6,c_week7,c_week8,c_week9,c_week10,...,region_code_9,region_code_10,region_code_11,region_code_12,region_code_13,region_code_14,region_code_15,region_code_16,region_code_17,region_code_18
36,0.714286,0.714286,0.714286,1.571429,2.571429,0.285714,0.571429,1.714286,1.857143,1.285714,...,0,0,0,0,0,0,0,0,0,0
49,0.428571,1.428571,0.0,0.142857,0.285714,0.285714,0.428571,0.571429,0.0,0.285714,...,0,0,0,0,0,0,0,0,0,0
90,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.428571,...,0,0,0,0,0,0,0,0,0,0
111,1.571429,1.285714,0.714286,1.571429,1.285714,1.142857,2.0,1.0,2.714286,1.857143,...,0,0,0,0,0,0,0,0,0,0
134,5.142857,4.857143,2.714286,2.285714,2.142857,2.0,3.0,1.857143,2.714286,4.0,...,0,0,0,0,0,0,0,0,0,0
