In [None]:
import sys
sys.path.append('../../../')

In [16]:
import pandas as pd
import numpy as np
import streamlit as st

# from modules import machine_learning_utils as mlu

from sklearn.model_selection import train_test_split
from catboost import Pool, CatBoostClassifier
from sklearn.metrics import roc_auc_score

pd.set_option("display.max_rows",None)
pd.set_option("display.max_columns",None)

In [17]:
def partitioning(df,train_dim, val_test_dim,target):
    """
    
    """
    val_test_dim_edit = train_dim + val_test_dim
    train, val, test = np.split(df.sample(frac=1,random_state=2), [int(train_dim*len(df)), int(val_test_dim_edit*len(df))])
    st.write("Train shape: ",train.shape)
    st.write("Train %: \n",train[target].value_counts(normalize=True))
    st.write("Val %: \n",val[target].value_counts(normalize=True))
    st.write("Test %: \n",test[target].value_counts(normalize=True))
    return train, val, test

In [18]:
def auc_score(y_trues, y_preds):
    """
    
    """
    for i, y_pred in enumerate(y_preds):
        y_true = y_trues[i]
        auc = roc_auc_score(y_true, y_pred)
    return auc

def adversarial_validation(val,test,drop_cols):
    """
    
    """
    aval = val.copy()
    atest = test.copy()
    # 1 - define target
    aval['y'] = 1.0
    atest['y'] = 0.0
    # 2 - create dataframe
    ad = aval.append(atest).sample(frac=1,random_state=2)
    # 3 - drop unuseful columns
    c_drop = drop_cols
    ad = ad.drop(columns=c_drop)
    # 4 - define format and imputation
    for i in ad:
        if ad[i].dtypes != 'object':
            ad[i] = ad[i].astype(float)
            ad[i] = ad[i].fillna(ad[i].median())
        else:
            ad[i] = ad[i].fillna('missing')
            ad[i] = ad[i].astype(str)
    # 5 - model preparation
    y = ad['y'].values
    X = ad.drop(columns=['y']).values
    categorical_features_indices = np.where(ad.dtypes != np.float64)[0]
    # 6 - train test split
    adv_X_train, adv_X_test, adv_y_train, adv_y_test = train_test_split(X, y , test_size = 0.30 , random_state = 2)
    train_data = Pool(data=adv_X_train,label=adv_y_train,cat_features=categorical_features_indices)
    test_data = Pool(data=adv_X_test,label=adv_y_test,cat_features=categorical_features_indices)
    # 7 - model training
    params = {'iterations': 1000,'eval_metric': 'AUC','od_type': 'Iter','od_wait': 50}
    model = CatBoostClassifier(**params)
    _ = model.fit(train_data, eval_set=test_data, plot=False, verbose=False)
    # 8 - model evaluation
    auc = auc_score([test_data.get_label()],[model.predict_proba(test_data)[:,1]])
    if auc <= 0.6:
        return st.write("No distribution shift, OK! AUC is: ",auc)
    else:
        return st.write("Check features importance (to be added) and rerun. AUC is: ",auc)

In [19]:
def class_imbalance(train,target):
    """
    
    """
    df_under = train[train[target]==train[target].value_counts(normalize=False).index[1]]
    df_over = train[train[target]==train[target].value_counts(normalize=False).index[0]]
    df_over = df_over.sample(frac=df_under.shape[0]/df_over.shape[0],random_state=2)
    final_train = df_over.append(df_under).sample(frac=1,random_state=2)
    st.write(final_train[target].value_counts(normalize=False))
    st.write("Train dataset shape: ",final_train.shape)
    return final_train

# 1.0 Data retrieval

In [31]:
df = pd.read_pickle("data_lake/output/df_under.pkl")

In [1]:
df.sample(2)

In [22]:
df.shape

(144233, 41)

# 2.0 Data Partitioning

In [23]:
df['isFraud'].value_counts()

0    132915
1     11318
Name: isFraud, dtype: int64

## 2.1 Train, validation, test split

In [24]:
train, val, test = partitioning(df,train_dim, val_test_dim,'isFraud')

Train shape:  (100963, 41)
Train %: 
 0    0.920496
1    0.079504
Name: isFraud, dtype: float64
Val %: 
 0    0.923319
1    0.076681
Name: isFraud, dtype: float64
Test %: 
 0    0.924567
1    0.075433
Name: isFraud, dtype: float64


## 2.2 Adversarial validation between val and test set

In [25]:
drop_cols = ['TransactionID','customer_id']

In [26]:
adversarial_validation(val,test,drop_cols)

No distribution shift, OK! AUC is:  0.5046975542178611


## 2.3 Class imbalance

In [27]:
final_train = class_imbalance(train,'isFraud')

0    8027
1    8027
Name: isFraud, dtype: int64
Train dataset shape:  (16054, 41)


# 3.0 Output

In [28]:
# final_train.to_pickle('../../../data_lake/output/train.pkl')
# val.to_pickle('../../../data_lake/output/val.pkl')
# test.to_pickle('../../../data_lake/output/test.pkl')