In [1]:
# Injected parameters
train_dim = 0.7
val_test_dim = 0.15


In [2]:
import sys
sys.path.append('../../../')

In [3]:
import pandas as pd
import numpy as np
import streamlit as st

from sklearn.model_selection import train_test_split
from catboost import Pool, CatBoostClassifier
from sklearn.metrics import roc_auc_score

pd.set_option("display.max_rows",None)
pd.set_option("display.max_columns",None)

In [4]:
def partitioning(df,train_dim, val_test_dim,target):
    """
    
    """
    val_test_dim_edit = train_dim + val_test_dim
    train, val, test = np.split(df.sample(frac=1,random_state=2), [int(train_dim*len(df)), int(val_test_dim_edit*len(df))])
    st.write("Train shape: ",train.shape)
    st.write("Train %: \n",train[target].value_counts(normalize=True))
    st.write("Val %: \n",val[target].value_counts(normalize=True))
    st.write("Test %: \n",test[target].value_counts(normalize=True))
    return train, val, test

In [5]:
def auc_score(y_trues, y_preds):
    """
    
    """
    for i, y_pred in enumerate(y_preds):
        y_true = y_trues[i]
        auc = roc_auc_score(y_true, y_pred)
    return auc

def adversarial_validation(val,test,drop_cols):
    """
    
    """
    aval = val.copy()
    atest = test.copy()
    # 1 - define target
    aval['y'] = 1.0
    atest['y'] = 0.0
    # 2 - create dataframe
    ad = aval.append(atest).sample(frac=1,random_state=2)
    # 3 - drop unuseful columns
    c_drop = drop_cols
    ad = ad.drop(columns=c_drop)
    # 4 - define format and imputation
    for i in ad:
        if ad[i].dtypes != 'object':
            ad[i] = ad[i].astype(float)
            ad[i] = ad[i].fillna(ad[i].median())
        else:
            ad[i] = ad[i].fillna('missing')
            ad[i] = ad[i].astype(str)
    # 5 - model preparation
    y = ad['y'].values
    X = ad.drop(columns=['y']).values
    categorical_features_indices = np.where(ad.dtypes != np.float64)[0]
    # 6 - train test split
    adv_X_train, adv_X_test, adv_y_train, adv_y_test = train_test_split(X, y , test_size = 0.30 , random_state = 2)
    train_data = Pool(data=adv_X_train,label=adv_y_train,cat_features=categorical_features_indices)
    test_data = Pool(data=adv_X_test,label=adv_y_test,cat_features=categorical_features_indices)
    # 7 - model training
    params = {'iterations': 1000,'eval_metric': 'AUC','od_type': 'Iter','od_wait': 50}
    model = CatBoostClassifier(**params)
    _ = model.fit(train_data, eval_set=test_data, plot=False, verbose=False)
    # 8 - model evaluation
    auc = auc_score([test_data.get_label()],[model.predict_proba(test_data)[:,1]])
    if auc <= 0.6:
        return st.write("No distribution shift, OK! AUC is: ",auc)
    else:
        return st.write("Check features importance (to be added) and rerun. AUC is: ",auc)

In [6]:
def class_imbalance(train,target):
    """
    
    """
    df_under = train[train[target]==train[target].value_counts(normalize=False).index[1]]
    df_over = train[train[target]==train[target].value_counts(normalize=False).index[0]]
    df_over = df_over.sample(frac=df_under.shape[0]/df_over.shape[0],random_state=2)
    final_train = df_over.append(df_under).sample(frac=1,random_state=2)
    st.write(final_train[target].value_counts(normalize=False))
    st.write("Train dataset shape: ",final_train.shape)
    return final_train

# 1.0 Data retrieval

In [7]:
df = pd.read_pickle("data_lake/output/df_under.pkl")

In [8]:
df.sample(2)

Unnamed: 0,TransactionID,isFraud,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,dist2,P_emaildomain,R_emaildomain,M4,id_21,id_22,id_23,id_24,id_25,id_26,id_27,id_28,id_29,id_30,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo,max_c,max_d,customer_id,num_transaction_per_time,multi_transaction_per_time
42551,3088866,0,30.0,H,9500,321.0,150.0,visa,226.0,debit,264.0,87.0,,gmail.com,gmail.com,,,,,,,,,Found,Found,Android 7.0,chrome 63.0 for android,32.0,1280x720,match_status:2,T,F,T,T,mobile,LG-K428 Build/NRD90U,1.0,100.708336,19861,1,0
114438,3426766,0,100.0,H,5293,321.0,150.0,visa,226.0,debit,123.0,87.0,,gmail.com,gmail.com,,,,,,,,,Found,Found,Android 8.0.0,samsung browser 7.0,32.0,2220x1081,match_status:2,T,F,T,T,mobile,SAMSUNG SM-N950U Build/R16NW,3.0,0.916666,174,1,0


In [9]:
df.shape

(144233, 41)

# 2.0 Data Partitioning

In [10]:
df['isFraud'].value_counts()

0    132915
1     11318
Name: isFraud, dtype: int64

## 2.1 Train, validation, test split

In [11]:
train_dim = 0.7
val_test_dim = 0.15

In [12]:
train, val, test = partitioning(df,train_dim, val_test_dim,'isFraud')

## 2.2 Adversarial validation between val and test set

In [13]:
drop_cols = ['TransactionID','customer_id']

In [14]:
adversarial_validation(val,test,drop_cols)

## 2.3 Class imbalance

In [15]:
final_train = class_imbalance(train,'isFraud')

# 3.0 Output

In [16]:
final_train.to_pickle('app/datalake/train.pkl')
val.to_pickle('app/datalake/val.pkl')