In [64]:
# pip install catboost

In [63]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.io as pio

from sklearn.model_selection import train_test_split
from catboost import Pool, CatBoostClassifier
from sklearn.metrics import roc_auc_score

%matplotlib inline
pio.renderers.default='iframe'

pd.set_option("display.max_rows",None)
pd.set_option("display.max_columns",None)

# 1.0 Data retrieval

In [2]:
df = pd.read_pickle("../../data_lake/output/df_under.pkl")

In [3]:
df.sample(2)

Unnamed: 0,TransactionID,isFraud,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,dist2,P_emaildomain,R_emaildomain,M4,id_21,id_22,id_23,id_24,id_25,id_26,id_27,id_28,id_29,id_30,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo,max_c,max_d,customer_id,num_transactio_per_time,multi_transaction_per_time
108819,3395213,0,15.709,C,17255,555.0,117.0,mastercard,195.0,credit,,,2013.0,anonymous.com,anonymous.com,M2,,,,,,,,Found,Found,,chrome 65.0,,,,F,F,T,F,desktop,Windows,28.0,569.0,19572,1,0
33308,3072050,0,300.0,R,5957,520.0,150.0,american express,190.0,credit,264.0,87.0,,anonymous.com,anonymous.com,,,,,,,,,Found,Found,Windows 10,chrome 63.0,24.0,1920x1080,match_status:2,T,F,T,F,desktop,Windows,1.0,177.875,7567,1,0


In [4]:
df.shape

(144233, 41)

# 2.0 Data Partitioning

In [5]:
df['isFraud'].value_counts()

0    132915
1     11318
Name: isFraud, dtype: int64

## 2.1 Train, validation, test split

In [21]:
train, val, test = np.split(df.sample(frac=1,random_state=2), [int(.7*len(df)), int(.85*len(df))])

In [22]:
train.shape

(100963, 41)

In [23]:
train.isFraud.value_counts(normalize=True)

0    0.920496
1    0.079504
Name: isFraud, dtype: float64

In [24]:
val.isFraud.value_counts(normalize=True)

0    0.923319
1    0.076681
Name: isFraud, dtype: float64

In [25]:
test.isFraud.value_counts(normalize=True)

0    0.924567
1    0.075433
Name: isFraud, dtype: float64

## 2.2 Adversarial validation between val and test set

In [51]:
aval = val.copy()
atest = test.copy()

In [52]:
aval['y'] = 1.0
atest['y'] = 0.0

In [53]:
ad = aval.append(atest).sample(frac=1,random_state=2)

In [54]:
c_drop = ['TransactionID','customer_id']

In [55]:
ad = ad.drop(columns=c_drop)

In [59]:
for i in ad:
    if ad[i].dtypes != 'object':
        ad[i] = ad[i].astype(float)
        ad[i] = ad[i].fillna(ad[i].median())
    else:
        ad[i] = ad[i].fillna('missing')
        ad[i] = ad[i].astype(str)

In [66]:
y = ad['y'].values
X = ad.drop(columns=['y']).values

In [68]:
categorical_features_indices = np.where(ad.dtypes != np.float64)[0]

In [69]:
adv_X_train, adv_X_test, adv_y_train, adv_y_test = train_test_split(X, y , test_size = 0.30 , random_state = 2)

In [70]:
train_data = Pool(data=adv_X_train,label=adv_y_train,cat_features=categorical_features_indices)
test_data = Pool(data=adv_X_test,label=adv_y_test,cat_features=categorical_features_indices)

In [71]:
params = {'iterations': 1000,'eval_metric': 'AUC','od_type': 'Iter','od_wait': 50}
model = CatBoostClassifier(**params)

In [75]:
_ = model.fit(train_data, eval_set=test_data, plot=False, verbose=False)

In [76]:
def auc_score(y_trues, y_preds):
    for i, y_pred in enumerate(y_preds):
        y_true = y_trues[i]
        auc = roc_auc_score(y_true, y_pred)
    return auc

In [77]:
auc = auc_score([test_data.get_label()],[model.predict_proba(test_data)[:,1]])

In [78]:
auc

0.5046975542178611

No distribution shift!

## 2.3 Class imbalance

In [94]:
train.shape

(100963, 41)

In [95]:
train['isFraud'].value_counts(normalize=False)

0    92936
1     8027
Name: isFraud, dtype: int64

In [96]:
df_under = train[train['isFraud']==train['isFraud'].value_counts(normalize=False).index[1]]
df_over = train[train['isFraud']==train['isFraud'].value_counts(normalize=False).index[0]]

In [97]:
df_over = df_over.sample(frac=df_under.shape[0]/df_over.shape[0],random_state=2)

In [98]:
final_train = df_over.append(df_under).sample(frac=1,random_state=2)

In [99]:
final_train['isFraud'].value_counts(normalize=False)

1    8027
0    8027
Name: isFraud, dtype: int64

# 3.0 Output

In [100]:
final_train.to_pickle('../../data_lake/output/train.pkl')
val.to_pickle('../../data_lake/output/val.pkl')
test.to_pickle('../../data_lake/output/test.pkl')