In [35]:
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedShuffleSplit
import json

In [2]:
def create_col_name(base_str, start_int, end_int):
    return [f'{base_str}{i}' for i in range(start_int, end_int+1)]

In [3]:
cat_cols = (['ProductCD'] + create_col_name('card', 1, 6) + ['addr1', 'addr2', 'P_emaildomain', 'R_emaildomain'] + 
            create_col_name('M', 1, 9) + ['DeviceType', 'DeviceInfo'] + create_col_name('id_', 12, 38))

id_cols = ['TransactionID']

dep_var = 'isFraud'

In [4]:
type_map = {c: str for c in cat_cols + id_cols}

In [5]:
table_names = ['train_identity', 'train_transaction', 'test_identity', 'test_transaction']
tables = [pd.read_csv(f'data/{fname}.csv', dtype=type_map, low_memory=False) for fname in table_names]
df_train_id, df_train_trans, df_test_id, df_test_trans = tables

In [6]:
df_train_id.shape, df_train_trans.shape, df_test_id.shape, df_test_trans.shape

((144233, 41), (590540, 394), (141907, 41), (506691, 393))

In [8]:
df_train_id.head(5)

Unnamed: 0,TransactionID,id_01,id_02,id_03,id_04,id_05,id_06,id_07,id_08,id_09,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987004,0.0,70787.0,,,,,,,,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M
1,2987008,-5.0,98945.0,,,0.0,-5.0,,,,...,mobile safari 11.0,32.0,1334x750,match_status:1,T,F,F,T,mobile,iOS Device
2,2987010,-5.0,191631.0,0.0,0.0,0.0,0.0,,,0.0,...,chrome 62.0,,,,F,F,T,T,desktop,Windows
3,2987011,-5.0,221832.0,,,0.0,-6.0,,,,...,chrome 62.0,,,,F,F,T,T,desktop,
4,2987016,0.0,7460.0,0.0,0.0,1.0,0.0,,,0.0,...,chrome 62.0,24.0,1280x800,match_status:2,T,F,T,T,desktop,MacOS


In [9]:
df_train_trans.head(5)

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
def join_df(left, right, left_on, right_on=None, suffix='_y'):
    if right_on is None: right_on = left_on
    return left.merge(right, how='left', left_on=left_on, right_on=right_on, 
                      suffixes=("", suffix))

In [11]:
df_train = join_df(df_train_trans, df_train_id, id_cols)

In [19]:
assert (df_train.shape[0] == df_train_trans.shape[0]) & (df_train.shape[1] == df_train_trans.shape[1]+ df_train_id.shape[1]-1)

In [20]:
numeric_cols = [col for col in df_train.columns.tolist() if col not in cat_cols + id_cols + [dep_var]]

In [22]:
df_test = join_df(df_test_trans, df_test_id, id_cols)

In [23]:
assert (df_test.shape[0] == df_test_trans.shape[0]) & (df_test.shape[1] == df_test_trans.shape[1]+ df_test_id.shape[1]-1)

In [25]:
df_train.to_csv('data/train.csv',index=False)

In [26]:
df_test.to_csv('data/test.csv',index=False)

In [36]:
features = {}
features['cat'] = cat_cols
features['cont'] = numeric_cols
features['dep_var'] = dep_var
#save the best params to a file
with open('artifacts/features.txt', 'w') as outfile:
    json.dump(features, outfile)

## Save a small stratified sample to use for quick testing of our code when writing our model

In [27]:
def stratifiedSample(df,n_samples,splitter):
    cols = list(df.columns.values)
    cols.remove(splitter)
    X = df[cols]
    y = df[splitter]

    splits = StratifiedShuffleSplit(n_splits=1, test_size=n_samples, random_state=42)
   
    for _, test_index in splits.split(X, y):
        sample = df.iloc[test_index]


    return sample

In [38]:
df_train_sample = stratifiedSample(df_train,10000,dep_var)

In [39]:
df_train_sample.groupby([dep_var]).count()

Unnamed: 0_level_0,TransactionID,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
isFraud,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,9650,9650,9650,9650,9650,9519,9624,9624,9571,9624,...,2113,1200,1141,1202,2119,2119,2119,2119,2116,1815
1,350,350,350,350,350,341,349,349,348,349,...,199,60,58,60,199,199,199,199,199,143


In [40]:
df_train_sample.to_csv('data/train-sample.csv',index=False)