# 1. Import libraries

In [1]:
import warnings
warnings.filterwarnings("ignore")
from sklearn import tree

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt 
%matplotlib inline
plt.style.use('seaborn-white')
plt.rcParams['figure.figsize'] = 12,4

# 2. Import dataset
* 01_clean is imported 
* There are 15 independent variables

In [3]:
df = pd.read_csv('../01 raw data/01_clean.csv')
df['paymentInstrumentAgeInAccount'] = pd.to_numeric(df['paymentInstrumentAgeInAccount'], errors='coerce')
print(df.shape)

(199927, 17)


# 3. categorical
* there are 9 categorical variables 
* each category is replaced with the average Fraud

In [4]:
def cat(df, x, y, z):
    df[x] = df[x].astype(str)
    df[x] = df[x].fillna('MISSING')
    df1 = df[[x,y]]

    df2 = df1.groupby(x).agg({y:['mean','count']}).reset_index()
    df2.columns = [x,z,'count']
    df2.to_csv('../03 rolledup data/'+x+'_b.csv', index=False)
    
    df = pd.merge(df, df2, how='left', on=x)
    df = df.drop([x,'count'], axis=1)    
    return(df)

In [5]:
for i in ['accountCountry','isUserRegistered','isProxyIP','transactionCurrencyCode', 'cardType','cvvVerifyResult',
          'paymentInstrumentType','ipState','ipCountryCode']:
    df = cat(df, i, 'Fraud', 'b_'+i)

print(df.shape)

(199927, 17)


# 4. numerical
* there are 6 categorical variables
* decision tree is used to bin the variable
* max depth = 5 and min sample size = 0.01% of observations (20)

In [6]:
def num(df, x, y, z):
    tempn = df[df[x].notnull()]
    dtree = tree.DecisionTreeClassifier(random_state=42, max_depth=5, min_samples_leaf=20)
    dtree = dtree.fit(tempn[[x]],tempn[y])
    tempn[z] = dtree.apply(tempn[[x]])


    tempm = df[df[x].isnull()]
    tempm[z] = 'MISSING'
    temp = pd.concat([tempm, tempn])
    temp = temp.drop(x, axis=1)
    return(temp)

In [7]:
n = pd.DataFrame()
for i in ['localWeekday','localHour','accountAge','transactionAmountUSD','numPaymentRejects1dPerUser',
          'paymentInstrumentAgeInAccount']:
    df = num(df, i, 'Fraud', 'c_'+i)
    df = cat(df, 'c_'+i, 'Fraud', 'b_'+i)

print(df.shape)

(199927, 17)


# 5. Export dataset
* 9 - categorical variables 
* 6 - numerical variables
* 2 - extra variables (Id, Fraud)

In [8]:
df.to_csv('../01 raw data/02_cat_num.csv', index=False)
df.shape

(199927, 17)