In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    precision_score, recall_score, roc_auc_score, f1_score, confusion_matrix, accuracy_score
    )
from sklearn.ensemble import ExtraTreesClassifier
%matplotlib inline

In [2]:
%load_ext autoreload

%autoreload 2

In [3]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

In [4]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Synthetic Fraud data

### Load data

In [5]:
fraud_data = pd.read_csv('data/PS_20174392719_1491204439457_log.csv')

## Prepare data

In [6]:
fraud_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


In [7]:
fraud_data.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [13]:
categoricals = ['type', 'nameOrig', 'nameDest']
for cat in categoricals:
    fraud_data[cat + '_enc'] = LabelEncoder().fit_transform(fraud_data[cat])

In [8]:
fraud_data.columns

Index(['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig',
       'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud',
       'isFlaggedFraud'],
      dtype='object')

In [14]:
train_cols = ['step', 'type_enc', 'amount', 'nameOrig_enc', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest',
             'nameDest_enc']
label_col = ['isFraud']

In [15]:
train_length = np.round(len(fraud_data.index) * 0.9, 0)
train_X = fraud_data.loc[:train_length, train_cols]
train_y = fraud_data.loc[:train_length, label_col]
valid_X = fraud_data.loc[train_length:, train_cols]
valid_y = fraud_data.loc[train_length:, label_col]

Let's see how many fraud cases we've got in our training set.

In [16]:
train_y.sum()

isFraud    4449
dtype: int64

Huh. This is tough. The majority of cases are in the latter half of the dataset - if you want to train it realistically, you have to take its timeline into account. So this way, the model won't have seen even half of the cases it will eventually be exposed to.  
NB: This is somewhat overconservative, if not paranoid thinking. Afaik, the timeline comprises a month, so it could just be christmas or something, but not a huge development of new techniques (or it could be a zero-day exploit or something, who knows. Then again, this is supposed to be simulated data with no special events).

## Run Boruta on training set to see what happens

In [17]:
from boruta import BorutaPy
from sklearn.ensemble import RandomForestRegressor
import numpy as np

###initialize Boruta
forest = RandomForestRegressor(
       n_jobs = -1, 
       max_depth = 5
    )

boruta = BorutaPy(
    estimator = forest, 
    n_estimators = 'auto',
    max_iter = 100 # number of trials to perform
    )

### fit Boruta (it accepts np.array, not pd.DataFrame)
boruta.fit(train_X.values, np.ravel(train_y))### print results

In [18]:
green_area = train_X.columns[boruta.support_].to_list()
blue_area = train_X.columns[boruta.support_weak_].to_list()

print('features in the green area:', green_area)
print('features in the blue area:', blue_area)

features in the green area: ['step', 'type_enc', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']
features in the blue area: []


WOOHOOO!  
Okay, so this means that, according to Boruta, we don't need the names at all. THAT makes life a busload simpler. Let's give the model a shot with the reduced columns and the binary encoding for the factors of the type.