In [146]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import warnings
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', lambda x: f'{x:.3f}')
%matplotlib inline
print(os.getcwd())

/kaggle/input/ieee-fraud-detection


In [110]:
# Load files
if not os.getcwd().endswith('detection'):
    os.chdir('../input/ieee-fraud-detection')
train_transaction = pd.read_csv('train_transaction.csv')
test_transaction = pd.read_csv('test_transaction.csv')
train_identity = pd.read_csv('train_identity.csv')
test_identity = pd.read_csv('test_identity.csv')

In [111]:
print('Data size'.center(50, '-'))
print(f'train set:transaction{train_transaction.shape},identity{train_identity.shape}')
print(f'test set:transaction{test_transaction.shape},identity{test_identity.shape}')

--------------------Data size---------------------
train set:transaction(590540, 394),identity(144233, 41)
test set:transaction(506691, 393),identity(141907, 41)


In [132]:
# Concat data
train = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')
test = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')

transaction_cols = [col for col in train_transaction.columns if col != 'isFraud']
identity_cols = train_identity.columns

# transaction = pd.concat([train[transaction_cols], test[transaction_cols]]).reset_index(drop=True)
identity = pd.concat([train[identity_cols], test[identity_cols]]).reset_index(drop=True)

n_train = len(train)
n_test = len(test)

In [None]:
msno.matrix(identity)

In [133]:
# stats
total = identity.isnull().count()
count = identity.count()
missing_cnts = identity.isnull().sum()
missing_rate = identity.isnull().mean()
mode = identity.fillna('missing').apply(lambda x: x.mode()[0])
mode_pct = identity.fillna('missing').apply(lambda x: sum(x == x.mode()[0]) / len(identity))
stats = pd.concat([total, count, missing_cnts, missing_rate, mode, mode_pct], axis=1,
                  keys=['total', 'count', 'missing_cnts', 'missing_rate', 'mode', 'mode_pct'])
stats

Unnamed: 0,total,count,missing_cnts,missing_rate,mode,mode_pct
TransactionID,1097231,1097231,0,0.0,2987000,0.0
id_01,1097231,286140,811091,0.739,missing,0.739
id_02,1097231,277848,819383,0.747,missing,0.747
id_03,1097231,132805,964426,0.879,missing,0.879
id_04,1097231,132805,964426,0.879,missing,0.879
id_05,1097231,271615,825616,0.752,missing,0.752
id_06,1097231,271615,825616,0.752,missing,0.752
id_07,1097231,10214,1087017,0.991,missing,0.991
id_08,1097231,10214,1087017,0.991,missing,0.991
id_09,1097231,149264,947967,0.864,missing,0.864


In [134]:
# Delete columns which missing_rate > 0.95 or mode_pct > 0.95 in identity.
thresh = 0.95
del_stats_cols = stats[(stats['missing_rate'] > thresh) | (stats['mode_pct'] > thresh)].index.tolist()
identity_cols = list(set(identity_cols) - set(del_stats_cols))
identity = identity[identity_cols]
print('Deleted %d columns:' % len(del_stats_cols), del_stats_cols)
identity.head()

Deleted 9 columns: ['id_07', 'id_08', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27']


Unnamed: 0,id_15,id_03,id_01,id_29,id_12,id_30,id_38,id_16,id_20,id_32,id_02,id_06,id_04,id_37,id_18,id_17,id_28,TransactionID,id_10,id_34,id_35,id_19,id_09,id_05,id_31,id_33,DeviceInfo,id_11,id_14,id_36,DeviceType,id_13
0,,,,,,,,,,,,,,,,,,2987000,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,,,,,2987001,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,,,,2987002,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,,,,2987003,,,,,,,,,,,,,,
4,New,,0.0,NotFound,NotFound,Android 7.0,T,NotFound,144.0,32.0,70787.0,,,T,,166.0,New,2987004,,match_status:2,T,542.0,,,samsung browser 6.2,2220x1080,SAMSUNG SM-G892A Build/NRD90M,100.0,-480.0,F,mobile,


In [135]:
identity['id_12'] = identity['id_12'].fillna('NotFound').map({'NotFound': 0, 'Found': 1})
identity['id_16'] = identity['id_16'].fillna('NotFound').map({'NotFound': 0, 'Found': 1})
identity['id_28'] = identity['id_28'].fillna('-1').map({'-1': -1, 'New': 0, 'Found': 1})
identity['id_29'] = identity['id_29'].fillna('NotFound').map({'NotFound': 0, 'Found': 1})
identity['id_35'] = identity['id_35'].fillna('-1').map({'-1': -1, 'T': 0, 'F': 1})
identity['id_36'] = identity['id_36'].fillna('-1').map({'-1': -1, 'T': 0, 'F': 1})
identity['id_37'] = identity['id_37'].fillna('-1').map({'-1': -1, 'T': 0, 'F': 1})
identity['id_38'] = identity['id_38'].fillna('-1').map({'-1': -1, 'T': 0, 'F': 1})
identity['id_30'] = identity['id_30'].fillna('missing').map(lambda x: str(x).split(' ')[0]).replace('other', 'missing').replace('func', 'missing')


def browser(x):
    for i in ['samsung','safari','chrome','edge','firefox','ie','other','opera','aol','silk','waterfox','nokia','puffin','cyberfox',
              'zte','palemoon','maxthon','line','lg','iron','blu','seamonkey','m4','comodo','lanix','chromium','inco','mozila','cherry',
              'icedragon','google','facebook','mobile','android','windows','generic', 'missing']:
        if i in x:
            return i
        else:
            return 'missing'
identity['id_31'] = identity['id_31'].fillna('missing').map(lambda x: browser(x.lower()))
identity['id_33'] = identity['id_33'].fillna('0x0').map(lambda x: int(x.split('x')[0])*int(x.split('x')[1]))
identity.drop('DeviceInfo', axis=1, inplace=True)

identity['id_01'].fillna(5, inplace=True)
identity['id_02'].fillna(500, inplace=True)
identity['id_03'].fillna(10, inplace=True)
identity['id_04'].fillna(0, inplace=True)
identity['id_05'].fillna(100, inplace=True)
identity['id_06'].fillna(10, inplace=True)
identity['id_09'].fillna(100, inplace=True)
identity['id_10'].fillna(100, inplace=True)
identity['id_11'].fillna(90, inplace=True)
identity['id_13'].fillna(10, inplace=True)
identity['id_14'].fillna(-1000, inplace=True)
identity['id_15'].fillna('missing', inplace=True)
identity['id_17'].fillna(300, inplace=True)
identity['id_18'].fillna(1, inplace=True)
identity['id_19'].fillna(1000, inplace=True)
identity['id_20'].fillna(50, inplace=True)
identity['id_32'].fillna(0, inplace=True)
identity['id_34'].fillna('missing', inplace=True)
identity['DeviceType'].fillna('missing', inplace=True)
print('max missing rate:', identity.isnull().sum().max())

max missing rate: 0


In [140]:
identity_cat = identity.select_dtypes(include='object')
identity_dummy = pd.get_dummies(identity_cat)
identity = pd.concat([identity, identity_cat], axis=1)
identity.drop(identity_cat.columns.tolist(), axis=1, inplace=True)
print(identity.dtypes.value_counts())

float64    16
int64      10
object     10
dtype: int64


In [None]:
X = identity.iloc[:n_train,1:]
y = train['isFraud']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2)
RF = RandomForestClassifier(n_estimators=200, n_jobs=-1)
RF.fit(X_train, y_train)

y_pred = RF.predict_proba(X_test)
fpr, tpr, _ = roc_curve(y_test, y_pred)
print('ks:', max(tpr - fpr))
print('auc:', auc(fpr, tpr))