In [1]:
import os
import random
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, PredefinedSplit

In [2]:
SEED = 31
N_ESTIMATORS = 2000
TARGET = 'isFraud'
VALIDATION_PERCENT = 0.01
SCORING = 'roc_auc'

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
seed_everything(SEED)

In [4]:
file_folder = '../input/ieee-fraud-detection-preprocess'
train = pd.read_csv(f'{file_folder}/train.csv')
test = pd.read_csv(f'{file_folder}/test.csv')
print(f'train={train.shape}, test={test.shape}')

train=(590540, 375), test=(506691, 374)


In [5]:
excludes = {TARGET}
for i in range(1, 340):
    excludes.add(f'V{i}')


cols = set(train.columns.values) - excludes
PREDICTORS = list(cols)
print(f'{len(PREDICTORS)} predictors={PREDICTORS}')

194 predictors=['TransactionAmt_to_M4_mean', 'card3', '_pc_5', 'id_34', 'dist1_to_id_23_std', 'dist1_to_M2_mean', '_pc_18', 'D10', 'card6', 'M2', 'dist1_to_M4_std', 'dist1_to_id_38_std', 'dist1_to_DeviceType_std', 'C2', '_pc_16', 'TransactionAmt_to_M1_std', 'dist1_to_DeviceType_mean', 'dist1_to_id_36_mean', 'id_20', 'id_33', '_pc_8', 'dist1_to_M1_std', 'dist1_to_id_32_std', 'TransactionAmt_to_id_32_std', 'dist1_to_id_32_mean', 'dist1_to_M4_mean', 'TransactionAmt_to_id_27_std', 'C3', 'TransactionAmt_to_id_27_mean', 'dist1_to_M8_std', 'TransactionAmt_to_M3_std', 'dist1_to_card4_std', 'TransactionAmt_to_id_32_mean', 'TransactionAmt_to_id_37_mean', 'TransactionAmt_to_id_23_std', 'id_12', '_pc_0', 'TransactionAmt_to_M1_mean', 'dist1_to_M1_mean', 'TransactionAmt_to_id_15_std', 'dist1_to_M5_std', 'dist1_to_M9_std', 'TransactionAmt_to_card6_std', '_pc_12', 'C5', 'TransactionAmt_to_id_16_std', 'TransactionAmt_to_id_37_std', 'dist1_to_id_37_std', 'ProductCD', 'dist1_to_id_27_mean', 'TransactionA

In [6]:
val_size = int(VALIDATION_PERCENT * len(train))
train_size = len(train) - val_size
train_ind = [-1] * train_size
val_ind = [0] * val_size
ps = PredefinedSplit(test_fold=np.concatenate((train_ind, val_ind)))

In [7]:
%%time
y_train = train[TARGET]
x_train = train[PREDICTORS]
model = RandomForestClassifier(n_estimators=N_ESTIMATORS, max_features='log2')
pipe = Pipeline([('model', model)])
param_grid = {
    'model__max_depth': [8],
    'model__min_samples_leaf': [10]
}
cv = GridSearchCV(pipe, cv=ps, param_grid=param_grid, scoring=SCORING)
cv.fit(x_train, y_train)
print('best_params_={}\nbest_score_={}'.format(repr(cv.best_params_), repr(cv.best_score_)))

best_params_={'model__max_depth': 8, 'model__min_samples_leaf': 10}
best_score_=0.8736061644030151
CPU times: user 1h 58min 16s, sys: 2.28 s, total: 1h 58min 18s
Wall time: 1h 58min 18s


In [8]:
x_test = test[PREDICTORS]
sub = pd.read_csv(f'../input/ieee-fraud-detection/sample_submission.csv')
sub[TARGET] = cv.predict_proba(x_test)[:,1]
sub.head()

Unnamed: 0,TransactionID,isFraud
0,3663549,0.013238
1,3663550,0.012682
2,3663551,0.023269
3,3663552,0.023917
4,3663553,0.042436


In [9]:
sub.to_csv('submission.csv', index=False)
print(os.listdir("."))

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


['__output__.json', '__notebook__.ipynb', 'submission.csv']
