In [35]:
import pandas as pd
import numpy as np
import multiprocessing
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import gc
from time import time
import datetime
from tqdm import tqdm_notebook
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.metrics import roc_auc_score
warnings.simplefilter('ignore')
sns.set()
%matplotlib inline

In [45]:
%%time
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

Wall time: 46.2 s


In [46]:
test_id = test['TransactionID']

In [47]:
useful_features = ['TransactionAmt', 'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 'dist1',
                   'P_emaildomain', 'R_emaildomain', 'C1', 'C2', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13',
                   'C14', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D8', 'D9', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15', 'M2', 'M3',
                   'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V17',
                   'V19', 'V20', 'V29', 'V30', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V40', 'V44', 'V45', 'V46', 'V47', 'V48',
                   'V49', 'V51', 'V52', 'V53', 'V54', 'V56', 'V58', 'V59', 'V60', 'V61', 'V62', 'V63', 'V64', 'V69', 'V70', 'V71',
                   'V72', 'V73', 'V74', 'V75', 'V76', 'V78', 'V80', 'V81', 'V82', 'V83', 'V84', 'V85', 'V87', 'V90', 'V91', 'V92',
                   'V93', 'V94', 'V95', 'V96', 'V97', 'V99', 'V100', 'V126', 'V127', 'V128', 'V130', 'V131', 'V138', 'V139', 'V140',
                   'V143', 'V145', 'V146', 'V147', 'V149', 'V150', 'V151', 'V152', 'V154', 'V156', 'V158', 'V159', 'V160', 'V161',
                   'V162', 'V163', 'V164', 'V165', 'V166', 'V167', 'V169', 'V170', 'V171', 'V172', 'V173', 'V175', 'V176', 'V177',
                   'V178', 'V180', 'V182', 'V184', 'V187', 'V188', 'V189', 'V195', 'V197', 'V200', 'V201', 'V202', 'V203', 'V204',
                   'V205', 'V206', 'V207', 'V208', 'V209', 'V210', 'V212', 'V213', 'V214', 'V215', 'V216', 'V217', 'V219', 'V220',
                   'V221', 'V222', 'V223', 'V224', 'V225', 'V226', 'V227', 'V228', 'V229', 'V231', 'V233', 'V234', 'V238', 'V239',
                   'V242', 'V243', 'V244', 'V245', 'V246', 'V247', 'V249', 'V251', 'V253', 'V256', 'V257', 'V258', 'V259', 'V261',
                   'V262', 'V263', 'V264', 'V265', 'V266', 'V267', 'V268', 'V270', 'V271', 'V272', 'V273', 'V274', 'V275', 'V276',
                   'V277', 'V278', 'V279', 'V280', 'V282', 'V283', 'V285', 'V287', 'V288', 'V289', 'V291', 'V292', 'V294', 'V303',
                   'V304', 'V306', 'V307', 'V308', 'V310', 'V312', 'V313', 'V314', 'V315', 'V317', 'V322', 'V323', 'V324', 'V326',
                   'V329', 'V331', 'V332', 'V333', 'V335', 'V336', 'V338', 'id_01', 'id_02', 'id_03', 'id_05', 'id_06', 'id_09',
                   'id_11', 'id_12', 'id_13', 'id_14', 'id_15', 'id_17', 'id_19', 'id_20', 'id_30', 'id_31', 'id_32', 'id_33',
                   'id_36', 'id_37', 'id_38', 'DeviceType', 'DeviceInfo']

In [48]:
train = train.drop(['Unnamed: 0'], axis = 1)
test = test.drop(['Unnamed: 0'], axis = 1)

In [49]:
cols_to_drop = [col for col in train.columns if col not in useful_features]
print(cols_to_drop)

['TransactionID', 'isFraud', 'TransactionDT', 'dist2', 'C3', 'D7', 'M1', 'V1', 'V2', 'V14', 'V15', 'V16', 'V18', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V31', 'V32', 'V39', 'V41', 'V42', 'V43', 'V50', 'V55', 'V57', 'V65', 'V66', 'V67', 'V68', 'V77', 'V79', 'V86', 'V88', 'V89', 'V98', 'V101', 'V102', 'V103', 'V104', 'V105', 'V106', 'V107', 'V108', 'V109', 'V110', 'V111', 'V112', 'V113', 'V114', 'V115', 'V116', 'V117', 'V118', 'V119', 'V120', 'V121', 'V122', 'V123', 'V124', 'V125', 'V129', 'V132', 'V133', 'V134', 'V135', 'V136', 'V137', 'V141', 'V142', 'V144', 'V148', 'V153', 'V155', 'V157', 'V168', 'V174', 'V179', 'V181', 'V183', 'V185', 'V186', 'V190', 'V191', 'V192', 'V193', 'V194', 'V196', 'V198', 'V199', 'V211', 'V218', 'V230', 'V232', 'V235', 'V236', 'V237', 'V240', 'V241', 'V248', 'V250', 'V252', 'V254', 'V255', 'V260', 'V269', 'V281', 'V284', 'V286', 'V290', 'V293', 'V295', 'V296', 'V297', 'V298', 'V299', 'V300', 'V301', 'V302', 'V305', 'V309', 'V311', 'V316', 'V

In [50]:
cols_to_drop.remove('isFraud')
cols_to_drop.remove('TransactionID')
cols_to_drop.remove('TransactionDT')
print(cols_to_drop)

['dist2', 'C3', 'D7', 'M1', 'V1', 'V2', 'V14', 'V15', 'V16', 'V18', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V31', 'V32', 'V39', 'V41', 'V42', 'V43', 'V50', 'V55', 'V57', 'V65', 'V66', 'V67', 'V68', 'V77', 'V79', 'V86', 'V88', 'V89', 'V98', 'V101', 'V102', 'V103', 'V104', 'V105', 'V106', 'V107', 'V108', 'V109', 'V110', 'V111', 'V112', 'V113', 'V114', 'V115', 'V116', 'V117', 'V118', 'V119', 'V120', 'V121', 'V122', 'V123', 'V124', 'V125', 'V129', 'V132', 'V133', 'V134', 'V135', 'V136', 'V137', 'V141', 'V142', 'V144', 'V148', 'V153', 'V155', 'V157', 'V168', 'V174', 'V179', 'V181', 'V183', 'V185', 'V186', 'V190', 'V191', 'V192', 'V193', 'V194', 'V196', 'V198', 'V199', 'V211', 'V218', 'V230', 'V232', 'V235', 'V236', 'V237', 'V240', 'V241', 'V248', 'V250', 'V252', 'V254', 'V255', 'V260', 'V269', 'V281', 'V284', 'V286', 'V290', 'V293', 'V295', 'V296', 'V297', 'V298', 'V299', 'V300', 'V301', 'V302', 'V305', 'V309', 'V311', 'V316', 'V318', 'V319', 'V320', 'V321', 'V325', 'V327',

In [51]:
train = train.drop(cols_to_drop, axis=1)
test = test.drop(cols_to_drop, axis=1)
print(len(cols_to_drop)," columns are dropped!")

152  columns are dropped!


In [52]:
# New feature - decimal part of the transaction amount
train['TransactionAmt_decimal'] = ((train['TransactionAmt'] - train['TransactionAmt'].astype(int)) * 1000).astype(int)
test['TransactionAmt_decimal'] = ((test['TransactionAmt'] - test['TransactionAmt'].astype(int)) * 1000).astype(int)

# Count encoding for card1 feature. 
# Explained in this kernel: https://www.kaggle.com/nroman/eda-for-cis-fraud-detection
train['card1_count_full'] = train['card1'].map(pd.concat([train['card1'], test['card1']], ignore_index=True).value_counts(dropna=False))
test['card1_count_full'] = test['card1'].map(pd.concat([train['card1'], test['card1']], ignore_index=True).value_counts(dropna=False))

# https://www.kaggle.com/fchmiel/day-and-time-powerful-predictive-feature
train['Transaction_day_of_week'] = np.floor((train['TransactionDT'] / (3600 * 24) - 1) % 7)
test['Transaction_day_of_week'] = np.floor((test['TransactionDT'] / (3600 * 24) - 1) % 7)
train['Transaction_hour'] = np.floor(train['TransactionDT'] / 3600) % 24
test['Transaction_hour'] = np.floor(test['TransactionDT'] / 3600) % 24

# Some arbitrary features interaction
for feature in ['id_02__id_20', 'id_02__D8', 'D11__DeviceInfo', 'DeviceInfo__P_emaildomain', 'P_emaildomain__C2', 
                'card2__dist1', 'card1__card5', 'card2__id_20', 'card5__P_emaildomain', 'addr1__card1']:

    f1, f2 = feature.split('__')
    train[feature] = train[f1].astype(str) + '_' + train[f2].astype(str)
    test[feature] = test[f1].astype(str) + '_' + test[f2].astype(str)

    le = LabelEncoder()
    le.fit(list(train[feature].astype(str).values) + list(test[feature].astype(str).values))
    train[feature] = le.transform(list(train[feature].astype(str).values))
    test[feature] = le.transform(list(test[feature].astype(str).values))
    
for feature in ['id_34', 'id_36']:
    if feature in useful_features:
        # Count encoded for both train and test
        train[feature + '_count_full'] = train[feature].map(pd.concat([train[feature], test[feature]], ignore_index=True).value_counts(dropna=False))
        test[feature + '_count_full'] = test[feature].map(pd.concat([train[feature], test[feature]], ignore_index=True).value_counts(dropna=False))
        
for feature in ['id_01', 'id_31', 'id_33', 'id_35', 'id_36']:
    if feature in useful_features:
        # Count encoded separately for train and test
        train[feature + '_count_dist'] = train[feature].map(train[feature].value_counts(dropna=False))
        test[feature + '_count_dist'] = test[feature].map(test[feature].value_counts(dropna=False))

In [53]:
for col in tqdm_notebook(train.columns):
    if train[col].dtype == 'object':
        le = LabelEncoder()
        le.fit(list(train[col].astype(str).values) + list(test[col].astype(str).values))
        train[col] = le.transform(list(train[col].astype(str).values))
        test[col] = le.transform(list(test[col].astype(str).values))  

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=301.0), HTML(value='')))




In [10]:
X = train.sort_values('TransactionDT').drop(['isFraud', 'TransactionDT', 'TransactionID'], axis=1)
y = train.sort_values('TransactionDT')['isFraud']
test = test.sort_values('TransactionDT').drop(['TransactionDT', 'TransactionID'], axis=1)

del train
gc.collect()

48

In [44]:
gc.collect()

159

In [11]:
X.shape, test.shape

((590540, 298), (506691, 298))

In [12]:
params = {'num_leaves': 491,
          'min_child_weight': 0.03454472573214212,
          'feature_fraction': 0.3797454081646243,
          'bagging_fraction': 0.4181193142567742,
          'min_data_in_leaf': 106,
          'objective': 'binary',
          'max_depth': -1,
          'learning_rate': 0.006883242363721497,
          "boosting_type": "gbdt",
          "bagging_seed": 11,
          "metric": 'auc',
          "verbosity": -1,
          'reg_alpha': 0.3899927210061127,
          'reg_lambda': 0.6485237330340494,
          'random_state': 47
         }

In [13]:
folds = TimeSeriesSplit(n_splits=5)

aucs = list()
feature_importances = pd.DataFrame()
feature_importances['feature'] = X.columns

training_start_time = time()
for fold, (trn_idx, test_idx) in enumerate(folds.split(X, y)):
    start_time = time()
    print('Training on fold {}'.format(fold + 1))
    
    trn_data = lgb.Dataset(X.iloc[trn_idx], label=y.iloc[trn_idx])
    val_data = lgb.Dataset(X.iloc[test_idx], label=y.iloc[test_idx])
    clf = lgb.train(params, trn_data, 10000, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds=500)
    
    feature_importances['fold_{}'.format(fold + 1)] = clf.feature_importance()
    aucs.append(clf.best_score['valid_1']['auc'])
    
    print('Fold {} finished in {}'.format(fold + 1, str(datetime.timedelta(seconds=time() - start_time))))
print('-' * 30)
print('Training has finished.')
print('Total training time is {}'.format(str(datetime.timedelta(seconds=time() - training_start_time))))
print('Mean AUC:', np.mean(aucs))
print('-' * 30)

Training on fold 1
Training until validation scores don't improve for 500 rounds
[1000]	training's auc: 1	valid_1's auc: 0.90225
Early stopping, best iteration is:
[1124]	training's auc: 1	valid_1's auc: 0.90262
Fold 1 finished in 0:35:44.422453
Training on fold 2
Training until validation scores don't improve for 500 rounds
[1000]	training's auc: 0.99997	valid_1's auc: 0.923195
Early stopping, best iteration is:
[843]	training's auc: 0.999877	valid_1's auc: 0.923343
Fold 2 finished in 0:48:29.009611
Training on fold 3
Training until validation scores don't improve for 500 rounds
[1000]	training's auc: 0.999506	valid_1's auc: 0.913462
Early stopping, best iteration is:
[976]	training's auc: 0.999439	valid_1's auc: 0.913534
Fold 3 finished in 0:08:30.193499
Training on fold 4
Training until validation scores don't improve for 500 rounds
[1000]	training's auc: 0.997989	valid_1's auc: 0.933244
Early stopping, best iteration is:
[1116]	training's auc: 0.998694	valid_1's auc: 0.933489
Fold 

In [16]:
# clf right now is the last model, trained with 80% of data and validated with 20%
best_iter = clf.best_iteration

In [17]:
clf = lgb.LGBMClassifier(**params, num_boost_round=best_iter)
clf.fit(X, y)

LGBMClassifier(bagging_fraction=0.4181193142567742, bagging_seed=11,
               feature_fraction=0.3797454081646243,
               learning_rate=0.006883242363721497, metric='auc',
               min_child_weight=0.03454472573214212, min_data_in_leaf=106,
               num_boost_round=1344, num_leaves=491, objective='binary',
               random_state=47, reg_alpha=0.3899927210061127,
               reg_lambda=0.6485237330340494, verbosity=-1)

In [32]:
sub = pd.DataFrame()
sub['TransactionID'] = test_id

In [54]:
test.shape

(506691, 300)

In [55]:
test.head()

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,card2__dist1,card1__card5,card2__id_20,card5__P_emaildomain,addr1__card1,id_36_count_full,id_01_count_dist,id_31_count_dist,id_33_count_dist,id_36_count_dist
0,3663549,18403224,31.95,4,10409,111.0,150.0,4,226.0,2,...,802,482,811,1375,5275,267353,4552,3636,436020,133287
1,3663550,18403263,49.0,4,4272,111.0,150.0,4,226.0,2,...,1609,12585,721,1361,26707,267353,12000,3636,1089,133287
2,3663551,18403310,171.0,4,4476,574.0,150.0,4,226.0,2,...,29448,12800,22981,1378,46662,10609,82579,683,436020,3690
3,3663552,18403310,285.0,4,10989,360.0,150.0,4,166.0,2,...,15520,1163,12085,730,14266,267353,4552,3636,436020,133287
4,3663553,18403317,67.94,4,18018,452.0,150.0,2,117.0,2,...,19848,9503,16357,142,20532,267353,270,3636,436020,133287


In [57]:
test = test.drop(['TransactionDT', 'TransactionID'], axis=1)

In [58]:
sub['isFraud'] = clf.predict_proba(test)[:, 1]

In [59]:
sub.to_csv('ImpFeatures.csv', index=False)

In [60]:
Score: 0.939961