In [1]:
# install library 
# ! pip install datatable

# faster read
# import datatable as dt

# library 
import numpy as np 
import pandas as pd 
import itertools
from scipy import interp
import os
import time
import datetime
import gc
import json
from numba import jit
from itertools import product
from tqdm import tqdm_notebook

# Suppr warning
import warnings
warnings.filterwarnings("ignore")

# Plots
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rcParams

# Data processing, metrics and modeling
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold, GroupKFold, GridSearchCV, train_test_split, TimeSeriesSplit
from bayes_opt import BayesianOptimization
from datetime import datetime
from sklearn.metrics import precision_score, recall_score, confusion_matrix, accuracy_score, roc_auc_score, f1_score, roc_curve, auc,precision_recall_curve
from sklearn import metrics
from sklearn import preprocessing
from sklearn import linear_model

# ML
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor, CatBoostClassifier

# options
pd.set_option('display.max_columns', 500)

In [2]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

### read dataset

In [3]:
# read
train_transaction = pd.read_csv('../input/train_transaction.csv')
test_transaction = pd.read_csv('../input/test_transaction.csv')
train_identity = pd.read_csv('../input/train_identity.csv')
test_identity = pd.read_csv('../input/test_identity.csv')
sub = pd.read_csv('../input/sample_submission.csv')

# merge 
train = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')
test = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')

# reduce_mem_usage
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

# base_columns
base_columns = list(train) + list(train_identity)

FileNotFoundError: File b'../input/train_transaction.csv' does not exist

In [4]:
del train_transaction, train_identity, test_transaction, test_identity

print("Train shape : "+str(train.shape))
print("Test shape  : "+str(test.shape))

NameError: name 'train_transaction' is not defined

In [5]:
# sampling 

# train = train.sample(5000)
# test = test.sample(5000)

### FE : missing value

In [6]:
train['nulls1'] = train.isna().sum(axis=1)
test['nulls1'] = test.isna().sum(axis=1)

NameError: name 'train' is not defined

### FE : time of day

In [7]:
import datetime

START_DATE = '2017-12-01'
startdate = datetime.datetime.strptime(START_DATE, '%Y-%m-%d')

train['dow'] = train['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds = x))).dt.dayofweek
train['hour'] = train['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds = x))).dt.hour
train['day'] = train['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds = x))).dt.day
test['dow'] = test['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds = x))).dt.dayofweek
test['hour'] = test['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds = x))).dt.hour
test['day'] = test['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds = x))).dt.day

NameError: name 'train' is not defined

> ### FE : id_31

In [8]:
def setbrowser(df):
    df.loc[df["id_31"]=="samsung browser 7.0",'lastest_browser']=1
    df.loc[df["id_31"]=="opera 53.0",'lastest_browser']=1
    df.loc[df["id_31"]=="mobile safari 10.0",'lastest_browser']=1
    df.loc[df["id_31"]=="google search application 49.0",'lastest_browser']=1
    df.loc[df["id_31"]=="firefox 60.0",'lastest_browser']=1
    df.loc[df["id_31"]=="edge 17.0",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 69.0",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 67.0 for android",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 63.0 for android",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 63.0 for ios",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 64.0",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 64.0 for android",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 64.0 for ios",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 65.0",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 65.0 for android",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 65.0 for ios",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 66.0",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 66.0 for android",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 66.0 for ios",'lastest_browser']=1
    return df

train["lastest_browser"] = np.zeros(train.shape[0])
test["lastest_browser"] = np.zeros(test.shape[0])
train = setbrowser(train)
test = setbrowser(test)

NameError: name 'train' is not defined

### FE : email

In [9]:
def fe_email(df):   
    
    emails = {'gmail': 'google', 'att.net': 'att', 'twc.com': 'spectrum', 'scranton.edu': 'other', 'optonline.net': 'other', 'hotmail.co.uk': 'microsoft', 'comcast.net': 'other', 'yahoo.com.mx': 'yahoo', 'yahoo.fr': 'yahoo', 'yahoo.es': 'yahoo', 'charter.net': 'spectrum', 'live.com': 'microsoft', 'aim.com': 'aol', 'hotmail.de': 'microsoft', 'centurylink.net': 'centurylink', 'gmail.com': 'google', 'me.com': 'apple', 'earthlink.net': 'other', 'gmx.de': 'other', 'web.de': 'other', 'cfl.rr.com': 'other', 'hotmail.com': 'microsoft', 'protonmail.com': 'other', 'hotmail.fr': 'microsoft', 'windstream.net': 'other', 'outlook.es': 'microsoft', 'yahoo.co.jp': 'yahoo', 'yahoo.de': 'yahoo', 'servicios-ta.com': 'other', 'netzero.net': 'other', 'suddenlink.net': 'other', 'roadrunner.com': 'other', 'sc.rr.com': 'other', 'live.fr': 'microsoft', 'verizon.net': 'yahoo', 'msn.com': 'microsoft', 'q.com': 'centurylink', 'prodigy.net.mx': 'att', 'frontier.com': 'yahoo', 'anonymous.com': 'other', 'rocketmail.com': 'yahoo', 'sbcglobal.net': 'att', 'frontiernet.net': 'yahoo', 'ymail.com': 'yahoo', 'outlook.com': 'microsoft', 'mail.com': 'other', 'bellsouth.net': 'other', 'embarqmail.com': 'centurylink', 'cableone.net': 'other', 'hotmail.es': 'microsoft', 'mac.com': 'apple', 'yahoo.co.uk': 'yahoo', 'netzero.com': 'other', 'yahoo.com': 'yahoo', 'live.com.mx': 'microsoft', 'ptd.net': 'other', 'cox.net': 'other', 'aol.com': 'aol', 'juno.com': 'other', 'icloud.com': 'apple'}
    us_emails = ['gmail', 'net', 'edu']
    
    df['P_email'] = (df['P_emaildomain']=='xmail.com')
    df['R_email'] = (df['R_emaildomain']=='xmail.com')
    
    df['P_isproton'] = (df['P_emaildomain']=='protonmail.com')
    df['R_isproton'] = (df['R_emaildomain']=='protonmail.com')

    df['email_check'] = np.where(df['P_emaildomain']==df['R_emaildomain'],1,0)
    df['email_check_nan_all'] = np.where((df['P_emaildomain'].isna())&(df['R_emaildomain'].isna()),1,0)
    df['email_check_nan_any'] = np.where((df['P_emaildomain'].isna())|(df['R_emaildomain'].isna()),1,0)    
    df['email_match_not_nan'] = np.where( (df['P_emaildomain']==df['R_emaildomain']) & (np.invert(df['P_emaildomain'].isna())) ,1,0)
    
    df['P_emaildomain_bin'] = df['P_emaildomain'].map(emails)    
    df['P_emaildomain_suffix'] = df['P_emaildomain'].map(lambda x: str(x).split('.')[-1])    
    df['P_emaildomain_suffix'] = df['P_emaildomain_suffix'].map(lambda x: x if str(x) not in us_emails else 'us')
    df['P_emaildomain_prefix'] = df['P_emaildomain'].map(lambda x: str(x).split('.')[0])   

    df['R_emaildomain_bin'] = df['R_emaildomain'].map(emails)    
    df['R_emaildomain_suffix'] = df['R_emaildomain'].map(lambda x: str(x).split('.')[-1])    
    df['R_emaildomain_suffix'] = df['R_emaildomain_suffix'].map(lambda x: x if str(x) not in us_emails else 'us')
    df['R_emaildomain_prefix'] = df['R_emaildomain'].map(lambda x: str(x).split('.')[0])   
    
    return df

train = fe_email(train)
test = fe_email(test)

NameError: name 'train' is not defined

### FE : card 

In [10]:
# Reset values for "noise" card1
valid_card = train['card1'].value_counts()
valid_card = valid_card[valid_card>10]
valid_card = list(valid_card.index)
    
train['card1'] = np.where(train['card1'].isin(valid_card), train['card1'], np.nan)
test['card1']  = np.where(test['card1'].isin(valid_card), test['card1'], np.nan)

NameError: name 'train' is not defined

In [11]:
# card3/5 low freq values 
train.loc[train.card3.isin(train.card3.value_counts()[train.card3.value_counts() < 200].index), 'card3'] = "Others"
test.loc[test.card3.isin(test.card3.value_counts()[test.card3.value_counts() < 200].index), 'card3'] = "Others"

train.loc[train.card5.isin(train.card5.value_counts()[train.card5.value_counts() < 300].index), 'card5'] = "Others"
test.loc[test.card5.isin(test.card5.value_counts()[test.card5.value_counts() < 300].index), 'card5'] = "Others"

NameError: name 'train' is not defined

In [12]:
# Let's add some kind of client uID based on cardID ad addr columns
train['uid'] = train['card1'].astype(str)+'_'+train['card2'].astype(str)+'_'+train['card3'].astype(str)+'_'+train['card4'].astype(str)
test['uid'] = test['card1'].astype(str)+'_'+test['card2'].astype(str)+'_'+test['card3'].astype(str)+'_'+test['card4'].astype(str)

train['uid2'] = train['uid'].astype(str)+'_'+train['addr1'].astype(str)+'_'+train['addr2'].astype(str)
test['uid2'] = test['uid'].astype(str)+'_'+test['addr1'].astype(str)+'_'+test['addr2'].astype(str)

NameError: name 'train' is not defined

In [13]:
# Encoding - count encoding for both train and test
for feature in ['card1', 'card2', 'card3', 'card4', 'card5', 'card6','uid','uid2']:
    train[feature + '_count_full'] = train[feature].map(pd.concat([train[feature], test[feature]], ignore_index=True).value_counts(dropna=False))
    test[feature + '_count_full'] = test[feature].map(pd.concat([train[feature], test[feature]], ignore_index=True).value_counts(dropna=False))

NameError: name 'train' is not defined

In [14]:
# Anomaly Search in geo information
# Let's look on bank addres and client addres matching
# card3/card5 bank country and name?
# Addr2 -> Clients geo position (country)
# Most common entries -> normal transactions
# Less common etries -> some anonaly
train['bank_type'] = train['card3'].astype(str)+'_'+train['card5'].astype(str)
test['bank_type']  = test['card3'].astype(str)+'_'+test['card5'].astype(str)

train['address_match'] = train['bank_type'].astype(str)+'_'+train['addr2'].astype(str)
test['address_match']  = test['bank_type'].astype(str)+'_'+test['addr2'].astype(str)

for col in ['address_match','bank_type']:
    tmp = pd.concat([train[[col]], test[[col]]])
    tmp[col] = np.where(tmp[col].str.contains('nan'), np.nan, tmp[col])
    tmp = tmp.dropna()
    fq_encode = tmp[col].value_counts().to_dict()   
    train[col] = train[col].map(fq_encode)
    test[col]  = test[col].map(fq_encode)

train['address_match'] = train['address_match']/train['bank_type'] 
test['address_match']  = test['address_match']/test['bank_type']

NameError: name 'train' is not defined

### FE : D9 (hour)

In [15]:
train['local_hour'] = train['D9']*24
test['local_hour']  = test['D9']*24

train['local_hour'] = train['local_hour'] - (train['TransactionDT']/(60*60))%24
test['local_hour']  = test['local_hour'] - (test['TransactionDT']/(60*60))%24

train['local_hour_dist'] = train['local_hour']/train['dist2']
test['local_hour_dist']  = test['local_hour']/test['dist2']

NameError: name 'train' is not defined

### FE : M1 ~ M9 (binary encoding, except M4)

In [16]:
i_cols = ['M1','M2','M3','M5','M6','M7','M8','M9']

train['M_sum'] = train[i_cols].sum(axis=1).astype(np.int8)
test['M_sum']  = test[i_cols].sum(axis=1).astype(np.int8)

train['M_na'] = train[i_cols].isna().sum(axis=1).astype(np.int8)
test['M_na']  = test[i_cols].isna().sum(axis=1).astype(np.int8)

train['M_type'] = ''
test['M_type']  = ''

for col in i_cols:
    train['M_type'] = '_'+train[col].astype(str)
    test['M_type'] = '_'+test[col].astype(str)

NameError: name 'train' is not defined

### FE : addr 

In [17]:
train['addr1_count_full'] = train['addr1'].map(pd.concat([train['addr1'], test['addr1']], ignore_index=True).value_counts(dropna=False))
test['addr1_count_full'] = test['addr1'].map(pd.concat([train['addr1'], test['addr1']], ignore_index=True).value_counts(dropna=False))

train['addr2_count_full'] = train['addr2'].map(pd.concat([train['addr2'], test['addr2']], ignore_index=True).value_counts(dropna=False))
test['addr2_count_full'] = test['addr2'].map(pd.concat([train['addr2'], test['addr2']], ignore_index=True).value_counts(dropna=False))

NameError: name 'train' is not defined

### FE : device 

In [18]:
def id_split(df):
    df['device_name'] = df['DeviceInfo'].str.split('/', expand=True)[0]
    df['device_version'] = df['DeviceInfo'].str.split('/', expand=True)[1]

    df['OS_id_30'] = df['id_30'].str.split(' ', expand=True)[0]
    df['version_id_30'] = df['id_30'].str.split(' ', expand=True)[1]

    df['browser_id_31'] = df['id_31'].str.split(' ', expand=True)[0]
    df['version_id_31'] = df['id_31'].str.split(' ', expand=True)[1]

    df['screen_width'] = df['id_33'].str.split('x', expand=True)[0]
    df['screen_height'] = df['id_33'].str.split('x', expand=True)[1]

    #df['id_34'] = df['id_34'].str.split(':', expand=True)[1]
    #df['id_23'] = df['id_23'].str.split(':', expand=True)[1]

    df.loc[df['device_name'].str.contains('SM', na=False), 'device_name'] = 'Samsung'
    df.loc[df['device_name'].str.contains('SAMSUNG', na=False), 'device_name'] = 'Samsung'
    df.loc[df['device_name'].str.contains('GT-', na=False), 'device_name'] = 'Samsung'
    df.loc[df['device_name'].str.contains('Moto G', na=False), 'device_name'] = 'Motorola'
    df.loc[df['device_name'].str.contains('Moto', na=False), 'device_name'] = 'Motorola'
    df.loc[df['device_name'].str.contains('moto', na=False), 'device_name'] = 'Motorola'
    df.loc[df['device_name'].str.contains('LG-', na=False), 'device_name'] = 'LG'
    df.loc[df['device_name'].str.contains('rv:', na=False), 'device_name'] = 'RV'
    df.loc[df['device_name'].str.contains('HUAWEI', na=False), 'device_name'] = 'Huawei'
    df.loc[df['device_name'].str.contains('ALE-', na=False), 'device_name'] = 'Huawei'
    df.loc[df['device_name'].str.contains('-L', na=False), 'device_name'] = 'Huawei'
    df.loc[df['device_name'].str.contains('Blade', na=False), 'device_name'] = 'ZTE'
    df.loc[df['device_name'].str.contains('BLADE', na=False), 'device_name'] = 'ZTE'
    df.loc[df['device_name'].str.contains('Linux', na=False), 'device_name'] = 'Linux'
    df.loc[df['device_name'].str.contains('XT', na=False), 'device_name'] = 'Sony'
    df.loc[df['device_name'].str.contains('HTC', na=False), 'device_name'] = 'HTC'
    df.loc[df['device_name'].str.contains('ASUS', na=False), 'device_name'] = 'Asus'

    df.loc[df.device_name.isin(df.device_name.value_counts()[df.device_name.value_counts() < 200].index), 'device_name'] = "Others"
    df['had_id'] = 1
    gc.collect()
    
    return df

train = id_split(train)
test = id_split(test)

NameError: name 'train' is not defined

### FE : TransactionAmt

In [19]:
train['TransactionAmt_log'] = np.log(train['TransactionAmt'])
test['TransactionAmt_log'] = np.log(test['TransactionAmt'])

NameError: name 'train' is not defined

In [20]:
train['TransactionAmt_decimal'] = ((train['TransactionAmt'] - train['TransactionAmt'].astype(int)) * 1000).astype(int)
test['TransactionAmt_decimal'] = ((test['TransactionAmt'] - test['TransactionAmt'].astype(int)) * 1000).astype(int)

NameError: name 'train' is not defined

In [21]:
# Check if Transaction Amount is common or not (we can use freq encoding here)
valid_card = train['TransactionAmt'].value_counts()
valid_card = valid_card[valid_card>10]
valid_card = list(valid_card.index)
    
train['TransactionAmt_check'] = np.where(train['TransactionAmt'].isin(test['TransactionAmt']), 1, 0)
test['TransactionAmt_check']  = np.where(test['TransactionAmt'].isin(train['TransactionAmt']), 1, 0)

NameError: name 'train' is not defined

In [22]:
# For our model current TransactionAmt is a noise (even when features importances are telling contrariwise)
# There are many unique values and model doesn't generalize well, Lets do some aggregations
i_cols = ['card1','card2','card3','card5','uid','uid2']

for col in i_cols:
    for agg_type in ['mean', 'std' ,'sum']:
        new_col_name = col+'_TransactionAmt_'+agg_type
        tmp = pd.concat([train[[col, 'TransactionAmt']], test[[col,'TransactionAmt']]])
        tmp = tmp.groupby([col])['TransactionAmt'].agg([agg_type]).reset_index().rename(columns={agg_type: new_col_name})
        
        tmp.index = list(tmp[col])
        tmp = tmp[new_col_name].to_dict()   
    
        train[new_col_name] = train[col].map(tmp)
        test[new_col_name]  = test[col].map(tmp)

NameError: name 'train' is not defined

### FE : two featrues label encoding

In [23]:
for feature in ['id_02__id_20', 'id_02__D8', 'D11__DeviceInfo', 'DeviceInfo__P_emaildomain', 'P_emaildomain__C2', 'card2__dist1', 'card1__card5', 'card2__id_20', 'card5__P_emaildomain', 'addr1__card1']:

    f1, f2 = feature.split('__')
    train[feature] = train[f1].astype(str) + '_' + train[f2].astype(str)
    test[feature] = test[f1].astype(str) + '_' + test[f2].astype(str)

    le = preprocessing.LabelEncoder()
    le.fit(list(train[feature].astype(str).values) + list(test[feature].astype(str).values))
    train[feature] = le.transform(list(train[feature].astype(str).values))
    test[feature] = le.transform(list(test[feature].astype(str).values))

NameError: name 'train' is not defined

### FE : count encoded separately for train and test

In [24]:
for feature in ['id_01', 'id_31', 'id_33', 'id_35']:
    train[feature + '_count_dist'] = train[feature].map(train[feature].value_counts(dropna=False))
    test[feature + '_count_dist'] = test[feature].map(test[feature].value_counts(dropna=False))

NameError: name 'train' is not defined

### FE : Freq encoding

In [25]:
i_cols = ['card1','card2','card3','card5',
          'C1','C2','C3','C4','C5','C6','C7','C8','C9','C10','C11','C12','C13','C14',
          'D1','D2','D3','D4','D5','D6','D7','D8','D9',
          'addr1','addr2',
          'dist1','dist2',
          'P_emaildomain', 'R_emaildomain',
          'id_01','id_02','id_03','id_04','id_05','id_06','id_07','id_08','id_09','id_10',
          'id_11','id_13','id_14','id_17','id_18','id_19','id_20','id_21','id_22','id_24',
          'id_25','id_26','id_30','id_31','id_32','id_33',
          'DeviceInfo'
         ]

for col in i_cols:
    tmp = pd.concat([train[[col]], test[[col]]])
    fq_encode = tmp[col].value_counts().to_dict()   
    train[col+'_fq_enc'] = train[col].map(fq_encode)
    test[col+'_fq_enc']  = test[col].map(fq_encode)

NameError: name 'train' is not defined

### FE : ProductCD and M4 Target mean

In [26]:
for col in ['ProductCD','M4']:
    temp_dict = train.groupby([col])['isFraud'].agg(['mean']).reset_index().rename(columns={'mean': col+'_target_mean'})
    temp_dict.index = temp_dict[col].values
    temp_dict = temp_dict[col+'_target_mean'].to_dict()

    train[col+'_target_mean'] = train[col].map(temp_dict)
    test[col+'_target_mean']  = test[col].map(temp_dict)

NameError: name 'train' is not defined

### FE : feature aggregation 

In [27]:
# For our model current TransactionAmt is a noise (even when features importances are telling contrariwise)
# There are many unique values and model doesn't generalize well, Lets do some aggregations

i_cols = ['card1','card2','card3','card5','uid','uid2']
for col in i_cols:
    for agg_type in ['mean', 'std' ,'sum']:
        new_col_name = col+'_'+'TransactionAmt'+'_'+agg_type
        tmp = pd.concat([train[[col, 'TransactionAmt']], test[[col,'TransactionAmt']]])
        tmp = tmp.groupby([col])['TransactionAmt'].agg([agg_type]).reset_index().rename(columns={agg_type: new_col_name})
        
        tmp.index = list(tmp[col])
        tmp = tmp[new_col_name].to_dict()   
    
        train[new_col_name] = train[col].map(tmp)
        test[new_col_name]  = test[col].map(tmp)

i_cols = ['card1','card2','card3','card5','uid','uid2']
for col in i_cols:
    for agg_type in ['mean', 'std' ,'sum']:
        new_col_name = col+'_'+'D15'+'_'+agg_type
        tmp = pd.concat([train[[col, 'D15']], test[[col,'D15']]])
        tmp = tmp.groupby([col])['D15'].agg([agg_type]).reset_index().rename(columns={agg_type: new_col_name})
        
        tmp.index = list(tmp[col])
        tmp = tmp[new_col_name].to_dict()   
    
        train[new_col_name] = train[col].map(tmp)
        test[new_col_name]  = test[col].map(tmp)
        
i_cols = ['addr1','addr2']
for col in i_cols:
    for agg_type in ['mean', 'std' ,'sum']:
        new_col_name = col+'_'+'D15'+'_'+agg_type
        tmp = pd.concat([train[[col, 'D15']], test[[col,'D15']]])
        tmp = tmp.groupby([col])['D15'].agg([agg_type]).reset_index().rename(columns={agg_type: new_col_name})
        
        tmp.index = list(tmp[col])
        tmp = tmp[new_col_name].to_dict()   
    
        train[new_col_name] = train[col].map(tmp)
        test[new_col_name]  = test[col].map(tmp)

NameError: name 'train' is not defined

### FE : repalce missing values

In [28]:
# fill in mean for floats
# for c in train.columns:
#     if train[c].dtype=='float16' or  train[c].dtype=='float32' or  train[c].dtype=='float64':
#         train[c].fillna(train[c].mean())
#         train[c].fillna(train[c].mean())

# fill in -999 for categoricals
# train = train.fillna(-999)
# test = test.fillna(-999)

### FE : character feature encoding 

In [29]:
# Encode Str columns
for col in list(train):
    if train[col].dtype=='O':
        print(col)
        train[col] = train[col].fillna('unseen_before_label')
        test[col]  = test[col].fillna('unseen_before_label')
        
        train[col] = train[col].astype(str)
        test[col] = test[col].astype(str)
        
        le = LabelEncoder()
        le.fit(list(train[col])+list(test[col]))
        train[col] = le.transform(train[col])
        test[col]  = le.transform(test[col])
        
        train[col] = train[col].astype('category')
        test[col] = test[col].astype('category')

NameError: name 'train' is not defined

### FE : drop features 

In [30]:
many_null_cols = [col for col in train.columns if train[col].isnull().sum() / train.shape[0] > 0.9]
many_null_cols_test = [col for col in test.columns if test[col].isnull().sum() / test.shape[0] > 0.9]
big_top_value_cols = [col for col in train.columns if train[col].value_counts(dropna=False, normalize=True).values[0] > 0.9]
big_top_value_cols_test = [col for col in test.columns if test[col].value_counts(dropna=False, normalize=True).values[0] > 0.9]
one_value_cols = [col for col in train.columns if train[col].nunique() <= 1]
one_value_cols_test = [col for col in test.columns if test[col].nunique() <= 1]

cols_to_drop = list(set(many_null_cols + many_null_cols_test + big_top_value_cols + big_top_value_cols_test + one_value_cols + one_value_cols_test))
cols_to_drop.remove('isFraud')

train = train.drop(cols_to_drop, axis=1)
test = test.drop(cols_to_drop, axis=1)

print(">> num of cols_to_drop :",len(cols_to_drop))

NameError: name 'train' is not defined

### model

In [31]:
print(">> num of columns :",len(train.columns))

NameError: name 'train' is not defined

In [32]:
train_rm_cols = [
    'TransactionID','TransactionDT', # These columns are pure noise right now
    'isFraud',                       # Not target in features))
    'uid','uid2',                    # Our new clien uID -> very noisy data
    'bank_type',                     # Victims bank could differ by time
]

test_rm_cols = [
    'TransactionID','TransactionDT', # These columns are pure noise right now
    'uid','uid2',                    # Our new clien uID -> very noisy data
    'bank_type',                     # Victims bank could differ by time
]

X = train.sort_values('TransactionDT').drop(train_rm_cols, axis=1)
y = train.sort_values('TransactionDT')['isFraud']
X_test = test.drop(test_rm_cols, axis=1)

NameError: name 'train' is not defined

In [33]:
# by https://www.kaggle.com/dimartinot
def clean_inf_nan(df):
    return df.replace([np.inf, -np.inf], np.nan)   

# Cleaning infinite values to NaN
X = clean_inf_nan(X)
X_test = clean_inf_nan(X_test )

NameError: name 'X' is not defined

In [34]:
n_fold = 5
# folds = TimeSeriesSplit(n_splits=n_fold) 
folds = KFold(n_splits=n_fold)
oof = np.zeros(len(X))
predictions = np.zeros(len(X_test))
cv_auc = list()

NameError: name 'X' is not defined

In [35]:
%%time 

@jit
def fast_auc(y_true, y_prob):
    """
    fast roc_auc computation: https://www.kaggle.com/c/microsoft-malware-prediction/discussion/76013
    """
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    nfalse = 0
    auc = 0
    n = len(y_true)
    for i in range(n):
        y_i = y_true[i]
        nfalse += (1 - y_i)
        auc += y_i * nfalse
    auc /= (nfalse * (n - nfalse))
    return auc

def eval_auc(y_true, y_pred):
    """
    Fast auc eval function for lgb.
    """
    return 'auc', fast_auc(y_true, y_pred), True

params = {
    "boosting_type" : 'gbdt',
    "objective" : "binary" ,
    "metric" : "auc",
    "num_leaves" : 256,
    "min_child_samples" : 79,
    "max_depth" : 15,
    "learning_rate" : 0.02,
    "subsample_freq" : 3,
    "subsample" : 0.9,
    "bagging_seed" : 11,
    "verbosity" : -1,
    "reg_alpha" : 0.3,
    "reg_lambda" : 0.3,
    "colsample_bytree" : 0.9
}

for fold_, (trn_idx, val_idx) in enumerate(folds.split(X.values, y.values)):
    print("Fold {}".format(fold_))
    train_df, y_train_df = X.iloc[trn_idx], y.iloc[trn_idx]
    valid_df, y_valid_df = X.iloc[val_idx], y.iloc[val_idx]
    
    trn_data = lgb.Dataset(train_df, label=y_train_df)
    val_data = lgb.Dataset(valid_df, label=y_valid_df)
    
    clf = lgb.train(params,
                    trn_data,
                    5000,
                    valid_sets = [trn_data, val_data],
                    verbose_eval=200,
                    feval = eval_auc,
                    early_stopping_rounds=200)

    pred = clf.predict(valid_df)
    oof[val_idx] = pred
    cv_auc.append(roc_auc_score(y_valid_df, pred))
    print( "  auc = ", roc_auc_score(y_valid_df, pred) )
    predictions += clf.predict(X_test) / n_fold

NameError: name 'X' is not defined

In [36]:
print(">> AUV(CV) :",np.mean(cv_auc))

NameError: name 'cv_auc' is not defined

In [37]:
sub['isFraud'] = predictions
sub.to_csv('submission.csv', index=False)

NameError: name 'predictions' is not defined