<h2>Summary of project</h2>
<br/>
<a href="https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection)" target="_blank">Link to Kaggle Competition</a>

## Common imports

In [5]:
import pandas as pd
import numpy as np
import gc
import os, sys
import time
from sklearn.metrics import roc_auc_score

## Aisimplekit helpers

In [8]:
try:
    ## Occasionally (dev purpose only)
    sys.path.insert(0, "../..")
    import aisimplekit
except ModuleNotFoundError as err:
    print("""[err] {err}""".format(err=err))
    print("""Try: `pip install aisimplekit`""")

In [9]:
from aisimplekit.features.stats import *
from aisimplekit.models.lgb import lgb_train_cv
from aisimplekit.models.xgb import xgb_train_cv

## Custom notebook-specific helpers

In [10]:
def load_datasets(frm, to, debug=True, test_ofst=0, dtypes={}):
    """ """
    print('Loading train: #%d' % (to-frm))
    df_train = pd.read_csv(TRAIN_PATH, parse_dates=['click_time'], skiprows=range(1,frm), nrows=to-frm, dtype=dtypes, usecols=['ip','app','device','os', 'channel', 'click_time', 'is_attributed'])
    if debug != 0:
        nrows = 100000
        print('Loading test: #%d' % nrows)
        test_df = pd.read_csv(TEST_PATH, skiprows=range(1,1+test_ofst), nrows=nrows, parse_dates=['click_time'], dtype=dtypes, usecols=['ip','app','device','os', 'channel', 'click_time', 'click_id'])        
    elif debug == 0:
        print('Loading test: all. Param test_ofst ignored.')
        test_df = pd.read_csv(TEST_PATH, parse_dates=['click_time'], dtype=dtypes, usecols=['ip','app','device','os', 'channel', 'click_time', 'click_id'])
    else: raise

        # copy reference field
    sub = pd.DataFrame()
    sub['click_id'] = test_df['click_id'].astype('int')
    len_train = len(df_train)
    df_train = df_train.append(test_df)
    return (df_train, test_df, len_train, sub)

In [11]:
def extract_stats_features(df_train, predictors=['app', 'device', 'os', 'channel', 'hour', 'day']):
    """ """
    print('Extracting stats features...')
    df_train = do_countuniq( df_train, ['ip'], 'app', 'ip_uniq_app_count', 'uint8', show_max=True )
    df_train = do_countuniq( df_train, ['ip'], 'channel', 'ip_uniq_chan_count', 'uint8', show_max=True )
    df_train = do_count( df_train, ['ip', 'app'], 'ip_app_count', show_max=True )
    gc.collect()

    predictors.extend([col for col in df_train.columns if col.startswith('X')])
    predictors.extend([col for col in df_train.columns if col.startswith('Z')])

    if 'ip_tcount' in df_train.columns:
        predictors.extend(['ip_tcount'])
    if 'ip_app_count' in df_train.columns:
        predictors.extend(['ip_app_count'])
    if 'ip_app_os_count' in df_train.columns:
        predictors.extend(['ip_app_os_count'])
    if 'ip_tchan_count' in df_train.columns:
        predictors.extend(['ip_tchan_count'])
    if 'ip_app_os_var' in df_train.columns:
        predictors.extend(['ip_app_os_var'])
    if 'ip_app_channel_var_day' in df_train.columns:
        predictors.extend(['ip_app_channel_var_day'])
    if 'ip_app_channel_mean_hour' in df_train.columns:
        predictors.extend(['ip_app_channel_mean_hour'])

    return df_train, predictors

In [12]:
def extract_timeserie_features(df_train, predictors=['app', 'device', 'os', 'channel', 'hour', 'day']):
    """ """
    ## Timeseries features
    print('Extracting Timserie features...')

    print('[1/2] Extracting nextClick')
    D=2**26
    df_train['category'] = (df_train['ip'].astype(str) + "_" + df_train['app'].astype(str) + "_" + df_train['device'].astype(str) \
        + "_" + df_train['os'].astype(str)).apply(hash) % D
    click_buffer= np.full(D, 3000000000, dtype=np.uint32)

    df_train['epochtime'] = df_train['click_time'].astype(np.int64) // 10 ** 9
    next_clicks= []
    for category, t in zip(reversed(df_train['category'].values), reversed(df_train['epochtime'].values)):
        next_clicks.append(click_buffer[category]-t)
        click_buffer[category]= t
    del(click_buffer)
    qq = list(reversed(next_clicks))

    df_train.drop(['category'], axis=1, inplace=True)
    df_train['nextClick'] = pd.Series(qq).astype('float32')
    predictors.append('nextClick')

    print('[2/2] Extracting nextClick_sameChan')
    D=2**26
    df_train['category'] = (df_train['ip'].astype(str) + "_" + df_train['channel'].astype(str) + "_" + df_train['device'].astype(str) \
        + "_" + df_train['os'].astype(str)).apply(hash) % D
    click_buffer= np.full(D, 3000000000, dtype=np.uint32)

    next_clicks= []
    for category, t in zip(reversed(df_train['category'].values), reversed(df_train['epochtime'].values)):
        next_clicks.append(click_buffer[category]-t)
        click_buffer[category]= t
    del(click_buffer)
    qq = list(reversed(next_clicks))

    df_train.drop(['category'], axis=1, inplace=True)
    df_train['nextClick_sameChan'] = pd.Series(qq).astype('float32')
    predictors.append('nextClick_sameChan')

    return df_train, predictors

## Parameters

In [13]:
## Data stored in Kaggle
#TRAIN_PATH = ".kaggle/competitions/talkingdata-adtracking-fraud-detection/train.csv"
#TEST_PATH = ".kaggle/competitions/talkingdata-adtracking-fraud-detection/test.csv"

## Data downloaded locally
TRAIN_PATH = "../../data/td-frauddetection-001/train.csv"
TEST_PATH = "../../data/td-frauddetection-001/test.csv"

dtypes = {
    'ip': 'uint32', 'app': 'uint16', 'device': 'uint16',
    'os': 'uint16', 'channel': 'uint16', 'is_attributed': 'uint8',
    'click_id': 'uint32',
}

debug = 2
limit_features = False

nrows=184903891-1
frm = 144903891
nchunk = 1000000 #2000000
val_size=int(0.33*nchunk); # debug == 2
df_val = None

if debug == 0:
    ## No cross-validation, all test data
    nchunk = 40000000; val_size = 5000000
    frm = 21500000 # day 2/4
elif debug == 1:
    ## With cross-validation
    nchunk = 5000000; frm = 85000000 # day-1
    val_size = 2000000; frm_val = 144903891

## Train data boundaries (fraction corresponding to nchunk size).
to = frm + nchunk
test_ofst = 0

## Main Loop

In [14]:
%%time
############################################################################################################
#        LOADING DATA
############################################################################################################
(df_train, test_df, len_train, sub) = load_datasets(
    frm, to, debug=debug,
    test_ofst=test_ofst, dtypes=dtypes
)

if debug == 1:
    print('************ Cross-validation: Loading data (#%d samp) ************'.format(val_size))
    len_train = len(df_train) - len(test_df)
#    dtypes = { 'ip': 'uint32', 'app': 'uint16', 'device': 'uint16', 'os': 'uint16', 'channel': 'uint16', 'is_attributed' : 'uint8', 'click_id': 'uint32'}
    df_val = pd.read_csv(TRAIN_PATH, parse_dates=['click_time'], skiprows=range(1,frm_val), nrows=val_size, dtype=dtypes,
                         usecols=['ip','app','device','os', 'channel', 'click_time', 'is_attributed'])
    df_train = df_train.append(df_val)

Loading train: #1000000
Loading test: #100000
CPU times: user 52.9 s, sys: 3.42 s, total: 56.3 s
Wall time: 58.4 s


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [15]:
%%time
############################################################################################################
#        FEATURE EXTRACTION (1/2: base stats)
############################################################################################################
df_train['hour'] = pd.to_datetime(df_train.click_time).dt.hour.astype('uint8')
df_train['day'] = pd.to_datetime(df_train.click_time).dt.day.astype('uint8')
df_train['minute'] = pd.to_datetime(df_train.click_time).dt.minute.astype('uint8')

categorical = ['app', 'device', 'os', 'channel', 'hour', 'day']
predictors = ['app', 'device', 'os', 'channel', 'hour', 'day']

df_train, predictors = extract_stats_features(df_train, predictors=predictors)

Extracting stats features...
Counting unique  app  by  ['ip'] ...
ip_uniq_app_count max value =  54
Counting unique  channel  by  ['ip'] ...
ip_uniq_chan_count max value =  93
Aggregating by  ['ip', 'app'] ...
ip_app_count max value =  1142
CPU times: user 2.99 s, sys: 111 ms, total: 3.1 s
Wall time: 3.14 s


In [16]:
%%time
############################################################################################################
#        FEATURE EXTRACTION (1/2: timserie stats)
############################################################################################################
print('[Feature: next click period]')
(df_train, predictors) = extract_timeserie_features(df_train, predictors)

df_train['minute'] = pd.to_datetime(df_train.click_time).dt.minute.astype('uint8')
prev_len = len(df_train)

[Feature: next click period]
Extracting Timserie features...
[1/2] Extracting nextClick
[2/2] Extracting nextClick_sameChan
CPU times: user 13.7 s, sys: 776 ms, total: 14.5 s
Wall time: 14.5 s


In [17]:
############################################################################################################
#        FEATURE SELECTION
############################################################################################################
if limit_features is True:
    predictors = ['app','channel', 'X3', 'X0', 'nextClick',
                  'os', 'nextClickPeriod', 'device', 'hour','day',
                  'nextClick_sameChan', 'ip_app_count']
    categorical = ['app','channel','os','device','hour','day']

In [18]:
%%time
############################################################################################################
#        TRAINING (1/3: Prepare data for LGB)
############################################################################################################
## Drop unnecessary columns
df_train.drop(['click_time'], axis=1, inplace=True)
df_train.drop(['epochtime'], axis=1, inplace=True)

## Convert types
df_train['ip_uniq_app_count'] = df_train['ip_uniq_app_count'].astype('uint16')
df_train['ip_app_count'] = df_train['ip_app_count'].astype('uint16')
df_train['ip_uniq_chan_count'] = df_train['ip_uniq_chan_count'].astype('uint16')

## Learning Parmeters: LGB
params = {
    'learning_rate': 0.05,
    #'is_unbalance': 'true', # replaced with scale_pos_weight argument
    'num_leaves': 15,  # 2^max_depth - 1
    'max_depth': 4,  # -1 means no limit
    'min_child_samples': 100,  # Minimum number of data need in a child(min_data_in_leaf)
    'max_bin': 100,  # Number of bucbketed bin for feature values
    'subsample': 0.7,  # Subsample ratio of the training instance.
    'subsample_freq': 1,  # frequence of subsample, <=0 means no enable
    'colsample_bytree': 0.9,  # Subsample ratio of columns when constructing each tree.
    'min_child_weight': 0,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
    'scale_pos_weight': 50, # because training data is extremely unbalanced 
}

## Target columns
target = 'is_attributed'

## Categorical columns
categorical = ['app', 'device', 'os', 'channel', 'hour', 'day']

CPU times: user 110 ms, sys: 115 µs, total: 110 ms
Wall time: 109 ms


In [19]:
############################################################################################################
#        TRAINING (2/3: Train/Test/CV Split)
############################################################################################################
test_df = df_train[len_train:]

if debug == 1:
    df_val = df_train[-val_size:]
    df_train = df_train[:len_train]
else:
    df_val = df_train[(len_train-val_size):len_train]
    df_train = df_train[:(len_train-val_size)]

print(df_train.shape, df_val.shape, test_df.shape)
print('Predictors:  %s' % predictors)
print('Categorical: %s' % categorical)

(670000, 15) (330000, 15) (100000, 15)
Predictors:  ['app', 'device', 'os', 'channel', 'hour', 'day', 'ip_app_count', 'nextClick', 'nextClick_sameChan']
Categorical: ['app', 'device', 'os', 'channel', 'hour', 'day']


In [20]:
############################################################################################################
#        TRAINING (3/3: Train LGB Model)
############################################################################################################

### LGB Model

In [29]:
(bst, best_iteration, eval_score) = lgb_train_cv(
    params, df_train, df_val, predictors, target, 
    objective='binary', 
    metrics='auc',
    early_stopping_rounds=30,
    verbose_eval=True, 
    num_boost_round=1000,
    categorical_features=categorical
)

pred_val_1 = bst.predict(df_val[predictors], num_iteration=best_iteration)

## Roc score
score = roc_auc_score(df_val.is_attributed, pred_val_1)
print(score)

Preparing validation datasets




Training until validation scores don't improve for 30 rounds.
[10]	train's auc: 0.871709	valid's auc: 0.836244
[20]	train's auc: 0.874509	valid's auc: 0.818293
[30]	train's auc: 0.923822	valid's auc: 0.878125
Early stopping, best iteration is:
[1]	train's auc: 0.942618	valid's auc: 0.923391

Model Report
bst1.best_iteration:  1
auc: 0.9233913046726969
0.923391304672697


### XGB Model

In [30]:
xgb_params = {} # use default ones

model = xgb_train_cv(
    xgb_params, df_train, df_val, predictors, target,
    objective='binary:logistic',
    early_stopping_rounds=25,
    num_boost_round=200,
    verbose_eval=5
)

## Measure XGB Performance on cross-validation data
import xgboost as xgb
dvalid = xgb.DMatrix(df_val[predictors], df_val.is_attributed)

## predictions on val
pred_val_2 = model.predict(dvalid, ntree_limit=model.best_ntree_limit)

## Roc score
score = roc_auc_score(df_val.is_attributed, pred_val_2)
print(score)

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


[0]	train-auc:0.920691	valid-auc:0.904783
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 25 rounds.
[5]	train-auc:0.945002	valid-auc:0.932226
[10]	train-auc:0.960655	valid-auc:0.95107
[15]	train-auc:0.965505	valid-auc:0.956018
[20]	train-auc:0.968309	valid-auc:0.958279
[25]	train-auc:0.971974	valid-auc:0.95978
[30]	train-auc:0.975527	valid-auc:0.960439
[35]	train-auc:0.977717	valid-auc:0.961915
[40]	train-auc:0.979526	valid-auc:0.962997
[45]	train-auc:0.981462	valid-auc:0.962769
[50]	train-auc:0.983056	valid-auc:0.963067
[55]	train-auc:0.984027	valid-auc:0.963087
[60]	train-auc:0.984808	valid-auc:0.963358
[65]	train-auc:0.985794	valid-auc:0.963702
[70]	train-auc:0.987043	valid-auc:0.963581
[75]	train-auc:0.98786	valid-auc:0.963285
[80]	train-auc:0.988488	valid-auc:0.962862
[85]	train-auc:0.988777	valid-auc:0.962575
[90]	train-auc:0.989291	valid-auc:0.962745
[95]	train-auc:0.98965	valid-auc:0.962823
Sto

### Catboost

In [23]:
import catboost as cb
import sklearn

cat_features_index = [0,1,2,3]

def auc(m, train, test):
    return (sklearn.metrics.roc_auc_score(y_train, m.predict_proba(train)[:,1]),
                sklearn.metrics.roc_auc_score(y_test,m.predict_proba(test)[:,1]))

model_catb = cb.CatBoostClassifier(
    eval_metric="AUC", depth=7, iterations=200,
    l2_leaf_reg=4,
    learning_rate=0.25,
    od_type='Iter', od_wait=20,
    one_hot_max_size=50
)

model_catb.fit(df_train[predictors], df_train[target], cat_features=cat_features_index)

pred_val_3 = model_catb.predict_proba(df_val[predictors])[:,1]
score = roc_auc_score(df_val[target], pred_val_3)
print(score)

0:	total: 955ms	remaining: 3m 10s
1:	total: 1.87s	remaining: 3m 5s
2:	total: 2.85s	remaining: 3m 7s
3:	total: 3.56s	remaining: 2m 54s
4:	total: 4.38s	remaining: 2m 50s
5:	total: 5.13s	remaining: 2m 46s
6:	total: 5.82s	remaining: 2m 40s
7:	total: 6.54s	remaining: 2m 36s
8:	total: 7.31s	remaining: 2m 35s
9:	total: 7.97s	remaining: 2m 31s
10:	total: 8.71s	remaining: 2m 29s
11:	total: 9.39s	remaining: 2m 27s
12:	total: 10s	remaining: 2m 24s
13:	total: 10.8s	remaining: 2m 23s
14:	total: 11.7s	remaining: 2m 24s
15:	total: 12.6s	remaining: 2m 24s
16:	total: 13.3s	remaining: 2m 23s
17:	total: 14.1s	remaining: 2m 22s
18:	total: 15s	remaining: 2m 22s
19:	total: 15.8s	remaining: 2m 22s
20:	total: 16.5s	remaining: 2m 20s
21:	total: 17s	remaining: 2m 17s
22:	total: 17.9s	remaining: 2m 17s
23:	total: 18.8s	remaining: 2m 17s
24:	total: 19.5s	remaining: 2m 16s
25:	total: 20.2s	remaining: 2m 15s
26:	total: 20.9s	remaining: 2m 14s
27:	total: 21.6s	remaining: 2m 12s
28:	total: 22.3s	remaining: 2m 11s
29:

<catboost.core.CatBoostClassifier at 0x7fd4450ec240>

## Stacking Models: final predictions

### on validation data

In [34]:
w1 = 1/5.
w2 = 2/5.
w3 = 2/5.

pred_stacked_cv = pred_val_1 * w1 + pred_val_2 * w2 + pred_val_3 * w3

score = roc_auc_score(df_val[target], pred_stacked_cv)
print(score)

0.9646512755430705


### on test data

In [38]:
pred_1 = bst.predict(test_df[predictors], num_iteration=best_iteration)

dtest = xgb.DMatrix(test_df[predictors])
pred_2 = model.predict(dtest, ntree_limit=model.best_ntree_limit)

pred_3 = model_catb.predict_proba(test_df[predictors])[:,1]

pred_stacked_test = pred_1 * w1 + pred_2 * w2 + pred_3 * w3

In [39]:
############################################################################################################
#        PREDICTIONS on TEST data.
############################################################################################################
fnames = {
    0: 'final-output.csv',
    1: 'out-dev-big.csv',
    2: 'out-dev-small.csv'
}

fname = fnames[debug]
print('Saving output to: {fname}'.format(fname=fname))
sub[target] = pred_stacked_test
sub.to_csv(fname, index=False, float_format='%.9f')

Saving output to: out-dev-small.csv


In [40]:
!ls 

catboost_info  out-dev-small.csv  td-frauddetection-001.ipynb
