## Summary of project
<a href="https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection)" target="_blank">L</a>

## Common imports

In [1]:
import pandas as pd
import numpy as np
import gc
import os, sys
import time

In [2]:
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

In [3]:
%matplotlib inline

## Aisimplekit helpers

In [4]:
try:
    ## Occasionally (dev purpose only)
    sys.path.insert(0, "../..")
    import aisimplekit
except ModuleNotFoundError as err:
    print("""[err] {err}""".format(err=err))
    print("""Try: `pip install aisimplekit`""")

In [19]:
from aisimplekit.features.stats import *
from aisimplekit.models.lgb import lgb_train_cv
from aisimplekit.models.xgb import xgb_train_cv

## Custom notebook-specific helpers

In [6]:
def load_datasets(frm, to, debug=True, test_ofst=0, dtypes={}):
    """ """
    print('Loading train: #%d' % (to-frm))
    df_train = pd.read_csv(TRAIN_PATH, parse_dates=['click_time'], skiprows=range(1,frm), nrows=to-frm, dtype=dtypes, usecols=['ip','app','device','os', 'channel', 'click_time', 'is_attributed'])
    if debug != 0:
        nrows = 100000
        print('Loading test: #%d' % nrows)
        test_df = pd.read_csv(TEST_PATH, skiprows=range(1,1+test_ofst), nrows=nrows, parse_dates=['click_time'], dtype=dtypes, usecols=['ip','app','device','os', 'channel', 'click_time', 'click_id'])        
    elif debug == 0:
        print('Loading test: all. Param test_ofst ignored.')
        test_df = pd.read_csv(TEST_PATH, parse_dates=['click_time'], dtype=dtypes, usecols=['ip','app','device','os', 'channel', 'click_time', 'click_id'])
    else: raise

        # copy reference field
    sub = pd.DataFrame()
    sub['click_id'] = test_df['click_id'].astype('int')
    len_train = len(df_train)
    df_train = df_train.append(test_df)
    return (df_train, test_df, len_train, sub)

In [7]:
def extract_stats_features(df_train, predictors=['app', 'device', 'os', 'channel', 'hour', 'day']):
    """ """
    print('Extracting stats features...')
    df_train = do_countuniq( df_train, ['ip'], 'app', 'ip_uniq_app_count', 'uint8', show_max=True )
    df_train = do_countuniq( df_train, ['ip'], 'channel', 'ip_uniq_chan_count', 'uint8', show_max=True )
    df_train = do_count( df_train, ['ip', 'app'], 'ip_app_count', show_max=True )
    gc.collect()

    predictors.extend([col for col in df_train.columns if col.startswith('X')])
    predictors.extend([col for col in df_train.columns if col.startswith('Z')])

    if 'ip_tcount' in df_train.columns:
        predictors.extend(['ip_tcount'])
    if 'ip_app_count' in df_train.columns:
        predictors.extend(['ip_app_count'])
    if 'ip_app_os_count' in df_train.columns:
        predictors.extend(['ip_app_os_count'])
    if 'ip_tchan_count' in df_train.columns:
        predictors.extend(['ip_tchan_count'])
    if 'ip_app_os_var' in df_train.columns:
        predictors.extend(['ip_app_os_var'])
    if 'ip_app_channel_var_day' in df_train.columns:
        predictors.extend(['ip_app_channel_var_day'])
    if 'ip_app_channel_mean_hour' in df_train.columns:
        predictors.extend(['ip_app_channel_mean_hour'])

    return df_train, predictors

In [8]:
def extract_timeserie_features(df_train, predictors=['app', 'device', 'os', 'channel', 'hour', 'day']):
    """ """
    ## Timeseries features
    print('Extracting Timserie features...')

    print('[1/2] Extracting nextClick')
    D=2**26
    df_train['category'] = (df_train['ip'].astype(str) + "_" + df_train['app'].astype(str) + "_" + df_train['device'].astype(str) \
        + "_" + df_train['os'].astype(str)).apply(hash) % D
    click_buffer= np.full(D, 3000000000, dtype=np.uint32)

    df_train['epochtime'] = df_train['click_time'].astype(np.int64) // 10 ** 9
    next_clicks= []
    for category, t in zip(reversed(df_train['category'].values), reversed(df_train['epochtime'].values)):
        next_clicks.append(click_buffer[category]-t)
        click_buffer[category]= t
    del(click_buffer)
    qq = list(reversed(next_clicks))

    df_train.drop(['category'], axis=1, inplace=True)
    df_train['nextClick'] = pd.Series(qq).astype('float32')
    predictors.append('nextClick')

    print('[2/2] Extracting nextClick_sameChan')
    D=2**26
    df_train['category'] = (df_train['ip'].astype(str) + "_" + df_train['channel'].astype(str) + "_" + df_train['device'].astype(str) \
        + "_" + df_train['os'].astype(str)).apply(hash) % D
    click_buffer= np.full(D, 3000000000, dtype=np.uint32)

    next_clicks= []
    for category, t in zip(reversed(df_train['category'].values), reversed(df_train['epochtime'].values)):
        next_clicks.append(click_buffer[category]-t)
        click_buffer[category]= t
    del(click_buffer)
    qq = list(reversed(next_clicks))

    df_train.drop(['category'], axis=1, inplace=True)
    df_train['nextClick_sameChan'] = pd.Series(qq).astype('float32')
    predictors.append('nextClick_sameChan')

    return df_train, predictors

## Parameters

In [10]:
## Data stored in Kaggle
#TRAIN_PATH = ".kaggle/competitions/talkingdata-adtracking-fraud-detection/train.csv"
#TEST_PATH = ".kaggle/competitions/talkingdata-adtracking-fraud-detection/test.csv"

## Data downloaded locally
TRAIN_PATH = "../../data/td-frauddetection-001/train.csv"
TEST_PATH = "../../data/td-frauddetection-001/test.csv"

dtypes = {
    'ip': 'uint32', 'app': 'uint16', 'device': 'uint16',
    'os': 'uint16', 'channel': 'uint16', 'is_attributed': 'uint8',
    'click_id': 'uint32',
}

debug = 2
limit_features = False

nrows=184903891-1
frm = 144903891
nchunk = 1000000 #2000000
val_size=int(0.33*nchunk); # debug == 2
df_val = None

if debug == 0:
    ## No cross-validation, all test data
    nchunk = 40000000; val_size = 5000000
    frm = 21500000 # day 2/4
elif debug == 1:
    ## With cross-validation
    nchunk = 5000000; frm = 85000000 # day-1
    val_size = 2000000; frm_val = 144903891

## Train data boundaries (fraction corresponding to nchunk size).
to = frm + nchunk
test_ofst = 0

## Main Loop

In [11]:
%%time
############################################################################################################
#        LOADING DATA
############################################################################################################
(df_train, test_df, len_train, sub) = load_datasets(
    frm, to, debug=debug,
    test_ofst=test_ofst, dtypes=dtypes
)

if debug == 1:
    print('************ Cross-validation: Loading data (#%d samp) ************'.format(val_size))
    len_train = len(df_train) - len(test_df)
#    dtypes = { 'ip': 'uint32', 'app': 'uint16', 'device': 'uint16', 'os': 'uint16', 'channel': 'uint16', 'is_attributed' : 'uint8', 'click_id': 'uint32'}
    df_val = pd.read_csv(TRAIN_PATH, parse_dates=['click_time'], skiprows=range(1,frm_val), nrows=val_size, dtype=dtypes,
                         usecols=['ip','app','device','os', 'channel', 'click_time', 'is_attributed'])
    df_train = df_train.append(df_val)

Loading train: #1000000
Loading test: #100000


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [12]:
%%time
############################################################################################################
#        FEATURE EXTRACTION (1/2: base stats)
############################################################################################################
df_train['hour'] = pd.to_datetime(df_train.click_time).dt.hour.astype('uint8')
df_train['day'] = pd.to_datetime(df_train.click_time).dt.day.astype('uint8')
df_train['minute'] = pd.to_datetime(df_train.click_time).dt.minute.astype('uint8')

categorical = ['app', 'device', 'os', 'channel', 'hour', 'day']
predictors = ['app', 'device', 'os', 'channel', 'hour', 'day']

df_train, predictors = extract_stats_features(df_train, predictors=predictors)

Extracting stats features...
Counting unique  app  by  ['ip'] ...
ip_uniq_app_count max value =  54
Counting unique  channel  by  ['ip'] ...
ip_uniq_chan_count max value =  93
Aggregating by  ['ip', 'app'] ...
ip_app_count max value =  1142
CPU times: user 3.02 s, sys: 96.2 ms, total: 3.11 s
Wall time: 3.12 s


In [13]:
%%time
############################################################################################################
#        FEATURE EXTRACTION (1/2: timserie stats)
############################################################################################################
print('[Feature: next click period]')
(df_train, predictors) = extract_timeserie_features(df_train, predictors)

df_train['minute'] = pd.to_datetime(df_train.click_time).dt.minute.astype('uint8')
prev_len = len(df_train)

[Feature: next click period]
Extracting Timserie features...
[1/2] Extracting nextClick
[2/2] Extracting nextClick_sameChan


NameError: name 'df_train_2' is not defined

In [15]:
############################################################################################################
#        FEATURE SELECTION
############################################################################################################
if limit_features is True:
    predictors = ['app','channel', 'X3', 'X0', 'nextClick',
                  'os', 'nextClickPeriod', 'device', 'hour','day',
                  'nextClick_sameChan', 'ip_app_count']
    categorical = ['app','channel','os','device','hour','day']

In [23]:
%%time
############################################################################################################
#        TRAINING (1/3: Prepare data for LGB)
############################################################################################################
## Drop unnecessary columns
df_train.drop(['click_time'], axis=1, inplace=True)
df_train.drop(['epochtime'], axis=1, inplace=True)

## Convert types
df_train['ip_uniq_app_count'] = df_train['ip_uniq_app_count'].astype('uint16')
df_train['ip_app_count'] = df_train['ip_app_count'].astype('uint16')
df_train['ip_uniq_chan_count'] = df_train['ip_uniq_chan_count'].astype('uint16')

## Learning Parmeters: LGB
params = {
    'learning_rate': 0.05,
    #'is_unbalance': 'true', # replaced with scale_pos_weight argument
    'num_leaves': 15,  # 2^max_depth - 1
    'max_depth': 4,  # -1 means no limit
    'min_child_samples': 100,  # Minimum number of data need in a child(min_data_in_leaf)
    'max_bin': 100,  # Number of bucbketed bin for feature values
    'subsample': 0.7,  # Subsample ratio of the training instance.
    'subsample_freq': 1,  # frequence of subsample, <=0 means no enable
    'colsample_bytree': 0.9,  # Subsample ratio of columns when constructing each tree.
    'min_child_weight': 0,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
    'scale_pos_weight': 50, # because training data is extremely unbalanced 
}

## Target columns
target = 'is_attributed'

## Categorical columns
categorical = ['app', 'device', 'os', 'channel', 'hour', 'day']

KeyError: "['click_time'] not found in axis"

In [18]:
############################################################################################################
#        TRAINING (2/3: Train/Test/CV Split)
############################################################################################################
test_df = df_train[len_train:]

if debug == 1:
    df_val = df_train[-val_size:]
    df_train = df_train[:len_train]
else:
    df_val = df_train[(len_train-val_size):len_train]
    df_train = df_train[:(len_train-val_size)]

print(df_train.shape, df_val.shape, test_df.shape)
print('Predictors:  %s' % predictors)
print('Categorical: %s' % categorical)

(670000, 15) (330000, 15) (100000, 15)
Predictors:  ['app', 'device', 'os', 'channel', 'hour', 'day', 'ip_app_count', 'nextClick', 'nextClick_sameChan']
Categorical: ['app', 'device', 'os', 'channel', 'hour', 'day']


In [51]:
############################################################################################################
#        TRAINING (3/3: Train LGB Model)
############################################################################################################

### LGB Model

In [36]:
(bst, best_iteration, eval_score) = lgb_train_cv(
    params, df_train, df_val, predictors, target, 
    objective='binary', 
    metrics='auc',
    early_stopping_rounds=30,
    verbose_eval=True, 
    num_boost_round=1000,
    categorical_features=categorical
)

Preparing validation datasets
Training until validation scores don't improve for 30 rounds.
[10]	train's auc: 0.84027	valid's auc: 0.819636
[20]	train's auc: 0.895856	valid's auc: 0.848605
[30]	train's auc: 0.887147	valid's auc: 0.919368
Early stopping, best iteration is:
[1]	train's auc: 0.942482	valid's auc: 0.923254

Model Report
bst1.best_iteration:  1
auc: 0.9232537071703216


### XGB Model

In [50]:
xgb_params = {} # use default ones

model = xgb_train_cv(
    xgb_params, df_train, df_val, predictors, target,
    objective='binary:logistic',
    early_stopping_rounds=25,
    num_boost_round=200,
    verbose_eval=5
)

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


[0]	train-auc:0.920701	valid-auc:0.904953
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 25 rounds.
[5]	train-auc:0.944551	valid-auc:0.932233
[10]	train-auc:0.960622	valid-auc:0.951792
[15]	train-auc:0.966551	valid-auc:0.95693
[20]	train-auc:0.968442	valid-auc:0.958544
[25]	train-auc:0.971583	valid-auc:0.959531
[30]	train-auc:0.974489	valid-auc:0.960434
[35]	train-auc:0.977473	valid-auc:0.961272
[40]	train-auc:0.979203	valid-auc:0.961788
[45]	train-auc:0.981173	valid-auc:0.961983
[50]	train-auc:0.983022	valid-auc:0.961731
[55]	train-auc:0.984093	valid-auc:0.9627
[60]	train-auc:0.984602	valid-auc:0.962681
[65]	train-auc:0.985627	valid-auc:0.962492
[70]	train-auc:0.986707	valid-auc:0.96227
[75]	train-auc:0.98752	valid-auc:0.96242
Stopping. Best iteration:
[52]	train-auc:0.983734	valid-auc:0.962895



## Measure XGB Performance on cross-validation data

In [41]:
## validation data
import xgboost as xgb
dvalid = xgb.DMatrix(df_val[predictors], df_val.is_attributed)

## predictions on val
pred = model.predict(dvalid, ntree_limit=model.best_ntree_limit)

## Roc score
from sklearn.metrics import roc_auc_score
score = roc_auc_score(df_val.is_attributed, pred)
print(score)

0.962895298413186


In [46]:
############################################################################################################
#        PREDICTIONS on TEST data.
############################################################################################################
pred = bst.predict(test_df[predictors], num_iteration=best_iteration)

fnames = {
    0: 'final-output.csv',
    1: 'out-dev-big.csv',
    2: 'out-dev-small.csv'
}

fname = fnames[debug]
print('Saving output to: {fname}'.format(fname=fname))
sub['is_attributed'] = pred
sub.to_csv(fname,index=False,float_format='%.9f')

Saving output to: out-dev-small.csv


In [49]:
!ls 

td-frauddetection-001.ipynb
