In [1]:
#Best single model up to now LB: 0.9820

import pandas as pd
import time
import numpy as np
from sklearn.cross_validation import train_test_split
import lightgbm as lgb
import gc
from sklearn import metrics

path = '../tdata/'
start_time = time.time()

dtypes = {
        'ip'            : 'uint32',
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'is_attributed' : 'uint8',
        'click_id'      : 'uint32'
        }
ctdtypes = {
        'ip_app_channel_var_day'    : np.float32,
        'qty'                       : 'uint32',
        'ip_app_count'              : 'uint32',
        'ip_app_os_count'           : 'uint32',
        'qty_var'                   : np.float32,
        'ip_app_os_var'             : np.float32,
        'ip_app_channel_mean_hour'  : np.float32
        }

validation = False
if validation:
    add_ = 'val'
    ntrees = 2000
    early_stop = 100
    test_usecols = ['ip','app','device','os', 'channel', 'click_time', 'is_attributed']
    val_size = 0
else:
    ntrees = 1600
    val_size = 22222
    early_stop = ntrees
    add_ = ''
    test_usecols = ['ip','app','device','os', 'channel', 'click_time', 'click_id']

print('[{}] Load Train'.format(time.time() - start_time))
train_df = pd.read_csv(path+"train%s.csv"%(add_), dtype=dtypes, usecols=['ip','app','device','os', 'channel', 'click_time', 'is_attributed'])

print('[{}] Load Test'.format(time.time() - start_time))
test_df = pd.read_csv(path+"test%s.csv"%(add_), dtype=dtypes, usecols=test_usecols)

print('[{}] Load Features'.format(time.time() - start_time))
feattrnapp = pd.read_csv(path+'lead_lag_trn_ip_device_os_app%s.gz'%(add_), compression = 'gzip')
feattstapp = pd.read_csv(path+'lead_lag_tst_ip_device_os_app%s.gz'%(add_), compression = 'gzip')
feattrnspl = pd.read_csv(path+'lead_split_sec_trn_ip_device_os_app%s.gz'%(add_), compression = 'gzip').astype(np.float32)
feattstspl = pd.read_csv(path+'lead_split_sec_tst_ip_device_os_app%s.gz'%(add_), compression = 'gzip').astype(np.float32)
feattrnchl = pd.read_csv(path+'lead_lag_trn_ip_device_os_channel%s.gz'%(add_), compression = 'gzip')
feattstchl = pd.read_csv(path+'lead_lag_tst_ip_device_os_channel%s.gz'%(add_), compression = 'gzip')
feattrnos  = pd.read_csv(path+'lead_lag_trn_ip_device_os%s.gz'%(add_), compression = 'gzip')
feattstos  = pd.read_csv(path+'lead_lag_tst_ip_device_os%s.gz'%(add_), compression = 'gzip')
# feattrncum = pd.read_csv(path+'cum_min_trn_ip_device_os_app%s.gz'%(add_), compression = 'gzip')
# feattstcum = pd.read_csv(path+'cum_min_tst_ip_device_os_app%s.gz'%(add_), compression = 'gzip')
feattrnld2 = pd.read_csv(path+'lead2_trn_ip_device_os_app%s.gz'%(add_), compression = 'gzip')
feattstld2 = pd.read_csv(path+'lead2_tst_ip_device_os_app%s.gz'%(add_), compression = 'gzip')
feattrnnext  = pd.read_csv(path+'next_trn_ip_device_os%s.gz'%(add_), compression = 'gzip').astype(np.int8)
feattstnext  = pd.read_csv(path+'next_tst_ip_device_os%s.gz'%(add_), compression = 'gzip').astype(np.int8)
feattrnprev  = pd.read_csv(path+'prevdayipchlqtytrn%s.gz'%(add_), compression = 'gzip')#.astype(np.int32)
feattstprev  = pd.read_csv(path+'prevdayipchlqtytst%s.gz'%(add_), compression = 'gzip')#.astype(np.int32)
feattstprev.fillna(-1, inplace = True)
feattrnprev = feattrnprev.astype(np.int32)
feattstprev = feattstprev.astype(np.int32)

featentip  = pd.read_csv(path+'entropyip.gz', compression = 'gzip')
featentip.iloc[:,1:] = featentip.iloc[:,1:].astype(np.float32)
featentip.iloc[:,0] = featentip.iloc[:,0].astype('uint32')
featentiphr  = pd.read_csv(path+'entropyiphr.gz', compression = 'gzip')
featentiphr.iloc[:,2:] = featentiphr.iloc[:,2:].astype(np.float32)
featentiphr.iloc[:,[0,1]] = featentiphr.iloc[:,[0,1]].astype('uint32')
featentiphr.rename(columns={'click_hr': 'hour'}, inplace = True)

print('[{}] Finished Loading Features, start concatenate'.format(time.time() - start_time))
def sumfeat(df):
    dfsum = df.iloc[:,0] + df.iloc[:,1]
    dfsum[df.iloc[:,0]<0] = -1
    dfsum[df.iloc[:,1]<0] = -2
    dfsum[(df.iloc[:,1]>1000) & (df.iloc[:,0]>1000)] = -3
    dfsum[(df.iloc[:,1]<0) & (df.iloc[:,0]<0)] = -4
    return dfsum

feattstapp.columns = feattrnapp.columns = [i+'_app' for i in feattrnapp.columns.tolist()]
feattstchl.columns = feattrnchl.columns = [i+'_chl' for i in feattrnchl.columns.tolist()]
feattstos.columns  = feattrnos.columns  = [i+'_os' for i in feattrnos.columns.tolist()]

feattrn = pd.concat([feattrnchl, feattrnos, feattrnapp], axis=1)
feattst = pd.concat([feattstchl, feattstos, feattstapp], axis=1)
feattrn['click_sec_lsum_os'] = sumfeat(feattrnos)
feattrn['click_sec_lsum_chl'] = sumfeat(feattrnchl)
feattst['click_sec_lsum_os'] = sumfeat(feattstos)
feattst['click_sec_lsum_chl'] = sumfeat(feattstchl)
feattst[['click_sec_lead_chl', 'click_sec_lead_app']].head(300)
feattrn['click_sec_lead_sameappchl'] = \
        (feattrn['click_sec_lead_chl']==feattrn['click_sec_lead_app']).astype('int8')
feattst['click_sec_lead_sameappchl'] = \
        (feattst['click_sec_lead_chl']==feattst['click_sec_lead_app']).astype('int8')

del feattrnchl, feattrnos, feattrnapp
del feattstchl, feattstos, feattstapp
import gc
gc.collect()
#feattrn.hist()
#pd.crosstab(feattrn['click_sec_lag_sameappchl'],train_df['is_attributed'])#feattst.hist()

clip_val = 3600*9
feattrn = feattrn.clip(-clip_val, clip_val).astype(np.int32)
feattst = feattst.clip(-clip_val, clip_val).astype(np.int32)
feattrn = pd.concat([feattrn, feattrnld2, feattrnspl], axis=1)
feattst = pd.concat([feattst, feattstld2, feattstspl], axis=1)
del feattrnld2, feattrnspl
del feattstld2, feattstspl
gc.collect()
#feattrn.hist()
#feattst.hist()
print(train_df.shape)
print(test_df.shape)


print('[{}] Concat Train/Test'.format(time.time() - start_time))
train_df = pd.concat([train_df, feattrn, feattrnnext, feattrnprev], axis=1)
test_df  = pd.concat([test_df , feattst, feattstnext, feattstprev], axis=1)
del feattrn, feattst, feattrnnext, feattstnext, feattrnprev, feattstprev
gc.collect()


print(train_df.shape)
print(test_df.shape)

len_train = len(train_df)
train_df=train_df.append(test_df)
del test_df
gc.collect()

print('[{}] Time prep'.format(time.time() - start_time))
train_df['hour'] = pd.to_datetime(train_df.click_time).dt.hour.astype('uint8')
train_df['day'] = pd.to_datetime(train_df.click_time).dt.day.astype('uint8')
train_df['minute'] = pd.to_datetime(train_df.click_time).dt.minute.astype('uint8')
gc.collect()


print('[{}] group by...unique app per ip/dev/os'.format(time.time() - start_time))
gp = train_df[['device', 'ip', 'os', 'app']].groupby(by=['device', 'ip', 'os'])[['app']].nunique().reset_index().rename(index=str, columns={'app': 'unique_app_ipdevos'})
print('merge...')
train_df = train_df.merge(gp[['device', 'ip', 'os', 'unique_app_ipdevos']], on=['device', 'ip', 'os'], how='left')
del gp
gc.collect()
train_df.rename(columns={'unique_app_ipdevos_x': 'unique_app_ipdevos'}, inplace = True)

print('[{}] group by...count app per ip/dev/os/min'.format(time.time() - start_time))
gp = train_df[['device', 'ip', 'os', 'app', 'minute']].groupby(by=['device', 'ip', 'os', 'minute'])[['app']].count().reset_index().rename(index=str, columns={'app': 'unique_app_ipdevosmin'})
print('merge...')
train_df = train_df.merge(gp[['device', 'ip', 'os', 'minute', 'unique_app_ipdevosmin']], on=['device', 'ip', 'os', 'minute'], how='left')
del gp
gc.collect()

print('[{}] group by...count channel per ip/day/hr/chl'.format(time.time() - start_time))
gp = train_df[['ip','day','hour','channel']].groupby(by=['ip','day','hour'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'qty_chl'})
print('merge...')
train_df = train_df.merge(gp, on=['ip','day','hour'], how='left')
del gp
gc.collect()

print('[{}] group by...count channel per ip/app/chl'.format(time.time() - start_time))
gp = train_df[['ip','app', 'channel']].groupby(by=['ip', 'app'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip_app_count'})
train_df = train_df.merge(gp, on=['ip','app'], how='left')
del gp
gc.collect()

print('[{}] group by...count channel per ip/app/os/chl'.format(time.time() - start_time))
gp = train_df[['ip','app', 'os', 'channel']].groupby(by=['ip', 'app', 'os'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip_app_os_count'})
train_df = train_df.merge(gp, on=['ip','app', 'os'], how='left')
del gp
gc.collect()

print('[{}] Add entropy'.format(time.time() - start_time))
train_df = train_df.merge(featentip, on=['ip'], how='left')
train_df = train_df.merge(featentiphr, on=['ip', 'hour'], how='left')
#print train_df.head()
#print featentip.head()
del featentip, featentiphr
gc.collect()

print('[{}] Data types'.format(time.time() - start_time))
train_df['qty'] = train_df['qty'].astype('uint16')
train_df['ip_app_count'] = train_df['ip_app_count'].astype('uint16')
train_df['ip_app_os_count'] = train_df['ip_app_os_count'].astype('uint16')
train_df['click_sec_lead_shift2'] = train_df['click_sec_lead_shift2'].astype('int32')
train_df['channel_app'] = train_df['channel'] + 500*train_df['app']

print( train_df.head() )
print( train_df.tail() )




[0.000328063964844] Load Train
(184903890, 7)
(18790469, 7)
[562.577267885] Concat Train/Test
(184903890, 25)
(18790469, 25)
[601.141028881] Time prep
[736.695734978] group by...unique app per ip/dev/os
merge...
[1313.60971498] group by...count app per ip/dev/os/min
merge...
[1548.47443604] group by...unique app per ip/day/hr/chl
merge...
[1715.09707093] group by...unique app per ip/app/chl
[1901.06852984] group by...unique app per ip/app/os/chl
[2150.5323689] Add entropy
[2367.18158102] Data types
   app  channel  click_id  click_sec_lag_app  click_sec_lag_chl  \
0    3      379       NaN                 -1                 -1   
1    3      379       NaN                 -1                 -1   
2    3      379       NaN                 -1                 -1   
3   14      478       NaN                 -1                 -1   
4    3      379       NaN                 -1                 -1   

   click_sec_lag_os  click_sec_lead_app  click_sec_lead_chl  \
0                -1           

In [3]:
train_df.iloc[:5,:20].head()

Unnamed: 0,app,channel,click_id,click_sec_lag_app,click_sec_lag_chl,click_sec_lag_os,click_sec_lead_app,click_sec_lead_chl,click_sec_lead_os,click_sec_lead_sameappchl,click_sec_lead_shift2,click_sec_lead_split_sec,click_sec_lsum_chl,click_sec_lsum_os,click_time,device,ip,is_attributed,os,prevday_qty
0,3,379,,-1,-1,-1,5340,5444,5307,0,5340,5340.212891,-2,-2,2017-11-06 14:32:21,1,83230,0.0,13,-1
1,3,379,,-1,-1,-1,5547,30591,5239,0,5662,5547.390625,-2,-2,2017-11-06 14:33:34,1,17357,0.0,19,-1
2,3,379,,-1,-1,-1,5925,6005,5205,0,6043,5925.76123,-2,-2,2017-11-06 14:34:12,1,35810,0.0,13,-1
3,14,478,,-1,-1,-1,5110,5679,5108,0,5224,5110.51123,-2,-2,2017-11-06 14:34:52,1,45745,0.0,13,-1
4,3,379,,-1,-1,-1,32400,32400,27338,0,33782,33756.210938,-2,-2,2017-11-06 14:35:08,1,161007,0.0,13,-1


In [4]:
train_df['click_time'] = pd.to_datetime( train_df['click_time'] )
gc.collect()
print("Done!")

Done!


In [8]:
print('[{}] group by...unique channel per ip/app/os/chl'.format(time.time() - start_time))
gp = train_df[['day','channel', 'hour', 'app']].groupby(by=['day','channel', 'hour'])[['app']].nunique().reset_index().rename(index=str, columns={'app': 'day_chl_hour_count'})
train_df['day_chl_hour_count'] = pd.merge( train_df[['day','channel', 'hour']], gp, on=['day','channel', 'hour'], how='left')['day_chl_hour_count']
del gp
gc.collect()
print(train_df.head())

[3305.94732499] group by...unique channel per ip/app/os/chl
   app  channel  click_id  click_sec_lag_app  click_sec_lag_chl  \
0    3      379       NaN                 -1                 -1   
1    3      379       NaN                 -1                 -1   
2    3      379       NaN                 -1                 -1   
3   14      478       NaN                 -1                 -1   
4    3      379       NaN                 -1                 -1   

   click_sec_lag_os  click_sec_lead_app  click_sec_lead_chl  \
0                -1                5340                5444   
1                -1                5547               30591   
2                -1                5925                6005   
3                -1                5110                5679   
4                -1               32400               32400   

   click_sec_lead_os  click_sec_lead_sameappchl         ...          \
0               5307                          0         ...           
1               

In [9]:
print('[{}] group by...count per app/day/hour'.format(time.time() - start_time))
gp = train_df[['app','day', 'hour', 'minute']].groupby(by=['app','day', 'hour'])[['minute']].count().reset_index().rename(index=str, columns={'minute': 'app_day_hour_count'})
train_df['app_day_hour_count'] = pd.merge( train_df[['app','day', 'hour']], gp, on=['app','day', 'hour'], how='left')['app_day_hour_count']
del gp
gc.collect()
print(train_df.tail())

[3578.84036708] group by...count per app/day/hour
           app  channel    click_id  click_sec_lag_app  click_sec_lag_chl  \
203694354    9      127  18790464.0               1305                 -1   
203694355   23      153  18790465.0              32400              32400   
203694356   18      265  18790467.0                  0                  0   
203694357   27      122  18790466.0                 -1              32400   
203694358   12      265  18790468.0                564               4380   

           click_sec_lag_os  click_sec_lead_app  click_sec_lead_chl  \
203694354                 0                 144                 144   
203694355                 4                  -1                  -1   
203694356                 0                  -1                 649   
203694357                 1                  -1                3099   
203694358               564                   0                   0   

           click_sec_lead_os  click_sec_lead_sameappchl     

In [12]:
print('[{}] group by...count per channel/app/day/hour/minute'.format(time.time() - start_time))
gp = train_df[['channel','app', 'day','hour','minute','click_id']].groupby(by=['channel','app', 'day','hour','minute'])[['click_id']].count().reset_index().rename(index=str, columns={'click_id': 'chl_app_day_hour_min_count'})
train_df['chl_app_day_hour_min_count'] = pd.merge( train_df[['channel','app', 'day','hour','minute']], gp, on=['channel','app', 'day','hour','minute'], how='left')['chl_app_day_hour_min_count']
del gp
gc.collect()
print(train_df.tail())

[3944.59300804] group by...count per channel/app/day/hour/minute
           app  channel    click_id  click_sec_lag_app  click_sec_lag_chl  \
203694354    9      127  18790464.0               1305                 -1   
203694355   23      153  18790465.0              32400              32400   
203694356   18      265  18790467.0                  0                  0   
203694357   27      122  18790466.0                 -1              32400   
203694358   12      265  18790468.0                564               4380   

           click_sec_lag_os  click_sec_lead_app  click_sec_lead_chl  \
203694354                 0                 144                 144   
203694355                 4                  -1                  -1   
203694356                 0                  -1                 649   
203694357                 1                  -1                3099   
203694358               564                   0                   0   

           click_sec_lead_os  click_sec_lead_

In [16]:
print('[{}] group by...count per channel/app'.format(time.time() - start_time))
gp = train_df[['app','channel']].groupby(by=['app'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'app_count'})
train_df['app_count'] = pd.merge( train_df[['app']], gp, on=['app'], how='left')['app_count']
del gp
gc.collect()
print(train_df.tail())

[7305.67420197] group by...count per channel/app
           app  channel    click_id  click_sec_lag_app  click_sec_lag_chl  \
203694354    9      127  18790464.0               1305                 -1   
203694355   23      153  18790465.0              32400              32400   
203694356   18      265  18790467.0                  0                  0   
203694357   27      122  18790466.0                 -1              32400   
203694358   12      265  18790468.0                564               4380   

           click_sec_lag_os  click_sec_lead_app  click_sec_lead_chl  \
203694354                 0                 144                 144   
203694355                 4                  -1                  -1   
203694356                 0                  -1                 649   
203694357                 1                  -1                3099   
203694358               564                   0                   0   

           click_sec_lead_os  click_sec_lead_sameappchl    ..

In [18]:
print('[{}] group by...mode per ip/hour'.format(time.time() - start_time))
gp = train_df[['ip','hour','channel']].groupby(by=['ip','hour'])['channel'].agg(lambda x:x.value_counts().index[0]).reset_index().rename(index=str, columns={'channel': 'mode_channel_ip_hour'})
train_df['mode_channel_ip_hour'] = pd.merge( train_df[['ip','hour']], gp, on=['ip','hour'], how='left')['mode_channel_ip_hour']
del gp
gc.collect()
print(train_df.tail())

[7678.01019096] group by...mode per ip/hour
           app  channel    click_id  click_sec_lag_app  click_sec_lag_chl  \
203694354    9      127  18790464.0               1305                 -1   
203694355   23      153  18790465.0              32400              32400   
203694356   18      265  18790467.0                  0                  0   
203694357   27      122  18790466.0                 -1              32400   
203694358   12      265  18790468.0                564               4380   

           click_sec_lag_os  click_sec_lead_app  click_sec_lead_chl  \
203694354                 0                 144                 144   
203694355                 4                  -1                  -1   
203694356                 0                  -1                 649   
203694357                 1                  -1                3099   
203694358               564                   0                   0   

           click_sec_lead_os  click_sec_lead_sameappchl          .

In [20]:
# print('[{}] group by...mode per ip/hour'.format(time.time() - start_time))
# gp = train_df[['ip','device','hour','minute','app']].groupby(by=['ip','device','hour','minute'])['app'].agg(lambda x:x.value_counts().index[0]).reset_index().rename(index=str, columns={'channel': 'mode_ip_device_hour_min'})
# train_df['mode_ip_device_hour_min'] = pd.merge( train_df[['ip','device','hour','minute']], gp, on=['ip','device','hour','minute'], how='left')['mode_ip_device_hour_min']
# del gp
# gc.collect()
# print(train_df.tail())

In [21]:
train_df['minday'] = train_df['hour']*60 + train_df['minute']
print( train_df.head() )


   app  channel  click_id  click_sec_lag_app  click_sec_lag_chl  \
0    3      379       NaN                 -1                 -1   
1    3      379       NaN                 -1                 -1   
2    3      379       NaN                 -1                 -1   
3   14      478       NaN                 -1                 -1   
4    3      379       NaN                 -1                 -1   

   click_sec_lag_os  click_sec_lead_app  click_sec_lead_chl  \
0                -1                5340                5444   
1                -1                5547               30591   
2                -1                5925                6005   
3                -1                5110                5679   
4                -1               32400               32400   

   click_sec_lead_os  click_sec_lead_sameappchl   ...    ip_click_min_entropy  \
0               5307                          0   ...                4.088781   
1               5239                          0   ...   

0

In [29]:
%%time

ds1 = pd.to_datetime( '2017-11-07 16:00:00' )
px = np.where( (train_df['click_time']<=ds1) & (train_df['is_attributed'].notnull())  )[0]
print(len(px))
py = np.where( (train_df['click_time']>ds1)  )[0]
print(len(py))
gc.collect()

train_df['ll_target1'] = np.NaN
#gp = train_df[['channel','app','minday','is_attributed']].iloc[px].groupby(by=['channel','app','minday'])['is_attributed'].mean().reset_index().rename(index=str, columns={'is_attributed': 'll_target1'})               
print(gp.head(5))
train_df['ll_target1'].iloc[py] = pd.merge( train_df[['channel','app','minday']].iloc[py], gp, on=['channel','app','minday'], how='left')['ll_target1']
del gp
gc.collect()


59710594
59710594
    channel  app  minday  ll_target1
0         0  177       1         0.0
1         0  177       2         0.0
2         0  177       3         0.0
3         0  177       4         0.0
4         0  177       6         0.0
5         0  177       7         0.0
6         0  177      10         0.0
7         0  177      13         0.0
8         0  177      16         0.0
9         0  177      18         0.0
10        0  177      21         0.0
11        0  177      22         0.0
12        0  177      27         0.0
13        0  177      28         0.0
14        0  177      29         0.0
15        0  177      30         0.0
16        0  177      31         0.0
17        0  177      32         0.0
18        0  177      33         0.0
19        0  177      38         0.0
20        0  177      41         0.0
21        0  177      42         0.0
22        0  177      43         0.0
23        0  177      46         0.0
24        0  177      49         0.0
25        0  177    

In [30]:
%%time

ds1 = pd.to_datetime( '2017-11-08 16:00:00' )
px = np.where( (train_df['click_time']<=ds1) & (train_df['is_attributed'].notnull())  )[0]
print(len(px))
py = np.where( (train_df['click_time']>ds1)  )[0]
print(len(py))
gc.collect()

gp = train_df[['channel','app','minday','is_attributed']].iloc[px].groupby(by=['channel','app','minday'])['is_attributed'].mean().reset_index().rename(index=str, columns={'is_attributed': 'll_target1'})               
print(gp.head(5))
train_df['ll_target1'].iloc[py] = pd.merge( train_df[['channel','app','minday']].iloc[py], gp, on=['channel','app','minday'], how='left')['ll_target1']
del gp
gc.collect()

122071523
81622836
   channel  app  minday  ll_target1
0        0  177       1         0.0
1        0  177       2         0.0
2        0  177       3         0.0
3        0  177       4         0.0
4        0  177       5         0.0
CPU times: user 50.2 s, sys: 13.4 s, total: 1min 3s
Wall time: 1min 3s


In [31]:
%%time

ds1 = pd.to_datetime( '2017-11-09 16:00:00' )
px = np.where( (train_df['click_time']<=ds1) & (train_df['is_attributed'].notnull())  )[0]
print(len(px))
py = np.where( (train_df['click_time']>ds1)  )[0]
print(len(py))
gc.collect()

gp = train_df[['channel','app','minday','is_attributed']].iloc[px].groupby(by=['channel','app','minday'])['is_attributed'].mean().reset_index().rename(index=str, columns={'is_attributed': 'll_target1'})               
print(gp.head(5))
train_df['ll_target1'].iloc[py] = pd.merge( train_df[['channel','app','minday']].iloc[py], gp, on=['channel','app','minday'], how='left')['ll_target1']
del gp
gc.collect()
print( train_df.tail() )

184903890
18790469
   channel  app  minday  ll_target1
0        0  177       0         0.0
1        0  177       1         0.0
2        0  177       2         0.0
3        0  177       3         0.0
4        0  177       4         0.0
           app  channel    click_id  click_sec_lag_app  click_sec_lag_chl  \
203694354    9      127  18790464.0               1305                 -1   
203694355   23      153  18790465.0              32400              32400   
203694356   18      265  18790467.0                  0                  0   
203694357   27      122  18790466.0                 -1              32400   
203694358   12      265  18790468.0                564               4380   

           click_sec_lag_os  click_sec_lead_app  click_sec_lead_chl  \
203694354                 0                 144                 144   
203694355                 4                  -1                  -1   
203694356                 0                  -1                 649   
203694357         

In [40]:
train_df.columns

Index([u'app', u'channel', u'click_id', u'click_sec_lag_app',
       u'click_sec_lag_chl', u'click_sec_lag_os', u'click_sec_lead_app',
       u'click_sec_lead_chl', u'click_sec_lead_os',
       u'click_sec_lead_sameappchl', u'click_sec_lead_shift2',
       u'click_sec_lead_split_sec', u'click_sec_lsum_chl',
       u'click_sec_lsum_os', u'click_time', u'device', u'ip', u'is_attributed',
       u'os', u'prevday_qty', u'prevhour_qty', u'qty', u'same_next_app',
       u'same_next_chl', u'same_prev_app', u'same_prev_chl', u'hour', u'day',
       u'minute', u'unique_app_ipdevos', u'unique_app_ipdevosmin', u'qty_chl',
       u'ip_app_count', u'ip_app_os_count', u'ip_device_entropy',
       u'ip_os_entropy', u'ip_app_entropy', u'ip_channel_entropy',
       u'ip_click_hr_entropy', u'ip_click_min_entropy', u'iphr_device_entropy',
       u'iphr_click_min_entropy', u'channel_app', u'day_chl_hour_count',
       u'app_day_hour_count', u'chl_app_day_hour_min_count', u'app_count',
       u'mode_channe

In [41]:
test_df  = train_df[len_train:]
train_df = train_df[:len_train]
gc.collect()

# Remove device 3032
print(train_df.shape)
print('Device 3032 shape : %s'%(train_df[train_df['device']==3032].shape[0]))
train_df = train_df[train_df['device']!=3032]
gc.collect()
print(train_df.shape)

(184903890, 50)
Device 3032 shape : 692891
(184210999, 50)


In [42]:
print('[{}] Get common train and test'.format(time.time() - start_time))
for col in ['app', 'channel', 'channel_app', 'os', 'device','mode_channel_ip_hour']:  
    print('Get common to train and test : %s'%(col))
    common = pd.Series(list(set(train_df[col]) & set(test_df[col])))
    train_df.loc[~train_df[col].isin(common), col ] = 65535
    test_df.loc [ ~test_df[col].isin(common), col ] = 65535
    del common
    gc.collect()

print('[{}] Data split complete'.format(time.time() - start_time))
print("train size: ", len(train_df))
print("test size : ", len(test_df))

[17761.426497] Get common train and test
Get common to train and test : app
Get common to train and test : channel
Get common to train and test : channel_app
Get common to train and test : os
Get common to train and test : device
Get common to train and test : mode_channel_ip_hour
[18171.9869289] Data split complete
('train size: ', 184210999)
('test size : ', 18790469)


In [43]:
#To save RAM

train_df.loc[train_df['is_attributed'].isnull(), 'is_attributed' ] = 0
train_df['is_attributed'] = train_df['is_attributed'].astype(np.int8)

for col in ['app', 'channel', 'channel_app', 'os', 'device']:
    train_df[col] = train_df[col].astype(np.int16)
    test_df [col] = test_df[col].astype(np.int16)
gc.collect()

for col in ['hour','unique_app_ipdevos']:
    train_df[col] = train_df[col].astype(np.int8)
    test_df [col] = test_df[col].astype(np.int8)
gc.collect()

for col in ['unique_app_ipdevosmin']:
    train_df[col] = train_df[col].astype(np.int16)
    test_df [col] = test_df[col].astype(np.int16)
gc.collect()

train_df.drop(['click_id','day','minute'], axis = 1, inplace = True)
gc.collect()
test_df.drop(['click_time','is_attributed','day','minute'], axis = 1, inplace = True)
gc.collect()

print(train_df.dtypes)
print(test_df.dtypes)

app                                    int16
channel                                int16
click_sec_lag_app                      int32
click_sec_lag_chl                      int32
click_sec_lag_os                       int32
click_sec_lead_app                     int32
click_sec_lead_chl                     int32
click_sec_lead_os                      int32
click_sec_lead_sameappchl              int32
click_sec_lead_shift2                  int32
click_sec_lead_split_sec             float32
click_sec_lsum_chl                     int32
click_sec_lsum_os                      int32
click_time                    datetime64[ns]
device                                 int16
ip                                    uint32
is_attributed                           int8
os                                     int16
prevday_qty                            int32
prevhour_qty                           int32
qty                                   uint16
same_next_app                           int8
same_next_

In [60]:
lead_cols = [col for col in train_df.columns if 'lead_' in col]
lead_cols += [col for col in train_df.columns if 'lag_' in col]
lead_cols += [col for col in train_df.columns if 'next_' in col]
lead_cols += [col for col in train_df.columns if 'entropy' in col]
lead_cols += [col for col in train_df.columns if 'qty' in col]
#lead_cols += [col for col in train_df.columns if 'mode' in col]
lead_cols += ['ip', 'app','device','os', 'channel', 'hour', 'ip_app_count', 'ip_app_os_count', 'unique_app_ipdevosmin', 'app_count', 'chl_app_day_hour_min_count', 'app_day_hour_count', 'day_chl_hour_count' ,'minday','ll_target1','mode_channel_ip_hour']
lead_cols = list(set(lead_cols))

#target = 'is_attributed'
predictors =  lead_cols
categorical = [ 'app','device','os', 'channel', 'hour','mode_channel_ip_hour'] #'channel_app',
print(50*'*')
print(predictors)
print(50*'*')
print(categorical)
print(50*'*')

# if not validation:
#     train_df.drop(['click_id'], axis = 1, inplace = True)
#     val_df.drop(['click_id'], axis = 1, inplace = True)
#     sub = pd.DataFrame()
#     sub['click_id'] = test_df['click_id'].astype('int')
# else:
#     val_df = test_df.sample(frac=0.025, replace=False, random_state=0)
#     gc.collect()
  
print('[{}] Drop features complete'.format(time.time() - start_time))
print("train size: ", len(train_df))
#print("valid size: ", len(val_df))
print("test size : ", len(test_df))

print('[{}]'.format(time.time() - start_time))

**************************************************
['ip', 'app', 'unique_app_ipdevosmin', 'chl_app_day_hour_min_count', 'qty', 'click_sec_lag_chl', 'prevday_qty', 'app_day_hour_count', 'ip_app_count', 'click_sec_lag_os', 'same_next_app', 'channel', 'click_sec_lead_chl', 'qty_chl', 'click_sec_lead_os', 'app_count', 'ip_click_hr_entropy', 'same_next_chl', 'click_sec_lead_sameappchl', 'll_target1', 'ip_os_entropy', 'minday', 'ip_channel_entropy', 'device', 'iphr_click_min_entropy', 'ip_device_entropy', 'ip_app_os_count', 'click_sec_lead_split_sec', 'iphr_device_entropy', 'hour', 'day_chl_hour_count', 'click_sec_lead_shift2', 'ip_app_entropy', 'mode_channel_ip_hour', 'ip_click_min_entropy', 'click_sec_lag_app', 'prevhour_qty', 'click_sec_lead_app', 'os']
**************************************************
['app', 'device', 'os', 'channel', 'hour', 'mode_channel_ip_hour']
**************************************************
[19052.5993791] Drop features complete
('train size: ', 184210999)
('t

In [61]:
ds0 = pd.to_datetime( '2017-11-08 16:00:00' )
#px = np.where( (train_df['click_time']<ds0) & (train_df['hour'].isin( [4,5,9,10,13,14] ))  )[0]
px = np.where( (train_df['click_time']<ds0) )[0]
print(len(px))

ds0 = pd.to_datetime( '2017-11-09 04:00:00' )
ds1 = pd.to_datetime( '2017-11-09 16:00:00' )
py = np.where( (train_df['click_time']>=ds0) & (train_df['click_time']<=ds1) & (train_df['hour'].isin( [4,5,9,10,13,14] )) )[0]
print(len(py))
gc.collect()


121377910
20895641


0

In [63]:
print("Loading LigthGBM Datasets Train...")
xgtrain = lgb.Dataset(train_df[predictors].values[px], label=train_df['is_attributed'].values[px],
                      feature_name=predictors,
                      categorical_feature=categorical,
                      free_raw_data=False
                      )
gc.collect()
print("Loading LigthGBM Datasets Valid...")
xgvalid = lgb.Dataset(train_df[predictors].values[py], label=train_df['is_attributed'].values[py],
                      feature_name=predictors,
                      categorical_feature=categorical,
                      free_raw_data=False
                      )
gc.collect()
print('Done!')

Loading LigthGBM Datasets...
Done!


In [64]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'learning_rate': 0.05,
    #'is_unbalance': 'true',  #because training data is unbalance (replaced with scale_pos_weight)
    'num_leaves': 15,  # we should let it be smaller than 2^(max_depth)
    'max_depth': 6,  # -1 means no limit
    #'min_child_samples': 10,  # Minimum number of data need in a child(min_data_in_leaf)
    'max_bin': 255,  # Number of bucketed bin for feature values
    'subsample': 0.90,  # Subsample ratio of the training instance.
    #'subsample_freq': 0,  # frequence of subsample, <=0 means no enable
    'colsample_bytree': 0.50,  # Subsample ratio of columns when constructing each tree.
    'min_child_weight': 0,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
    #'subsample_for_bin': 200000,  # Number of samples for constructing bin
    'min_split_gain': 0,  # lambda_l1, lambda_l2 and min_gain_to_split to regularization
    'reg_alpha': 0,  # L1 regularization term on weights
    'reg_lambda': 0,  # L2 regularization term on weights
    'nthread': 19,
    'verbose': 1,
    'metric':'auc',
    'scale_pos_weight':99.0 # because training data is extremely unbalanced 
}
evals_results = {}
bst = lgb.train(params, 
                 xgtrain, 
                 valid_sets=[xgvalid], 
                 valid_names=['valid'], 
                 evals_result=evals_results, 
                 num_boost_round=2000,
                 early_stopping_rounds=100,
                 verbose_eval=10, 
                 )

n_estimators = bst.best_iteration
print("\nModel Report")
print("n_estimators : ", n_estimators)
print(evals_results['valid']['auc'][n_estimators-1])




Training until validation scores don't improve for 100 rounds.
[10]	valid's auc: 0.965688
[20]	valid's auc: 0.968553
[30]	valid's auc: 0.969721
[40]	valid's auc: 0.970922
[50]	valid's auc: 0.97262
[60]	valid's auc: 0.974519
[70]	valid's auc: 0.975731
[80]	valid's auc: 0.976427
[90]	valid's auc: 0.97722
[100]	valid's auc: 0.97814
[110]	valid's auc: 0.978858
[120]	valid's auc: 0.979451
[130]	valid's auc: 0.979906
[140]	valid's auc: 0.980415
[150]	valid's auc: 0.980735
[160]	valid's auc: 0.981036
[170]	valid's auc: 0.981239
[180]	valid's auc: 0.98148
[190]	valid's auc: 0.98167
[200]	valid's auc: 0.981858
[210]	valid's auc: 0.982029
[220]	valid's auc: 0.982165
[230]	valid's auc: 0.982301
[240]	valid's auc: 0.98241
[250]	valid's auc: 0.982491
[260]	valid's auc: 0.982562
[270]	valid's auc: 0.98265
[280]	valid's auc: 0.98272
[290]	valid's auc: 0.982787
[300]	valid's auc: 0.982861
[310]	valid's auc: 0.98294
[320]	valid's auc: 0.983014
[330]	valid's auc: 0.983068
[340]	valid's auc: 0.983119
[35

In [65]:
gc.collect()
imp = pd.DataFrame([(a,b) for (a,b) in zip(bst.feature_name(), bst.feature_importance())], columns = ['feat', 'imp'])
imp = imp.sort_values('imp', ascending = False).reset_index(drop=True)
print(imp)

                          feat   imp
0                      channel  2922
1                          app  2303
2         mode_channel_ip_hour  1806
3                           os  1799
4                         hour  1273
5     click_sec_lead_split_sec   874
6         ip_click_min_entropy   451
7                ip_os_entropy   442
8            ip_device_entropy   438
9           click_sec_lead_app   432
10       click_sec_lead_shift2   376
11       unique_app_ipdevosmin   369
12              ip_app_entropy   365
13          ip_channel_entropy   357
14                   app_count   355
15         ip_click_hr_entropy   352
16                ip_app_count   320
17                         qty   297
18                     qty_chl   283
19          app_day_hour_count   282
20      iphr_click_min_entropy   249
21             ip_app_os_count   247
22           click_sec_lag_app   223
23                          ip   221
24                      device   193
25                prevhour_qty   173
2

In [66]:
print("Predicting...")
sub = pd.DataFrame()
sub[ 'click_id' ] = test_df['click_id']
sub[ 'click_id' ] = sub[ 'click_id' ].astype( np.int32 )
sub['is_attributed'] = bst.predict( test_df[predictors].values )
print(sub.head())
print("writing...")
sub.to_csv(path + '../sub/sub_giba_lgb0406.csv.gz',index=False, compression = 'gzip')
print("done...")
print(sub.info())


Predicting...
           click_id  is_attributed
184903890         0       0.038044
184903891         1       0.004008
184903892         2       0.000189
184903893         3       0.010869
184903894         4       0.005509
writing...
done...
<class 'pandas.core.frame.DataFrame'>
Int64Index: 18790469 entries, 184903890 to 203694358
Data columns (total 2 columns):
click_id         int32
is_attributed    float64
dtypes: float64(1), int32(1)
memory usage: 358.4 MB
None


In [67]:
#kaggle competitions submit -c talkingdata-adtracking-fraud-detection -f sub_giba_lgb0405B.csv.gz -m "lightgbm giba test partial CV: 0.9842"
print('Done')

Done


# Train FULL

In [68]:
del xgtrain, xgvalid
gc.collect()

ds0 = pd.to_datetime( '2017-11-09 16:00:00' )
px = np.where( (train_df['click_time']<ds0) & (train_df['is_attributed'].notnull())  )[0]
print(len(px))

ds0 = pd.to_datetime( '2017-11-09 04:00:00' )
ds1 = pd.to_datetime( '2017-11-09 16:00:00' )
py = np.where( (train_df['click_time']>=ds0) & (train_df['click_time']<=ds1) & (train_df['hour'].isin( [4] )) )[0]
print(len(py))
gc.collect()

print("Loading LigthGBM Datasets FULL...")
xgtrain = lgb.Dataset(train_df[predictors].values[px], label=train_df['is_attributed'].values[px],
                      feature_name=predictors,
                      categorical_feature=categorical,
                      free_raw_data=False
                      )
gc.collect()
print("Loading LigthGBM Datasets Validation...")
xgvalid = lgb.Dataset(train_df[predictors].values[py], label=train_df['is_attributed'].values[py],
                      feature_name=predictors,
                      categorical_feature=categorical,
                      free_raw_data=False
                      )
gc.collect()

print('Train FULL!!')
evals_results = {}
bstFULL = lgb.train(params, 
                 xgtrain, 
                 valid_sets=[xgvalid], 
                 valid_names=['valid'], 
                 evals_result=evals_results, 
                 num_boost_round=n_estimators+150,
                 verbose_eval=10, 
                 )

gc.collect()
imp = pd.DataFrame([(a,b) for (a,b) in zip(bstFULL.feature_name(), bstFULL.feature_importance())], columns = ['feat', 'imp'])
imp = imp.sort_values('imp', ascending = False).reset_index(drop=True)
print(imp)

print("Predicting...")
sub = pd.DataFrame()
sub[ 'click_id' ] = test_df['click_id']
sub[ 'click_id' ] = sub[ 'click_id' ].astype( np.int32 )
sub['is_attributed'] = bstFULL.predict( test_df[predictors].values )
print(sub.head())
print("writing...")
sub.to_csv(path + '../sub/sub_giba_lgb0406_FULL.csv.gz',index=False, compression = 'gzip')
print("done...")
print(sub.info())

184210552
4032691
Loading LigthGBM Datasets FULL...
Loading LigthGBM Datasets Validation...
Train FULL!!
[10]	valid's auc: 0.965051
[20]	valid's auc: 0.968417
[30]	valid's auc: 0.969142
[40]	valid's auc: 0.97009
[50]	valid's auc: 0.971263
[60]	valid's auc: 0.973104
[70]	valid's auc: 0.974055
[80]	valid's auc: 0.974642
[90]	valid's auc: 0.975467
[100]	valid's auc: 0.976418
[110]	valid's auc: 0.976997
[120]	valid's auc: 0.97743
[130]	valid's auc: 0.97782
[140]	valid's auc: 0.978335
[150]	valid's auc: 0.978572
[160]	valid's auc: 0.978818
[170]	valid's auc: 0.979085
[180]	valid's auc: 0.979318
[190]	valid's auc: 0.979515
[200]	valid's auc: 0.979729
[210]	valid's auc: 0.979947
[220]	valid's auc: 0.980077
[230]	valid's auc: 0.980216
[240]	valid's auc: 0.980371
[250]	valid's auc: 0.98048
[260]	valid's auc: 0.980598
[270]	valid's auc: 0.980752
[280]	valid's auc: 0.980867
[290]	valid's auc: 0.980983
[300]	valid's auc: 0.981082
[310]	valid's auc: 0.981186
[320]	valid's auc: 0.98127
[330]	valid's

In [69]:
#kaggle competitions submit -c talkingdata-adtracking-fraud-detection -f sub_giba_lgb0406_FULL.csv.gz -m "added some feature but CV decreased to 0.98411"
print('Done')

Done
