In [1]:
import pandas as pd
import sys
import numpy as np
import gc
#gc.collect()

In [2]:
dtypes = {
        'ip'            : 'uint32',
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'is_attributed' : 'uint8',
        'click_id'      : 'uint32'
        }

In [3]:
test_df = pd.read_csv('test.csv',  dtype=dtypes)

In [4]:
train_df = pd.read_csv("train.csv", skiprows=range(1,140003891), dtype=dtypes, usecols=['ip','app','device','os', 'channel', 'click_time', 'is_attributed'])

In [5]:
train_df.head()

Unnamed: 0,ip,app,device,os,channel,click_time,is_attributed
0,17663,26,1,19,266,2017-11-09 02:33:37,0
1,8720,11,1,47,325,2017-11-09 02:33:37,0
2,118229,2,1,53,477,2017-11-09 02:33:37,0
3,81606,2,1,19,237,2017-11-09 02:33:37,0
4,43068,3,1,13,280,2017-11-09 02:33:37,0


In [6]:
def get_date(x):
    td = str(x).strip().split(' ')
    if len(td)>0:
        return td[0]
def get_time(x):
    td = str(x).strip().split(' ')
    if len(td)>1:
        return td[1]
def get_hour(x):
    td = str(x).strip().split(':')
    if len(td)>0:
        return td[0]

train_df['time']=train_df.click_time.apply(get_time)
train_df['hour']=train_df.time.apply(get_hour)
test_df['time']=test_df.click_time.apply(get_time)
test_df['hour']=test_df.time.apply(get_hour)

In [7]:
test_df.head()

Unnamed: 0,click_id,ip,app,device,os,channel,click_time,time,hour
0,0,5744,9,1,3,107,2017-11-10 04:00:00,04:00:00,4
1,1,119901,9,1,3,466,2017-11-10 04:00:00,04:00:00,4
2,2,72287,21,1,19,128,2017-11-10 04:00:00,04:00:00,4
3,3,78477,15,1,13,111,2017-11-10 04:00:00,04:00:00,4
4,4,123080,12,1,13,328,2017-11-10 04:00:00,04:00:00,4


In [8]:
train_df.hour = train_df.hour.astype(int)
test_df.hour = test_df.hour.astype(int)

In [9]:
train_df = train_df.drop(['click_time','time'], axis=1)
test_df = test_df.drop(['click_time','time'], axis=1)

In [10]:
train_df.hour.unique(), test_df.hour.unique()

(array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16]),
 array([ 4,  5,  6,  9, 10, 11, 13, 14, 15]))

In [11]:
def reg_target_encoding(train, test, col, target='is_attributed'):
    """ Computes regularize mean encoding.
    Inputs:
       train: training dataframe
    """
    
    new_col_name='%s_count_enc'%col 
    
    
    temp = train.groupby(col)[target].count()
    train[new_col_name]= train[col].map(temp)
    test[new_col_name]= test[col].map(temp)
    global_mean = train[target].count()
    train[new_col_name] = train[new_col_name].fillna(global_mean)
    test[new_col_name] = test[new_col_name].fillna(global_mean)

reg_target_encoding(train_df,test_df, col = "ip")
reg_target_encoding(train_df,test_df, col = "app")
reg_target_encoding(train_df,test_df, col = "device")
reg_target_encoding(train_df,test_df, col = "os")
reg_target_encoding(train_df,test_df, col = "hour")
reg_target_encoding(train_df,test_df, col = "channel")

In [12]:
import catboost as cb
from  sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

In [13]:
def auc2(m, train, valid): 
    return (metrics.roc_auc_score(y_train,m.predict_proba(train)[:,1]),
                            metrics.roc_auc_score(y_valid,m.predict_proba(valid)[:,1]))

In [14]:
target = train_df.is_attributed
train_df = train_df.drop(['is_attributed'], axis=1)

In [15]:
train_df.head()

Unnamed: 0,ip,app,device,os,channel,hour,ip_count_enc,app_count_enc,device_count_enc,os_count_enc,hour_count_enc,channel_count_enc
0,17663,26,1,19,266,2,646,862847,42060465,10554622,1353113,232636
1,8720,11,1,47,325,2,2639,980339,42060465,379300,1353113,90673
2,118229,2,1,53,477,2,19318,5141746,42060465,322374,1353113,1712062
3,81606,2,1,19,237,2,5163,5141746,42060465,10554622,1353113,752819
4,43068,3,1,13,280,2,672,7361205,42060465,9533113,1353113,2686011


In [16]:
X_train, X_valid, y_train, y_valid = train_test_split(train_df, target, test_size = 0.20, random_state = 10)

In [17]:
gc.collect()

3617

In [31]:
gbm = lgb.LGBMClassifier(objective='binary',
                        max_depth=50,
                        num_leaves=900,
                        verbose =0,
                        subsample=0.5,
                        metric = ['auc'],
                        learning_rate=0.1,
                        colsample_bytree=0.8,
                        n_estimators=100)

In [32]:
gbm.fit(X_train, y_train, eval_metric=['auc'], eval_set=(X_valid,y_valid) ,early_stopping_rounds=30)
print(auc2(gbm, X_train, X_valid))

[1]	valid_0's auc: 0.913754
Training until validation scores don't improve for 30 rounds.
[2]	valid_0's auc: 0.931723
[3]	valid_0's auc: 0.93265
[4]	valid_0's auc: 0.948923
[5]	valid_0's auc: 0.949002
[6]	valid_0's auc: 0.949502
[7]	valid_0's auc: 0.949431
[8]	valid_0's auc: 0.949349
[9]	valid_0's auc: 0.951723
[10]	valid_0's auc: 0.957192
[11]	valid_0's auc: 0.958208
[12]	valid_0's auc: 0.958458
[13]	valid_0's auc: 0.960792
[14]	valid_0's auc: 0.960729
[15]	valid_0's auc: 0.960831
[16]	valid_0's auc: 0.962457
[17]	valid_0's auc: 0.962163
[18]	valid_0's auc: 0.961725
[19]	valid_0's auc: 0.961844
[20]	valid_0's auc: 0.961952
[21]	valid_0's auc: 0.962567
[22]	valid_0's auc: 0.962891
[23]	valid_0's auc: 0.963371
[24]	valid_0's auc: 0.963142
[25]	valid_0's auc: 0.962686
[26]	valid_0's auc: 0.962656
[27]	valid_0's auc: 0.963067
[28]	valid_0's auc: 0.96294
[29]	valid_0's auc: 0.963095
[30]	valid_0's auc: 0.962616
[31]	valid_0's auc: 0.962722
[32]	valid_0's auc: 0.962965
[33]	valid_0's auc: 0

In [35]:
for i in [25,50,75]:
    for j in [300, 600, 900]:
        print(i,j)
        gbm = lgb.LGBMClassifier(objective='binary',
                        max_depth=i,
                        num_leaves=j,
                        verbose =0,
                        subsample=0.5,
                        metric = ['auc'],
                        learning_rate=0.1,
                        colsample_bytree=0.8,
                        n_estimators=100)
        gbm.fit(X_train, y_train, eval_metric=['auc'], eval_set=(X_valid,y_valid) ,early_stopping_rounds=30, verbose=False)
        print(auc2(gbm, X_train, X_valid))
        gc.collect()

25 300
(0.97219128728157622, 0.97099741674819073)
25 600
(0.97428683404129257, 0.97126156972559174)
25 900
(0.97520747206799463, 0.97138676496171039)
50 300
(0.97230628330996671, 0.97078942382977007)
50 600
(0.97357541551707338, 0.97116213793153605)
50 900
(0.974272304917434, 0.97138079909231489)
75 300
(0.97230628330996671, 0.97078942382977007)
75 600
(0.97357541551707338, 0.97116213793153605)
75 900
(0.974272304917434, 0.97138079909231489)


In [18]:
for i in [25,50,75]:
    for j in [300, 600, 900]:
        print(i,j)
        gbm = lgb.LGBMClassifier(objective='binary',
                        max_depth=i,
                        num_leaves=j,
                        verbose =0,
                        subsample=0.5,
                        metric = ['auc'],
                        learning_rate=0.05,
                        colsample_bytree=0.8,
                        n_estimators=500)
        gbm.fit(X_train, y_train, eval_metric=['auc'], eval_set=(X_valid,y_valid) ,early_stopping_rounds=30, verbose=False)
        print(auc2(gbm, X_train, X_valid))
        gc.collect()

25 300
(0.97512251714066966, 0.97167133345265477)
25 600
(0.97889727301825336, 0.97203896819164581)
25 900
(0.98133398772933489, 0.97200966207588968)
50 300
(0.97562134821686974, 0.97167686319984203)
50 600
(0.9781174961966872, 0.97189511107457205)
50 900
(0.98124340738035964, 0.97201208202151124)
75 300
(0.97562134821686974, 0.97167686319984203)
75 600
(0.9781174961966872, 0.97189511107457205)
75 900
(0.98124340738035964, 0.97201208202151124)


In [17]:
for i in [30,40]:
    for j in [700,800,1200]:
        print(i,j)
        gbm = lgb.LGBMClassifier(objective='binary',
                        max_depth=i,
                        num_leaves=j,
                        verbose =0,
                        subsample=0.5,
                        metric = ['auc'],
                        learning_rate=0.05,
                        colsample_bytree=0.8,
                        n_estimators=500)
        gbm.fit(X_train, y_train, eval_metric=['auc'], eval_set=(X_valid,y_valid) ,early_stopping_rounds=30, verbose=False)
        print(auc2(gbm, X_train, X_valid))
        gc.collect()

30 700
(0.97924549105476122, 0.97194032653405782)
30 800
(0.98014291745150639, 0.97203107188673288)
30 1200
(0.98313946312614198, 0.97192286221862501)
40 700
(0.98002332271213299, 0.97193994423260155)
40 800
(0.980300313219993, 0.97196410789146404)
40 1200
(0.98398935900696682, 0.97205214950948116)


In [18]:
for i in [30,40]:
    for j in [700,800,900]:
        print(i,j)
        gbm = lgb.LGBMClassifier(objective='binary',
                        max_depth=i,
                        num_leaves=j,
                        verbose =0,
                        subsample=0.8,
                        metric = ['auc'],
                        learning_rate=0.05,
                        colsample_bytree=0.8,
                        n_estimators=500)
        gbm.fit(X_train, y_train, eval_metric=['auc'], eval_set=(X_valid,y_valid) ,early_stopping_rounds=30, verbose=False)
        print(auc2(gbm, X_train, X_valid))
        gc.collect()

30 700
(0.98039442396397158, 0.97261187982082953)
30 800
(0.98148847607323664, 0.97262645556028693)
30 900
(0.98157770666953315, 0.97260847042490495)
40 700
(0.9796326821165563, 0.97256072224761758)
40 800
(0.98107946047183903, 0.97254173727585758)
40 900
(0.98268582163846885, 0.97256992932308861)


In [None]:
for i in [30,40]:
    for j in [700,800,900]:
        print(i,j)
        gbm = lgb.LGBMClassifier(objective='binary',
                        max_depth=i,
                        num_leaves=j,
                        verbose =0,
                        subsample=0.6,
                        metric = ['auc'],
                        learning_rate=0.05,
                        colsample_bytree=0.8,
                        n_estimators=500)
        gbm.fit(X_train, y_train, eval_metric=['auc'], eval_set=(X_valid,y_valid) ,early_stopping_rounds=30, verbose=False)
        print(auc2(gbm, X_train, X_valid))
        gc.collect()

30 700
(0.97935095413750395, 0.97216998097951357)
30 800


In [19]:
lg = lgb.LGBMClassifier(silent=True)
param_dist = {"max_depth": [25,50,75],
              "subsample":[0.5],
              "colsample_bytree":[0.8],
              "num_leaves": [600],
              "learning_rate":[0.1],
             }
grid_search = GridSearchCV(lg, n_jobs=-1, param_grid=param_dist, cv = 3, scoring="roc_auc", verbose=5)

In [None]:
print('sda')