# LightGBM: CV

In [17]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV  # Perforing grid search
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

In [9]:
train_data = pd.read_csv('Train300.csv')



In [10]:
train_y = train_data.pop('click').values.astype('int')   # target variable
train_data.pop('id')
col = train_data.columns
train_x = train_data[col]
train_x = train_x.apply(lambda x: pd.factorize(x)[0])
train_x = train_x.astype('category')

train_x.columns

Index(['hour', 'C1', 'banner_pos', 'site_id', 'site_domain', 'site_category',
       'app_id', 'app_domain', 'app_category', 'device_id', 'device_ip',
       'device_model', 'device_type', 'device_conn_type', 'C14', 'C15', 'C16',
       'C17', 'C18', 'C19', 'C20', 'C21'],
      dtype='object')

In [11]:
X_train, X_test, y_train, y_test = train_test_split(train_x, train_y, test_size=0.3, random_state=42)

In [23]:
parameters = {
    'learning_rate': [0.01,0.05,0.1],
    'num_leaves': [250,300],
}

gbm = lgb.LGBMClassifier(objective='binary', 
                          n_jobs=-1, 
                          #is_unbalance=True, 
                          #num_threads=8, 
                          #two_round=True,
                          bagging_fraction=0.9,
                          bagging_freq=8,
                          metric = 'binary_logloss',
                          verbose = 0,
                          boosting_type='gbdt',
                          feature_fraction=0.8,
                          #learning_rate=0.1,
                          #min_child_samples=10,
                          #min_child_weight=5,
                          #min_data_in_leaf=20,
                          #min_split_gain=0.0,
                          #n_estimators=10,
                          #num_leaves=80,
                          #reg_alpha=0.0,
                          #reg_lambda=0.0,
                          #subsample=1.0,
                          lambda_l1= 0.6,
                          lambda_l2= 0,
                          cat_smooth= 35,
                          max_dept=24
                        )
gsearch = GridSearchCV(gbm, param_grid=parameters, scoring='neg_log_loss', cv=3)
gsearch.fit(X_train, y_train)
print('best parameter:{0}'.format(gsearch.best_params_))
print('best logloss:{0}'.format(-gsearch.best_score_))
print(-gsearch.cv_results_['mean_test_score'])
print(gsearch.cv_results_['params'])

best parameter:{'learning_rate': 0.05, 'num_leaves': 300}
best logloss:0.394124476119134
[0.39416251 0.39412448 0.39509614 0.39532593 0.39868554 0.39957227]
[{'learning_rate': 0.05, 'num_leaves': 250}, {'learning_rate': 0.05, 'num_leaves': 300}, {'learning_rate': 0.1, 'num_leaves': 250}, {'learning_rate': 0.1, 'num_leaves': 300}, {'learning_rate': 0.2, 'num_leaves': 250}, {'learning_rate': 0.2, 'num_leaves': 300}]


In [24]:
preds=gsearch.predict_proba(X_test)[:, 1]

In [25]:
print("Logloss: %0.3f"%(log_loss(y_test, preds)))

Logloss: 0.391


# result on test set

In [26]:

test_data = pd.read_csv('finaltest.csv')


In [32]:
train_data.head()

Unnamed: 0,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,6,1005,0,85f751fd,c4e18dd6,50e219e0,396df801,2347f47a,0f2161f8,a99f214a,...,1,0,15704,320,50,1722,0,35,-1,79
1,21,1005,0,85f751fd,c4e18dd6,50e219e0,03528b27,2347f47a,8ded1f7a,2e68d00c,...,1,0,15705,320,50,1722,0,35,-1,79
2,20,1005,0,85f751fd,c4e18dd6,50e219e0,66f5e02e,6f7ca2ba,0f2161f8,2e610fe8,...,1,0,22815,320,50,2647,2,35,100148,23
3,11,1005,1,e4d8dd7b,a17bde68,3e814130,ecad2386,7801e8d9,07d7df22,a99f214a,...,1,0,19950,320,50,1800,3,167,100075,23
4,8,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,...,1,0,15702,320,50,1722,0,35,-1,79


In [34]:
train_x=train_data

In [37]:
train_x.head()

Unnamed: 0,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,6,1005,0,85f751fd,c4e18dd6,50e219e0,396df801,2347f47a,0f2161f8,a99f214a,...,1,0,15704,320,50,1722,0,35,-1,79
1,21,1005,0,85f751fd,c4e18dd6,50e219e0,03528b27,2347f47a,8ded1f7a,2e68d00c,...,1,0,15705,320,50,1722,0,35,-1,79
2,20,1005,0,85f751fd,c4e18dd6,50e219e0,66f5e02e,6f7ca2ba,0f2161f8,2e610fe8,...,1,0,22815,320,50,2647,2,35,100148,23
3,11,1005,1,e4d8dd7b,a17bde68,3e814130,ecad2386,7801e8d9,07d7df22,a99f214a,...,1,0,19950,320,50,1800,3,167,100075,23
4,8,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,...,1,0,15702,320,50,1722,0,35,-1,79


In [33]:
test_data.head()

Unnamed: 0,id,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,3295858251275419735,9,1005,1,85f751fd,c4e18dd6,50e219e0,1dc72b4d,2347f47a,0f2161f8,...,1,0,8334,300,50,761,3,175,100075,23
1,12281702837842634283,20,1007,0,85f751fd,c4e18dd6,50e219e0,8311368f,1dc9b529,0f2161f8,...,1,2,24303,320,50,2788,3,295,100194,240
2,4638380339534007785,9,1005,0,4e7614cf,c1aa3c04,f028772b,ecad2386,7801e8d9,07d7df22,...,1,0,24165,320,50,2776,0,35,-1,79
3,17039804736879076347,1,1002,0,48c42b43,de0f0f82,50e219e0,ecad2386,7801e8d9,07d7df22,...,0,0,23438,320,50,2684,2,1327,-1,52
4,5753064066292192109,19,1005,1,e151e245,7e091613,f028772b,ecad2386,7801e8d9,07d7df22,...,1,0,24084,320,50,2761,2,35,100148,13


In [35]:
test_id = test_data.pop('id').values

In [36]:
test_data.head()

Unnamed: 0,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,9,1005,1,85f751fd,c4e18dd6,50e219e0,1dc72b4d,2347f47a,0f2161f8,cd915ca3,...,1,0,8334,300,50,761,3,175,100075,23
1,20,1007,0,85f751fd,c4e18dd6,50e219e0,8311368f,1dc9b529,0f2161f8,ec0aff16,...,1,2,24303,320,50,2788,3,295,100194,240
2,9,1005,0,4e7614cf,c1aa3c04,f028772b,ecad2386,7801e8d9,07d7df22,a99f214a,...,1,0,24165,320,50,2776,0,35,-1,79
3,1,1002,0,48c42b43,de0f0f82,50e219e0,ecad2386,7801e8d9,07d7df22,dfb9b781,...,0,0,23438,320,50,2684,2,1327,-1,52
4,19,1005,1,e151e245,7e091613,f028772b,ecad2386,7801e8d9,07d7df22,a99f214a,...,1,0,24084,320,50,2761,2,35,100148,13


In [38]:
frames = [train_x,test_data]
result = pd.concat(frames)

In [39]:
result.shape

(16015341, 22)

In [40]:
result1 = result.apply(lambda x: pd.factorize(x)[0])
result1 = result1.astype('category')
result1.head()

Unnamed: 0,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,1,0,1,1,...,0,0,1,0,0,0,0,0,0,0
2,2,0,0,0,0,0,2,1,0,2,...,0,0,2,0,0,1,1,0,1,1
3,3,0,1,1,1,1,3,2,2,0,...,0,0,3,0,0,2,2,1,2,1
4,4,0,0,2,2,2,3,2,2,0,...,0,0,4,0,0,0,0,0,0,0


In [41]:
X_train = result1.iloc[0:3000000]

In [42]:
X_test = result1.iloc[3000000:]

In [43]:
X_test.shape

(13015341, 22)

In [44]:
gbm = lgb.LGBMClassifier(objective='binary', 
                          n_jobs=-1, 
                          #is_unbalance=True, 
                          #num_threads=8, 
                          #two_round=True,
                          bagging_fraction=0.9,
                          bagging_freq=8,
                          metric = 'binary_logloss',
                          verbose = 0,
                          boosting_type='gbdt',
                          feature_fraction=0.8,
                          learning_rate=0.05,
                          #min_child_samples=10,
                          #min_child_weight=5,
                          #min_data_in_leaf=20,
                          #min_split_gain=0.0,
                          #n_estimators=10,
                          num_leaves=300,
                          #reg_alpha=0.0,
                          #reg_lambda=0.0,
                          #subsample=1.0,
                          lambda_l1= 0.6,
                          lambda_l2= 0,
                          cat_smooth= 35,
                          max_dept=24)
gbm.fit(X_train, train_y)
preds=gbm.predict_proba(X_train)[:, 1]

In [45]:
# check the logloss on trainset
print("Logloss: %0.3f"%(log_loss(train_y, preds)))

Logloss: 0.381


In [46]:
realpreds=gbm.predict_proba(X_test)[:, 1]

In [47]:
submit = pd.read_csv('ProjectSubmission-TeamX.csv')

In [48]:
submit.head()

Unnamed: 0,id,P(click)
0,3295858251275419735,0.5
1,12281702837842634283,0.5
2,4638380339534007785,0.5
3,17039804736879076347,0.5
4,5753064066292192109,0.5


In [49]:
test_id

array([ 3295858251275419735, 12281702837842634283,  4638380339534007785,
       ...,  4549134116722204000,  2114008866308662546,
        7842936548778884133], dtype=uint64)

In [50]:
submit.iloc[:,1]=realpreds

In [51]:
submit.head()

Unnamed: 0,id,P(click)
0,3295858251275419735,0.080547
1,12281702837842634283,0.093069
2,4638380339534007785,0.033475
3,17039804736879076347,0.086293
4,5753064066292192109,0.284962


In [52]:
submit.to_csv(r'jiutale.csv',index=False,sep=',')