In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
train_df = pd.read_csv('/root/tempfile/train_final121702.csv')
test_df = pd.read_csv('/root/tempfile/test_final121702.csv')

In [3]:
print(train_df.shape, test_df.shape)

(199710, 481) (123623, 480)


In [4]:
feature_col = [x for x in train_df.columns if x not in ['card_id','target']]
testindex = test_df.card_id

y_train = train_df.target.values.astype('float')
x_train = train_df[feature_col].values.astype('float')
x_test = test_df[feature_col].values.astype('float')

del train_df, test_df

In [5]:
str_label = [
    'auth_city_id_cal_mode',
    'auth_merchant_category_id_cal_mode', 
    'auth_merchant_id_cal_mode',
    'feature_1',
    'feature_2',
    'feature_3', 
    'new_city_id_cal_mode', 
    'new_merchant_category_id_cal_mode',
    'new_merchant_id_cal_mode', 
    'unauth_city_id_cal_mode', 
    'unauth_merchant_category_id_cal_mode', 
    'unauth_merchant_id_cal_mode'
]


In [6]:
lgb_model = lgb.LGBMRegressor(objective='regression',
                              min_data_in_leaf=20,
                              num_leaves=50,
                              learning_rate=0.1, 
                              max_depth=8,
                              bagging_fraction = 0.8,
                              feature_fraction = 0.8,
                              num_threads=4)

In [12]:
params_test1={
    'max_depth': range(9,11,1),
    'num_leaves': range(120, 180, 20),
    'min_data_in_leaf': range(100,300,20)
}
gsearch1 = GridSearchCV(estimator=lgb_model, 
                        param_grid=params_test1,
                        scoring='neg_mean_squared_error', 
                        cv=5,
                        verbose=1, 
                        n_jobs=2)

gs1 = gsearch1.fit(x_train, y_train)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed: 14.8min
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed: 61.8min
[Parallel(n_jobs=2)]: Done 300 out of 300 | elapsed: 95.9min finished


In [13]:
print(gs1.best_params_, gs1.best_score_)

{'max_depth': 10, 'min_data_in_leaf': 280, 'num_leaves': 120} -2.4260136648237998


In [14]:
params_test2={
    'max_depth': range(8,11,1),
    'num_leaves': range(90, 210, 20),
    'min_data_in_leaf': range(190,300,20)
}
gsearch2 = GridSearchCV(estimator=lgb_model, 
                        param_grid=params_test2,
                        scoring='neg_mean_squared_error', 
                        cv=5,
                        verbose=1, 
                        n_jobs=2)

gs2 = gsearch2.fit(x_train, y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed: 12.5min
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed: 52.7min
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed: 129.3min
[Parallel(n_jobs=2)]: Done 540 out of 540 | elapsed: 159.1min finished


In [15]:
print(gs2.best_params_, gs2.best_score_)

{'max_depth': 9, 'min_data_in_leaf': 290, 'num_leaves': 110} -2.424251742856599


In [26]:
gs2.cv_results_.keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_max_depth', 'param_min_data_in_leaf', 'param_num_leaves', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score', 'split0_train_score', 'split1_train_score', 'split2_train_score', 'split3_train_score', 'split4_train_score', 'mean_train_score', 'std_train_score'])

In [67]:
params_test3={
    'max_depth': [9],
    'num_leaves': range(90, 131, 5),
    'min_data_in_leaf': range(270,341, 5)
}
gsearch3 = GridSearchCV(estimator=lgb_model, 
                        param_grid=params_test3,
                        scoring='neg_mean_squared_error', 
                        cv=5,
                        verbose=1, 
                        n_jobs=2)

gs3 = gsearch3.fit(x_train, y_train)

Fitting 5 folds for each of 135 candidates, totalling 675 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed: 13.3min
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed: 56.7min
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed: 128.3min
[Parallel(n_jobs=2)]: Done 675 out of 675 | elapsed: 193.1min finished


In [68]:
print(gs3.best_params_, gs3.best_score_)

{'max_depth': 9, 'min_data_in_leaf': 290, 'num_leaves': 110} -2.424251742856599


In [77]:
gs3.cv_results_['mean_test_score']

array([-2.42537944, -2.42672743, -2.4263556 , -2.42703581, -2.42750174,
       -2.42543779, -2.42741712, -2.42802904, -2.42821529, -2.42774088,
       -2.42666973, -2.42715992, -2.42688314, -2.4259147 , -2.42806232,
       -2.42785071, -2.42683725, -2.42534773, -2.42614322, -2.4274158 ,
       -2.42455651, -2.42648201, -2.42721139, -2.42685238, -2.42788002,
       -2.42818259, -2.42771749, -2.4268317 , -2.42647517, -2.4267938 ,
       -2.42445935, -2.42762039, -2.42742672, -2.42842136, -2.42897762,
       -2.42875827, -2.42694511, -2.42750008, -2.42717252, -2.42688905,
       -2.42425174, -2.42870704, -2.42843688, -2.42849466, -2.42627763,
       -2.42657916, -2.42796088, -2.42616613, -2.42509586, -2.42582779,
       -2.42539525, -2.42835499, -2.42610689, -2.42551411, -2.42641857,
       -2.42681655, -2.42661236, -2.42653385, -2.42722264, -2.42699507,
       -2.42981406, -2.42745019, -2.4261826 , -2.42572372, -2.42620338,
       -2.42685872, -2.42567469, -2.42596692, -2.42818838, -2.42

In [7]:
params_test4={
    'max_depth': [9],
    'num_leaves': range(93, 98, 1),
    'min_data_in_leaf': range(333, 338, 1)
}
gsearch4 = GridSearchCV(estimator=lgb_model, 
                        param_grid=params_test4,
                        scoring='neg_mean_squared_error', 
                        cv=5,
                        verbose=1, 
                        n_jobs=2)

gs4 = gsearch4.fit(x_train, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed: 12.9min
[Parallel(n_jobs=2)]: Done 125 out of 125 | elapsed: 35.0min finished


In [8]:
print(gs4.best_params_, gs4.best_score_)

{'max_depth': 9, 'min_data_in_leaf': 337, 'num_leaves': 95} -2.423977833190758


In [9]:
gs4.cv_results_['mean_test_score']

array([-2.42674247, -2.4255113 , -2.42752359, -2.42559661, -2.42664298,
       -2.42449182, -2.42698194, -2.42506807, -2.42561108, -2.42598433,
       -2.42600035, -2.42618394, -2.42432601, -2.42694565, -2.42599039,
       -2.42603374, -2.4268669 , -2.42536598, -2.4270285 , -2.42654753,
       -2.42624308, -2.42518768, -2.42397783, -2.42716676, -2.42547101])

In [8]:
params_test5 = {
    'max_depth': [9],
    'num_leaves': [95],
    'min_data_in_leaf': [337],
    'feature_fraction': [0.5, 0.6, 0.7, 0.8, 0.9],  
    'bagging_fraction': [0.6, 0.7, 0.8, 0.9, 1.0]
}
gsearch5 = GridSearchCV(estimator=lgb_model, 
                        param_grid=params_test5,
                        scoring='neg_mean_squared_error', 
                        cv=5,
                        verbose=1, 
                        n_jobs=2)

gs5 = gsearch5.fit(x_train, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed: 11.8min
[Parallel(n_jobs=2)]: Done 125 out of 125 | elapsed: 32.4min finished


In [9]:
print(gs5.best_params_, gs5.best_score_)

{'bagging_fraction': 0.6, 'feature_fraction': 0.8, 'max_depth': 9, 'min_data_in_leaf': 337, 'num_leaves': 95} -2.423977833190758
