In [10]:
# coding: utf-8
"""
train lightgbm model
"""
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import lightgbm as lgb

import gc
import pickle
from datetime import datetime
from scipy import sparse
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error

file_stamp = datetime.now().strftime('%m%d%H%M')

root = '/kaggle/competitions/avito-demand-prediction/'

def load_matrix(name):
    return pickle.load(open(root + 'features/{}.pkl'.format(name), 'rb'))

In [2]:
tr_index = load_matrix('tr_index')
te_index = load_matrix('te_index')

X = load_matrix('X_20180519')
y = load_matrix('y')
X_te = load_matrix('X_20180519_te')
feature_names = load_matrix('lgb_feature_20180519')

In [3]:
def cut_feature(feature_importance_):
    global feature_names
    feature_weak_ = [x[0] for x in feature_importance_ if x[1]==0.0]
    feature_index = [i for i, c in enumerate(feature_names) if c not in feature_weak_]
    _feature_names = [c for i, c in enumerate(feature_names) if c not in feature_weak_]
    _X    = sparse.lil_matrix(X[:,feature_index])
    _X_te = sparse.lil_matrix(X_te[:,feature_index])
    print('cut feature {}%'.format(100*len(feature_weak_)/len(feature_importance_)))
    return _X, _X_te, _feature_names

In [4]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [5]:
num_boost_round = 20000
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.1,
    'num_leaves': 15,
    'max_bin': 256,
    'feature_fraction': 0.6,
    'min_child_samples': 10,
    'min_child_weight': 150,
    'min_split_gain': 0,
    'subsample': 0.9,
    'drop_rate': 0.1,
    'max_drop': 50,
    'verbosity': 0,
    'seed': 228,
}
n_folds = 5
kfold = StratifiedKFold(n_splits=n_folds, shuffle=False, random_state=218)

In [6]:
# feature selection

In [7]:
val_split_idx = round(X.shape[0]*.9)
X_tr, X_va, y_tr, y_va = X[:val_split_idx], X[val_split_idx:], y[:val_split_idx], y[val_split_idx:]
print(X_tr.shape, X_va.shape, y_tr.shape[0], y_va.shape[0])
dtrain = lgb.Dataset(X_tr, y_tr, feature_name=feature_names)
dvalid = lgb.Dataset(X_va, y_va, feature_name=feature_names, reference=dtrain)

params['learning_rate'] = 0.2
params['feature_fraction'] = 0.66
gbm = lgb.train(params, dtrain, num_boost_round,
                valid_sets=(dtrain, dvalid),
                valid_names=['train', 'valid'],
                verbose_eval=100,
                early_stopping_rounds=50)

(1353082, 60084) (150342, 60084) 1353082 150342
(1353082, 60084) (150342, 60084) 1353082 150342
Training until validation scores don't improve for 50 rounds.
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 0.224345	valid's rmse: 0.222749
[100]	train's rmse: 0.224345	valid's rmse: 0.222749
[200]	train's rmse: 0.222316	valid's rmse: 0.221139
[200]	train's rmse: 0.222316	valid's rmse: 0.221139
[300]	train's rmse: 0.221014	valid's rmse: 0.220264
[300]	train's rmse: 0.221014	valid's rmse: 0.220264
[400]	train's rmse: 0.220002	valid's rmse: 0.219635
[400]	train's rmse: 0.220002	valid's rmse: 0.219635
[500]	train's rmse: 0.219259	valid's rmse: 0.219371
[500]	train's rmse: 0.219259	valid's rmse: 0.219371
[600]	train's rmse: 0.218509	valid's rmse: 0.219026
[600]	train's rmse: 0.218509	valid's rmse: 0.219026
[700]	train's rmse: 0.217875	valid's rmse: 0.21876
[700]	train's rmse: 0.217875	valid's rmse: 0.21876
[800]	train's rmse: 0.217308	valid's rmse: 0.218611
[8

NameError: name 'y_hat' is not defined

NameError: name 'y_hat' is not defined

In [8]:
y_hat = gbm.predict(X_va)
score = rmse(y_va, y_hat)
print('rmse:', score)
feature_importance_ = zip(feature_names, gbm.feature_importance())
feature_importance_ = sorted(feature_importance_, key=lambda x: x[1], reverse=True)
print('\n'.join(('%s: %.2f' % x) for x in feature_importance_[:100]))

rmse: 0.21786545260568901
price: 1034.00
image_top_1_cluster: 406.00
avg_days_up_user: 394.00
n_user_items: 356.00
n_image_top_1_price_avg: 280.00
avg_times_up_user: 255.00
item_seq_number: 245.00
param_1_cluster: 218.00
city_cluster: 217.00
c2v_3: 199.00
c2v_1: 194.00
r_desc_pun: 193.00
c2v_5: 185.00
r_titl_des: 182.00
image_top_1: 180.00
c2v_8: 178.00
c2v_9: 174.00
r_desc_dig: 169.00
c2v_0: 165.00
c2v_4: 164.00
c2v_2: 160.00
r_desc_cap: 157.00
n_desc_len: 155.00
r_desc_spa: 153.00
c2v_7: 152.00
r_desc_wds: 146.00
r_titl_spa: 140.00
n_titl_len: 134.00
c2v_6: 134.00
n_char_params: 132.00
r_titl_wds: 124.00
r_titl_cap: 117.00
city_uv: 115.00
item_seq_number_uv: 112.00
n_desc_dig: 105.00
n_text_wds: 99.00
n_region_price_avg: 98.00
city_upv: 98.00
region_upv: 97.00
n_desc_pun: 93.00
n_category_name_price_avg: 90.00
n_desc_spa: 87.00
r_titl_dig: 85.00
category_name_upv: 83.00
region_factor: 82.00
category_name_uv: 76.00
n_desc_cap: 76.00
image_top_1_uv: 73.00
r_titl_pun: 68.00
n_desc_wds: 

In [12]:
cv_pr = gbm.predict(X_te, num_iteration=gbm.best_iteration)
pr_hat = pd.Series(np.clip(cv_pr, 0, 1), name='deal_probability', index=te_index)

pr_hat_file = root+'results/lgb_base_pr_{}_{}.csv'.format(file_stamp, score)
pr_hat.to_csv(pr_hat_file, header=True)

# cut feature then train

In [13]:
X, X_te, feature_names = cut_feature(feature_importance_)

cut feature 91.29385526928966%
cut feature 91.29385526928966%


In [14]:
categorical = list(filter(lambda c: c.endswith('_cluster'),feature_names))

In [15]:
n_models = 8
x_score = []
final_cv_tr = np.zeros(len(y))
final_cv_pr = np.zeros(len(te_index))

In [16]:
params['learning_rate'] = 0.067
# params['feature_fraction'] = 0.86

for s in range(n_models):
    cv_tr = np.zeros(len(y))
    cv_pr = np.zeros(len(te_index))
    
    print(20 * '*' + 'lgb[{}]'.format(s) + 21 * '*')
    kf = kfold.split(X, round(y))
    best_trees = []
    fold_scores = []
    
    for i, (tr, va) in enumerate(kf):
        print(20 * '/' + 'lgb[{}]'.format(s), 'cv[{}]'.format(i) + 15 * '/')
        params['seed'] = s * i
        
        X_tr, X_va, y_tr, y_va = X[tr, :], X[va, :], y[tr], y[va]
        print(X_tr.shape, X_va.shape)
        dtrain = lgb.Dataset(X_tr, y_tr, feature_name=feature_names)
        dvalid = lgb.Dataset(X_va, y_va, feature_name=feature_names, reference=dtrain)
        gbm = lgb.train(params, dtrain, num_boost_round,
                        valid_sets=(dtrain, dvalid),
                        valid_names=['train', 'valid'],
                        verbose_eval=100,
                        early_stopping_rounds=50)
        best_trees.append(gbm.best_iteration)
        cv_pr += gbm.predict(X_te, num_iteration=gbm.best_iteration)
        cv_tr[va] += gbm.predict(X_va)
        score = rmse(y_va, cv_tr[va])
        print('lgb[{}]'.format(s), 'cv[{}]'.format(i), 'rmse:', score)
        fold_scores.append(score)
        feature_importance_ = zip(feature_names, gbm.feature_importance())
        feature_importance_ = sorted(feature_importance_, key=lambda x: x[1], reverse=True)
        pickle.dump(feature_importance_, open(root + 'results/lgb_20180519_{}_{}_feature_importance.pkl'.format(s, i), 'wb'))
        print('lgb[{}]'.format(s), 'cv[{}]'.format(i), 'importance features')
        print('\n'.join(('%s: %.2f' % x) for x in feature_importance_[:100]))
        print(50 * '/')
        
    cv_pr /= n_folds
    final_cv_tr += cv_tr
    final_cv_pr += cv_pr

    fold_score = rmse(y, cv_tr)
    x_score.append(fold_score)
    print('lgb[{}]'.format(s), 'fold score:', fold_score)
    print('lgb[{}]'.format(s), 'mean score:', rmse(y, cv_tr) / (s + 1.0), s + 1)
    print('lgb[{}]'.format(s), fold_scores)
    print('lgb[{}]'.format(s), best_trees, np.mean(best_trees))

********************lgb[0]*********************
********************lgb[0]*********************
////////////////////lgb[0] cv[0]///////////////
////////////////////lgb[0] cv[0]///////////////
(1202739, 5231) (300685, 5231)
(1202739, 5231) (300685, 5231)
Training until validation scores don't improve for 50 rounds.
Training until validation scores don't improve for 50 rounds.
[100]	train's rmse: 0.227233	valid's rmse: 0.228275
[100]	train's rmse: 0.227233	valid's rmse: 0.228275
[200]	train's rmse: 0.225078	valid's rmse: 0.226322
[200]	train's rmse: 0.225078	valid's rmse: 0.226322
[300]	train's rmse: 0.223846	valid's rmse: 0.225278
[300]	train's rmse: 0.223846	valid's rmse: 0.225278
[400]	train's rmse: 0.22298	valid's rmse: 0.224583
[400]	train's rmse: 0.22298	valid's rmse: 0.224583
[500]	train's rmse: 0.22232	valid's rmse: 0.22409
[500]	train's rmse: 0.22232	valid's rmse: 0.22409
[600]	train's rmse: 0.221737	valid's rmse: 0.223685
[600]	train's rmse: 0.221737	valid's rmse: 0.223685
[700

KeyboardInterrupt: 

KeyboardInterrupt: 

In [None]:
print(50 * '-')
print('Fininsh CV')

print('\n'.join(['lgm[{}] score:{}'.format(*x) for x in enumerate(fold_scores)]))

final_avg_score = round(np.mean(x_score), 6)
final_std_score = round(np.std(x_score), 6)
print('avg score:%.6f' % final_avg_score)
print('std score:%.6f' % final_std_score)

pr_hat = pd.Series(np.clip(final_cv_pr / float(n_models), 0, 1), name='deal_probability', index=te_index)
pr_hat_file = root+'results/lgb_{}_pr_avg_{}_{}.csv'.format(n_models, file_stamp, final_avg_score)
pr_hat.to_csv(pr_hat_file, header=True)

tr_hat = pd.Series(np.clip(final_cv_tr / float(n_models), 0, 1), name='deal_probability', index=tr_index)
tr_hat_file = root+'results/lgb_{}_tr_avg_{}_{}.csv'.format(n_models, file_stamp, final_avg_score)
tr_hat.to_csv(tr_hat_file, header=True)

print('\n'.join([pr_hat_file, tr_hat_file]))