In [1]:
import os
import pickle
import datetime
import numpy as np
import pandas as pd
import lightgbm as lgb

In [2]:
cache_path = './cache/'
data_path = './data/'
feature_path = './feature/'

In [3]:
hours = 1
start_date = '2018-03-24'
# range of days to compute feature
days = 446
data_feat_url = feature_path + 'data_feat_{}_{}days.hdf'.format(start_date,days)

In [4]:
data_feat = pd.read_hdf(data_feat_url, 'w')

In [5]:
data_feat.shape

(749280, 296)

In [6]:
data_feat['time'].min(), data_feat['time'].max()

('2017-01-03 00:00:00', '2018-03-25 23:00:00')

In [7]:
data_feat.columns[data_feat.isnull().any()]

Index([], dtype='object')

In [8]:
train_feat = data_feat[data_feat['time']<'2018-03-05 00:00:00']
eval_feat = data_feat[data_feat['time']>='2018-03-05 00:00:00']

In [9]:
train_feat.shape, eval_feat.shape

((714840, 296), (34440, 296))

In [10]:
predictors = [c for c in train_feat.columns if c not in (['station_id', 'time','date', 'PM2.5', 'PM10', 'O3'])]

In [11]:
# predictors

In [12]:
params = {
    'learning_rate': 0.01,
    'boosting_type': 'gbdt',
    # 'objective': 'regression',
    'application': 'mape',
    'metric': 'mape', #'map'
    'sub_feature': 0.7,
    'num_leaves': 60,
    'min_data': 100,
    'min_hessian': 1,
    'verbose': -1,
}

In [13]:
model_dict = {}
def f1(x): return np.log(x+1)
def f2(x): return np.log(x+1)
def f3(x): return np.log(x+100)
def f4(x): return np.exp(x)-1
def f5(x): return np.exp(x)-1
def f6(x): return np.exp(x)-100
encode = {'PM2.5':f1,'PM10':f2,'O3':f3}
decode = {'PM2.5':f4,'PM10':f5,'O3':f6}

In [14]:
def smape_error(preds, train_data):
    labels = train_data.get_label()
    return 'error', 2*np.mean(np.fabs(preds - labels) / (preds + labels)), False

In [15]:
# for label in ['PM2.5']:
for label in ['PM2.5','PM10','O3']:
    lgb_train = lgb.Dataset(train_feat[train_feat[label] > 0][predictors],encode[label](train_feat[train_feat[label] > 0][label]))
    lgb_eval = lgb.Dataset(eval_feat[eval_feat[label] > 0][predictors], encode[label](eval_feat[eval_feat[label] > 0][label]))

    gbm = lgb.train(params,
                    train_set=lgb_train,
                    num_boost_round=5000,
                    valid_sets=lgb_eval,
                    feval=smape_error,
                    verbose_eval = 100,
                    early_stopping_rounds = 100)
#     feat_imp = pd.Series(gbm.feature_importance(), index=predictors).sort_values(ascending=False)
    model_dict[label] = gbm

Training until validation scores don't improve for 100 rounds.
[100]	valid_0's mape: 0.174125	valid_0's error: 0.180236
[200]	valid_0's mape: 0.15579	valid_0's error: 0.157095
[300]	valid_0's mape: 0.144426	valid_0's error: 0.143635
[400]	valid_0's mape: 0.137835	valid_0's error: 0.136118
[500]	valid_0's mape: 0.134882	valid_0's error: 0.132785
[600]	valid_0's mape: 0.132958	valid_0's error: 0.130724
[700]	valid_0's mape: 0.131607	valid_0's error: 0.129472
[800]	valid_0's mape: 0.130312	valid_0's error: 0.128171
[900]	valid_0's mape: 0.129197	valid_0's error: 0.127113
[1000]	valid_0's mape: 0.128195	valid_0's error: 0.126148
[1100]	valid_0's mape: 0.127549	valid_0's error: 0.1255
[1200]	valid_0's mape: 0.127079	valid_0's error: 0.125032
[1300]	valid_0's mape: 0.126661	valid_0's error: 0.124575
[1400]	valid_0's mape: 0.126247	valid_0's error: 0.124167
[1500]	valid_0's mape: 0.12589	valid_0's error: 0.123814
[1600]	valid_0's mape: 0.12555	valid_0's error: 0.123466
[1700]	valid_0's mape: 

In [16]:
pickle.dump((model_dict,predictors),open(data_path+'lightgbm.model','wb+'))