In [1]:
import lightgbm as lgb
import numpy as np
import pickle
import time
from sklearn.model_selection import train_test_split

In [2]:
def save_pkl(file_dir, data):
    f = open(file_dir,"wb")
    pickle.dump(data, f, protocol=4)
    f.close()
    
def read_pkl(file_dir):
    f = open(file_dir,"rb")
    data = pickle.load(f)
    return data

In [3]:
# load the whole size dataset
X = np.load("London/X_all.npy")
Y_NO2 = np.load("London/Y_NO2_all.npy")
Y_PM10 = np.load("London/Y_PM10_all.npy")
Y_PM25 = np.load("London/Y_PM25_all.npy")

In [9]:
# lightgbm model parameters
model_param = {'lr': 0.02, 'depth': 10, 'tree': 25000, 'leaf': 400, 'sample': 0.9, 'seed': 3}
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression_l2',
    'metric': {'l2', 'l1'},
    'max_depth': model_param['depth'],
    'num_leaves': model_param['leaf'],
    'min_data_in_leaf': 20,
    'learning_rate': model_param['lr'],
    'feature_fraction': 1,
    'bagging_fraction': model_param['sample'],
    'bagging_freq': 1,
    'bagging_seed': model_param['seed'],
    'verbose': 0
}

# calculate the symmetric mean absolute percentage error (SMAPE)
def get_score(pred, valid_y_exp):
    return np.mean(np.abs(pred - valid_y_exp) / (pred + valid_y_exp) * 2)

# train and test the input X Y with the lightgbm model
def train_test(model_param, params, train_X, test_X, train_Y, test_Y, save_dir):
    time_start = time.time()
    lgb_train = lgb.Dataset(train_X, train_Y)
    lgb_eval = lgb.Dataset(test_X, test_Y, reference=lgb_train)
    gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round=model_param['tree'],
                    valid_sets=lgb_eval,
                    early_stopping_rounds=20)

    test_predict = gbm.predict(test_X)
    score = get_score(test_predict, test_Y)
    save_pkl(save_dir, gbm)
    print('running time: ', time.time() - time_start)
    return score

In [6]:
# PM 2.5

In [7]:
# split the Y_PM25, X into training part and testing part
train_X_PM25, test_X_PM25, train_Y_PM25, test_Y_PM25 = train_test_split(X, Y_PM25, test_size=0.2, random_state=11)
print(train_X_PM25.shape, test_X_PM25.shape, train_Y_PM25.shape, test_Y_PM25.shape)

(5394355, 125) (1348589, 125) (5394355,) (1348589,)


In [9]:
# model_param = {'lr': 0.02, 'depth': 10, 'tree': 3000, 'leaf': 400, 'sample': 0.9, 'seed': 3}
score = train_test(model_param, params, train_X_PM25, test_X_PM25, train_Y_PM25, test_Y_PM25, "London/lightgbm_PM25.model")
print('score: ', score)

[1]	valid_0's l2: 122.694	valid_0's l1: 7.49303
Training until validation scores don't improve for 20 rounds.
[2]	valid_0's l2: 120.125	valid_0's l1: 7.42147
[3]	valid_0's l2: 117.671	valid_0's l1: 7.35241
[4]	valid_0's l2: 115.314	valid_0's l1: 7.28522
[5]	valid_0's l2: 113.028	valid_0's l1: 7.21917
[6]	valid_0's l2: 110.84	valid_0's l1: 7.1554
[7]	valid_0's l2: 108.731	valid_0's l1: 7.09318
[8]	valid_0's l2: 106.716	valid_0's l1: 7.03299
[9]	valid_0's l2: 104.759	valid_0's l1: 6.97358
[10]	valid_0's l2: 102.887	valid_0's l1: 6.9164
[11]	valid_0's l2: 101.093	valid_0's l1: 6.86096
[12]	valid_0's l2: 99.3348	valid_0's l1: 6.80621
[13]	valid_0's l2: 97.6785	valid_0's l1: 6.75405
[14]	valid_0's l2: 96.0789	valid_0's l1: 6.70281
[15]	valid_0's l2: 94.5111	valid_0's l1: 6.65217
[16]	valid_0's l2: 93.0184	valid_0's l1: 6.60381
[17]	valid_0's l2: 91.5517	valid_0's l1: 6.55544
[18]	valid_0's l2: 90.156	valid_0's l1: 6.50923
[19]	valid_0's l2: 88.8309	valid_0's l1: 6.46508
[20]	valid_0's l2: 8

In [8]:
# model_param = {'lr': 0.02, 'depth': 10, 'tree': 25000, 'leaf': 400, 'sample': 0.9, 'seed': 3}
score = train_test(model_param, params, train_X_PM25, test_X_PM25, train_Y_PM25, test_Y_PM25, "London/lightgbm_PM25.model")
# last save version
print('score: ', score)

[1]	valid_0's l1: 7.49303	valid_0's l2: 122.694
Training until validation scores don't improve for 20 rounds.
[2]	valid_0's l1: 7.42147	valid_0's l2: 120.125
[3]	valid_0's l1: 7.35241	valid_0's l2: 117.671
[4]	valid_0's l1: 7.28522	valid_0's l2: 115.314
[5]	valid_0's l1: 7.21917	valid_0's l2: 113.028
[6]	valid_0's l1: 7.1554	valid_0's l2: 110.84
[7]	valid_0's l1: 7.09318	valid_0's l2: 108.731
[8]	valid_0's l1: 7.03299	valid_0's l2: 106.716
[9]	valid_0's l1: 6.97358	valid_0's l2: 104.759
[10]	valid_0's l1: 6.9164	valid_0's l2: 102.887
[11]	valid_0's l1: 6.86096	valid_0's l2: 101.093
[12]	valid_0's l1: 6.80621	valid_0's l2: 99.3348
[13]	valid_0's l1: 6.75405	valid_0's l2: 97.6785
[14]	valid_0's l1: 6.70281	valid_0's l2: 96.0789
[15]	valid_0's l1: 6.65217	valid_0's l2: 94.5111
[16]	valid_0's l1: 6.60381	valid_0's l2: 93.0184
[17]	valid_0's l1: 6.55544	valid_0's l2: 91.5517
[18]	valid_0's l1: 6.50923	valid_0's l2: 90.156
[19]	valid_0's l1: 6.46508	valid_0's l2: 88.8309
[20]	valid_0's l1: 6

In [8]:
# model_param = {'lr': 0.02, 'depth': 10, 'tree': 3000, 'leaf': 400, 'sample': 0.9, 'seed': 3}
score = train_test(model_param, params, train_X_PM25, test_X_PM25, train_Y_PM25, test_Y_PM25, "London/lightgbm_PM25.model")
print('score: ', score)

[1]	valid_0's l2: 122.688	valid_0's l1: 7.49272
Training until validation scores don't improve for 20 rounds.
[2]	valid_0's l2: 120.127	valid_0's l1: 7.42145
[3]	valid_0's l2: 117.644	valid_0's l1: 7.35117
[4]	valid_0's l2: 115.261	valid_0's l1: 7.28301
[5]	valid_0's l2: 112.981	valid_0's l1: 7.21725
[6]	valid_0's l2: 110.785	valid_0's l1: 7.15282
[7]	valid_0's l2: 108.671	valid_0's l1: 7.09031
[8]	valid_0's l2: 106.634	valid_0's l1: 7.02906
[9]	valid_0's l2: 104.669	valid_0's l1: 6.96967
[10]	valid_0's l2: 102.784	valid_0's l1: 6.91196
[11]	valid_0's l2: 100.945	valid_0's l1: 6.85479
[12]	valid_0's l2: 99.1697	valid_0's l1: 6.7988
[13]	valid_0's l2: 97.4887	valid_0's l1: 6.74569
[14]	valid_0's l2: 95.8562	valid_0's l1: 6.69398
[15]	valid_0's l2: 94.275	valid_0's l1: 6.64283
[16]	valid_0's l2: 92.7451	valid_0's l1: 6.59304
[17]	valid_0's l2: 91.2991	valid_0's l1: 6.54555
[18]	valid_0's l2: 89.9028	valid_0's l1: 6.4996
[19]	valid_0's l2: 88.5351	valid_0's l1: 6.45359
[20]	valid_0's l2: 

In [None]:
# model_param = {'lr': 0.02, 'depth': 10, 'tree': 25000, 'leaf': 400, 'sample': 0.9, 'seed': 3}
score = train_test(model_param, params, train_X_PM25, test_X_PM25, train_Y_PM25, test_Y_PM25, "London/lightgbm_PM25.model")
print('score: ', score)

[1]	valid_0's l2: 122.688	valid_0's l1: 7.49272
Training until validation scores don't improve for 20 rounds.
[2]	valid_0's l2: 120.127	valid_0's l1: 7.42145
[3]	valid_0's l2: 117.644	valid_0's l1: 7.35117
[4]	valid_0's l2: 115.261	valid_0's l1: 7.28301
[5]	valid_0's l2: 112.981	valid_0's l1: 7.21725
[6]	valid_0's l2: 110.785	valid_0's l1: 7.15282
[7]	valid_0's l2: 108.671	valid_0's l1: 7.09031
[8]	valid_0's l2: 106.634	valid_0's l1: 7.02906
[9]	valid_0's l2: 104.669	valid_0's l1: 6.96967
[10]	valid_0's l2: 102.784	valid_0's l1: 6.91196
[11]	valid_0's l2: 100.945	valid_0's l1: 6.85479
[12]	valid_0's l2: 99.1697	valid_0's l1: 6.7988
[13]	valid_0's l2: 97.4887	valid_0's l1: 6.74569
[14]	valid_0's l2: 95.8562	valid_0's l1: 6.69398
[15]	valid_0's l2: 94.275	valid_0's l1: 6.64283
[16]	valid_0's l2: 92.7451	valid_0's l1: 6.59304
[17]	valid_0's l2: 91.2991	valid_0's l1: 6.54555
[18]	valid_0's l2: 89.9028	valid_0's l1: 6.4996
[19]	valid_0's l2: 88.5351	valid_0's l1: 6.45359
[20]	valid_0's l2: 

In [None]:
# PM 10

In [5]:
# split the Y_PM10, X into training part and testing part
train_X_PM10, test_X_PM10, train_Y_PM10, test_Y_PM10 = train_test_split(X, Y_PM10, test_size=0.2, random_state=11)
print(train_X_PM10.shape, test_X_PM10.shape, train_Y_PM10.shape, test_Y_PM10.shape)

(5394355, 125) (1348589, 125) (5394355,) (1348589,)


In [6]:
# train and test the lightgbm model with X and Y_PM10 data
# save the model parameters into lightgbm_PM10.model
# 48+ features and model_param = {'lr': 0.1, 'depth': 10, 'tree': 3000, 'leaf': 400, 'sample': 0.9, 'seed': 3}
score = train_test(model_param, params, train_X_PM10, test_X_PM10, train_Y_PM10, test_Y_PM10, "London/lightgbm_PM10.model")
print('score: ', score)

[1]	valid_0's l2: 184.355	valid_0's l1: 9.55967
Training until validation scores don't improve for 20 rounds.
[2]	valid_0's l2: 170.661	valid_0's l1: 9.18332
[3]	valid_0's l2: 159.476	valid_0's l1: 8.86273
[4]	valid_0's l2: 150.027	valid_0's l1: 8.58226
[5]	valid_0's l2: 142.117	valid_0's l1: 8.34112
[6]	valid_0's l2: 135.604	valid_0's l1: 8.1343
[7]	valid_0's l2: 130.103	valid_0's l1: 7.95431
[8]	valid_0's l2: 125.413	valid_0's l1: 7.797
[9]	valid_0's l2: 121.459	valid_0's l1: 7.66219
[10]	valid_0's l2: 118.091	valid_0's l1: 7.54366
[11]	valid_0's l2: 115.27	valid_0's l1: 7.4433
[12]	valid_0's l2: 112.833	valid_0's l1: 7.35592
[13]	valid_0's l2: 110.771	valid_0's l1: 7.27784
[14]	valid_0's l2: 108.847	valid_0's l1: 7.20705
[15]	valid_0's l2: 107.29	valid_0's l1: 7.14647
[16]	valid_0's l2: 105.96	valid_0's l1: 7.09455
[17]	valid_0's l2: 104.655	valid_0's l1: 7.04189
[18]	valid_0's l2: 103.534	valid_0's l1: 6.99881
[19]	valid_0's l2: 102.551	valid_0's l1: 6.95854
[20]	valid_0's l2: 101.

In [6]:
# split the Y_PM10, X into training part and testing part
train_X_PM10, test_X_PM10, train_Y_PM10, test_Y_PM10 = train_test_split(X, Y_PM10, test_size=0.2, random_state=11)
print(train_X_PM10.shape, test_X_PM10.shape, train_Y_PM10.shape, test_Y_PM10.shape)

(5394355, 109) (1348589, 109) (5394355,) (1348589,)


In [12]:
# train and test the lightgbm model with X and Y_PM10 data
# save the model parameters into lightgbm_PM10.model
model_param = {'lr': 0.01, 'depth': 10, 'tree': 3000, 'leaf': 400, 'sample': 0.9, 'seed': 3}
score = train_test(model_param, params, train_X_PM10, test_X_PM10, train_Y_PM10, test_Y_PM10, "London/lightgbm_PM10.model")
print('score: ', score)

[1]	valid_0's l2: 199.411	valid_0's l1: 9.9548
Training until validation scores don't improve for 20 rounds.
[2]	valid_0's l2: 197.688	valid_0's l1: 9.91053
[3]	valid_0's l2: 195.995	valid_0's l1: 9.86684
[4]	valid_0's l2: 194.343	valid_0's l1: 9.82393
[5]	valid_0's l2: 192.716	valid_0's l1: 9.78136
[6]	valid_0's l2: 191.129	valid_0's l1: 9.73981
[7]	valid_0's l2: 189.561	valid_0's l1: 9.69836
[8]	valid_0's l2: 188.023	valid_0's l1: 9.6577
[9]	valid_0's l2: 186.519	valid_0's l1: 9.61753
[10]	valid_0's l2: 185.057	valid_0's l1: 9.57834
[11]	valid_0's l2: 183.606	valid_0's l1: 9.53939
[12]	valid_0's l2: 182.177	valid_0's l1: 9.50087
[13]	valid_0's l2: 180.786	valid_0's l1: 9.4632
[14]	valid_0's l2: 179.424	valid_0's l1: 9.42602
[15]	valid_0's l2: 178.088	valid_0's l1: 9.38946
[16]	valid_0's l2: 176.77	valid_0's l1: 9.35326
[17]	valid_0's l2: 175.487	valid_0's l1: 9.31776
[18]	valid_0's l2: 174.209	valid_0's l1: 9.28232
[19]	valid_0's l2: 172.97	valid_0's l1: 9.24785
[20]	valid_0's l2: 17

In [14]:
model_param = {'lr': 0.02, 'depth': 10, 'tree': 3000, 'leaf': 400, 'sample': 0.9, 'seed': 3}
score = train_test(model_param, params, train_X_PM10, test_X_PM10, train_Y_PM10, test_Y_PM10, "London/lightgbm_PM10.model")
print('score: ', score)

[1]	valid_0's l2: 197.667	valid_0's l1: 9.91
Training until validation scores don't improve for 20 rounds.
[2]	valid_0's l2: 194.31	valid_0's l1: 9.82278
[3]	valid_0's l2: 191.075	valid_0's l1: 9.73837
[4]	valid_0's l2: 187.954	valid_0's l1: 9.65588
[5]	valid_0's l2: 184.967	valid_0's l1: 9.57618
[6]	valid_0's l2: 182.092	valid_0's l1: 9.49842
[7]	valid_0's l2: 179.32	valid_0's l1: 9.42324
[8]	valid_0's l2: 176.636	valid_0's l1: 9.34946
[9]	valid_0's l2: 174.063	valid_0's l1: 9.27808
[10]	valid_0's l2: 171.597	valid_0's l1: 9.20929
[11]	valid_0's l2: 169.219	valid_0's l1: 9.14222
[12]	valid_0's l2: 166.884	valid_0's l1: 9.07616
[13]	valid_0's l2: 164.679	valid_0's l1: 9.01326
[14]	valid_0's l2: 162.546	valid_0's l1: 8.95203
[15]	valid_0's l2: 160.493	valid_0's l1: 8.89243
[16]	valid_0's l2: 158.51	valid_0's l1: 8.83453
[17]	valid_0's l2: 156.592	valid_0's l1: 8.77814
[18]	valid_0's l2: 154.707	valid_0's l1: 8.72258
[19]	valid_0's l2: 152.904	valid_0's l1: 8.66885
[20]	valid_0's l2: 151

In [15]:
model_param = {'lr': 0.01, 'depth': 10, 'tree': 10000, 'leaf': 400, 'sample': 0.9, 'seed': 3}
# actual lr is 0.02
score = train_test(model_param, params, train_X_PM10, test_X_PM10, train_Y_PM10, test_Y_PM10, "London/lightgbm_PM10.model")
# last save version
print('score: ', score)

[1]	valid_0's l2: 197.667	valid_0's l1: 9.91
Training until validation scores don't improve for 20 rounds.
[2]	valid_0's l2: 194.31	valid_0's l1: 9.82278
[3]	valid_0's l2: 191.075	valid_0's l1: 9.73837
[4]	valid_0's l2: 187.954	valid_0's l1: 9.65588
[5]	valid_0's l2: 184.967	valid_0's l1: 9.57618
[6]	valid_0's l2: 182.092	valid_0's l1: 9.49842
[7]	valid_0's l2: 179.32	valid_0's l1: 9.42324
[8]	valid_0's l2: 176.636	valid_0's l1: 9.34946
[9]	valid_0's l2: 174.063	valid_0's l1: 9.27808
[10]	valid_0's l2: 171.597	valid_0's l1: 9.20929
[11]	valid_0's l2: 169.219	valid_0's l1: 9.14222
[12]	valid_0's l2: 166.884	valid_0's l1: 9.07616
[13]	valid_0's l2: 164.679	valid_0's l1: 9.01326
[14]	valid_0's l2: 162.546	valid_0's l1: 8.95203
[15]	valid_0's l2: 160.493	valid_0's l1: 8.89243
[16]	valid_0's l2: 158.51	valid_0's l1: 8.83453
[17]	valid_0's l2: 156.592	valid_0's l1: 8.77814
[18]	valid_0's l2: 154.707	valid_0's l1: 8.72258
[19]	valid_0's l2: 152.904	valid_0's l1: 8.66885
[20]	valid_0's l2: 151

In [12]:
# PM10_Concentration = train_X_PM10[:,1]
# s = get_score(PM10_Concentration.reshape((PM10_Concentration.shape[0], 1)), train_Y_PM10)
# print(s)

# PM10_Concentration = test_X_PM10[:,1]
# s = get_score(PM10_Concentration.reshape((PM10_Concentration.shape[0], 1)), test_Y_PM10)
# print(s)

In [None]:
# NO2

In [7]:
# split the Y_NO2, X into training part and testing part
train_X_NO2, test_X_NO2, train_Y_NO2, test_Y_NO2 = train_test_split(X, Y_NO2, test_size=0.2, random_state=11)
print(train_X_NO2.shape, test_X_NO2.shape, train_Y_NO2.shape, test_Y_NO2.shape)

(5394355, 125) (1348589, 125) (5394355,) (1348589,)


In [8]:
# New model_param = {'lr': 0.02, 'depth': 10, 'tree': 3000, 'leaf': 400, 'sample': 0.9, 'seed': 3}
score = train_test(model_param, params, train_X_NO2, test_X_NO2, train_Y_NO2, test_Y_NO2, "London/lightgbm_NO2.model")
print('score: ', score)

[1]	valid_0's l1: 21.4217	valid_0's l2: 755.348
Training until validation scores don't improve for 20 rounds.
[2]	valid_0's l1: 21.2343	valid_0's l2: 741.479
[3]	valid_0's l1: 21.0535	valid_0's l2: 728.205
[4]	valid_0's l1: 20.8772	valid_0's l2: 715.384
[5]	valid_0's l1: 20.706	valid_0's l2: 703.048
[6]	valid_0's l1: 20.5395	valid_0's l2: 691.165
[7]	valid_0's l1: 20.3779	valid_0's l2: 679.759
[8]	valid_0's l1: 20.2219	valid_0's l2: 668.815
[9]	valid_0's l1: 20.0699	valid_0's l2: 658.263
[10]	valid_0's l1: 19.9227	valid_0's l2: 648.124
[11]	valid_0's l1: 19.7787	valid_0's l2: 638.331
[12]	valid_0's l1: 19.6386	valid_0's l2: 628.87
[13]	valid_0's l1: 19.5023	valid_0's l2: 619.738
[14]	valid_0's l1: 19.3701	valid_0's l2: 610.996
[15]	valid_0's l1: 19.2387	valid_0's l2: 602.343
[16]	valid_0's l1: 19.1112	valid_0's l2: 594.047
[17]	valid_0's l1: 18.9896	valid_0's l2: 586.19
[18]	valid_0's l1: 18.8711	valid_0's l2: 578.62
[19]	valid_0's l1: 18.7528	valid_0's l2: 571.129
[20]	valid_0's l1: 1

In [None]:
# PM 2.5

In [5]:
# to-do-list: optimize model parameters

# 48+ features and model_param = {'lr': 0.1, 'depth': 10, 'tree': 10000, 'leaf': 400, 'sample': 0.9, 'seed': 3}
score = train_test(model_param, params, train_X_PM25, test_X_PM25, train_Y_PM25, test_Y_PM25, "London/lightgbm_PM25_0.model")
print('score: ', score)

[1]	valid_0's l1: 7.21258	valid_0's l2: 112.715
Training until validation scores don't improve for 20 rounds.
[2]	valid_0's l1: 6.90126	valid_0's l2: 102.287
[3]	valid_0's l1: 6.63338	valid_0's l2: 93.7384
[4]	valid_0's l1: 6.40057	valid_0's l2: 86.6782
[5]	valid_0's l1: 6.20101	valid_0's l2: 80.8691
[6]	valid_0's l1: 6.02859	valid_0's l2: 76.0309
[7]	valid_0's l1: 5.87512	valid_0's l2: 71.9276
[8]	valid_0's l1: 5.74434	valid_0's l2: 68.5576
[9]	valid_0's l1: 5.63215	valid_0's l2: 65.7095
[10]	valid_0's l1: 5.53556	valid_0's l2: 63.3085
[11]	valid_0's l1: 5.45028	valid_0's l2: 61.2468
[12]	valid_0's l1: 5.3757	valid_0's l2: 59.546
[13]	valid_0's l1: 5.31005	valid_0's l2: 58.0514
[14]	valid_0's l1: 5.25473	valid_0's l2: 56.8046
[15]	valid_0's l1: 5.20222	valid_0's l2: 55.6921
[16]	valid_0's l1: 5.15652	valid_0's l2: 54.6952
[17]	valid_0's l1: 5.11527	valid_0's l2: 53.8015
[18]	valid_0's l1: 5.07782	valid_0's l2: 53.0044
[19]	valid_0's l1: 5.04383	valid_0's l2: 52.2907
[20]	valid_0's l1: