In [1]:
import lightgbm as lgb
import numpy as np
import pickle
import time
from sklearn.model_selection import train_test_split

In [2]:
def save_pkl(file_dir, data):
    f = open(file_dir,"wb")
    pickle.dump(data, f, protocol=4)
    f.close()
    
def read_pkl(file_dir):
    f = open(file_dir,"rb")
    data = pickle.load(f)
    return data

In [11]:
# lightgbm model parameters
model_param = {'lr': 0.01, 'depth': 10, 'tree': 3000, 'leaf': 400, 'sample': 0.9, 'seed': 3}
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression_l2',
    'metric': {'l2', 'l1'},
    'max_depth': model_param['depth'],
    'num_leaves': model_param['leaf'],
    'min_data_in_leaf': 20,
    'learning_rate': model_param['lr'],
    'feature_fraction': 1,
    'bagging_fraction': model_param['sample'],
    'bagging_freq': 1,
    'bagging_seed': model_param['seed'],
    'verbose': 0
}

# calculate the symmetric mean absolute percentage error (SMAPE)
def get_score(pred, valid_y_exp):
    return np.mean(np.abs(pred - valid_y_exp) / (pred + valid_y_exp) * 2)

# train and test the input X Y with the lightgbm model
def train_test(model_param, params, train_X, test_X, train_Y, test_Y, save_dir):
    time_start = time.time()
    lgb_train = lgb.Dataset(train_X, train_Y)
    lgb_eval = lgb.Dataset(test_X, test_Y, reference=lgb_train)
    gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round=model_param['tree'],
                    valid_sets=lgb_eval,
                    early_stopping_rounds=20)

    test_predict = gbm.predict(test_X)
    score = get_score(test_predict, test_Y)
    save_pkl(save_dir, gbm)
    print('running time: ', time.time() - time_start)
    return score

In [4]:
# X = read_pkl("London/X.pkl")
# Y_PM25 = read_pkl("London/Y_PM25.pkl")
# Y_PM10 = read_pkl("London/Y_PM10.pkl")
# Y_NO2 = read_pkl("London/Y_NO2.pkl")

In [4]:
# X = np.load("London/X.npy")
# Y_NO2 = np.load("London/Y_NO2.npy")
# Y_PM10 = np.load("London/Y_PM10.npy")
# Y_PM25 = np.load("London/Y_PM25.npy")

In [5]:
# load the whole size dataset
X = np.load("London/X_all.npy")
Y_NO2 = np.load("London/Y_NO2_all.npy")
Y_PM10 = np.load("London/Y_PM10_all.npy")
Y_PM25 = np.load("London/Y_PM25_all.npy")

In [11]:
# Observe temperature changes in the first 48 hour
PM25_Concentration = X[:,0]
# s = get_score(PM25_Concentration.reshape((PM25_Concentration.shape[0], 1)), Y_PM25.reshape((Y_PM25.shape[0], 1)))
print(PM25_Concentration[:48])
print(Y_PM25[:48])

[30.8 30.8 30.8 30.8 30.8 30.8 30.8 30.8 30.8 30.8 30.8 30.8 30.8 30.8
 30.8 30.8 30.8 30.8 30.8 30.8 30.8 30.8 30.8 30.8 30.8 30.8 30.8 30.8
 30.8 30.8 30.8 30.8 30.8 30.8 30.8 30.8 30.8 30.8 30.8 30.8 30.8 30.8
 30.8 30.8 30.8 30.8 30.8 30.8]
[22.9 18.4 19.5 22.1 22.8 19.6 15.6 18.1 14.5 10.9 10.5  8.9  6.9  5.3
  5.4  3.1  1.5  5.2  3.1  6.6  4.2  4.9  3.7  2.6  0.2  3.3  3.9  4.8
  2.3  1.7 -1.   1.9  5.9  7.1  7.2  4.6  3.   6.3  6.1  7.6  6.8  8.9
  9.6 12.5 11.8 12.3 16.  14.3]


In [5]:
# split the Y_PM25, X into training part and testing part
train_X_PM25, test_X_PM25, train_Y_PM25, test_Y_PM25 = train_test_split(X, Y_PM25, test_size=0.2, random_state=11)
print(train_X_PM25.shape, test_X_PM25.shape, train_Y_PM25.shape, test_Y_PM25.shape)

(5394355, 109) (1348589, 109) (5394355,) (1348589,)


In [6]:
# train and test the lightgbm model with X and Y_PM25 data
# save the model parameters into lightgbm_PM25.model
score = train_test(model_param, params, train_X_PM25, test_X_PM25, train_Y_PM25, test_Y_PM25, "London/lightgbm_PM25.model")
print('score: ', score)

[1]	valid_0's l2: 112.715	valid_0's l1: 7.21258
Training until validation scores don't improve for 20 rounds.
[2]	valid_0's l2: 102.287	valid_0's l1: 6.90126
[3]	valid_0's l2: 93.7384	valid_0's l1: 6.63338
[4]	valid_0's l2: 86.6782	valid_0's l1: 6.40057
[5]	valid_0's l2: 80.8691	valid_0's l1: 6.20101
[6]	valid_0's l2: 76.0309	valid_0's l1: 6.02859
[7]	valid_0's l2: 71.9276	valid_0's l1: 5.87512
[8]	valid_0's l2: 68.5576	valid_0's l1: 5.74434
[9]	valid_0's l2: 65.7095	valid_0's l1: 5.63215
[10]	valid_0's l2: 63.3085	valid_0's l1: 5.53556
[11]	valid_0's l2: 61.2468	valid_0's l1: 5.45028
[12]	valid_0's l2: 59.546	valid_0's l1: 5.3757
[13]	valid_0's l2: 58.0514	valid_0's l1: 5.31005
[14]	valid_0's l2: 56.8046	valid_0's l1: 5.25473
[15]	valid_0's l2: 55.6921	valid_0's l1: 5.20222
[16]	valid_0's l2: 54.6952	valid_0's l1: 5.15652
[17]	valid_0's l2: 53.8015	valid_0's l1: 5.11527
[18]	valid_0's l2: 53.0044	valid_0's l1: 5.07782
[19]	valid_0's l2: 52.2907	valid_0's l1: 5.04383
[20]	valid_0's l2:

In [5]:
# split the Y_PM10, X into training part and testing part
train_X_PM10, test_X_PM10, train_Y_PM10, test_Y_PM10 = train_test_split(X, Y_PM10, test_size=0.2, random_state=11)
print(train_X_PM10.shape, test_X_PM10.shape, train_Y_PM10.shape, test_Y_PM10.shape)

(5394355, 125) (1348589, 125) (5394355,) (1348589,)


In [6]:
# train and test the lightgbm model with X and Y_PM10 data
# save the model parameters into lightgbm_PM10.model
score = train_test(model_param, params, train_X_PM10, test_X_PM10, train_Y_PM10, test_Y_PM10, "London/lightgbm_PM10.model")
print('score: ', score)

[1]	valid_0's l2: 184.355	valid_0's l1: 9.55967
Training until validation scores don't improve for 20 rounds.
[2]	valid_0's l2: 170.661	valid_0's l1: 9.18332
[3]	valid_0's l2: 159.476	valid_0's l1: 8.86273
[4]	valid_0's l2: 150.027	valid_0's l1: 8.58226
[5]	valid_0's l2: 142.117	valid_0's l1: 8.34112
[6]	valid_0's l2: 135.604	valid_0's l1: 8.1343
[7]	valid_0's l2: 130.103	valid_0's l1: 7.95431
[8]	valid_0's l2: 125.413	valid_0's l1: 7.797
[9]	valid_0's l2: 121.459	valid_0's l1: 7.66219
[10]	valid_0's l2: 118.091	valid_0's l1: 7.54366
[11]	valid_0's l2: 115.27	valid_0's l1: 7.4433
[12]	valid_0's l2: 112.833	valid_0's l1: 7.35592
[13]	valid_0's l2: 110.771	valid_0's l1: 7.27784
[14]	valid_0's l2: 108.847	valid_0's l1: 7.20705
[15]	valid_0's l2: 107.29	valid_0's l1: 7.14647
[16]	valid_0's l2: 105.96	valid_0's l1: 7.09455
[17]	valid_0's l2: 104.655	valid_0's l1: 7.04189
[18]	valid_0's l2: 103.534	valid_0's l1: 6.99881
[19]	valid_0's l2: 102.551	valid_0's l1: 6.95854
[20]	valid_0's l2: 101.

In [6]:
# split the Y_PM10, X into training part and testing part
train_X_PM10, test_X_PM10, train_Y_PM10, test_Y_PM10 = train_test_split(X, Y_PM10, test_size=0.2, random_state=11)
print(train_X_PM10.shape, test_X_PM10.shape, train_Y_PM10.shape, test_Y_PM10.shape)

(5394355, 109) (1348589, 109) (5394355,) (1348589,)


In [None]:
# train and test the lightgbm model with X and Y_PM10 data
# save the model parameters into lightgbm_PM10.model
score = train_test(model_param, params, train_X_PM10, test_X_PM10, train_Y_PM10, test_Y_PM10, "London/lightgbm_PM10.model")
print('score: ', score)

[1]	valid_0's l2: 199.411	valid_0's l1: 9.9548
Training until validation scores don't improve for 20 rounds.
[2]	valid_0's l2: 197.688	valid_0's l1: 9.91053
[3]	valid_0's l2: 195.995	valid_0's l1: 9.86684
[4]	valid_0's l2: 194.343	valid_0's l1: 9.82393
[5]	valid_0's l2: 192.716	valid_0's l1: 9.78136
[6]	valid_0's l2: 191.129	valid_0's l1: 9.73981
[7]	valid_0's l2: 189.561	valid_0's l1: 9.69836
[8]	valid_0's l2: 188.023	valid_0's l1: 9.6577
[9]	valid_0's l2: 186.519	valid_0's l1: 9.61753
[10]	valid_0's l2: 185.057	valid_0's l1: 9.57834
[11]	valid_0's l2: 183.606	valid_0's l1: 9.53939
[12]	valid_0's l2: 182.177	valid_0's l1: 9.50087
[13]	valid_0's l2: 180.786	valid_0's l1: 9.4632
[14]	valid_0's l2: 179.424	valid_0's l1: 9.42602
[15]	valid_0's l2: 178.088	valid_0's l1: 9.38946
[16]	valid_0's l2: 176.77	valid_0's l1: 9.35326
[17]	valid_0's l2: 175.487	valid_0's l1: 9.31776
[18]	valid_0's l2: 174.209	valid_0's l1: 9.28232
[19]	valid_0's l2: 172.97	valid_0's l1: 9.24785
[20]	valid_0's l2: 17

In [12]:
# PM10_Concentration = train_X_PM10[:,1]
# s = get_score(PM10_Concentration.reshape((PM10_Concentration.shape[0], 1)), train_Y_PM10)
# print(s)

# PM10_Concentration = test_X_PM10[:,1]
# s = get_score(PM10_Concentration.reshape((PM10_Concentration.shape[0], 1)), test_Y_PM10)
# print(s)

In [5]:
# split the Y_NO2, X into training part and testing part
train_X_NO2, test_X_NO2, train_Y_NO2, test_Y_NO2 = train_test_split(X, Y_NO2, test_size=0.2, random_state=11)
print(train_X_NO2.shape, test_X_NO2.shape, train_Y_NO2.shape, test_Y_NO2.shape)

(5394355, 109) (1348589, 109) (5394355,) (1348589,)


In [6]:
# train and test the lightgbm model with X and Y_NO2 data
# save the model parameters into lightgbm_NO2.model
score = train_test(model_param, params, train_X_NO2, test_X_NO2, train_Y_NO2, test_Y_NO2, "London/lightgbm_NO2.model")
print('score: ', score)

[1]	valid_0's l1: 20.836	valid_0's l2: 712.993
Training until validation scores don't improve for 20 rounds.
[2]	valid_0's l1: 20.1666	valid_0's l2: 666.337
[3]	valid_0's l1: 19.5853	valid_0's l2: 627.481
[4]	valid_0's l1: 19.0807	valid_0's l2: 595.034
[5]	valid_0's l1: 18.6404	valid_0's l2: 567.594
[6]	valid_0's l1: 18.2554	valid_0's l2: 544.505
[7]	valid_0's l1: 17.9195	valid_0's l2: 525.042
[8]	valid_0's l1: 17.621	valid_0's l2: 508.126
[9]	valid_0's l1: 17.3611	valid_0's l2: 493.895
[10]	valid_0's l1: 17.1302	valid_0's l2: 481.743
[11]	valid_0's l1: 16.9221	valid_0's l2: 470.874
[12]	valid_0's l1: 16.7355	valid_0's l2: 461.3
[13]	valid_0's l1: 16.5642	valid_0's l2: 452.9
[14]	valid_0's l1: 16.404	valid_0's l2: 445.026
[15]	valid_0's l1: 16.2663	valid_0's l2: 438.439
[16]	valid_0's l1: 16.1303	valid_0's l2: 432.003
[17]	valid_0's l1: 16.0157	valid_0's l2: 426.868
[18]	valid_0's l1: 15.9051	valid_0's l2: 421.759
[19]	valid_0's l1: 15.8028	valid_0's l2: 417.007
[20]	valid_0's l1: 15.7

In [5]:
# to-do-list: optimize model parameters

model_param = {'lr': 0.1, 'depth': 10, 'tree': 10000, 'leaf': 400, 'sample': 0.9, 'seed': 3}
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression_l2',
    'metric': {'l2', 'l1'},
    'max_depth': model_param['depth'],
    'num_leaves': model_param['leaf'],
    'min_data_in_leaf': 20,
    'learning_rate': model_param['lr'],
    'feature_fraction': 1,
    'bagging_fraction': model_param['sample'],
    'bagging_freq': 1,
    'bagging_seed': model_param['seed'],
    'verbose': 0
}

# calculate the symmetric mean absolute percentage error (SMAPE)
def get_score(pred, valid_y_exp):
    return np.mean(np.abs(pred - valid_y_exp) / (pred + valid_y_exp) * 2)

# train and test the input X Y with the lightgbm model
def train_test(model_param, params, train_X, test_X, train_Y, test_Y, save_dir):
    time_start = time.time()
    lgb_train = lgb.Dataset(train_X, train_Y)
    lgb_eval = lgb.Dataset(test_X, test_Y, reference=lgb_train)
    gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round=model_param['tree'],
                    valid_sets=lgb_eval,
                    early_stopping_rounds=20)

    test_predict = gbm.predict(test_X)
    score = get_score(test_predict, test_Y)
    save_pkl(save_dir, gbm)
    print('running time: ', time.time() - time_start)
    return score

score = train_test(model_param, params, train_X_PM25, test_X_PM25, train_Y_PM25, test_Y_PM25, "London/lightgbm_PM25_0.model")
print('score: ', score)

[1]	valid_0's l1: 7.21258	valid_0's l2: 112.715
Training until validation scores don't improve for 20 rounds.
[2]	valid_0's l1: 6.90126	valid_0's l2: 102.287
[3]	valid_0's l1: 6.63338	valid_0's l2: 93.7384
[4]	valid_0's l1: 6.40057	valid_0's l2: 86.6782
[5]	valid_0's l1: 6.20101	valid_0's l2: 80.8691
[6]	valid_0's l1: 6.02859	valid_0's l2: 76.0309
[7]	valid_0's l1: 5.87512	valid_0's l2: 71.9276
[8]	valid_0's l1: 5.74434	valid_0's l2: 68.5576
[9]	valid_0's l1: 5.63215	valid_0's l2: 65.7095
[10]	valid_0's l1: 5.53556	valid_0's l2: 63.3085
[11]	valid_0's l1: 5.45028	valid_0's l2: 61.2468
[12]	valid_0's l1: 5.3757	valid_0's l2: 59.546
[13]	valid_0's l1: 5.31005	valid_0's l2: 58.0514
[14]	valid_0's l1: 5.25473	valid_0's l2: 56.8046
[15]	valid_0's l1: 5.20222	valid_0's l2: 55.6921
[16]	valid_0's l1: 5.15652	valid_0's l2: 54.6952
[17]	valid_0's l1: 5.11527	valid_0's l2: 53.8015
[18]	valid_0's l1: 5.07782	valid_0's l2: 53.0044
[19]	valid_0's l1: 5.04383	valid_0's l2: 52.2907
[20]	valid_0's l1: