<h1><center> NASA Airathon - NO2 Track </center></h1>

### <center> Forecasting: Light GBM </center>

<div style="text-align: center"> 
    Dr. Sukanta Basu <br/> Associate Professor <br/> Delft University of Technology, The Netherlands <br/> Email: s.basu@tudelft.nl<br/> https://sites.google.com/view/sukantabasu/
</div>

#### Log

Last updated: 4th April, 2022

#### User instructions

Run this notebook. It will ingest testOBS.csv, testOMI.csv, and testGFS.csv files to produce final forecast (submission_sukantabasu.csv). 

#### Load packages

In [1]:
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
from pathlib import Path

from pickle import dump, load

import lightgbm as lgb

#For reproducibility of the results, the following seeds should be selected 
from numpy.random import seed
seed(20)

#### Directories

In [2]:
ROOT_DIR    = '../../'

#Location of processed datasets
EXTDATA_DIR = ROOT_DIR + 'data/airathon/processed/'

#Location of saved models
TUNING_DIR  = ROOT_DIR + 'model/'

#Location of final submission
SUBMISSION_DIR = ROOT_DIR + 'forecast/airathon/'

#### User input

In [3]:
nTrial = 256 #required input for halving random grid search
nEns   = 100 #number of ensembles

#### Prepare test data

In [4]:
df_OBS    = pd.read_csv(EXTDATA_DIR + 'test/STN/' + 'testOBS.csv')
datetime  = df_OBS['datetime'].values
ID        = df_OBS['ID'].values
nSamples  = np.size(ID)

df_OBS_subset = df_OBS[['latitude','longitude']]

df_OMI = pd.read_csv(EXTDATA_DIR + 'test/OMI/' + 'testOMI.csv')
df_OMI_subset = df_OMI[['NO2_OMI','NO2Tr_OMI']]

df_GFS   = pd.read_csv(EXTDATA_DIR + 'test/GFS/' + 'testGFS.csv')

df_tst_1 = pd.concat([df_OBS_subset,df_GFS], axis=1)
df_tst_1 = df_tst_1[['latitude','longitude',
                     'cosJDAY','sinJDAY','WDAY',
                     'PBLH_0','PBLH_3','PBLH_6','PBLH_9','PBLH_12','PBLH_15','PBLH_18','PBLH_21',
                     'dT_0','dT_3','dT_6','dT_9','dT_12','dT_15','dT_18','dT_21',
                     'SHFX_0','SHFX_3','SHFX_6','SHFX_9','SHFX_12','SHFX_15','SHFX_18','SHFX_21',
                     'M10_0','M10_3','M10_6','M10_9','M10_12','M10_15','M10_18','M10_21',
                     'M100_0','M100_3','M100_6','M100_9','M100_12','M100_15','M100_18','M100_21',
                     'alpha_0','alpha_3','alpha_6','alpha_9','alpha_12','alpha_15','alpha_18','alpha_21',
                     'beta_0','beta_3','beta_6','beta_9','beta_12','beta_15','beta_18','beta_21',
                     'cosX100_0','cosX100_3','cosX100_6','cosX100_9','cosX100_12','cosX100_15','cosX100_18','cosX100_21',
                     'sinX100_0','sinX100_3','sinX100_6','sinX100_9','sinX100_12','sinX100_15','sinX100_18','sinX100_21',
                     'VENT_0','VENT_3','VENT_6','VENT_9','VENT_12','VENT_15','VENT_18','VENT_21',
                     'T2_0','T2_3','T2_6','T2_9','T2_12','T2_15','T2_18','T2_21',
                     'RH_0','RH_3','RH_6','RH_9','RH_12','RH_15','RH_18','RH_21']]

df_tst_2 = pd.concat([df_OBS_subset,df_OMI_subset,df_GFS], axis=1)
NO2_OMI   = df_tst_2['NO2_OMI'].values
NO2Tr_OMI = df_tst_2['NO2Tr_OMI'].values

df_tst_2 = df_tst_2[['latitude','longitude',
                 'NO2_OMI','NO2Tr_OMI',
                 'cosJDAY','sinJDAY','WDAY',
                 'PBLH_0','PBLH_3','PBLH_6','PBLH_9','PBLH_12','PBLH_15','PBLH_18','PBLH_21',
                 'dT_0','dT_3','dT_6','dT_9','dT_12','dT_15','dT_18','dT_21',
                 'SHFX_0','SHFX_3','SHFX_6','SHFX_9','SHFX_12','SHFX_15','SHFX_18','SHFX_21',
                 'M10_0','M10_3','M10_6','M10_9','M10_12','M10_15','M10_18','M10_21',
                 'M100_0','M100_3','M100_6','M100_9','M100_12','M100_15','M100_18','M100_21',
                 'alpha_0','alpha_3','alpha_6','alpha_9','alpha_12','alpha_15','alpha_18','alpha_21',
                 'beta_0','beta_3','beta_6','beta_9','beta_12','beta_15','beta_18','beta_21',
                 'cosX100_0','cosX100_3','cosX100_6','cosX100_9','cosX100_12','cosX100_15','cosX100_18','cosX100_21',
                 'sinX100_0','sinX100_3','sinX100_6','sinX100_9','sinX100_12','sinX100_15','sinX100_18','sinX100_21',
                 'VENT_0','VENT_3','VENT_6','VENT_9','VENT_12','VENT_15','VENT_18','VENT_21',
                 'T2_0','T2_3','T2_6','T2_9','T2_12','T2_15','T2_18','T2_21',
                 'RH_0','RH_3','RH_6','RH_9','RH_12','RH_15','RH_18','RH_21']]

In [5]:
df_tst_1.describe()

Unnamed: 0,latitude,longitude,cosJDAY,sinJDAY,WDAY,PBLH_0,PBLH_3,PBLH_6,PBLH_9,PBLH_12,...,T2_18,T2_21,RH_0,RH_3,RH_6,RH_9,RH_12,RH_15,RH_18,RH_21
count,16350.0,16350.0,16350.0,16350.0,16350.0,16350.0,16350.0,16350.0,16350.0,16350.0,...,16350.0,16350.0,16350.0,16350.0,16350.0,16350.0,16350.0,16350.0,16350.0,16350.0
mean,31.346745,-27.364144,0.195975,0.10437,2.978471,252.775496,217.44432,197.969195,293.571309,821.934149,...,297.793453,294.632539,47.083622,48.885853,49.958837,46.464138,35.115148,29.950181,33.506828,41.388451
std,3.096324,101.212444,0.712993,0.665132,1.999517,245.385247,228.14311,221.40066,270.165783,593.911174,...,7.848693,7.608778,22.688716,22.969227,23.040705,22.013567,19.701075,19.063471,19.958439,21.536433
min,24.998015,-118.540188,-0.999963,-0.999991,0.0,10.256347,10.100066,10.093186,11.744179,24.047504,...,273.684509,270.635651,1.1,1.4,4.1,3.3,3.2,2.9,2.5,1.7
25%,28.645235,-117.956283,-0.459733,-0.478734,1.0,61.841543,46.426991,37.866886,100.48246,471.458527,...,291.980499,289.461029,27.200001,28.700001,30.0,27.799999,19.1,15.1,17.700001,23.4
50%,33.814243,-117.282546,0.38963,0.120208,3.0,183.072266,141.475876,116.825874,218.568748,654.42395,...,296.513306,293.325958,45.200001,47.099998,48.299999,43.900002,30.6,24.200001,28.200001,37.799999
75%,34.037858,77.192242,0.869589,0.762493,5.0,354.639374,294.08728,251.496353,382.74559,967.740753,...,303.599976,299.799988,67.099998,69.400002,71.074999,66.099998,49.0,42.177841,47.299999,58.900002
max,34.372178,121.593138,1.0,0.999991,6.0,1783.94397,1694.260254,1575.930298,1693.612793,4902.666992,...,319.073639,316.711884,98.800003,99.800003,99.0,98.400002,96.699997,93.199997,95.800003,96.099998


In [6]:
df_tst_2.describe()

Unnamed: 0,latitude,longitude,NO2_OMI,NO2Tr_OMI,cosJDAY,sinJDAY,WDAY,PBLH_0,PBLH_3,PBLH_6,...,T2_18,T2_21,RH_0,RH_3,RH_6,RH_9,RH_12,RH_15,RH_18,RH_21
count,16350.0,16350.0,8838.0,8759.0,16350.0,16350.0,16350.0,16350.0,16350.0,16350.0,...,16350.0,16350.0,16350.0,16350.0,16350.0,16350.0,16350.0,16350.0,16350.0,16350.0
mean,31.346745,-27.364144,8.340485,5.873028,0.195975,0.10437,2.978471,252.775496,217.44432,197.969195,...,297.793453,294.632539,47.083622,48.885853,49.958837,46.464138,35.115148,29.950181,33.506828,41.388451
std,3.096324,101.212444,3.783766,3.910196,0.712993,0.665132,1.999517,245.385247,228.14311,221.40066,...,7.848693,7.608778,22.688716,22.969227,23.040705,22.013567,19.701075,19.063471,19.958439,21.536433
min,24.998015,-118.540188,0.138372,0.00053,-0.999963,-0.999991,0.0,10.256347,10.100066,10.093186,...,273.684509,270.635651,1.1,1.4,4.1,3.3,3.2,2.9,2.5,1.7
25%,28.645235,-117.956283,5.950994,3.339092,-0.459733,-0.478734,1.0,61.841543,46.426991,37.866886,...,291.980499,289.461029,27.200001,28.700001,30.0,27.799999,19.1,15.1,17.700001,23.4
50%,33.814243,-117.282546,7.447121,4.836471,0.38963,0.120208,3.0,183.072266,141.475876,116.825874,...,296.513306,293.325958,45.200001,47.099998,48.299999,43.900002,30.6,24.200001,28.200001,37.799999
75%,34.037858,77.192242,9.659702,7.228231,0.869589,0.762493,5.0,354.639374,294.08728,251.496353,...,303.599976,299.799988,67.099998,69.400002,71.074999,66.099998,49.0,42.177841,47.299999,58.900002
max,34.372178,121.593138,32.558712,30.773388,1.0,0.999991,6.0,1783.94397,1694.260254,1575.930298,...,319.073639,316.711884,98.800003,99.800003,99.0,98.400002,96.699997,93.199997,95.800003,96.099998


#### Ensemble prediction

In [7]:
#Load the tuned model
trnOpt      = 1
NO2pred_1_i = np.zeros((nSamples,nEns))
for n in range(nEns):
    
    lgbReg_1   = load(open(TUNING_DIR + 'ESLGBTuningFS_' + str(trnOpt) + '_' + str(nTrial) + '_' + str(n) + '.pkl', "rb"))

    NO2pred_1_i[:,n] = lgbReg_1.predict(df_tst_1)
    print(n)

NO2pred_1_ens = np.median(NO2pred_1_i,axis=1)

#-----
trnOpt      = 2
NO2pred_2_i = np.zeros((nSamples,nEns))
for n in range(nEns):
    
    lgbReg_2   = load(open(TUNING_DIR + 'ESLGBTuningFS_' + str(trnOpt) + '_' + str(nTrial) + '_' + str(n) + '.pkl', "rb"))

    NO2pred_2_i[:,n] = lgbReg_2.predict(df_tst_2)
    print(n)

NO2pred_2_ens = np.median(NO2pred_2_i,axis=1)

#-----
#Find where NO2 values are NOT missing. We should use trnOpt = 2 for these cases.   
indx = np.where( (np.isnan(NO2_OMI)==0) & (np.isnan(NO2Tr_OMI)==0) )
NO2pred_1_ens[indx] = NO2pred_2_ens[indx] 

indx = np.where(NO2pred_1_ens < 0)
NO2pred_1_ens[indx] = 0

df_ens = pd.DataFrame(data=NO2pred_1_ens,columns=['value'])
df_ens.insert(0, 'datetime', datetime)
df_ens.insert(1, 'grid_id', ID)
df_ens.to_csv(SUBMISSION_DIR+'submission_sukantabasu.csv', index=False)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
