<h1><center> NASA Airathon - NO2 Track </center></h1>

### <center> Single Day Forecasting: Light GBM </center>

<div style="text-align: center"> 
    Dr. Sukanta Basu <br/> Associate Professor <br/> Delft University of Technology, The Netherlands <br/> Email: s.basu@tudelft.nl<br/> https://sites.google.com/view/sukantabasu/
</div>

#### Log

Last updated: 4th April, 2022

#### User instructions

Run this notebook. It will ingest testOBS.csv, testOMI.csv, and testGFS.csv files to produce final forecast (submission_sukantabasu.csv). 

#### Load packages

In [1]:
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
from pathlib import Path

from pickle import dump, load

import lightgbm as lgb

#For reproducibility of the results, the following seeds should be selected 
from numpy.random import seed
seed(20)

#### Directories

In [2]:
ROOT_DIR    = '../../'

#Location of processed datasets
EXTDATA_DIR = ROOT_DIR + 'data/singleday/processed/'

#Location of saved models
TUNING_DIR  = ROOT_DIR + 'model/'

#Location of final submission
SUBMISSION_DIR = ROOT_DIR + 'forecast/singleday/'

#### User input

In [3]:
nTrial = 256 #required input for halving random grid search
nEns   = 100 #number of ensembles

#### Prepare test data

In [4]:
df_OBS    = pd.read_csv(EXTDATA_DIR + 'test/STN/' + 'testOBS.csv')

df_OBS_subset = df_OBS[['latitude','longitude']]

df_OMI = pd.read_csv(EXTDATA_DIR + 'test/OMI/' + 'testOMI.csv')
df_OMI_subset = df_OMI[['NO2_OMI','NO2Tr_OMI']]

df_GFS   = pd.read_csv(EXTDATA_DIR + 'test/GFS/' + 'testGFS.csv')

df_tst_1 = pd.concat([df_OBS_subset,df_GFS], axis=1)
df_tst_1 = df_tst_1[['latitude','longitude',
                     'cosJDAY','sinJDAY','WDAY',
                     'PBLH_0','PBLH_3','PBLH_6','PBLH_9','PBLH_12','PBLH_15','PBLH_18','PBLH_21',
                     'dT_0','dT_3','dT_6','dT_9','dT_12','dT_15','dT_18','dT_21',
                     'SHFX_0','SHFX_3','SHFX_6','SHFX_9','SHFX_12','SHFX_15','SHFX_18','SHFX_21',
                     'M10_0','M10_3','M10_6','M10_9','M10_12','M10_15','M10_18','M10_21',
                     'M100_0','M100_3','M100_6','M100_9','M100_12','M100_15','M100_18','M100_21',
                     'alpha_0','alpha_3','alpha_6','alpha_9','alpha_12','alpha_15','alpha_18','alpha_21',
                     'beta_0','beta_3','beta_6','beta_9','beta_12','beta_15','beta_18','beta_21',
                     'cosX100_0','cosX100_3','cosX100_6','cosX100_9','cosX100_12','cosX100_15','cosX100_18','cosX100_21',
                     'sinX100_0','sinX100_3','sinX100_6','sinX100_9','sinX100_12','sinX100_15','sinX100_18','sinX100_21',
                     'VENT_0','VENT_3','VENT_6','VENT_9','VENT_12','VENT_15','VENT_18','VENT_21',
                     'T2_0','T2_3','T2_6','T2_9','T2_12','T2_15','T2_18','T2_21',
                     'RH_0','RH_3','RH_6','RH_9','RH_12','RH_15','RH_18','RH_21']]

df_tst_2 = pd.concat([df_OBS_subset,df_OMI_subset,df_GFS], axis=1)
NO2_OMI   = df_tst_2['NO2_OMI'].values
NO2Tr_OMI = df_tst_2['NO2Tr_OMI'].values

df_tst_2 = df_tst_2[['latitude','longitude',
                 'NO2_OMI','NO2Tr_OMI',
                 'cosJDAY','sinJDAY','WDAY',
                 'PBLH_0','PBLH_3','PBLH_6','PBLH_9','PBLH_12','PBLH_15','PBLH_18','PBLH_21',
                 'dT_0','dT_3','dT_6','dT_9','dT_12','dT_15','dT_18','dT_21',
                 'SHFX_0','SHFX_3','SHFX_6','SHFX_9','SHFX_12','SHFX_15','SHFX_18','SHFX_21',
                 'M10_0','M10_3','M10_6','M10_9','M10_12','M10_15','M10_18','M10_21',
                 'M100_0','M100_3','M100_6','M100_9','M100_12','M100_15','M100_18','M100_21',
                 'alpha_0','alpha_3','alpha_6','alpha_9','alpha_12','alpha_15','alpha_18','alpha_21',
                 'beta_0','beta_3','beta_6','beta_9','beta_12','beta_15','beta_18','beta_21',
                 'cosX100_0','cosX100_3','cosX100_6','cosX100_9','cosX100_12','cosX100_15','cosX100_18','cosX100_21',
                 'sinX100_0','sinX100_3','sinX100_6','sinX100_9','sinX100_12','sinX100_15','sinX100_18','sinX100_21',
                 'VENT_0','VENT_3','VENT_6','VENT_9','VENT_12','VENT_15','VENT_18','VENT_21',
                 'T2_0','T2_3','T2_6','T2_9','T2_12','T2_15','T2_18','T2_21',
                 'RH_0','RH_3','RH_6','RH_9','RH_12','RH_15','RH_18','RH_21']]

In [5]:
df_tst_1

Unnamed: 0,latitude,longitude,cosJDAY,sinJDAY,WDAY,PBLH_0,PBLH_3,PBLH_6,PBLH_9,PBLH_12,...,T2_18,T2_21,RH_0,RH_3,RH_6,RH_9,RH_12,RH_15,RH_18,RH_21
0,33.66484,-117.327462,-0.605056,-0.796183,1,10.615895,10.65457,10.664719,144.277786,985.858521,...,305.985779,294.799988,80.300003,84.0,83.800003,57.700001,19.6,13.0,16.200001,45.0


In [6]:
df_tst_2

Unnamed: 0,latitude,longitude,NO2_OMI,NO2Tr_OMI,cosJDAY,sinJDAY,WDAY,PBLH_0,PBLH_3,PBLH_6,...,T2_18,T2_21,RH_0,RH_3,RH_6,RH_9,RH_12,RH_15,RH_18,RH_21
0,33.66484,-117.327462,6.747592,3.842983,-0.605056,-0.796183,1,10.615895,10.65457,10.664719,...,305.985779,294.799988,80.300003,84.0,83.800003,57.700001,19.6,13.0,16.200001,45.0


#### Ensemble prediction

In [7]:
#Load the tuned model
trnOpt      = 1
NO2pred_1_i = np.zeros((1,nEns))
for n in range(nEns):
    
    lgbReg_1   = load(open(TUNING_DIR + 'ESLGBTuningFS_' + str(trnOpt) + '_' + str(nTrial) + '_' + str(n) + '.pkl', "rb"))

    NO2pred_1_i[:,n] = lgbReg_1.predict(df_tst_1)
    print(n)

NO2pred_1_ens = np.median(NO2pred_1_i,axis=1)

#-----
trnOpt      = 2
NO2pred_2_i = np.zeros((1,nEns))
for n in range(nEns):
    
    lgbReg_2   = load(open(TUNING_DIR + 'ESLGBTuningFS_' + str(trnOpt) + '_' + str(nTrial) + '_' + str(n) + '.pkl', "rb"))

    NO2pred_2_i[:,n] = lgbReg_2.predict(df_tst_2)
    print(n)

NO2pred_2_ens = np.median(NO2pred_2_i,axis=1)

#-----
#Find where NO2 values are NOT missing. We should use trnOpt = 2 for these cases.   
indx = np.where( (np.isnan(NO2_OMI)==0) & (np.isnan(NO2Tr_OMI)==0) )
NO2pred_1_ens[indx] = NO2pred_2_ens[indx] 

indx = np.where(NO2pred_1_ens < 0)
NO2pred_1_ens[indx] = 0

df_ens = pd.DataFrame(data=NO2pred_1_ens,columns=['value'])
df_ens.to_csv(SUBMISSION_DIR+'submission_sukantabasu.csv', index=False)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
