<h1><center> NASA Airathon - NO2 Track </center></h1>

### <center> Training: Light GBM </center>

<div style="text-align: center"> 
    Dr. Sukanta Basu <br/> Associate Professor <br/> Delft University of Technology, The Netherlands <br/> Email: s.basu@tudelft.nl<br/> https://sites.google.com/view/sukantabasu/
</div>

#### Log

Last updated: 4th April, 2022

#### User instructions

Run this notebook twice. First with trnOpt = 1 followed by trnOpt = 2. 

#### Load packages

In [None]:
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
from pathlib import Path

from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingRandomSearchCV

import lightgbm as lgb

from pickle import dump, load
import time

#For reproducibility of the results, the following seeds should be selected 
from numpy.random import seed
seed(20)

#### Directories

In [None]:
ROOT_DIR    = '../../'

#Location of processed datasets
EXTDATA_DIR = ROOT_DIR + 'data/airathon/processed/'

#Location of saved models
TUNING_DIR  = ROOT_DIR + 'model/'

#### User input

In [None]:
nTrial = 256 #required input for halving random grid search
nEns   = 100 #number of ensembles

trnOpt  = 2 #1: OBS+GFS; 2: OBS+GFS+OMI

#### Prepare training data

In [None]:
tini = time.time()

df_OBS   = pd.read_csv(EXTDATA_DIR + 'train/STN/' + 'trainOBS.csv')
df_OBS_subset = df_OBS[['latitude','longitude']]

df_OMI = pd.read_csv(EXTDATA_DIR + 'train/OMI/' + 'trainOMI.csv')
df_OMI_subset = df_OMI[['NO2_OMI','NO2Tr_OMI']]

df_GFS   = pd.read_csv(EXTDATA_DIR + 'train/GFS/' + 'trainGFS.csv')

if trnOpt == 1:
    df_trn   = pd.concat([df_OBS_subset,df_GFS], axis=1)

    df_trn = df_trn[['latitude','longitude',
                     'cosJDAY','sinJDAY','WDAY',
                     'PBLH_0','PBLH_3','PBLH_6','PBLH_9','PBLH_12','PBLH_15','PBLH_18','PBLH_21',
                     'dT_0','dT_3','dT_6','dT_9','dT_12','dT_15','dT_18','dT_21',
                     'SHFX_0','SHFX_3','SHFX_6','SHFX_9','SHFX_12','SHFX_15','SHFX_18','SHFX_21',
                     'M10_0','M10_3','M10_6','M10_9','M10_12','M10_15','M10_18','M10_21',
                     'M100_0','M100_3','M100_6','M100_9','M100_12','M100_15','M100_18','M100_21',
                     'alpha_0','alpha_3','alpha_6','alpha_9','alpha_12','alpha_15','alpha_18','alpha_21',
                     'beta_0','beta_3','beta_6','beta_9','beta_12','beta_15','beta_18','beta_21',
                     'cosX100_0','cosX100_3','cosX100_6','cosX100_9','cosX100_12','cosX100_15','cosX100_18','cosX100_21',
                     'sinX100_0','sinX100_3','sinX100_6','sinX100_9','sinX100_12','sinX100_15','sinX100_18','sinX100_21',
                     'VENT_0','VENT_3','VENT_6','VENT_9','VENT_12','VENT_15','VENT_18','VENT_21',
                     'T2_0','T2_3','T2_6','T2_9','T2_12','T2_15','T2_18','T2_21',
                     'RH_0','RH_3','RH_6','RH_9','RH_12','RH_15','RH_18','RH_21',
                     'NO2']]
        
elif trnOpt == 2:
    df_trn   = pd.concat([df_OBS_subset,df_OMI_subset,df_GFS], axis=1)

    df_trn = df_trn[['latitude','longitude',
                     'NO2_OMI','NO2Tr_OMI',
                     'cosJDAY','sinJDAY','WDAY',
                     'PBLH_0','PBLH_3','PBLH_6','PBLH_9','PBLH_12','PBLH_15','PBLH_18','PBLH_21',
                     'dT_0','dT_3','dT_6','dT_9','dT_12','dT_15','dT_18','dT_21',
                     'SHFX_0','SHFX_3','SHFX_6','SHFX_9','SHFX_12','SHFX_15','SHFX_18','SHFX_21',
                     'M10_0','M10_3','M10_6','M10_9','M10_12','M10_15','M10_18','M10_21',
                     'M100_0','M100_3','M100_6','M100_9','M100_12','M100_15','M100_18','M100_21',
                     'alpha_0','alpha_3','alpha_6','alpha_9','alpha_12','alpha_15','alpha_18','alpha_21',
                     'beta_0','beta_3','beta_6','beta_9','beta_12','beta_15','beta_18','beta_21',
                     'cosX100_0','cosX100_3','cosX100_6','cosX100_9','cosX100_12','cosX100_15','cosX100_18','cosX100_21',
                     'sinX100_0','sinX100_3','sinX100_6','sinX100_9','sinX100_12','sinX100_15','sinX100_18','sinX100_21',
                     'VENT_0','VENT_3','VENT_6','VENT_9','VENT_12','VENT_15','VENT_18','VENT_21',
                     'T2_0','T2_3','T2_6','T2_9','T2_12','T2_15','T2_18','T2_21',
                     'RH_0','RH_3','RH_6','RH_9','RH_12','RH_15','RH_18','RH_21',
                     'NO2']]
    
df_trn   = df_trn.dropna()

In [None]:
plt.hist(df_trn['NO2'],100);
plt.show()

p10 = np.percentile(df_trn['NO2'],10)
p50 = np.percentile(df_trn['NO2'],50)
p90 = np.percentile(df_trn['NO2'],90)
p95 = np.percentile(df_trn['NO2'],95)
p99 = np.percentile(df_trn['NO2'],99)
pmax= np.max(df_trn['NO2'])
print((p10,p50,p90,p95,p99,pmax))

df_trn.describe()

#### Get rid of "rare" events

In [None]:
thres = np.percentile(df_trn['NO2'],99.9)
indx = np.where(df_trn['NO2'] > thres)
print(np.size(indx))
df_trn = df_trn[df_trn['NO2'] < thres]

In [None]:
XTrnVal  = df_trn.iloc[:,0:-1]
yTrnVal  = df_trn.iloc[:,-1:]

XTrnVal.describe()

In [None]:
yTrnVal.describe()

#### Training

In [None]:
randSeed = np.random.randint(1000)

#----------------------------
#Coarse hyperparameter tuning
params = {'num_leaves':2**np.arange(2,10,1),'max_depth':np.arange(1,11,1),\
                  'learning_rate':np.array([0.005,0.01,0.025]),\
                  'reg_lambda':np.arange(0,3.01,0.5),'reg_alpha':np.arange(0,3.01,0.5),\
                  'subsample':np.arange(0.1,1.01,0.1), 'colsample_bytree':np.arange(0.1,1.01,0.1)}

lgbReg = lgb.LGBMRegressor(objective='mse', random_state = randSeed, force_col_wise = True, metric='None', 
                           first_metric_only=True, n_jobs=-1)    
lgbReg.min_data_in_leaf = 100 #Important to avoid overfitting
randLgbReg = HalvingRandomSearchCV(lgbReg, params, resource = 'n_estimators', min_resources = 100, 
                                   n_candidates = nTrial, max_resources = 1600, factor = 2, 
                                   scoring = 'neg_mean_squared_error',
                                   cv = 10, return_train_score=True, random_state = randSeed, verbose = 10, n_jobs=1)

randLgbReg.fit(XTrnVal, yTrnVal)
trnScore = randLgbReg.cv_results_['mean_train_score']
valScore = randLgbReg.cv_results_['mean_test_score']

bestScore = -1*randLgbReg.best_score_
indx = np.where(valScore == -bestScore)
bestTrnScore = trnScore[indx]
globalBestScore = bestScore

et = time.time() - tini
print((et,trnScore,valScore,bestTrnScore,bestScore))

bestParam = randLgbReg.best_params_
print(bestParam)
lgbRegNew   = randLgbReg.best_estimator_

dump(lgbRegNew, open(TUNING_DIR + 'CoarseLGBTuningFS_' + str(trnOpt) + '_' + str(nTrial) + '.pkl', 'wb'))

#----------------------------
#Early stopping

#Refit the model with early stopping (randomly select 10% data for validation)
nSamples    = np.size(yTrnVal)
nSamplesVal = int(np.rint(0.1*nSamples))

sumTrnScore = 0
sumValScore = 0
count = 0
for n in range(100000):
    tini = time.time()

    #Refit the data with best parameters + early stopping
    lgbRegES = lgbRegNew
    lgbRegES.n_jobs = -1

    #Overwrite n_estimators to 2000 
    lgbRegES.n_estimators = 10000

    #Pick a random start location for validation set
    iVal        = np.random.randint(0,nSamples-nSamplesVal+1)

    print((nSamples,nSamplesVal,iVal))

    XValDATA    = XTrnVal.iloc[iVal:iVal+nSamplesVal+1,:]
    yValDATA    = yTrnVal.iloc[iVal:iVal+nSamplesVal+1,:]
    XVal        = pd.DataFrame(data=XValDATA)
    yVal        = pd.DataFrame(data=yValDATA)

    XTrn1DATA   = XTrnVal.iloc[0:iVal,:]
    yTrn1DATA   = yTrnVal.iloc[0:iVal,:]

    XTrn2DATA   = XTrnVal.iloc[iVal+nSamplesVal+1:,:]
    yTrn2DATA   = yTrnVal.iloc[iVal+nSamplesVal+1:,:]

    XTrn        = pd.concat([XTrn1DATA, XTrn2DATA], axis=0)
    yTrn        = pd.concat([yTrn1DATA, yTrn2DATA], axis=0)
    
    eval_set = [(XTrn, yTrn),(XVal, yVal)]
    
    lgbRegES.fit(XTrn, yTrn, early_stopping_rounds=25, eval_metric="l2", eval_set=eval_set, verbose=-1)

    trnScore    = lgbRegES.best_score_["training"]["l2"]
    valScore    = lgbRegES.best_score_["valid_1"]["l2"]
                
    et          = time.time() - tini 

    if (trnScore < globalBestScore) & (valScore < globalBestScore):
        
        dump(lgbRegES, open(TUNING_DIR + 'ESLGBTuningFS_' + str(trnOpt) + '_' + str(nTrial) + '_' + str(count) + '.pkl', 'wb'))            

        count = count + 1
        sumTrnScore = sumTrnScore + trnScore
        sumValScore = sumValScore + valScore
        print((count,et,sumTrnScore/count,sumValScore/count))
        if count == nEns:
            break
