# NOTEBOOK 3A - Run Seasonal Forecasts.
### This script uses the selected features (from the best solution of the optimisation algorithm).
### ERA5 predictors are used to train ML models (.e.g Random Forest).
### Output: forecasts of the target (NDQ90) over the 1993-2016 period.

In [18]:
import numpy as np
from netCDF4 import Dataset
import glob
import matplotlib.pyplot as plt
import scipy.stats as sp
import pandas as pd
from sklearn.metrics import mean_squared_error
from math import cos, asin, sqrt, pi

#import warnings
#warnings.filterwarnings('ignore')

import sys
sys.path.insert(1, '../Modules/')
from seasonal_forecasts import *

## STEP 1 - Choose ML model.
### Select your model (clf_) from: Linear Regression (LR), Support Vector (SVR), Decision Tree (DT), Random Forest (RF), K Nearest Neighbour (neigh), 
### AdaBoost (AB), Multi-Layer Perceptron (MLP), Light Gradient Boost (LGBM)

In [19]:
from ML_models_regressors import clf_LR, clf_SVR, clf_DT, clf_RF, clf_neigh, clf_AB, clf_MLP, clf_LGBM
clf=clf_LR # clf_RF, clf_AB etc...
mod="LR" # change name to save file

output_file = f"Output/DDHWSF_Forecasts_{mod}_19792021.csv"

## STEP 2 - Extract Optimal Predictors.
### Solutions files for each grid point

In [20]:
files=sorted(glob.glob("Output/optimisation_output.csv"))

In [21]:
nevals=[] # number of evaluations 
cv_best=[] # best cross-validation/training score
test_best=[] # test score corresponding to cv_best
sols_best=[] # predictors correspondin to cv_best

for file in files:
    #print (file[-9:-4])
    sol_file_av = pd.read_csv(file, index_col=None, sep=' ', header=0)#[:20]
    if sol_file_av.shape[0]>0:
        nevals.append(sol_file_av.shape[0])
        sols_best.append(np.fromstring(sol_file_av.Sol[sol_file_av.sort_values(by=['CV'],ascending=True).index[0]].replace('[', '').replace(']', '').replace('\n', ''), dtype=float, sep=' '))
        cv_best.append(sol_file_av.CV[sol_file_av.sort_values(by=['CV'],ascending=True).index[0]])
        test_best.append(sol_file_av.Test[sol_file_av.sort_values(by=['CV'],ascending=True).index[0]])
    else:
        print ("Empty file - no solutions")


## STEP 3 - Open Target Data.
### Training on past2k (0-1850)
### Testing on ERA5

In [22]:
### Open HW target dataset ###

df1 = pd.read_csv("Output/NumberHWdays_past2k_Cluj-Napoca_thresh90_dur3.csv")
target_past2k=df1.NumberHWDays

df2 = pd.read_csv("Output/NumberHWdays_ERA5_Cluj-Napoca_thresh90_dur3.csv")
target_ERA5=df2.NumberHWDays#[1993-1993:2021-1940]

#===============================#

pred_dataframe_era5 = pd.read_csv('../DATA/Predictors_dataset_ERA5_weekly-smallsample.csv', index_col=0)

pred_dataframe_past2k = pd.read_csv('../DATA/Predictors_dataset_past2k_weekly-smallsample.csv', index_col=0)
pred_dataframe=pd.concat([pred_dataframe_past2k,pred_dataframe_era5])

# Convert ERA5 predictor to past2k units
# Soil Moisture kg/m2 , ERA5 - m3/s3 (divide by 0.1m, divide by 1000 kg.m3, times by 0.7 = divide by 70)
pred_dataframe['smEurope_cluster1']['1979-01-01':]=(pred_dataframe['smEurope_cluster1']['1979-01-01':].values)*70
pred_dataframe['smEurope_cluster2']['1979-01-01':]=(pred_dataframe['smEurope_cluster2']['1979-01-01':].values)*70
pred_dataframe['smEurope_cluster3']['1979-01-01':]=(pred_dataframe['smEurope_cluster3']['1979-01-01':].values)*70
pred_dataframe['smEurope_cluster4']['1979-01-01':]=(pred_dataframe['smEurope_cluster4']['1979-01-01':].values)*70
pred_dataframe['smEurope_cluster5']['1979-01-01':]=(pred_dataframe['smEurope_cluster5']['1979-01-01':].values)*70

# SIC Arctic
# past2k - percentage , ERA5 - proportion 
# !!! Uncomment if using all predictors !!!
#pred_dataframe['sicArctic_cluster1']['1979-01-01':]=pred_dataframe['sicArctic_cluster1']['1979-01-01':].values*100
#pred_dataframe['sicArctic_cluster2']['1979-01-01':]=pred_dataframe['sicArctic_cluster2']['1979-01-01':].values*100
#pred_dataframe['sicArctic_cluster3']['1979-01-01':]=pred_dataframe['sicArctic_cluster3']['1979-01-01':].values*100
#pred_dataframe['sicArctic_cluster4']['1979-01-01':]=pred_dataframe['sicArctic_cluster4']['1979-01-01':].values*100
#pred_dataframe['sicArctic_cluster5']['1979-01-01':]=pred_dataframe['sicArctic_cluster5']['1979-01-01':].values*100

## STEP 4 - Run Forecast.

In [23]:
remove_co2=True

preds=forecast(target_past2k, target_ERA5, [1993,2020], sols_best[0], clf, pred_dataframe, remove_co2=True)
print (preds)
saver(output_file,preds[1],target_ERA5.size)

('LinearRegression', array([ 8.15291924,  8.1031216 ,  6.19226069,  9.96107485,  9.05450493,
        9.59704052, 10.78968721,  8.79909978,  8.16939287,  9.35034391,
       14.31040951,  8.28833678,  9.89577317, 11.39757648, 10.45070499,
        8.46135872, 10.30436324, 10.40887242, 13.25679161,  9.71839182,
       11.50186403, 10.64323431, 11.46467606,  8.61131834, 11.48408093,
        8.0116867 , 15.08120479, 13.54607919]))
Saved predictions with metadata to Output/DDHWSF_Forecasts_LR_19792021.csv
