In [14]:
from tqdm.auto import tqdm 
import pandas as pd
import numpy as np
import xarray as xr
import netCDF4 as nf
from netCDF4 import Dataset
%matplotlib inline
import glob
import seaborn as sns
import matplotlib.pyplot as plt
import ast
from copy import deepcopy

# Custom packages
import read_config
from util.data_process import read_vars, proc_dataset
from util.models import performance_scores,train_baseline

In [12]:
# Read configuration file
config_set = read_config.read_config()
# Define Target
if int(config_set['target_lag'])==4:
    target='delv24'

seed=100

In [3]:
# Process the filted TC list in the config file
TC_tofilt_list = ast.literal_eval(config_set['TCfilt'])
# Get the names of the remaining TCs
filt_TClist = read_vars.remove_storms(trackpath=config_set['track_path'],basinID='NA',yearmin=int(config_set['start_year']),yearmax=int(config_set['end_year']),
                                      remove_set=TC_tofilt_list)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 22/22 [00:00<00:00, 663.65it/s]


In [4]:
# Process the variable names to remove in the config file
ERA5_dropvarname = ast.literal_eval(config_set['ERA5_dropvarname'])
# Read saved ERA5 csvs
storeERA5 = read_vars.read_ERA5_csv(startyear=int(config_set['start_year']),endyear=int(config_set['end_year']),vars_path=config_set['vars_path'],
                                      filted_TCnames=filt_TClist,suffixlist=['obsw_dwmax','tigramite_6hr'],era5_dropvar=ERA5_dropvarname)

22it [00:02, 10.59it/s]


In [5]:
storeERA5_PRIMED = read_vars.read_TCPRIMED_df(startyear=int(config_set['start_year']),
                                              endyear=int(config_set['end_year']),
                                              ERA5dict=storeERA5,
                                              filted_TCnames=filt_TClist,
                                              PRIMEDpath=config_set['PRIMED_path'],
                                              PRIMEDlevels=ast.literal_eval(config_set['PRIMED_levels'])
                                             )

22it [00:10,  2.01it/s]


In [6]:
storeERA5_all = read_vars.create_ERA5_df(startyear=int(config_set['start_year']),
                                         endyear=int(config_set['end_year']),
                                         ERA5SPS_path=config_set['ERA5SPS_path'],
                                         ERA5SPS_suffix='all_storms_ships23vars_obswmax.pkl',
                                         ERA5dict=storeERA5_PRIMED,
                                         wantvarnames=ast.literal_eval(config_set['ERA5SPS_varname']),
                                         targetname=target,
                                         filted_TCnames=filt_TClist,
                                         lagnum=int(config_set['target_lag']))

22it [00:00, 61.65it/s]


In [8]:
var_names=storeERA5_all[2001]['ALLISON'].columns.values.tolist()
    
TC_fulllist = {}
for year in np.linspace(int(config_set['start_year']),int(config_set['end_year']),int(config_set['end_year'])-int(config_set['start_year'])+1):
    temp = storeERA5_all[year]
    for ind,name in enumerate(temp.keys()):
        TC_fulllist[str(int(year))+'_'+name] = temp[name]

In [15]:
#---------------------------------------------------------------------------------------------------------
# ML-ready dataset
#---------------------------------------------------------------------------------------------------------
# Split data with a 0.15 test, 0.15 valid split
datastorer = proc_dataset.splitdata_handler(df=TC_fulllist,
                                            method='year',
                                            seed=seed,
                                            config=config_set,
                                            testyears=[2020,2021]
                                           )
# Remove empty storms in the data
traincleaned = {key: datastorer['train'][key] for ind,key in enumerate(datastorer['train'].keys()) if datastorer['train'][key].shape[0]>0}
validcleaned = {key: datastorer['valid'][key] for ind,key in enumerate(datastorer['valid'].keys()) if datastorer['valid'][key].shape[0]>0}
testcleaned = {key: datastorer['test'][key] for ind,key in enumerate(datastorer['test'].keys()) if datastorer['test'][key].shape[0]>0}
    
# Replace original training data with the cleaned version
datastorer_n = deepcopy(datastorer)
    
# Replace
datastorer_n['train'] = traincleaned
datastorer_n['valid'] = validcleaned
datastorer_n['test'] = testcleaned

In [17]:
# Get smoothed MSLP data and argmin values
smoothed_MSLP, MSLP_argmin = proc_dataset.proc_data(df=datastorer_n,seed=seed).smooth_and_minindices(varname='pmin',sigma=3)
# Aligned the inputs with the minimum SLP data
aligned_train = proc_dataset.proc_data(df=datastorer_n,seed=seed).do_data_align(datastorer_n['train'],MSLP_argmin['train'],var_names)
aligned_valid = proc_dataset.proc_data(df=datastorer_n,seed=seed).do_data_align(datastorer_n['valid'],MSLP_argmin['valid'],var_names)
aligned_test = proc_dataset.proc_data(df=datastorer_n,seed=seed).do_data_align(datastorer_n['test'],MSLP_argmin['test'],var_names)
    
# Combine different TCs into a long dataset
X,y,size = proc_dataset.df_proc_separate(aligned_train,aligned_valid,aligned_test,target)

# Find the mean and std of the training set for normalization
trainmean,trainstd = X['train'].dropna().mean(axis=0),X['train'].dropna().std(axis=0)

# Data normalization
Xnorml = proc_dataset.normalized_TCs_handler(train=aligned_train,
                                             valid=aligned_valid,
                                             test=aligned_test,
                                             trainmean=trainmean,
                                             trainstd=trainstd,
                                             dropcol=[target],
                                             target=target
                                            )
var_names = Xnorml['train'][list(Xnorml['train'].keys())[0]].columns