In [1]:
from tqdm.auto import tqdm 
import pandas as pd
import numpy as np
import xarray as xr
import netCDF4 as nf
from netCDF4 import Dataset
%matplotlib inline
import glob
import seaborn as sns
import matplotlib.pyplot as plt
import ast,gc
from copy import deepcopy

# Custom packages
import read_config
from util.data_process import read_vars, proc_dataset
from util.models import performance_scores,train_baseline,causal_settings,train_PC1

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Read configuration file
config_set = read_config.read_config()
# Define Target
if int(config_set['target_lag'])==4:
    target='DELV24'
elif int(config_set['target_lag'])==8:
    target='DELV48'
seed = 10

# Create Pandas DataFrame

In [3]:
# Process the filted TC list in the config file
TC_tofilt_list = ast.literal_eval(config_set['TCfilt'])
# Get the names of the remaining TCs
filt_TClist = read_vars.remove_storms(trackpath=config_set['track_path'],basinID='NA',yearmin=int(config_set['start_year']),yearmax=int(config_set['end_year']),
                                      remove_set=TC_tofilt_list)
# Read saved SHIPS csvs
storeSHIPS = read_vars.read_SHIPS_csv(startyear=int(config_set['start_year']),endyear=int(config_set['end_year']),vars_path=config_set['vars_path'],
                                      filted_TCnames=filt_TClist,suffixlist=['newships_dev_POT'])
# Read selected variables from the pandas dfs
SHIPS_df = read_vars.create_SHIPS_df(startyear=int(config_set['start_year']),endyear=int(config_set['end_year']),SHIPSdict=storeSHIPS,
                                     wantvarnames=config_set['SHIPSops_varname'],targetname=target,filted_TCnames=filt_TClist,
                                     lagnum=int(config_set['target_lag'])
                                    )
# Add derived variables stored separately
store_dfstorms_ships = read_vars.add_derive_df(startyear=int(config_set['start_year']),
                                     endyear=int(config_set['end_year']),
                                     SHIPSdict=SHIPS_df,
                                     addfilepath='/work/FAC/FGSE/IDYST/tbeucler/default/saranya/causal/SHIPS/ships_pkl/all_storms_ships23vars_obswmax.pkl',
                                     addvarname=['pc20'],
                                     filted_TCnames=filt_TClist,
                                     lagnum=int(config_set['target_lag'])
                                    )

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 22/22 [00:00<00:00, 623.51it/s]
22it [00:00, 47.71it/s]
22it [00:00, 120.92it/s]
22it [00:00, 153.95it/s]


In [4]:
var_names=store_dfstorms_ships[2001]['ALLISON'].columns.values.tolist()

TC_fulllist = {}
for year in np.linspace(int(config_set['start_year']),int(config_set['end_year']),int(config_set['end_year'])-int(config_set['start_year'])+1):
    temp = store_dfstorms_ships[year]
    for ind,name in enumerate(temp.keys()):
        TC_fulllist[str(int(year))+'_'+name] = temp[name]

# ML-ready dataset

In [5]:
# Split data with a 0.15 test, 0.15 valid split
datastorer = proc_dataset.splitdata_handler(df=TC_fulllist,method='year',seed=seed,config=config_set,testyears=[2020,2021])

# Remove empty storms in the data
traincleaned = {key: datastorer['train'][key] for ind,key in enumerate(datastorer['train'].keys()) if datastorer['train'][key].shape[0]>0}
validcleaned = {key: datastorer['valid'][key] for ind,key in enumerate(datastorer['valid'].keys()) if datastorer['valid'][key].shape[0]>0}
testcleaned = {key: datastorer['test'][key] for ind,key in enumerate(datastorer['test'].keys()) if datastorer['test'][key].shape[0]>0}

# Replace original training data with the cleaned version
datastorer_n = deepcopy(datastorer)

# Replace
datastorer_n['train'] = traincleaned
datastorer_n['valid'] = validcleaned
datastorer_n['test'] = testcleaned

In [6]:
# Get smoothed MSLP data and argmin values
smoothed_MSLP, MSLP_argmin = proc_dataset.proc_data(df=datastorer_n,
                                                    seed=seed).smooth_and_minindices(varname='MSLP',sigma=3)
# Aligned the inputs with the minimum SLP data
aligned_train = proc_dataset.proc_data(df=datastorer_n,seed=seed).do_data_align(datastorer_n['train'],MSLP_argmin['train'],var_names)
aligned_valid = proc_dataset.proc_data(df=datastorer_n,seed=seed).do_data_align(datastorer_n['valid'],MSLP_argmin['valid'],var_names)
aligned_test = proc_dataset.proc_data(df=datastorer_n,seed=seed).do_data_align(datastorer_n['test'],MSLP_argmin['test'],var_names)

# Combine different TCs into a long dataset
X,y,size = proc_dataset.df_proc_separate(aligned_train,aligned_valid,aligned_test,target)

In [7]:
# Find the mean and std of the training set for normalization
trainmean,trainstd = X['train'].dropna().mean(axis=0),X['train'].dropna().std(axis=0)

# Data normalization
Xnorml = proc_dataset.normalized_TCs_handler(train=aligned_train,
                                             valid=aligned_valid,
                                             test=aligned_test,
                                             trainmean=trainmean,
                                             trainstd=trainstd,
                                             dropcol=[target],
                                             target=target
                                            )
var_names = Xnorml['train'][list(Xnorml['train'].keys())[0]].columns

# Causal

In [8]:
onlyships_lag = causal_settings.link_onlyships(numvar=aligned_train[list(aligned_train.keys())[0]].shape[1],
                                               lag=int(config_set['target_lag']),
                                               target_ind=[0])

In [42]:
for key in aligned_train.keys():
    plt.plot(np.ma.masked_less(aligned_train[key][:,2],800))
#plt.ylim(0,1010)
plt.show()

InvalidIndexError: (slice(None, None, None), 2)

In [9]:
results = []
for pc_alpha in tqdm([0.0001, 0.00015 ,0.001,0.0015,0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1,
                      0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6,0.65,0.7,0.75,0.8,0.85,0.86,0.88,0.9]):
    Xnorml_c = {'train': {ind: np.asarray(Xnorml['train'][key].replace(np.nan,-999.0)) for ind,key in enumerate(Xnorml['train'].keys())},
            'valid': {ind: np.asarray(Xnorml['valid'][key].replace(np.nan,-999.0)) for ind,key in enumerate(Xnorml['valid'].keys())},
            'test': {ind: np.asarray(Xnorml['test'][key].replace(np.nan,-999.0)) for ind,key in enumerate(Xnorml['test'].keys())}
           }
    result = train_PC1.Pipeline(Xnorml_c['train'],
                                pc_alpha,
                                pc_type='run_pcstable',
                                tau_min0=int(config_set['tau_min']),
                                tau_max0=int(config_set['tau_max']),
                                var_name=var_names,
                                link_assumptions=onlyships_lag).run_tigramite()
    del Xnorml_c
    gc.collect()
    results.append(result)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:48<00:00,  1.52s/it]


# Performance Skill

## No causally-informed feature selection

In [10]:
ytrain = np.concatenate([np.asarray(Xnorml['train'][key].dropna()[target]) for key in Xnorml['train'].keys()],axis=0)
Xtrain = np.concatenate([np.asarray(Xnorml['train'][key].dropna().drop(columns=[target])) for key in Xnorml['train'].keys()],axis=0)
yvalid = np.concatenate([np.asarray(Xnorml['valid'][key].dropna()[target]) for key in Xnorml['valid'].keys()],axis=0)
Xvalid = np.concatenate([np.asarray(Xnorml['valid'][key].dropna().drop(columns=[target])) for key in Xnorml['valid'].keys()],axis=0)
ytest = np.concatenate([np.asarray(Xnorml['test'][key].dropna()[target]) for key in Xnorml['test'].keys()],axis=0)
Xtest = np.concatenate([np.asarray(Xnorml['test'][key].dropna().drop(columns=[target])) for key in Xnorml['test'].keys()],axis=0)

In [11]:
Xnorml_nocausal = {'train': Xtrain, 'valid': Xvalid, 'test': Xtest}
y = {'train': ytrain, 'valid': yvalid, 'test': ytest}

In [12]:
regr = train_baseline.train_baseline_MLR(Xnorml_nocausal,y)

In [13]:
MLR_scoreboard = performance_scores.scoreboard(regr).store_scores(Xnorml_nocausal,y)
MLR_scoreboard['train']['r2'],MLR_scoreboard['valid']['r2'],MLR_scoreboard['test']['r2']

(0.39057866103395666, 0.32414988553072477, 0.3810791945004669)

## With causally-informed feature selection

In [14]:
def benchmark_causal(PC1_results=None,Xnorml=None):
    causal_predictor_list = [var_names[i] for i in [obj[0] for obj in PC1_results[0]]]
    while target in causal_predictor_list: 
        causal_predictor_list.remove(target)
        
    Xtrain_causal = np.concatenate([np.asarray(Xnorml['train'][key].dropna()[causal_predictor_list]) for key in Xnorml['train'].keys()],axis=0)
    Xvalid_causal = np.concatenate([np.asarray(Xnorml['valid'][key].dropna()[causal_predictor_list]) for key in Xnorml['valid'].keys()],axis=0)
    Xtest_causal = np.concatenate([np.asarray(Xnorml['test'][key].dropna()[causal_predictor_list]) for key in Xnorml['test'].keys()],axis=0)
    
    Xnorml_causal = {'train': Xtrain_causal, 'valid': Xvalid_causal, 'test': Xtest_causal}
    regr_causal = train_baseline.train_baseline_MLR(Xnorml_causal,y)
    return performance_scores.scoreboard(regr_causal).store_scores(Xnorml_causal,y),Xnorml_causal,regr_causal

In [15]:
scores,Xs,regr = [],[],[]
for obj in results:
    score,X,regrz = benchmark_causal(PC1_results=obj,Xnorml=Xnorml)
    scores.append(score)
    Xs.append(X)
    regr.append(regrz)

In [16]:
[obj['valid']['r2'] for obj in scores]

[0.1021555358079308,
 0.1021555358079308,
 0.1021555358079308,
 0.1021555358079308,
 0.2452320855925929,
 0.2452320855925929,
 0.24405800371330577,
 0.24379775741868182,
 0.24379775741868182,
 0.24379775741868182,
 0.2446481135565478,
 0.2709626786485583,
 0.2709626786485583,
 0.268247385815237,
 0.2853524218899478,
 0.28628801845663976,
 0.28628801845663976,
 0.28628801845663976,
 0.28628801845663976,
 0.28628801845663976,
 0.28628801845663976,
 0.28628801845663976,
 0.28863877427561013,
 0.28863877427561013,
 0.28863877427561013,
 0.28863877427561013,
 0.2744416432922989,
 0.28155526779540596,
 0.28448719302397096,
 0.28448719302397096,
 0.28448719302397096,
 0.292464581726671]

In [17]:
[obj['test']['r2'] for obj in scores]

[0.009977204607032775,
 0.009977204607032775,
 0.009977204607032775,
 0.009977204607032775,
 0.2716924929790153,
 0.2716924929790153,
 0.2750141812017797,
 0.27450444848794664,
 0.27450444848794664,
 0.27450444848794664,
 0.27774375521667305,
 0.27871858308850184,
 0.27871858308850184,
 0.28418523034269094,
 0.29524352448890256,
 0.2973222397860653,
 0.2973222397860653,
 0.2973222397860653,
 0.2973222397860653,
 0.2973222397860653,
 0.2973222397860653,
 0.2973222397860653,
 0.2944053005698549,
 0.2944053005698549,
 0.2944053005698549,
 0.2944053005698549,
 0.28052070333758905,
 0.2981138735772464,
 0.30615659889086966,
 0.30615659889086966,
 0.30615659889086966,
 0.3178415691993145]