In [1]:
from tqdm.auto import tqdm 
import pandas as pd
import numpy as np
import xarray as xr
import netCDF4 as nf
from netCDF4 import Dataset
%matplotlib inline
import glob
import seaborn as sns
import matplotlib.pyplot as plt
import ast

In [114]:
# Custom packages
import read_config
from util.data_process import read_vars, proc_dataset
from util.models import performance_scores,train_baseline

In [3]:
# Read configuration file
config_set = read_config.read_config()
# Define Target
if int(config_set['target_lag'])==4:
    target='DELV24'

# Create Pandas DataFrame

In [4]:
# Process the filted TC list in the config file
TC_tofilt_list = ast.literal_eval(config_set['TCfilt'])
# Get the names of the remaining TCs
filt_TClist = read_vars.remove_storms(trackpath=config_set['track_path'],basinID='NA',yearmin=int(config_set['start_year']),yearmax=int(config_set['end_year']),
                                      remove_set=TC_tofilt_list)

  0%|          | 0/22 [00:00<?, ?it/s]

In [5]:
# Read saved SHIPS csvs
storeSHIPS = read_vars.read_SHIPS_csv(startyear=int(config_set['start_year']),endyear=int(config_set['end_year']),vars_path=config_set['vars_path'],
                                      filted_TCnames=filt_TClist,suffixlist=['newships_dev_POT'])
# Read selected variables from the pandas dfs
SHIPS_df = read_vars.create_SHIPS_df(startyear=int(config_set['start_year']),endyear=int(config_set['end_year']),SHIPSdict=storeSHIPS,
                                     wantvarnames=config_set['SHIPSops_varname'],targetname=target,filted_TCnames=filt_TClist,
                                     lagnum=int(config_set['target_lag'])
                                    )

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [6]:
# Add derived variables stored separately
store_dfstorms_ships = read_vars.add_derive_df(startyear=int(config_set['start_year']),
                                     endyear=int(config_set['end_year']),
                                     SHIPSdict=SHIPS_df,
                                     addfilepath='/work/FAC/FGSE/IDYST/tbeucler/default/saranya/causal/SHIPS/ships_pkl/all_storms_ships23vars_obswmax.pkl',
                                     addvarname=['pc20'],
                                     filted_TCnames=filt_TClist,
                                     lagnum=int(config_set['target_lag'])
                                    )

var_names=store_dfstorms_ships[2001]['ALLISON'].columns.values.tolist()

TC_fulllist = {}
for year in np.linspace(int(config_set['start_year']),int(config_set['end_year']),int(config_set['end_year'])-int(config_set['start_year'])+1):
    temp = store_dfstorms_ships[year]
    for ind,name in enumerate(temp.keys()):
        TC_fulllist[str(int(year))+'_'+name] = temp[name]

0it [00:00, ?it/s]

# Create ML-ready dataset

In [118]:
# Split data with a 0.15 test, 0.15 valid split
datastorer = proc_dataset.splitdata_handler(df=TC_fulllist,method='year',seed=41,config=config_set,testyears=[2020,2021])
# Combine different TCs into a long dataset
X,y,size = proc_dataset.df_proc_separate(datastorer['train'],datastorer['valid'],datastorer['test'],'DELV24')

In [127]:
# Find the mean and std of the training set for normalization
trainmean,trainstd = X['train'].mean(axis=0),X['train'].std(axis=0)
# Data normalization
Xnorml = {
    'train':proc_dataset.normalize_data(X['train'],trainmean,trainstd),
    'valid':proc_dataset.normalize_data(X['valid'],trainmean,trainstd),
    'test':proc_dataset.normalize_data(X['test'],trainmean,trainstd),
}

# Train MLR baselines and evaluate skill scores|

In [131]:
regr = train_baseline.train_baseline_MLR(Xnorml,y)

In [133]:
MLR_scoreboard = performance_scores.scoreboard(regr).store_scores(Xnorml,y)

In [121]:
import importlib
importlib.reload(proc_dataset)

<module 'util.data_process.proc_dataset' from '/work/FAC/FGSE/IDYST/tbeucler/default/freddy0218/2024_causalML/util/data_process/proc_dataset.py'>