In [1]:
from tqdm.auto import tqdm 
import pandas as pd
import numpy as np
import xarray as xr
import netCDF4 as nf
from netCDF4 import Dataset
%matplotlib inline
import glob
import seaborn as sns
import matplotlib.pyplot as plt
import ast

In [2]:
# Custom packages
import read_config
from util.data_process import read_vars, proc_dataset

In [3]:
# Read configuration file
config_set = read_config.read_config()
# Define Target
if int(config_set['target_lag'])==4:
    target='DELV24'

# Create Pandas DataFrame

In [4]:
# Process the filted TC list in the config file
TC_tofilt_list = ast.literal_eval(config_set['TCfilt'])
# Get the names of the remaining TCs
filt_TClist = read_vars.remove_storms(trackpath=config_set['track_path'],basinID='NA',yearmin=int(config_set['start_year']),yearmax=int(config_set['end_year']),remove_set=TC_tofilt_list)

  0%|          | 0/22 [00:00<?, ?it/s]

In [5]:
# Read saved SHIPS csvs
storeSHIPS = read_vars.read_SHIPS_csv(startyear=int(config_set['start_year']),endyear=int(config_set['end_year']),vars_path=config_set['vars_path'],
                                      filted_TCnames=filt_TClist,suffixlist=['newships_dev_POT'])
# Read selected variables from the pandas dfs
SHIPS_df = read_vars.create_SHIPS_df(startyear=int(config_set['start_year']),endyear=int(config_set['end_year']),SHIPSdict=storeSHIPS,
                                     wantvarnames=config_set['SHIPSops_varname'],targetname=target,filted_TCnames=filt_TClist,
                                     lagnum=int(config_set['target_lag'])
                                    )

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [6]:
# Add derived variables stored separately
store_dfstorms_ships = read_vars.add_derive_df(startyear=int(config_set['start_year']),
                                     endyear=int(config_set['end_year']),
                                     SHIPSdict=SHIPS_df,
                                     addfilepath='/work/FAC/FGSE/IDYST/tbeucler/default/saranya/causal/SHIPS/ships_pkl/all_storms_ships23vars_obswmax.pkl',
                                     addvarname=['pc20'],
                                     filted_TCnames=filt_TClist,
                                     lagnum=int(config_set['target_lag'])
                                    )

var_names=store_dfstorms_ships[2001]['ALLISON'].columns.values.tolist()

TC_fulllist = {}
for year in np.linspace(int(config_set['start_year']),int(config_set['end_year']),int(config_set['end_year'])-int(config_set['start_year'])+1):
    temp = store_dfstorms_ships[year]
    for ind,name in enumerate(temp.keys()):
        TC_fulllist[str(int(year))+'_'+name] = temp[name]

0it [00:00, ?it/s]

# Create ML-ready dataset

In [46]:
# Split data with a 0.15 test, 0.15 valid split
datastorer = proc_dataset.splitdata_handler(df=TC_fulllist,method='year',seed=40,config=config_set,testyears=[2020,2021])
# Combine different TCs into a long dataset
X,y,size = proc_dataset.df_proc_separate(datastorer['train'],datastorer['valid'],datastorer['test'],'DELV24')

In [79]:
from sklearn.linear_model import LinearRegression,Lasso,ElasticNet

In [86]:
regr = LinearRegression()
regr.fit(X['train'],y['train'])

LinearRegression()

In [110]:
scoreboard(regr).store_scores(X,y)

{'train': {'pred': array([12.90912713,  9.44948454, 15.10632819, ..., 15.24289035,
          1.69313479, -9.07919447]),
  'r2': 0.298019844881509,
  'RMSE': 14.770239868936553,
  'MAE': 10.769055662293773},
 'valid': {'pred': array([ 1.82793953e+01,  1.48913432e+01,  1.60049397e+01,  2.10385766e+01,
          2.03887730e+01,  1.71682970e+01,  1.53308861e+01,  1.67820081e+01,
          1.95858478e+01,  1.49501433e+01,  5.31063204e+00,  3.61417371e+00,
         -2.78867915e+00, -4.13730534e+00, -3.75241544e+00, -4.48507363e+00,
         -3.66036415e+00, -2.91024519e+00, -1.03087425e+00, -4.87654302e+00,
         -4.97725321e+00,  2.62250562e-03,  6.97372431e+00,  6.31399271e+00,
          3.87455087e+00,  2.50180624e+00,  3.01920392e+00, -2.25560063e-01,
          2.47317187e+00, -3.31608060e+00, -9.90199012e-01, -2.81434165e+00,
         -3.26013595e+00,  1.57055391e+01,  1.36749508e+01,  1.64633889e+01,
          1.84603820e+01,  2.26588777e+01,  2.04320753e+01,  1.88659278e+01,
      

In [96]:
import importlib
importlib.reload(proc_dataset)

<module 'util.data_process.proc_dataset' from '/work/FAC/FGSE/IDYST/tbeucler/default/freddy0218/2024_causalML/util/data_process/proc_dataset.py'>