In [1]:
import numpy as np
import xarray as xr
import pandas as pd
import copy
from datetime import datetime, timedelta
from keras.utils import to_categorical
# import visualkeras
# import tensorflow as tf
from sklearn.metrics import balanced_accuracy_score
import optuna
from optuna.samplers import TPESampler
import keras
from keras.callbacks import ModelCheckpoint
from sklearn.utils.class_weight import compute_class_weight
import sys
import os
import joblib
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import glob 

sys.path.append("/glade/u/home/jhayron/WR_Predictability/3_MLModels/")
from model_builders_v2 import *
from sklearn import datasets, ensemble
from sklearn.model_selection import RandomizedSearchCV

2023-10-16 06:24:53.307045: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
  from .autonotebook import tqdm as notebook_tqdm


# Load outputs

In [2]:
week_out=0
week_out_str = f'week{week_out}'

wr_series = pd.read_csv('/glade/work/jhayron/Data4Predictability/WR_Series_20230824.csv',\
                index_col=0,names=['week0'],skiprows=1,parse_dates=True)
for wk in range(2,10):
    series_temp = copy.deepcopy(wr_series["week0"])
    series_temp.index = series_temp.index - timedelta(weeks = wk-1)
    series_temp.name = f'week{wk-1}'
    if wk==2:
        df_shifts = pd.concat([pd.DataFrame(wr_series["week0"]),pd.DataFrame(series_temp)],axis=1)  
    else:
        df_shifts = pd.concat([df_shifts,pd.DataFrame(series_temp)],axis=1)

# Load inputs

In [3]:
list_folders = np.sort(glob.glob('/glade/u/home/jhayron/WR_Predictability/4_PCA_Analysis/figures/*/'))

In [4]:
list_vars = [list_folders[i].split('/')[-2] for i in range(len(list_folders))]

In [5]:
list_vars

['IC_SODA',
 'IT_SODA',
 'MLD_SODA',
 'OHC100_SODA',
 'OHC200_SODA',
 'OHC300_SODA',
 'OHC50_SODA',
 'OHC700_SODA',
 'OLR_ERA5',
 'SD_ERA5',
 'SSH_SODA',
 'SST_OISSTv2',
 'SST_SODA',
 'STL_1m_ERA5',
 'STL_28cm_ERA5',
 'STL_7cm_ERA5',
 'STL_full_ERA5',
 'SWVL_1m_ERA5',
 'SWVL_28cm_ERA5',
 'SWVL_7cm_ERA5',
 'SWVL_full_ERA5',
 'U10_ERA5',
 'U200_ERA5',
 'Z500_ERA5',
 'Z500_ERA5_Region']

In [6]:
indices_atmosphere = np.array([8,21,22,23])

In [7]:
dic_inputs = {}
for ivar in indices_atmosphere:
    dic_inputs[list_vars[ivar]] = pd.read_csv(f'{list_folders[ivar]}PC_{list_vars[ivar]}.csv',index_col=0,parse_dates=True)

In [8]:
# Create an empty DataFrame to store the combined data
combined_df = pd.DataFrame()

# Loop through the dictionary and concatenate the dataframes
for key, data in dic_inputs.items():
    # Convert the dictionary for the current key into a DataFrame
    df = pd.DataFrame(data)
    # Concatenate the current DataFrame with the combined DataFrame
    combined_df = pd.concat([combined_df, df], axis=1)

In [9]:
dic_inputs

{'OLR_ERA5':                    0         1         2         3         4         5
 1981-01-05 -0.968588 -0.238542 -1.333144  1.923289 -2.598784  0.909056
 1981-01-08 -1.016249 -0.018750 -2.139704  1.658078 -2.874292  1.259667
 1981-01-12 -0.689263  0.246856 -2.087065  1.259903 -3.305622  1.411027
 1981-01-15 -0.179971  0.155327 -2.092652  0.474640 -2.836408  1.346053
 1981-01-19 -0.121285 -0.067485 -2.469213  0.299682 -2.099917  0.905846
 ...              ...       ...       ...       ...       ...       ...
 2020-11-12 -0.255296 -0.034393  0.695331  0.474000 -0.196351 -0.176301
 2020-11-16  0.733943 -0.508264 -0.100867  0.688417 -1.075825 -0.689831
 2020-11-19  1.059217 -0.894245  0.149965  0.902031 -1.813075 -0.881752
 2020-11-23 -0.179422 -1.250273  1.378278  1.053626 -1.705519 -0.587166
 2020-11-26 -0.730713 -1.386602  1.347865  1.329752 -1.176415  0.056545
 
 [4164 rows x 6 columns],
 'U10_ERA5':                    0         1         2         3         4         5
 1981-01-05 

# Train model

In [45]:
fully_combined_df = pd.concat([combined_df,df_shifts[week_out_str]],axis=1)
fully_combined_df = fully_combined_df.dropna()

X_train = fully_combined_df['1980':'2015'].iloc[:,:-1].values
y_train = fully_combined_df['1980':'2015'].iloc[:,-1].values

# X_val = fully_combined_df['2011':'2015'].iloc[:,:-1].values
# y_val = fully_combined_df['2011':'2015'].iloc[:,-1].values

X_test = fully_combined_df['2016':'2020'].iloc[:,:-1].values
y_test = fully_combined_df['2016':'2020'].iloc[:,-1].values

In [32]:
# clf = ensemble.GradientBoostingClassifier()
# clf.fit(X_train, y_train)

# acc_train = clf.score(X_train, y_train)
# acc_val = clf.score(X_val, y_val)
# acc_test = clf.score(X_test, y_test)
# print("Accuracy train: {:.4f}".format(acc_train))
# print("Accuracy val: {:.4f}".format(acc_val))
# print("Accuracy test: {:.4f}".format(acc_test))

# optimize parameters for week 3

In [33]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5, 6, 7],
    'min_samples_split': [2, 3, 4, 5],
    'min_samples_leaf': [1, 2, 3, 4],
    'subsample': [0.8, 0.9, 1.0],
    'max_features': ['sqrt', 'log2', None],
    'random_state': [42],  # Set to a specific value for reproducibility
    'criterion': ['friedman_mse', 'squared_error'],  # Splitting criterion
    'min_impurity_decrease': [0.0, 0.1, 0.2],  # Minimum impurity decrease for split
    'min_weight_fraction_leaf': [0.0, 0.1, 0.2],  # Minimum weighted fraction for leaf
}

In [34]:
# Create the GradientBoostingClassifier
clf = ensemble.GradientBoostingClassifier()

# Create the Grid Search object
grid_search = RandomizedSearchCV(clf, param_grid, cv=5, scoring='accuracy', n_jobs=18,n_iter=100,return_train_score=True)

# Fit the model with training data
grid_search.fit(X_train, y_train)

# Get the best parameters and best estimator
best_params = grid_search.best_params_
best_clf = grid_search.best_estimator_
best_score = grid_search.best_score_

In [35]:
acc_train = best_clf.score(X_train, y_train)
# acc_val = best_clf.score(X_val, y_val)
acc_test = best_clf.score(X_test, y_test)
print("Accuracy train: {:.4f}".format(acc_train))
# print("Accuracy val: {:.4f}".format(acc_val))
print("Accuracy test: {:.4f}".format(acc_test))

Accuracy train: 0.5096
Accuracy test: 0.3867


# Iterate atmosphere, land, ocean

In [41]:
list_vars

['IC_SODA',
 'IT_SODA',
 'MLD_SODA',
 'OHC100_SODA',
 'OHC200_SODA',
 'OHC300_SODA',
 'OHC50_SODA',
 'OHC700_SODA',
 'OLR_ERA5',
 'SD_ERA5',
 'SSH_SODA',
 'SST_OISSTv2',
 'SST_SODA',
 'STL_1m_ERA5',
 'STL_28cm_ERA5',
 'STL_7cm_ERA5',
 'STL_full_ERA5',
 'SWVL_1m_ERA5',
 'SWVL_28cm_ERA5',
 'SWVL_7cm_ERA5',
 'SWVL_full_ERA5',
 'U10_ERA5',
 'U200_ERA5',
 'Z500_ERA5',
 'Z500_ERA5_Region']

In [42]:
indices_atmosphere = np.array([8,21,22,23])
indices_land = np.array([9,13,16,17,20])
indices_ocean = np.array([0,1,2,4,6,10,12])

In [56]:
for component in ['atm','ocn','lnd']:
    if component=='atm':
        dic_inputs = {}
        for ivar in indices_atmosphere:
            dic_inputs[list_vars[ivar]] = pd.read_csv(f'{list_folders[ivar]}PC_{list_vars[ivar]}.csv',index_col=0,parse_dates=True)
    elif component =='ocn':
        dic_inputs = {}
        for ivar in indices_ocean:
            dic_inputs[list_vars[ivar]] = pd.read_csv(f'{list_folders[ivar]}PC_{list_vars[ivar]}.csv',index_col=0,parse_dates=True)
    elif component =='lnd':
        dic_inputs = {}
        for ivar in indices_land:
            dic_inputs[list_vars[ivar]] = pd.read_csv(f'{list_folders[ivar]}PC_{list_vars[ivar]}.csv',index_col=0,parse_dates=True)
    print(dic_inputs.keys())
    # Create an empty DataFrame to store the combined data
    combined_df = pd.DataFrame()

    # Loop through the dictionary and concatenate the dataframes
    for key, data in dic_inputs.items():
        # Convert the dictionary for the current key into a DataFrame
        df = pd.DataFrame(data)
        # Concatenate the current DataFrame with the combined DataFrame
        combined_df = pd.concat([combined_df, df], axis=1)
    
    scores_test = []
    scores_val = []
    
    for week_out in np.arange(1,9):
        print(week_out)
        week_out_str = f'week{week_out}'
        fully_combined_df = pd.concat([combined_df,df_shifts[week_out_str]],axis=1)
        fully_combined_df = fully_combined_df.dropna()

        X_train = fully_combined_df['1980':'2015'].iloc[:,:-1].values
        y_train = fully_combined_df['1980':'2015'].iloc[:,-1].values

        X_test = fully_combined_df['2016':'2020'].iloc[:,:-1].values
        y_test = fully_combined_df['2016':'2020'].iloc[:,-1].values
        
        param_grid = {
            'n_estimators': [100, 200, 300],
            'learning_rate': [0.01, 0.1, 0.2],
            'max_depth': [3, 4, 5, 6, 7],
            'min_samples_split': [2, 3, 4, 5],
            'min_samples_leaf': [1, 2, 3, 4],
            'subsample': [0.8, 0.9, 1.0],
            'max_features': ['sqrt', 'log2', None],
            'random_state': [42],  # Set to a specific value for reproducibility
            'criterion': ['friedman_mse', 'squared_error'],  # Splitting criterion
            'min_impurity_decrease': [0.0, 0.1, 0.2],  # Minimum impurity decrease for split
            'min_weight_fraction_leaf': [0.0, 0.1, 0.2],  # Minimum weighted fraction for leaf
        }
        # Create the GradientBoostingClassifier
        clf = ensemble.GradientBoostingClassifier()

        # Create the Grid Search object
        grid_search = RandomizedSearchCV(clf, param_grid, cv=5, scoring='accuracy', n_jobs=18,n_iter=100,return_train_score=True)

        # Fit the model with training data
        grid_search.fit(X_train, y_train)

        # Get the best parameters and best estimator
        best_params = grid_search.best_params_
        best_clf = grid_search.best_estimator_
        best_score = grid_search.best_score_
        
        scores_test.append(best_clf.score(X_test, y_test))
        scores_val.append(best_score)
        results = pd.DataFrame(np.array([best_clf.predict(X_test),y_test]).T,
                     index=fully_combined_df['2016':'2020'].index,
                     columns=['y_predicted','y_test'])
        
        np.save(f'results/{component}_scores_test',np.array(scores_test))
        np.save(f'results/{component}_scores_val',np.array(scores_val))
        print(best_score)
        print(scores_test[-1])
        results.to_csv(f'results/results_{component}_{week_out_str}.csv')

dict_keys(['OLR_ERA5', 'U10_ERA5', 'U200_ERA5', 'Z500_ERA5'])
1
0.30257556734066676
0.32941176470588235
2
0.24616494574892714
0.25984251968503935
3
0.24507355283623483
0.22134387351778656
4
0.2412405599385342
0.19444444444444445
5
0.23959485036448475
0.17729083665338646
6
0.24425463336019343
0.22
7
0.240142420778442
0.1746987951807229
8
0.24014204598691977
0.17540322580645162
dict_keys(['IC_SODA', 'IT_SODA', 'MLD_SODA', 'OHC200_SODA', 'OHC50_SODA', 'SSH_SODA', 'SST_SODA'])
1
0.23877368213931
0.1803921568627451
2
0.239320877761745
0.17716535433070865
3
0.23932087776174504
0.17786561264822134
4
0.23959485036448475
0.1765873015873016
5
0.23959485036448475
0.17729083665338646
6
0.23986844817570224
0.176
7
0.240142420778442
0.1746987951807229
8
0.24014204598691977
0.17540322580645162
dict_keys(['SD_ERA5', 'STL_1m_ERA5', 'STL_full_ERA5', 'SWVL_1m_ERA5', 'SWVL_full_ERA5'])
1
0.23877368213931
0.1803921568627451
2
0.239320877761745
0.17716535433070865
3
0.23932087776174504
0.17786561264822134
4