In [1]:
import numpy as np
import xarray as xr
import pandas as pd
import copy
from datetime import datetime, timedelta
from keras.utils import to_categorical
# import visualkeras
# import tensorflow as tf
from sklearn.metrics import balanced_accuracy_score
import optuna
from optuna.samplers import TPESampler
import keras
from keras.callbacks import ModelCheckpoint
from sklearn.utils.class_weight import compute_class_weight
import sys
import os
import joblib
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

2023-09-13 14:25:30.923803: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
sys.path.append("/glade/u/home/jhayron/WR_Predictability/3_MLModels/")
from model_builders_v2 import *

## Helper functions

In [3]:

def create_tf_datasets(input_data, output_data):
    # Convert xarray dataset to numpy array for TensorFlow Dataset
    input_images = input_data.transpose('time', 'lat', 'lon','channel').values
    output_one_hot = output_data.values

    # Create TensorFlow Datasets
    input_dataset = tf.data.Dataset.from_tensor_slices(input_images)
    output_dataset = tf.data.Dataset.from_tensor_slices(output_one_hot)

    # Combine input and output datasets into a joint dataset
    joint_dataset = tf.data.Dataset.zip((input_dataset, output_dataset))

    return (input_images,output_one_hot)

def create_datasets(input_anoms, var_name, df_shifts, week_out):
# Assuming you have the xarray.Dataset 'input_data' and the pandas.Series 'output_data'
    input_data = copy.deepcopy(input_anoms[var_name])

    array_temp = input_data.data
    array_temp[np.isfinite(array_temp)==False]=0
    input_data.data = array_temp

#     input_data = (input_data - input_data.mean('time')) / (input_data.std('time'))
    
#     input_data[np.isfinite(array_temp)==False] = 0
    
    # Reshape the data to add a new dimension
    values_reshaped = input_data.values.reshape(input_data.shape[0], input_data.shape[1], input_data.shape[2], 1)

    # Create a new xarray.DataArray with the reshaped data and the original coordinates
    input_data = xr.DataArray(values_reshaped, coords=input_data.coords, dims=('time', 'lat', 'lon', 'channel'))
    output_data = copy.deepcopy(df_shifts[f'week{week_out}']).dropna()

    # Step 1: Create a common date index that includes all dates in both the input and output data
    common_dates = np.intersect1d(input_data['time'].values, output_data.index)

    # Step 2: Reindex the input xarray dataset and the output DataFrame to the common date index
    input_data = input_data.sel(time=common_dates)
    output_data = output_data.loc[common_dates]

    # Step 3: One-hot encode the output DataFrame using to_categorical
    num_classes = len(output_data.unique())  # Number of classes (number of weeks in this case)
    output_data_encoded = to_categorical(output_data, num_classes=num_classes)
    output_data_encoded = pd.DataFrame(output_data_encoded,index=output_data.index)

    # Step 4: Create masks for training, validation, and testing periods
    train_mask = (output_data.index >= '1980-01-01') & (output_data.index <= '2010-12-31')
    val_mask = (output_data.index >= '2011-01-01') & (output_data.index <= '2015-12-31')
    test_mask = (output_data.index >= '2016-01-01') & (output_data.index <= '2020-12-31')

    # Step 5: Split the input xarray dataset and the output DataFrame into subsets
    input_train = input_data.sel(time=train_mask)
    input_val = input_data.sel(time=val_mask)
    input_test = input_data.sel(time=test_mask)

    output_train = output_data_encoded.loc[train_mask]
    output_val = output_data_encoded.loc[val_mask]
    output_test = output_data_encoded.loc[test_mask]

    train_joint_dataset = create_tf_datasets(input_train, output_train)
    val_joint_dataset = create_tf_datasets(input_val, output_val)
    test_joint_dataset = create_tf_datasets(input_test, output_test)

    # buffer_size = train_joint_dataset.cardinality()
    # train_joint_dataset = train_joint_dataset.shuffle(buffer_size)
    return train_joint_dataset, val_joint_dataset, test_joint_dataset

def get_output_from_dataset(dataset):
    output_array = []
    for input_data, output_data in dataset.as_numpy_iterator():
        output_array.append(output_data)

    # Convert the list of NumPy arrays into a single NumPy array
    output_array = np.array(output_array)
    return output_array

def balanced_accuracy(y_true, y_pred):
    y_true = tf.argmax(y_true, axis=1)
    y_pred = tf.argmax(y_pred, axis=1)
    return tf.py_function(balanced_accuracy_score, (y_true, y_pred), tf.float32)

def logging_callback(study, frozen_trial):
    previous_best_value = study.user_attrs.get("previous_best_value", None)
    if previous_best_value != study.best_value:
        study.set_user_attr("previous_best_value", study.best_value)
        print(
            "Trial {} finished with best value: {} and parameters: {}. ".format(
            frozen_trial.number,
            frozen_trial.value,
            frozen_trial.params,
            )
        )

In [4]:
def create_datasets_multichannel(input_data, df_shifts, week_out):
# Assuming you have the xarray.Dataset 'input_data' and the pandas.Series 'output_data'

    # Create a new xarray.DataArray with the reshaped data and the original coordinates
    output_data = copy.deepcopy(df_shifts[f'week{week_out}']).dropna()

    # Step 1: Create a common date index that includes all dates in both the input and output data
    common_dates = np.intersect1d(input_data['time'].values, output_data.index)

    # Step 2: Reindex the input xarray dataset and the output DataFrame to the common date index
    input_data = input_data.sel(time=common_dates)
    output_data = output_data.loc[common_dates]

    # Step 3: One-hot encode the output DataFrame using to_categorical
    num_classes = len(output_data.unique())  # Number of classes (number of weeks in this case)
    output_data_encoded = to_categorical(output_data, num_classes=num_classes)
    output_data_encoded = pd.DataFrame(output_data_encoded,index=output_data.index)

    # Step 4: Create masks for training, validation, and testing periods
    train_mask = (output_data.index >= '1980-01-01') & (output_data.index <= '2010-12-31')
    val_mask = (output_data.index >= '2011-01-01') & (output_data.index <= '2015-12-31')
    test_mask = (output_data.index >= '2016-01-01') & (output_data.index <= '2020-12-31')

    # Step 5: Split the input xarray dataset and the output DataFrame into subsets
    input_train = input_data.sel(time=train_mask)
    input_val = input_data.sel(time=val_mask)
    input_test = input_data.sel(time=test_mask)

    output_train = output_data_encoded.loc[train_mask]
    output_val = output_data_encoded.loc[val_mask]
    output_test = output_data_encoded.loc[test_mask]

    train_joint_dataset = create_tf_datasets(input_train, output_train)
    val_joint_dataset = create_tf_datasets(input_val, output_val)
    test_joint_dataset = create_tf_datasets(input_test, output_test)

    return train_joint_dataset, val_joint_dataset, test_joint_dataset,input_train.time,\
        input_val.time,input_test.time

In [5]:
def normalize_data_with_scaling(data, start_year, end_year):
    # Define a normalization function
    def normalize_pixel(pixel):
        return (pixel - min_value) / (max_value - min_value)

    # Create an empty DataArray to store the scaling parameters (min and max) for each pixel and channel
    scaling_params = xr.DataArray(np.nan, dims=("lat", "lon", "channel", "parameter"), 
                                  coords={"lat": data.lat, "lon": data.lon, 
                                          "channel": data.channel, "parameter": ["min", "max"]})

    # Normalize the data using the MinMaxScalers and scaling parameters
    normalized_data = data.copy()

    # Initialize a dictionary to store MinMaxScalers for each pixel and channel
    for channel in data.channel:
        data_channel = data.sel(channel=channel)
        years_for_scaling = data_channel.sel(time=slice(f"{start_year}-01-01", f"{end_year}-12-31"))
        # Define the min and max values for normalization
        min_value = years_for_scaling.min(dim='time')
        max_value = years_for_scaling.max(dim='time')
        
        # Apply the normalization function to each pixel along the 'time' dimension
        normalized_data_channel = normalize_pixel(data_channel).values
        normalized_data_channel[np.isfinite(normalized_data_channel)==False]=0
        normalized_data.loc[dict(channel=channel)] = normalized_data_channel
        
    return normalized_data

In [6]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

## GLOBAL SEED ##    
np.random.seed(42)
tf.random.set_seed(42)

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


2023-09-13 14:25:36.984291: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2023-09-13 14:25:36.985762: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2023-09-13 14:25:37.027277: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:88:00.0 name: Tesla V100-SXM2-32GB computeCapability: 7.0
coreClock: 1.53GHz coreCount: 80 deviceMemorySize: 31.75GiB deviceMemoryBandwidth: 836.37GiB/s
2023-09-13 14:25:37.027320: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
2023-09-13 14:25:37.181293: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.10
2023-09-13 14:25:37.181379: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublasLt.so.10
2

# Training

# olr

IC_SODA.nc   OHC100_SODA.nc  OHC50_SODA.nc   SD_ERA5.nc      SST_SODA.nc       STL_7cm_ERA5.nc   SWVL_28cm_ERA5.nc  U10_ERA5.nc
IT_SODA.nc   OHC200_SODA.nc  OHC700_SODA.nc  SSH_SODA.nc     STL_1m_ERA5.nc    STL_full_ERA5.nc  SWVL_7cm_ERA5.nc   U200_ERA5.nc
MLD_SODA.nc  OHC300_SODA.nc  OLR_ERA5.nc     SST_OISSTv2.nc  STL_28cm_ERA5.nc  SWVL_1m_ERA5.nc   SWVL_full_ERA5.nc  Z500_ERA5.ncm

In [7]:
name_var = 'atm'

In [8]:
path_models = f'/glade/work/jhayron/Data4Predictability/models/CNN_Sep13_2023/test_{name_var}_olr/'
try:
    os.mkdir(path_models)
except:
    pass

In [9]:
normalized_data = xr.open_dataset(f'/glade/scratch/jhayron/Data4Predictability/FinalTrainingDataArrays/{name_var}.nc')

In [10]:
normalized_data = normalized_data.__xarray_dataarray_variable__

In [11]:
wr_series = pd.read_csv('/glade/work/jhayron/Data4Predictability/WR_Series_20230824.csv',\
                index_col=0,names=['week0'],skiprows=1,parse_dates=True)
for wk in range(2,10):
    series_temp = copy.deepcopy(wr_series["week0"])
    series_temp.index = series_temp.index - timedelta(weeks = wk-1)
    series_temp.name = f'week{wk-1}'
    if wk==2:
        df_shifts = pd.concat([pd.DataFrame(wr_series["week0"]),pd.DataFrame(series_temp)],axis=1)  
    else:
        df_shifts = pd.concat([df_shifts,pd.DataFrame(series_temp)],axis=1)

In [12]:
for week_out in range(0,9):
    week_out_str = f'week{week_out}'
    
    train_joint_dataset, val_joint_dataset, test_joint_dataset, _, _, test_index  = \
        create_datasets_multichannel(normalized_data.sel(channel=[1,5]), df_shifts, week_out)
    
    dict_params = {'model_base':'densenet',
                   'type_pooling':'avg',
                   'do':0.3,
                   'md':16,
                   'activation':'LeakyReLU',
                   'weighted_loss':True,
                   'bs':16,
                   'lr':0.001,
                   'input_shape':train_joint_dataset[0].shape[1:]}
    # with strategy.scope():
    model = build_predesigned_model(dict_params['model_base'],
                                    dict_params['type_pooling'],
                                    dict_params['do'],
                                    dict_params['md'],
                                    dict_params['activation'],
                                    dict_params['input_shape'])
    model.compile(loss=keras.losses.categorical_crossentropy, 
                  optimizer=keras.optimizers.Adam(lr=dict_params['lr']),
                  metrics=[balanced_accuracy,'accuracy'])  
    
    filepath = f'{path_models}{name_var}/model_{week_out_str}_v0.h5'

    model.load_weights(filepath)
    predictions_prob = model.predict(test_joint_dataset[0])
    df_results = pd.DataFrame(predictions_prob,index=test_index)
    df_results.to_csv(f'results/atm_olr_{week_out_str}.csv')


2023-09-13 14:26:40.680109: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-09-13 14:26:40.680294: I tensorflow/compiler/jit/xla_gpu_device.cc:99] Not creating XLA devices, tf_xla_enable_xla_devices not set
2023-09-13 14:26:40.681393: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:88:00.0 name: Tesla V100-SXM2-32GB computeCapability: 7.0
coreClock: 1.53GHz coreCount: 80 deviceMemorySize: 31.75GiB deviceMemoryBandwidth: 836.37GiB/s
2023-09-13 14:26:40.681475: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
2023-09-13 14:26:40.681521: I tensorflow/stream_executor/platfor

# swvl

IC_SODA.nc   OHC100_SODA.nc  OHC50_SODA.nc   SD_ERA5.nc      SST_SODA.nc       STL_7cm_ERA5.nc   SWVL_28cm_ERA5.nc  U10_ERA5.nc
IT_SODA.nc   OHC200_SODA.nc  OHC700_SODA.nc  SSH_SODA.nc     STL_1m_ERA5.nc    STL_full_ERA5.nc  SWVL_7cm_ERA5.nc   U200_ERA5.nc
MLD_SODA.nc  OHC300_SODA.nc  OLR_ERA5.nc     SST_OISSTv2.nc  STL_28cm_ERA5.nc  SWVL_1m_ERA5.nc   SWVL_full_ERA5.nc  Z500_ERA5.ncm

In [19]:
name_var = 'lnd'

In [20]:
path_models = f'/glade/work/jhayron/Data4Predictability/models/CNN_Sep13_2023/test_{name_var}_swvl/'
try:
    os.mkdir(path_models)
except:
    pass

In [21]:
normalized_data = xr.open_dataset(f'/glade/scratch/jhayron/Data4Predictability/FinalTrainingDataArrays/{name_var}.nc')

In [22]:
normalized_data = normalized_data.__xarray_dataarray_variable__

In [23]:
wr_series = pd.read_csv('/glade/work/jhayron/Data4Predictability/WR_Series_20230824.csv',\
                index_col=0,names=['week0'],skiprows=1,parse_dates=True)
for wk in range(2,10):
    series_temp = copy.deepcopy(wr_series["week0"])
    series_temp.index = series_temp.index - timedelta(weeks = wk-1)
    series_temp.name = f'week{wk-1}'
    if wk==2:
        df_shifts = pd.concat([pd.DataFrame(wr_series["week0"]),pd.DataFrame(series_temp)],axis=1)  
    else:
        df_shifts = pd.concat([df_shifts,pd.DataFrame(series_temp)],axis=1)

In [24]:
for week_out in range(0,9):
    week_out_str = f'week{week_out}'
    
    train_joint_dataset, val_joint_dataset, test_joint_dataset, _, _, test_index  = \
        create_datasets_multichannel(normalized_data.sel(channel=[1,5]), df_shifts, week_out)
    
    dict_params = {'model_base':'densenet',
                   'type_pooling':'avg',
                   'do':0.3,
                   'md':16,
                   'activation':'LeakyReLU',
                   'weighted_loss':True,
                   'bs':16,
                   'lr':0.001,
                   'input_shape':train_joint_dataset[0].shape[1:]}
    # with strategy.scope():
    model = build_predesigned_model(dict_params['model_base'],
                                    dict_params['type_pooling'],
                                    dict_params['do'],
                                    dict_params['md'],
                                    dict_params['activation'],
                                    dict_params['input_shape'])
    model.compile(loss=keras.losses.categorical_crossentropy, 
                  optimizer=keras.optimizers.Adam(lr=dict_params['lr']),
                  metrics=[balanced_accuracy,'accuracy'])  
    
    filepath = f'{path_models}{name_var}/model_{week_out_str}_v0.h5'

    model.load_weights(filepath)
    predictions_prob = model.predict(test_joint_dataset[0])
    df_results = pd.DataFrame(predictions_prob,index=test_index)
    df_results.to_csv(f'results/lnd_swvl_{week_out_str}.csv')


# sst

IC_SODA.nc   OHC100_SODA.nc  OHC50_SODA.nc   SD_ERA5.nc      SST_SODA.nc       STL_7cm_ERA5.nc   SWVL_28cm_ERA5.nc  U10_ERA5.nc
IT_SODA.nc   OHC200_SODA.nc  OHC700_SODA.nc  SSH_SODA.nc     STL_1m_ERA5.nc    STL_full_ERA5.nc  SWVL_7cm_ERA5.nc   U200_ERA5.nc
MLD_SODA.nc  OHC300_SODA.nc  OLR_ERA5.nc     SST_OISSTv2.nc  STL_28cm_ERA5.nc  SWVL_1m_ERA5.nc   SWVL_full_ERA5.nc  Z500_ERA5.ncm

In [25]:
name_var = 'ocn'

In [26]:
path_models = f'/glade/work/jhayron/Data4Predictability/models/CNN_Sep13_2023/test_{name_var}_sst/'
try:
    os.mkdir(path_models)
except:
    pass

In [27]:
normalized_data = xr.open_dataset(f'/glade/scratch/jhayron/Data4Predictability/FinalTrainingDataArrays/{name_var}.nc')

In [28]:
normalized_data = normalized_data.__xarray_dataarray_variable__

In [29]:
wr_series = pd.read_csv('/glade/work/jhayron/Data4Predictability/WR_Series_20230824.csv',\
                index_col=0,names=['week0'],skiprows=1,parse_dates=True)
for wk in range(2,10):
    series_temp = copy.deepcopy(wr_series["week0"])
    series_temp.index = series_temp.index - timedelta(weeks = wk-1)
    series_temp.name = f'week{wk-1}'
    if wk==2:
        df_shifts = pd.concat([pd.DataFrame(wr_series["week0"]),pd.DataFrame(series_temp)],axis=1)  
    else:
        df_shifts = pd.concat([df_shifts,pd.DataFrame(series_temp)],axis=1)

In [None]:
for week_out in range(0,9):
    week_out_str = f'week{week_out}'
    
    train_joint_dataset, val_joint_dataset, test_joint_dataset, _, _, test_index  = \
        create_datasets_multichannel(normalized_data.sel(channel=[2,6]), df_shifts, week_out)
    
    dict_params = {'model_base':'densenet',
                   'type_pooling':'avg',
                   'do':0.3,
                   'md':16,
                   'activation':'LeakyReLU',
                   'weighted_loss':True,
                   'bs':16,
                   'lr':0.001,
                   'input_shape':train_joint_dataset[0].shape[1:]}
    # with strategy.scope():
    model = build_predesigned_model(dict_params['model_base'],
                                    dict_params['type_pooling'],
                                    dict_params['do'],
                                    dict_params['md'],
                                    dict_params['activation'],
                                    dict_params['input_shape'])
    model.compile(loss=keras.losses.categorical_crossentropy, 
                  optimizer=keras.optimizers.Adam(lr=dict_params['lr']),
                  metrics=[balanced_accuracy,'accuracy'])  
    
    filepath = f'{path_models}{name_var}/model_{week_out_str}_v0.h5'

    model.load_weights(filepath)
    predictions_prob = model.predict(test_joint_dataset[0])
    df_results = pd.DataFrame(predictions_prob,index=test_index)
    df_results.to_csv(f'results/ocn_sst_{week_out_str}.csv')


# z_only

IC_SODA.nc   OHC100_SODA.nc  OHC50_SODA.nc   SD_ERA5.nc      SST_SODA.nc       STL_7cm_ERA5.nc   SWVL_28cm_ERA5.nc  U10_ERA5.nc
IT_SODA.nc   OHC200_SODA.nc  OHC700_SODA.nc  SSH_SODA.nc     STL_1m_ERA5.nc    STL_full_ERA5.nc  SWVL_7cm_ERA5.nc   U200_ERA5.nc
MLD_SODA.nc  OHC300_SODA.nc  OLR_ERA5.nc     SST_OISSTv2.nc  STL_28cm_ERA5.nc  SWVL_1m_ERA5.nc   SWVL_full_ERA5.nc  Z500_ERA5.ncm

In [7]:
name_var = 'atm'

In [8]:
path_models = f'/glade/work/jhayron/Data4Predictability/models/CNN_Sep13_2023/test_{name_var}_z/'
try:
    os.mkdir(path_models)
except:
    pass

In [9]:
normalized_data = xr.open_dataset(f'/glade/scratch/jhayron/Data4Predictability/FinalTrainingDataArrays/{name_var}.nc')

In [10]:
normalized_data = normalized_data.__xarray_dataarray_variable__

In [11]:
wr_series = pd.read_csv('/glade/work/jhayron/Data4Predictability/WR_Series_20230824.csv',\
                index_col=0,names=['week0'],skiprows=1,parse_dates=True)
for wk in range(2,10):
    series_temp = copy.deepcopy(wr_series["week0"])
    series_temp.index = series_temp.index - timedelta(weeks = wk-1)
    series_temp.name = f'week{wk-1}'
    if wk==2:
        df_shifts = pd.concat([pd.DataFrame(wr_series["week0"]),pd.DataFrame(series_temp)],axis=1)  
    else:
        df_shifts = pd.concat([df_shifts,pd.DataFrame(series_temp)],axis=1)

In [24]:
for week_out in range(0,9):
    week_out_str = f'week{week_out}'
    
    train_joint_dataset, val_joint_dataset, test_joint_dataset, _, _, test_index  = \
        create_datasets_multichannel(normalized_data.sel(channel=[0,4]), df_shifts, week_out)
    
    dict_params = {'model_base':'densenet',
                   'type_pooling':'avg',
                   'do':0.3,
                   'md':16,
                   'activation':'LeakyReLU',
                   'weighted_loss':True,
                   'bs':16,
                   'lr':0.001,
                   'input_shape':train_joint_dataset[0].shape[1:]}
    # with strategy.scope():
    model = build_predesigned_model(dict_params['model_base'],
                                    dict_params['type_pooling'],
                                    dict_params['do'],
                                    dict_params['md'],
                                    dict_params['activation'],
                                    dict_params['input_shape'])
    model.compile(loss=keras.losses.categorical_crossentropy, 
                  optimizer=keras.optimizers.Adam(lr=dict_params['lr']),
                  metrics=[balanced_accuracy,'accuracy'])  
    
    filepath = f'{path_models}{name_var}/model_{week_out_str}_v0.h5'

    model.load_weights(filepath)
    predictions_prob = model.predict(test_joint_dataset[0])
    df_results = pd.DataFrame(predictions_prob,index=test_index)
    df_results.to_csv(f'results/atm_z_only_{week_out_str}.csv')


# atm

IC_SODA.nc   OHC100_SODA.nc  OHC50_SODA.nc   SD_ERA5.nc      SST_SODA.nc       STL_7cm_ERA5.nc   SWVL_28cm_ERA5.nc  U10_ERA5.nc
IT_SODA.nc   OHC200_SODA.nc  OHC700_SODA.nc  SSH_SODA.nc     STL_1m_ERA5.nc    STL_full_ERA5.nc  SWVL_7cm_ERA5.nc   U200_ERA5.nc
MLD_SODA.nc  OHC300_SODA.nc  OLR_ERA5.nc     SST_OISSTv2.nc  STL_28cm_ERA5.nc  SWVL_1m_ERA5.nc   SWVL_full_ERA5.nc  Z500_ERA5.ncm

In [25]:
name_var = 'atm'

In [26]:
path_models = f'/glade/work/jhayron/Data4Predictability/models/CNN_Sep13_2023/test_{name_var}/'
try:
    os.mkdir(path_models)
except:
    pass

In [27]:
normalized_data = xr.open_dataset(f'/glade/scratch/jhayron/Data4Predictability/FinalTrainingDataArrays/{name_var}.nc')

In [28]:
normalized_data = normalized_data.__xarray_dataarray_variable__

In [29]:
wr_series = pd.read_csv('/glade/work/jhayron/Data4Predictability/WR_Series_20230824.csv',\
                index_col=0,names=['week0'],skiprows=1,parse_dates=True)
for wk in range(2,10):
    series_temp = copy.deepcopy(wr_series["week0"])
    series_temp.index = series_temp.index - timedelta(weeks = wk-1)
    series_temp.name = f'week{wk-1}'
    if wk==2:
        df_shifts = pd.concat([pd.DataFrame(wr_series["week0"]),pd.DataFrame(series_temp)],axis=1)  
    else:
        df_shifts = pd.concat([df_shifts,pd.DataFrame(series_temp)],axis=1)

In [None]:
for week_out in range(0,9):
    week_out_str = f'week{week_out}'
    
    train_joint_dataset, val_joint_dataset, test_joint_dataset, _, _, test_index  = \
        create_datasets_multichannel(normalized_data, df_shifts, week_out)
    
    dict_params = {'model_base':'densenet',
                   'type_pooling':'avg',
                   'do':0.3,
                   'md':16,
                   'activation':'LeakyReLU',
                   'weighted_loss':True,
                   'bs':16,
                   'lr':0.001,
                   'input_shape':train_joint_dataset[0].shape[1:]}
    # with strategy.scope():
    model = build_predesigned_model(dict_params['model_base'],
                                    dict_params['type_pooling'],
                                    dict_params['do'],
                                    dict_params['md'],
                                    dict_params['activation'],
                                    dict_params['input_shape'])
    model.compile(loss=keras.losses.categorical_crossentropy, 
                  optimizer=keras.optimizers.Adam(lr=dict_params['lr']),
                  metrics=[balanced_accuracy,'accuracy'])  
    
    filepath = f'{path_models}{name_var}/model_{week_out_str}_v0.h5'

    model.load_weights(filepath)
    predictions_prob = model.predict(test_joint_dataset[0])
    df_results = pd.DataFrame(predictions_prob,index=test_index)
    df_results.to_csv(f'results/{name_var}_{week_out_str}.csv')


# land

IC_SODA.nc   OHC100_SODA.nc  OHC50_SODA.nc   SD_ERA5.nc      SST_SODA.nc       STL_7cm_ERA5.nc   SWVL_28cm_ERA5.nc  U10_ERA5.nc
IT_SODA.nc   OHC200_SODA.nc  OHC700_SODA.nc  SSH_SODA.nc     STL_1m_ERA5.nc    STL_full_ERA5.nc  SWVL_7cm_ERA5.nc   U200_ERA5.nc
MLD_SODA.nc  OHC300_SODA.nc  OLR_ERA5.nc     SST_OISSTv2.nc  STL_28cm_ERA5.nc  SWVL_1m_ERA5.nc   SWVL_full_ERA5.nc  Z500_ERA5.ncm

In [None]:
name_var = 'lnd'

In [None]:
path_models = f'/glade/work/jhayron/Data4Predictability/models/CNN_Sep13_2023/test_{name_var}/'
try:
    os.mkdir(path_models)
except:
    pass

In [None]:
normalized_data = xr.open_dataset(f'/glade/scratch/jhayron/Data4Predictability/FinalTrainingDataArrays/{name_var}.nc')

In [None]:
normalized_data = normalized_data.__xarray_dataarray_variable__

In [None]:
wr_series = pd.read_csv('/glade/work/jhayron/Data4Predictability/WR_Series_20230824.csv',\
                index_col=0,names=['week0'],skiprows=1,parse_dates=True)
for wk in range(2,10):
    series_temp = copy.deepcopy(wr_series["week0"])
    series_temp.index = series_temp.index - timedelta(weeks = wk-1)
    series_temp.name = f'week{wk-1}'
    if wk==2:
        df_shifts = pd.concat([pd.DataFrame(wr_series["week0"]),pd.DataFrame(series_temp)],axis=1)  
    else:
        df_shifts = pd.concat([df_shifts,pd.DataFrame(series_temp)],axis=1)

In [36]:
for week_out in range(0,9):
    week_out_str = f'week{week_out}'
    
    train_joint_dataset, val_joint_dataset, test_joint_dataset, _, _, test_index  = \
        create_datasets_multichannel(normalized_data, df_shifts, week_out)
    
    dict_params = {'model_base':'densenet',
                   'type_pooling':'avg',
                   'do':0.3,
                   'md':16,
                   'activation':'LeakyReLU',
                   'weighted_loss':True,
                   'bs':16,
                   'lr':0.001,
                   'input_shape':train_joint_dataset[0].shape[1:]}
    # with strategy.scope():
    model = build_predesigned_model(dict_params['model_base'],
                                    dict_params['type_pooling'],
                                    dict_params['do'],
                                    dict_params['md'],
                                    dict_params['activation'],
                                    dict_params['input_shape'])
    model.compile(loss=keras.losses.categorical_crossentropy, 
                  optimizer=keras.optimizers.Adam(lr=dict_params['lr']),
                  metrics=[balanced_accuracy,'accuracy'])  
    
    filepath = f'{path_models}{name_var}/model_{week_out_str}_v0.h5'

    model.load_weights(filepath)
    predictions_prob = model.predict(test_joint_dataset[0])
    df_results = pd.DataFrame(predictions_prob,index=test_index)
    df_results.to_csv(f'results/{name_var}_{week_out_str}.csv')


# ocean

IC_SODA.nc   OHC100_SODA.nc  OHC50_SODA.nc   SD_ERA5.nc      SST_SODA.nc       STL_7cm_ERA5.nc   SWVL_28cm_ERA5.nc  U10_ERA5.nc
IT_SODA.nc   OHC200_SODA.nc  OHC700_SODA.nc  SSH_SODA.nc     STL_1m_ERA5.nc    STL_full_ERA5.nc  SWVL_7cm_ERA5.nc   U200_ERA5.nc
MLD_SODA.nc  OHC300_SODA.nc  OLR_ERA5.nc     SST_OISSTv2.nc  STL_28cm_ERA5.nc  SWVL_1m_ERA5.nc   SWVL_full_ERA5.nc  Z500_ERA5.ncm

In [37]:
name_var = 'ocn'

In [38]:
path_models = f'/glade/work/jhayron/Data4Predictability/models/CNN_Sep13_2023/test_{name_var}/'
try:
    os.mkdir(path_models)
except:
    pass

In [39]:
normalized_data = xr.open_dataset(f'/glade/scratch/jhayron/Data4Predictability/FinalTrainingDataArrays/{name_var}.nc')

In [40]:
normalized_data = normalized_data.__xarray_dataarray_variable__

In [41]:
wr_series = pd.read_csv('/glade/work/jhayron/Data4Predictability/WR_Series_20230824.csv',\
                index_col=0,names=['week0'],skiprows=1,parse_dates=True)
for wk in range(2,10):
    series_temp = copy.deepcopy(wr_series["week0"])
    series_temp.index = series_temp.index - timedelta(weeks = wk-1)
    series_temp.name = f'week{wk-1}'
    if wk==2:
        df_shifts = pd.concat([pd.DataFrame(wr_series["week0"]),pd.DataFrame(series_temp)],axis=1)  
    else:
        df_shifts = pd.concat([df_shifts,pd.DataFrame(series_temp)],axis=1)

In [42]:
for week_out in range(0,9):
    week_out_str = f'week{week_out}'
    
    train_joint_dataset, val_joint_dataset, test_joint_dataset, _, _, test_index  = \
        create_datasets_multichannel(normalized_data, df_shifts, week_out)
    
    dict_params = {'model_base':'densenet',
                   'type_pooling':'avg',
                   'do':0.3,
                   'md':16,
                   'activation':'LeakyReLU',
                   'weighted_loss':True,
                   'bs':16,
                   'lr':0.001,
                   'input_shape':train_joint_dataset[0].shape[1:]}
    # with strategy.scope():
    model = build_predesigned_model(dict_params['model_base'],
                                    dict_params['type_pooling'],
                                    dict_params['do'],
                                    dict_params['md'],
                                    dict_params['activation'],
                                    dict_params['input_shape'])
    model.compile(loss=keras.losses.categorical_crossentropy, 
                  optimizer=keras.optimizers.Adam(lr=dict_params['lr']),
                  metrics=[balanced_accuracy,'accuracy'])  
    
    filepath = f'{path_models}{name_var}/model_{week_out_str}_v0.h5'

    model.load_weights(filepath)
    predictions_prob = model.predict(test_joint_dataset[0])
    df_results = pd.DataFrame(predictions_prob,index=test_index)
    df_results.to_csv(f'results/{name_var}_{week_out_str}.csv')
