In [1]:
import numpy as np
import xarray as xr
import pandas as pd
import copy
from datetime import datetime, timedelta
from keras.utils import to_categorical
# import visualkeras
# import tensorflow as tf
from sklearn.metrics import balanced_accuracy_score
import optuna
from optuna.samplers import TPESampler
import keras
from keras.callbacks import ModelCheckpoint
from sklearn.utils.class_weight import compute_class_weight
import sys
import os
import joblib
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

2023-09-19 23:23:14.258544: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
sys.path.append("/glade/u/home/jhayron/WR_Predictability/3_MLModels/")
from model_builders_v2 import *

## Helper functions

In [3]:
# def create_tf_datasets(input_data, output_data):
#     # Convert xarray dataset to numpy array for TensorFlow Dataset
#     input_images = input_data.transpose('time', 'lat', 'lon','channel').values
#     output_one_hot = output_data.values

#     # Create TensorFlow Datasets
#     input_dataset = tf.data.Dataset.from_tensor_slices(input_images)
#     output_dataset = tf.data.Dataset.from_tensor_slices(output_one_hot)

#     # Combine input and output datasets into a joint dataset
#     joint_dataset = tf.data.Dataset.zip((input_dataset, output_dataset))

#     return joint_dataset
def create_tf_datasets(input_data, output_data):
    # Convert xarray dataset to numpy array for TensorFlow Dataset
    input_images = input_data.transpose('time', 'lat', 'lon','channel').values
    output_one_hot = output_data.values

    # Create TensorFlow Datasets
    input_dataset = tf.data.Dataset.from_tensor_slices(input_images)
    output_dataset = tf.data.Dataset.from_tensor_slices(output_one_hot)

    # Combine input and output datasets into a joint dataset
    joint_dataset = tf.data.Dataset.zip((input_dataset, output_dataset))

    return (input_images,output_one_hot)

def create_datasets(input_anoms, var_name, df_shifts, week_out):
# Assuming you have the xarray.Dataset 'input_data' and the pandas.Series 'output_data'
    input_data = copy.deepcopy(input_anoms[var_name])

    array_temp = input_data.data
    array_temp[np.isfinite(array_temp)==False]=0
    input_data.data = array_temp

#     input_data = (input_data - input_data.mean('time')) / (input_data.std('time'))
    
#     input_data[np.isfinite(array_temp)==False] = 0
    
    # Reshape the data to add a new dimension
    values_reshaped = input_data.values.reshape(input_data.shape[0], input_data.shape[1], input_data.shape[2], 1)

    # Create a new xarray.DataArray with the reshaped data and the original coordinates
    input_data = xr.DataArray(values_reshaped, coords=input_data.coords, dims=('time', 'lat', 'lon', 'channel'))
    output_data = copy.deepcopy(df_shifts[f'week{week_out}']).dropna()

    # Step 1: Create a common date index that includes all dates in both the input and output data
    common_dates = np.intersect1d(input_data['time'].values, output_data.index)

    # Step 2: Reindex the input xarray dataset and the output DataFrame to the common date index
    input_data = input_data.sel(time=common_dates)
    output_data = output_data.loc[common_dates]

    # Step 3: One-hot encode the output DataFrame using to_categorical
    num_classes = len(output_data.unique())  # Number of classes (number of weeks in this case)
    output_data_encoded = to_categorical(output_data, num_classes=num_classes)
    output_data_encoded = pd.DataFrame(output_data_encoded,index=output_data.index)

    # Step 4: Create masks for training, validation, and testing periods
    train_mask = (output_data.index >= '1980-01-01') & (output_data.index <= '2010-12-31')
    val_mask = (output_data.index >= '2011-01-01') & (output_data.index <= '2015-12-31')
    test_mask = (output_data.index >= '2016-01-01') & (output_data.index <= '2020-12-31')

    # Step 5: Split the input xarray dataset and the output DataFrame into subsets
    input_train = input_data.sel(time=train_mask)
    input_val = input_data.sel(time=val_mask)
    input_test = input_data.sel(time=test_mask)

    output_train = output_data_encoded.loc[train_mask]
    output_val = output_data_encoded.loc[val_mask]
    output_test = output_data_encoded.loc[test_mask]

    train_joint_dataset = create_tf_datasets(input_train, output_train)
    val_joint_dataset = create_tf_datasets(input_val, output_val)
    test_joint_dataset = create_tf_datasets(input_test, output_test)

    # buffer_size = train_joint_dataset.cardinality()
    # train_joint_dataset = train_joint_dataset.shuffle(buffer_size)
    return train_joint_dataset, val_joint_dataset, test_joint_dataset

def get_output_from_dataset(dataset):
    output_array = []
    for input_data, output_data in dataset.as_numpy_iterator():
        output_array.append(output_data)

    # Convert the list of NumPy arrays into a single NumPy array
    output_array = np.array(output_array)
    return output_array

def balanced_accuracy(y_true, y_pred):
    y_true = tf.argmax(y_true, axis=1)
    y_pred = tf.argmax(y_pred, axis=1)
    return tf.py_function(balanced_accuracy_score, (y_true, y_pred), tf.float32)

def logging_callback(study, frozen_trial):
    previous_best_value = study.user_attrs.get("previous_best_value", None)
    if previous_best_value != study.best_value:
        study.set_user_attr("previous_best_value", study.best_value)
        print(
            "Trial {} finished with best value: {} and parameters: {}. ".format(
            frozen_trial.number,
            frozen_trial.value,
            frozen_trial.params,
            )
        )

In [4]:
def create_datasets_multichannel(input_data, df_shifts, week_out):
# Assuming you have the xarray.Dataset 'input_data' and the pandas.Series 'output_data'

    # Create a new xarray.DataArray with the reshaped data and the original coordinates
    output_data = copy.deepcopy(df_shifts[f'week{week_out}']).dropna()

    # Step 1: Create a common date index that includes all dates in both the input and output data
    common_dates = np.intersect1d(input_data['time'].values, output_data.index)

    # Step 2: Reindex the input xarray dataset and the output DataFrame to the common date index
    input_data = input_data.sel(time=common_dates)
    output_data = output_data.loc[common_dates]

    # Step 3: One-hot encode the output DataFrame using to_categorical
    num_classes = len(output_data.unique())  # Number of classes (number of weeks in this case)
    output_data_encoded = to_categorical(output_data, num_classes=num_classes)
    output_data_encoded = pd.DataFrame(output_data_encoded,index=output_data.index)

    # Step 4: Create masks for training, validation, and testing periods
    train_mask = (output_data.index >= '1980-01-01') & (output_data.index <= '2010-12-31')
    val_mask = (output_data.index >= '2011-01-01') & (output_data.index <= '2015-12-31')
    test_mask = (output_data.index >= '2016-01-01') & (output_data.index <= '2020-12-31')

    # Step 5: Split the input xarray dataset and the output DataFrame into subsets
    input_train = input_data.sel(time=train_mask)
    input_val = input_data.sel(time=val_mask)
    input_test = input_data.sel(time=test_mask)

    output_train = output_data_encoded.loc[train_mask]
    output_val = output_data_encoded.loc[val_mask]
    output_test = output_data_encoded.loc[test_mask]

    train_joint_dataset = create_tf_datasets(input_train, output_train)
    val_joint_dataset = create_tf_datasets(input_val, output_val)
    test_joint_dataset = create_tf_datasets(input_test, output_test)

    return train_joint_dataset, val_joint_dataset, test_joint_dataset

In [5]:
def normalize_data_with_scaling(data, start_year, end_year):
    # Define a normalization function
    def normalize_pixel(pixel):
        return (pixel - min_value) / (max_value - min_value)

    # Create an empty DataArray to store the scaling parameters (min and max) for each pixel and channel
    scaling_params = xr.DataArray(np.nan, dims=("lat", "lon", "channel", "parameter"), 
                                  coords={"lat": data.lat, "lon": data.lon, 
                                          "channel": data.channel, "parameter": ["min", "max"]})

    # Normalize the data using the MinMaxScalers and scaling parameters
    normalized_data = data.copy()

    # Initialize a dictionary to store MinMaxScalers for each pixel and channel
    for channel in data.channel:
        data_channel = data.sel(channel=channel)
        years_for_scaling = data_channel.sel(time=slice(f"{start_year}-01-01", f"{end_year}-12-31"))
        # Define the min and max values for normalization
        min_value = years_for_scaling.min(dim='time')
        max_value = years_for_scaling.max(dim='time')
        
        # Apply the normalization function to each pixel along the 'time' dimension
        normalized_data_channel = normalize_pixel(data_channel).values
        normalized_data_channel[np.isfinite(normalized_data_channel)==False]=0
        normalized_data.loc[dict(channel=channel)] = normalized_data_channel
        
    return normalized_data

IC_SODA.nc   OHC100_SODA.nc  OHC50_SODA.nc   SD_ERA5.nc      SST_SODA.nc       STL_7cm_ERA5.nc   SWVL_28cm_ERA5.nc  U10_ERA5.nc
IT_SODA.nc   OHC200_SODA.nc  OHC700_SODA.nc  SSH_SODA.nc     STL_1m_ERA5.nc    STL_full_ERA5.nc  SWVL_7cm_ERA5.nc   U200_ERA5.nc
MLD_SODA.nc  OHC300_SODA.nc  OLR_ERA5.nc     SST_OISSTv2.nc  STL_28cm_ERA5.nc  SWVL_1m_ERA5.nc   SWVL_full_ERA5.nc  Z500_ERA5.ncm

# Atmosphere

In [6]:
## array_anoms
names_vars = ['Z500_ERA5','OLR_ERA5','U200_ERA5','U10_ERA5']

path_weekly_anoms = '/glade/scratch/jhayron/Data4Predictability/WeeklyAnoms_DetrendedStd/'
# path_weekly_anoms = '/glade/scratch/jhayron/Data4Predictability/WeeklyAnoms_Std_withTrends/'

list_input_anoms = []

var_names = []
for name_var in names_vars:
    list_input_anoms.append(xr.open_dataset(f'{path_weekly_anoms}{name_var}.nc'))
    var_names.append(list(list_input_anoms[-1].data_vars.keys())[0])
    
full_input_array = np.zeros((list_input_anoms[0][var_names[0]].values.shape[0],240,720,len(list_input_anoms)))

for ichannel in range(len(list_input_anoms)):
    full_input_array[:,:,:,ichannel] = list_input_anoms[ichannel][var_names[ichannel]].values
    
#     aaaaaa
full_input_array[np.isfinite(full_input_array)==False]=0

del(list_input_anoms)

In [7]:
## array_mean
names_vars = ['Z500_ERA5','OLR_ERA5','U200_ERA5','U10_ERA5']

path_weekly_mean = '/glade/scratch/jhayron/Data4Predictability/WeeklyDatasets/'
# path_weekly_anoms = '/glade/scratch/jhayron/Data4Predictability/WeeklyAnoms_Std_withTrends/'

list_input_mean = []

var_names = []
for name_var in names_vars:
    list_input_mean.append(xr.open_dataset(f'{path_weekly_mean}{name_var}.nc'))
    var_names.append(list(list_input_mean[-1].data_vars.keys())[0])
    
full_input_array_mean = np.zeros((list_input_mean[0][var_names[0]].values.shape[0],240,720,len(list_input_mean)))

for ichannel in range(len(list_input_mean)):
    full_input_array_mean[:,:,:,ichannel] = list_input_mean[ichannel][var_names[ichannel]].values
    
#     aaaaaa
full_input_array_mean[np.isfinite(full_input_array_mean)==False]=0


In [8]:
merged_array = np.concatenate((full_input_array, full_input_array_mean), axis=-1)

In [9]:
del(full_input_array)
del(full_input_array_mean)

In [10]:

da_input = xr.DataArray(
    data=merged_array,
    dims=["time","lat", "lon","channel"],
    coords=dict(
        time=(["time"], list_input_mean[0][var_names[0]].time.values),
        lat=(["lat"], list_input_mean[0][var_names[0]].lat.values),
        lon=(["lon"], list_input_mean[0][var_names[0]].lon.values),
        channel=(["channel"], np.arange(merged_array.shape[-1])),
    )
)

reduction_factor = 3
coarsened_data = da_input.coarsen(lat=reduction_factor, lon=reduction_factor).mean()
# del(full_input_array)
del(list_input_mean)
del(var_names)
del(da_input)

# Example usage:
start_year = 1981
end_year = 2015
normalized_data = normalize_data_with_scaling(coarsened_data, start_year, end_year)

week_out=3
week_out_str = f'week{week_out}'

wr_series = pd.read_csv('/glade/work/jhayron/Data4Predictability/WR_Series_20230824.csv',\
                index_col=0,names=['week0'],skiprows=1,parse_dates=True)
for wk in range(2,10):
    series_temp = copy.deepcopy(wr_series["week0"])
    series_temp.index = series_temp.index - timedelta(weeks = wk-1)
    series_temp.name = f'week{wk-1}'
    if wk==2:
        df_shifts = pd.concat([pd.DataFrame(wr_series["week0"]),pd.DataFrame(series_temp)],axis=1)  
    else:
        df_shifts = pd.concat([df_shifts,pd.DataFrame(series_temp)],axis=1)
        
# train_joint_dataset, val_joint_dataset, test_joint_dataset = \
#     create_datasets_multichannel(normalized_data, df_shifts, week_out)


In [11]:
normalized_data.to_netcdf('/glade/scratch/jhayron/Data4Predictability/FinalTrainingDataArrays/atm_1p5deg.nc')

# Ocean

IC_SODA.nc OHC100_SODA.nc OHC50_SODA.nc SD_ERA5.nc SST_SODA.nc STL_7cm_ERA5.nc SWVL_28cm_ERA5.nc U10_ERA5.nc IT_SODA.nc OHC200_SODA.nc OHC700_SODA.nc SSH_SODA.nc STL_1m_ERA5.nc STL_full_ERA5.nc SWVL_7cm_ERA5.nc U200_ERA5.nc MLD_SODA.nc OHC300_SODA.nc OLR_ERA5.nc SST_OISSTv2.nc STL_28cm_ERA5.nc SWVL_1m_ERA5.nc SWVL_full_ERA5.nc Z500_ERA5.ncm

In [12]:
## array_anoms
names_vars = ['IT_SODA','OHC100_SODA','SST_SODA','MLD_SODA']

path_weekly_anoms = '/glade/scratch/jhayron/Data4Predictability/WeeklyAnoms_DetrendedStd/'
# path_weekly_anoms = '/glade/scratch/jhayron/Data4Predictability/WeeklyAnoms_Std_withTrends/'

list_input_anoms = []

var_names = []
for name_var in names_vars:
    list_input_anoms.append(xr.open_dataset(f'{path_weekly_anoms}{name_var}.nc'))
    var_names.append(list(list_input_anoms[-1].data_vars.keys())[0])
    
full_input_array = np.zeros((list_input_anoms[0][var_names[0]].values.shape[0],240,720,len(list_input_anoms)))

for ichannel in range(len(list_input_anoms)):
    full_input_array[:,:,:,ichannel] = list_input_anoms[ichannel][var_names[ichannel]].values
    
#     aaaaaa
full_input_array[np.isfinite(full_input_array)==False]=0

del(list_input_anoms)

In [13]:
## array_mean
names_vars = ['IT_SODA','OHC100_SODA','SST_SODA','MLD_SODA']

path_weekly_mean = '/glade/scratch/jhayron/Data4Predictability/WeeklyDatasets/'
# path_weekly_anoms = '/glade/scratch/jhayron/Data4Predictability/WeeklyAnoms_Std_withTrends/'

list_input_mean = []

var_names = []
for name_var in names_vars:
    list_input_mean.append(xr.open_dataset(f'{path_weekly_mean}{name_var}.nc'))
    var_names.append(list(list_input_mean[-1].data_vars.keys())[0])
    
full_input_array_mean = np.zeros((list_input_mean[0][var_names[0]].values.shape[0],240,720,len(list_input_mean)))

for ichannel in range(len(list_input_mean)):
    full_input_array_mean[:,:,:,ichannel] = list_input_mean[ichannel][var_names[ichannel]].values
    
#     aaaaaa
full_input_array_mean[np.isfinite(full_input_array_mean)==False]=0


In [14]:
merged_array = np.concatenate((full_input_array, full_input_array_mean), axis=-1)

In [15]:
del(full_input_array)
del(full_input_array_mean)

In [16]:

da_input = xr.DataArray(
    data=merged_array,
    dims=["time","lat", "lon","channel"],
    coords=dict(
        time=(["time"], list_input_mean[0][var_names[0]].time.values),
        lat=(["lat"], list_input_mean[0][var_names[0]].lat.values),
        lon=(["lon"], list_input_mean[0][var_names[0]].lon.values),
        channel=(["channel"], np.arange(merged_array.shape[-1])),
    )
)

reduction_factor = 3
coarsened_data = da_input.coarsen(lat=reduction_factor, lon=reduction_factor).mean()
# del(full_input_array)
del(list_input_mean)
del(var_names)
del(da_input)

# Example usage:
start_year = 1981
end_year = 2015
normalized_data = normalize_data_with_scaling(coarsened_data, start_year, end_year)

week_out=3
week_out_str = f'week{week_out}'

wr_series = pd.read_csv('/glade/work/jhayron/Data4Predictability/WR_Series_20230824.csv',\
                index_col=0,names=['week0'],skiprows=1,parse_dates=True)
for wk in range(2,10):
    series_temp = copy.deepcopy(wr_series["week0"])
    series_temp.index = series_temp.index - timedelta(weeks = wk-1)
    series_temp.name = f'week{wk-1}'
    if wk==2:
        df_shifts = pd.concat([pd.DataFrame(wr_series["week0"]),pd.DataFrame(series_temp)],axis=1)  
    else:
        df_shifts = pd.concat([df_shifts,pd.DataFrame(series_temp)],axis=1)
        
# train_joint_dataset, val_joint_dataset, test_joint_dataset = \
#     create_datasets_multichannel(normalized_data, df_shifts, week_out)


In [17]:
normalized_data.to_netcdf('/glade/scratch/jhayron/Data4Predictability/FinalTrainingDataArrays/ocn_1p5deg.nc')

# Land

IC_SODA.nc OHC100_SODA.nc OHC50_SODA.nc SD_ERA5.nc SST_SODA.nc STL_7cm_ERA5.nc SWVL_28cm_ERA5.nc U10_ERA5.nc IT_SODA.nc OHC200_SODA.nc OHC700_SODA.nc SSH_SODA.nc STL_1m_ERA5.nc STL_full_ERA5.nc SWVL_7cm_ERA5.nc U200_ERA5.nc MLD_SODA.nc OHC300_SODA.nc OLR_ERA5.nc SST_OISSTv2.nc STL_28cm_ERA5.nc SWVL_1m_ERA5.nc SWVL_full_ERA5.nc Z500_ERA5.ncm

In [18]:
## array_anoms
names_vars = ['SWVL_28cm_ERA5','SWVL_full_ERA5','STL_full_ERA5','SD_ERA5']

path_weekly_anoms = '/glade/scratch/jhayron/Data4Predictability/WeeklyAnoms_DetrendedStd/'
# path_weekly_anoms = '/glade/scratch/jhayron/Data4Predictability/WeeklyAnoms_Std_withTrends/'

list_input_anoms = []

var_names = []
for name_var in names_vars:
    list_input_anoms.append(xr.open_dataset(f'{path_weekly_anoms}{name_var}.nc'))
    var_names.append(list(list_input_anoms[-1].data_vars.keys())[0])
    
full_input_array = np.zeros((list_input_anoms[0][var_names[0]].values.shape[0],240,720,len(list_input_anoms)))

for ichannel in range(len(list_input_anoms)):
    full_input_array[:,:,:,ichannel] = list_input_anoms[ichannel][var_names[ichannel]].values
    
#     aaaaaa
full_input_array[np.isfinite(full_input_array)==False]=0

del(list_input_anoms)

In [19]:
## array_mean
names_vars = ['SWVL_28cm_ERA5','SWVL_full_ERA5','STL_full_ERA5','SD_ERA5']

path_weekly_mean = '/glade/scratch/jhayron/Data4Predictability/WeeklyDatasets/'
# path_weekly_anoms = '/glade/scratch/jhayron/Data4Predictability/WeeklyAnoms_Std_withTrends/'

list_input_mean = []

var_names = []
for name_var in names_vars:
    list_input_mean.append(xr.open_dataset(f'{path_weekly_mean}{name_var}.nc'))
    var_names.append(list(list_input_mean[-1].data_vars.keys())[0])
    
full_input_array_mean = np.zeros((list_input_mean[0][var_names[0]].values.shape[0],240,720,len(list_input_mean)))

for ichannel in range(len(list_input_mean)):
    full_input_array_mean[:,:,:,ichannel] = list_input_mean[ichannel][var_names[ichannel]].values
    
#     aaaaaa
full_input_array_mean[np.isfinite(full_input_array_mean)==False]=0


In [20]:
merged_array = np.concatenate((full_input_array, full_input_array_mean), axis=-1)

In [21]:
del(full_input_array)
del(full_input_array_mean)

In [22]:

da_input = xr.DataArray(
    data=merged_array,
    dims=["time","lat", "lon","channel"],
    coords=dict(
        time=(["time"], list_input_mean[0][var_names[0]].time.values),
        lat=(["lat"], list_input_mean[0][var_names[0]].lat.values),
        lon=(["lon"], list_input_mean[0][var_names[0]].lon.values),
        channel=(["channel"], np.arange(merged_array.shape[-1])),
    )
)

reduction_factor = 3
coarsened_data = da_input.coarsen(lat=reduction_factor, lon=reduction_factor).mean()
# del(full_input_array)
del(list_input_mean)
del(var_names)
del(da_input)

# Example usage:
start_year = 1981
end_year = 2015
normalized_data = normalize_data_with_scaling(coarsened_data, start_year, end_year)

week_out=3
week_out_str = f'week{week_out}'

wr_series = pd.read_csv('/glade/work/jhayron/Data4Predictability/WR_Series_20230824.csv',\
                index_col=0,names=['week0'],skiprows=1,parse_dates=True)
for wk in range(2,10):
    series_temp = copy.deepcopy(wr_series["week0"])
    series_temp.index = series_temp.index - timedelta(weeks = wk-1)
    series_temp.name = f'week{wk-1}'
    if wk==2:
        df_shifts = pd.concat([pd.DataFrame(wr_series["week0"]),pd.DataFrame(series_temp)],axis=1)  
    else:
        df_shifts = pd.concat([df_shifts,pd.DataFrame(series_temp)],axis=1)
        
# train_joint_dataset, val_joint_dataset, test_joint_dataset = \
#     create_datasets_multichannel(normalized_data, df_shifts, week_out)


In [23]:
normalized_data.to_netcdf('/glade/scratch/jhayron/Data4Predictability/FinalTrainingDataArrays/lnd_1p5deg.nc')