In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [3]:
# Basics
import tensorflow as tf
import xarray as xr

# Helpful
import tqdm

# Visualization
import matplotlib.pyplot as plt

# My Methods
from src.utils.CRPS import *  # CRPS metrics
from src.utils.data_split import *  # Splitting data into X and y
from src.utils.drn_make_X_array import *  # Import make train array functions (make_X_array)
from src.models.EMOS import *  # EMOS implementation
from src.models.EMOS_global.EMOS_global_load_score import *  # Load EMOS_global_scores
from src.models.EMOS_global.EMOS_global_load_model import *  # Load EMOS_global_models
import data.raw.load_data_raw as ldr  # Load raw data
import data.processed.load_data_processed as ldp  # Load processed data normed
import data.processed.load_data_processed_denormed as ldpd  # Load processed data denormed
from src.models.CRPS_baseline.CRPS_load import *  # Load CRPS scores

<IPython.core.display.Javascript object>

### 0. Set up

In [4]:
lead_time = 0

<IPython.core.display.Javascript object>

### 1. Load Data

In [5]:
dat_train_denorm = ldpd.load_data_all_train_proc_denorm()
dat_test_denorm = ldpd.load_data_all_test_proc_denorm()

<IPython.core.display.Javascript object>

### 2. Data Split

In [6]:
dat_X_lead_all_denorm, dat_y_lead_all_denorm = split_var_lead(dat_train_denorm)

<IPython.core.display.Javascript object>

### 3. Data preparation

Idea: X_train constructed by flattening the value arrays and concatenating them together, so that in the end I get the shape (n, n_variables), keep in mind we use both mean and std of each variabel for training. So in total we will have 10 / 12, depending on which variable. Important, for drn we'll also need the embeddings, hence the idea to keep another flattened array with shape (n, 1) with the complementing embedding (1 - 15600).

In [7]:
drn_X_train_0_array, drn_embe = make_X_array(dat_X_lead_all_denorm, 0)

KeyboardInterrupt: 

<IPython.core.display.Javascript object>

In [30]:
def make_X_train_array(X_array_all_denorm, lead_time):
    """
    makes a training array with all of the variable mean and std combined, including embedding array
Args:
    X_array_all_denorm (nested_list): 6x31, X values for each variable and lead_time
    lead_time (int): the lead_time for which to construct the training array (0 - 30)
Returns:
    result (array): Return array with all X_means and stds combined
    embedding (array): embedding array with all embeddings in same order as results
    
    """
    X_train_lead_denorm_list = []
    for var in range(6):
        for mean_std in range(2):
            X_train_part, embedding = flatten_with_grid_ids(
                dat_X_lead_all_denorm[var][lead_time].isel(mean_std=mean_std)
            )
            X_train_lead_denorm_list.append(X_train_part)

    # get length of individual arrays and total count
    length = len(X_train_lead_denorm_list[0])
    n = len(X_train_lead_denorm_list)

    # initialize an empty array of shape (length, n)
    result = np.empty((length, n))

    # fill the result array
    for i, arr in enumerate(X_train_lead_denorm_list):
        result[:, i] = arr

    return result, embedding

<IPython.core.display.Javascript object>

In [14]:
def flatten_with_grid_ids(da):
    """
    Flatten an xarray DataArray and generate corresponding grid point IDs.
    
    Args:
        da (xarray.DataArray): The DataArray to flatten.
        
    Returns:
        A tuple (flattened_values, grid_ids), where:
            - flattened_values (numpy.ndarray): A 1D array with all values from the DataArray.
            - grid_ids (numpy.ndarray): A 1D array with the corresponding grid point ID for each value.
    """
    # Get the shapes of the 'lat' and 'lon' dimensions
    lat_shape = da.sizes["lat"]
    lon_shape = da.sizes["lon"]

    # Generate a 2D array with the grid point ID for each (lat, lon) pair
    grid_id_2d = np.arange(lat_shape * lon_shape).reshape(lat_shape, lon_shape) + 1

    # Repeat the 2D grid ID array along the other dimensions to match the shape of the DataArray
    grid_id_nd = np.repeat(grid_id_2d[None, :, :], da.sizes["forecast_date"], axis=0)

    # Flatten both the DataArray values and the grid ID array
    flattened_values = da.values.flatten()
    grid_ids = grid_id_nd.flatten()

    return flattened_values, grid_ids

<IPython.core.display.Javascript object>

In [15]:
flattened_values, grid_ids = flatten_with_grid_ids(
    dat_X_lead_all_denorm[0][0].isel(mean_std=0)
)

<IPython.core.display.Javascript object>

In [18]:
flattened_values

array([-0.463738 , -1.2450967, -2.1260755, ...,  1.8941275,  1.6559246,
        1.4866699], dtype=float32)

<IPython.core.display.Javascript object>

In [45]:
def build_emb_model(
    n_features,
    n_outputs,
    hidden_nodes,
    emb_size,
    max_id,
    compile=False,
    optimizer="adam",
    lr=0.01,
    loss=crps_cost_function,
    activation="relu",
    reg=None,
):
    """

    Args:
        n_features: Number of features
        n_outputs: Number of outputs
        hidden_nodes: int or list of hidden nodes
        emb_size: Embedding size
        max_id: Max embedding ID
        compile: If true, compile model
        optimizer: Name of optimizer
        lr: learning rate
        loss: loss function
        activation: Activation function for hidden layer

    Returns:
        model: Keras model
    """
    if type(hidden_nodes) is not list:
        hidden_nodes = [hidden_nodes]

    features_in = Input(shape=(n_features,))
    id_in = Input(shape=(1,))
    emb = Embedding(max_id + 1, emb_size)(id_in)
    emb = Flatten()(emb)
    x = Concatenate()([features_in, emb])
    for h in hidden_nodes:
        x = Dense(h, activation=activation, kernel_regularizer=reg)(x)
    x = Dense(n_outputs, activation="linear", kernel_regularizer=reg)(x)
    model = Model(inputs=[features_in, id_in], outputs=x)

    if compile:
        opt = keras.optimizers.__dict__[optimizer](lr=lr)
        model.compile(optimizer=opt, loss=loss)
    return model

<IPython.core.display.Javascript object>