In [1]:
import numpy as np
import xarray as xr
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

from pathlib import Path

from cnn_training import generate_climprob_inputs, ModelRegistry, construct_climdev_cnn, DEFAULT_FIT, DEFAULT_COMPILE, DEFAULT_CONSTRUCT

2022-09-09 08:22:33.199966: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-09-09 08:22:33.200000: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
scratchdir = Path('/scratch/')
experiment_name = 'trial5_ensmean_something'
ensmean = True
varlist = ['tcw']
season = None
patchsize = 40 # If single value then the number of degrees for square latlon patch, otherewise supply tuple of ((latmin,latmax),(lonmin,lonmax))

In [3]:
# Preparation of input and target data
train_inputs = np.load(scratchdir / f'{experiment_name}.training_inputs.npy') # (nsamples,nlat,nlon,nchannels), if ensmean then channels is just the number of variables, nlat & nlon depend on patch size
train_target = np.load(scratchdir / f'{experiment_name}.training_terciles.npy') # Spatial average 4-week rainfall classified into terciles, one hot encoded (low, mid, high)
train_timestamps = pd.read_hdf(scratchdir / f'{experiment_name}.training_timestamps.h5')

test_inputs = np.load(scratchdir / f'{experiment_name}.testing_inputs.npy') # these are the forecasts that have been kept separate
test_target = np.load(scratchdir / f'{experiment_name}.testing_terciles.npy')
test_timestamps = pd.read_hdf(scratchdir / f'{experiment_name}.testing_timestamps.h5')

full_inputs = np.concatenate([train_inputs, test_inputs], axis = 0) # Stacking along valid_time/sample dimension
full_target = np.concatenate([train_target, test_target], axis = 0) # Stacking along valid_time/sample dimension
full_timestamps = pd.concat([train_timestamps, test_timestamps])

# Now we want to prepare input to the second branch of the neural network (the base probabilities for each of the classes)
n_classes = full_target.shape[-1]
full_clim_logprobs = generate_climprob_inputs(full_inputs, climprobs = full_target.mean(axis = 0)) # 3 classes are assumed to be equiprobable terciles

In [4]:
# Now is the moment to choose your hyperparameters
# We supply them as kwargs to the structure that will initialize and contain all models
registry = ModelRegistry(xdata = [full_inputs, full_clim_logprobs],
        ydata = full_target,
        timestamps = full_timestamps.index,
        compile_kwargs = DEFAULT_COMPILE, construct_kwargs = DEFAULT_CONSTRUCT, fit_kwargs = DEFAULT_FIT)

In [5]:
# Select test and validation years, run model
test_year = 2020
validation_years = [2005, 2010, 2015]
years = full_timestamps.index.year.unique()
remaining_years = years.drop(test_year).sort_values() # sort to make sure that the next leaving out is blockwise
training_years = years.drop(test_year).drop(validation_years)
print('test', test_year)
print('validation', validation_years)
print('training', training_years)
test_indices = np.where(full_timestamps.index.year == test_year)[0] # From boolean to numeric index
val_indices = np.where(full_timestamps.index.year.map(lambda y: y in validation_years))[0]
train_indices = np.where(full_timestamps.index.year.map(lambda y: y in training_years))[0]
modelindex = registry.initialize_untrained_model(train_indices = train_indices, val_indices = val_indices, test_indices = test_indices)
print(modelindex)
registry.train_model(modelindex = modelindex)

test 2020
validation [2005, 2010, 2015]
training Int64Index([2000, 2001, 2002, 2003, 2004, 2006, 2007, 2008, 2009, 2011, 2012,
            2013, 2014, 2016, 2017, 2018, 2019, 2021],
           dtype='int64', name='valid_time')
0
Epoch 1/10


2022-09-09 08:24:35.901848: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-09-09 08:24:35.901882: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-09-09 08:24:35.901903: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (jupyternoteboo): /proc/driver/nvidia/version does not exist
2022-09-09 08:24:35.902078: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [13]:
test_date_indices = range(7,17,4)
test_timestamps[test_date_indices]

valid_time
2020-03-06   2020-03-06
2020-04-03   2020-04-03
2020-05-01   2020-05-01
dtype: datetime64[ns]

In [17]:
for n_sample in test_date_indices:
    yyyy = test_timestamps[n_sample].year
    mm = test_timestamps[n_sample].month
    dd = test_timestamps[n_sample].day
    np.save(f'dianna/test_{yyyy}/test_inputs_{mm}-{dd}.npy', test_inputs[n_sample])
    np.save(f'dianna/test_{yyyy}/test_target_{mm}-{dd}.npy', test_target[n_sample])

In [16]:
registry.registry[-1].save(f'dianna/test_{yyyy}/pre-trained')



INFO:tensorflow:Assets written to: dianna/test_2020/pre-trained/assets


INFO:tensorflow:Assets written to: dianna/test_2020/pre-trained/assets


In [20]:
climprobs = train_target.mean(axis = 0)
np.save(f'dianna/test_{yyyy}/climprobs.npy', climprobs)