### Train best model according to report to try to reproduce results

- All static features: 
    * Z500, 
    * T850, 
    * latitude, 
    * orography, 
    * land-sea mask, 
    * soil type, and 
    * top-of-atmosphere radiation
- L=2
- $\Delta t$ = 6

#### Import libraries

In [2]:
%load_ext autoreload
%autoreload 2

In [1]:
import sys
sys.path.append('/'.join(sys.path[0].split('/')[:-1]))

import xarray as xr
import matplotlib.pyplot as plt
import numpy as np
import time
import os
import healpy as hp
import random

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

from modules.utils import train_model_2steps_temp, init_device
from modules.data import WeatherBenchDatasetXarrayHealpixTemp
from modules.healpix_models import UNetSphericalHealpix, UNetSphericalTempHealpix
from modules.test import create_iterative_predictions_healpix_temp
from modules.test import compute_rmse_healpix
from modules.plotting import plot_rmses

datadir = "../data/healpix/"
input_dir = datadir + "5.625deg_nearest/"
model_save_path = datadir + "models/"
pred_save_path = datadir + "predictions/"

if not os.path.isdir(model_save_path):
    os.mkdir(model_save_path)
    
if not os.path.isdir(pred_save_path):
    os.mkdir(pred_save_path)

Define constants and load data

In [10]:
chunk_size = 720*3

In [11]:
train_years = ('1979', '2012')#('1979', '2012')
val_years = ('2013', '2016')
test_years = ('2017', '2018')

nodes = 12*16*16
max_lead_time = 5*24
nb_timesteps = 2

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0,2"
gpu = [0, 1]
num_workers = 10
pin_memory = True

nb_epochs = 20
learning_rate = 8e-3

obs = xr.open_mfdataset(pred_save_path + 'observations_nearest.nc', combine='by_coords', chunks={'time':chunk_size})
#rmses_weyn = xr.open_dataset(datadir + 'metrics/rmses_weyn.nc')

Define functions:

**TODO**
Check if the code is the same as the functions with the same name in ```modules/*.py``` and subtitute by imports in such a case. 

#### Load data

In [12]:
z500 = xr.open_mfdataset(f'{input_dir}geopotential_500/*.nc', combine='by_coords', chunks={'time':chunk_size}).rename({'z':'z500'})
t850 = xr.open_mfdataset(f'{input_dir}temperature_850/*.nc', combine='by_coords', chunks={'time':chunk_size}).rename({'t':'t850'})
rad = xr.open_mfdataset(f'{input_dir}toa_incident_solar_radiation/*.nc', combine='by_coords', chunks={'time':chunk_size})

z500 = z500.isel(time=slice(7, None))
t850 = t850.isel(time=slice(7, None))

constants = xr.open_dataset(f'{input_dir}constants/constants_5.625deg.nc').rename({'orography' :'orog'})
constants = constants.assign(cos_lon=lambda x: np.cos(np.deg2rad(x.lon)))
constants = constants.assign(sin_lon=lambda x: np.sin(np.deg2rad(x.lon)))

temp = xr.DataArray(np.zeros(z500.dims['time']), coords=[('time', z500.time.values)])
constants, _ = xr.broadcast(constants, temp)

orog = constants['orog']
lsm = constants['lsm']
lats = constants['lat2d']
slt = constants['slt']
cos_lon = constants['cos_lon']
sin_lon = constants['sin_lon']

In [13]:
z = xr.open_mfdataset(f'{input_dir}geopotential_500/*.nc', combine='by_coords', chunks={'time':chunk_size})['z']\
.assign_coords(level=1)

t = xr.open_mfdataset(f'{input_dir}temperature_850/*.nc', combine='by_coords', chunks={'time':chunk_size})['t']\
.assign_coords(level=1)

predictors = xr.concat([z, t], 'level')

In [5]:
#predictors_mean = predictors.mean(('time','node')).compute()
#predictors_std = predictors.std('time').mean('node').compute()

#const_mean = constants.mean(('time','node')).compute()
#const_std = constants.std('time').mean(('node')).compute()

In [15]:
# z500, t850, orog, lats, lsm, slt, rad
in_features = 7
out_features = 2
ds = xr.merge([z500, t850, orog, lats, lsm, slt, rad], compat='override')

ds_train = ds.sel(time=slice(*train_years))
ds_valid = ds.sel(time=slice(*val_years))
ds_test = ds.sel(time=slice(*test_years))

In [7]:
#train_mean = ds_train.mean(('time','node')).compute()
#train_std = ds_train.std('time').mean('node').compute()

In [8]:
#train_mean.to_netcdf(f'{input_dir}mean_train_all_vars.nc')
#train_std.to_netcdf(f'{input_dir}std_train_all_vars.nc')

In [16]:
train_mean = xr.open_mfdataset(f'{input_dir}mean_train_all_vars.nc').to_array(dim='level')
train_std = xr.open_mfdataset(f'{input_dir}std_train_all_vars.nc').to_array(dim='level')

#### Define model parameters

In [17]:
# define length of sequence to take into account for loss
len_sqce = 2
# define time resolution
delta_t = 6

# predict 5days data
max_lead_time = 5*24

feature_idx = list(range(7))
in_features = 7
out_features = 2

In [18]:
#del train_mean_
#del train_std_

In [19]:
description = "all_const_len{}_delta{}".format(len_sqce, delta_t)

model_filename = model_save_path + "spherical_unet_" + description + ".h5"
pred_filename = pred_save_path + "spherical_unet_" + description + ".nc"
rmse_filename = datadir + 'metrics/rmse_' + description + '.nc'

**Attention:**

If ```load=True``` the kernel dies. Check problem origin and if it's necessary to load the data

In [20]:
def train_model_2steps_temp_old(model, device, train_generator, batch_size, epochs, lr, validation_data):    
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr, eps=1e-7, weight_decay=0, amsgrad=False)
    
    train_losses = []
    val_losses = []
    n_samples = training_ds.n_samples
    
    for epoch in range(epochs):
        print('\rEpoch : {}'.format(epoch), end="")
        time1 = time.time()
        
        val_loss = 0
        train_loss = 0
        
        model.train() 
        
        batch_idx = 0
        for (batch, labels) in train_generator:
            # Transfer to GPU
            batch1 = batch[0].to(device)
            constants1 = batch[1].to(device)

            label1 = labels[0].to(device)
            label2 = labels[1].to(device)

            batch_size = batch1.shape[0]

            # Model
            output1 = model(batch1)
            batch2 = torch.cat((batch1[:, :, 1:, :], torch.cat((output1, constants1), dim=2).unsqueeze(2)), dim=2)
            output2 = model(batch2)
            
            loss = criterion(output1, label1) + criterion(output2, label2)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            train_loss = train_loss + loss.item() * batch_size
            print('\rBatch idx: {}; Loss: {:.3f}'.format(batch_idx, train_loss/(batch_size*(batch_idx+1))), end="")
            batch_idx += 1
            
        train_loss = train_loss / (len(train_generator.dataset))
        train_losses.append(train_loss)
        
        model.eval()
        with torch.set_grad_enabled(False):
            index = 0
            
            for batch, labels in validation_data:
                #batch.load()
                # Transfer to GPU
                batch1 = batch[0].to(device)
                constants1 = batch[1].to(device)

                label1 = labels[0].to(device)
                label2 = labels[1].to(device)

                batch_size = batch1.shape[0]
                
                output1 = model(batch1)
                batch2 = torch.cat((batch1[:, :, 1:, :], torch.cat((output1, constants1), dim=2).unsqueeze(2)), dim=2)
                output2 = model(batch2)
                
                val_loss = val_loss + (criterion(output1, label1).item() 
                                       + criterion(output2, label2).item()) * batch_size
                index = index + batch_size
                
        val_loss = val_loss / (len(validation_data.dataset))
        val_losses.append(val_loss)
        
        time2 = time.time()
        
        # Print stuff
        print('Epoch: {e:3d}/{n_e:3d}  - loss: {l:.3f}  - val_loss: {v_l:.5f}  - time: {t:2f}'
              .format(e=epoch+1, n_e=epochs, l=train_loss, v_l=val_loss, t=time2-time1))
        
    return train_losses, val_losses

In [21]:
def train_model_2steps_temp(model, device, training_ds, batch_size, epochs, lr, validation_data):    
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr, eps=1e-7, weight_decay=0, amsgrad=False)
    
    train_losses = []
    val_losses = []
    n_samples = training_ds.n_samples
    
    for epoch in range(epochs):
        print('\rEpoch : {}'.format(epoch), end="")
        time1 = time.time()
        
        val_loss = 0
        train_loss = 0
        
        model.train() 
        
        random.shuffle(training_ds.idxs)
        idxs = training_ds.idxs
        
        batch_idx = 0
        for i in range(0, n_samples - batch_size, batch_size):
            i_next = min(i + batch_size, n_samples)
            
            print('Reading batch...')
            batch, labels = training_ds[idxs[i:i_next]]
            print('Done')
        #for (batch, labels) in train_generator:
            #print('\rBatch idx: {}'.format(batch_idx), end="")
            # Transfer to GPU
            #batch.load()
            batch1 = batch[0].to(device)
            constants1 = batch[1].to(device)

            label1 = labels[0].to(device)
            label2 = labels[1].to(device)

            batch_size = batch1.shape[0]

            # Model
            print(batch1.shape)
            output1 = model(batch1)
            batch2 = torch.cat((batch1[:, :, 1:, :], torch.cat((output1, constants1), dim=2).unsqueeze(2)), dim=2)
            output2 = model(batch2)
            
            loss = criterion(output1, label1) + criterion(output2, label2)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            train_loss = train_loss + loss.item() * batch_size
            print('\rBatch idx: {}; Loss: {:.3f}'.format(batch_idx, train_loss/(batch_size*(batch_idx+1))), end="")
            batch_idx += 1
            
        train_loss = train_loss / (len(train_generator.dataset))
        train_losses.append(train_loss)
        
        model.eval()
        with torch.set_grad_enabled(False):
            index = 0
            
            for batch, labels in validation_data:
                #batch.load()
                # Transfer to GPU
                batch1 = batch[0].to(device)
                constants1 = batch[1].to(device)

                label1 = labels[0].to(device)
                label2 = labels[1].to(device)

                batch_size = batch1.shape[0]
                
                output1 = model(batch1)
                batch2 = torch.cat((batch1[:, :, 1:, :], torch.cat((output1, constants1), dim=2).unsqueeze(2)), dim=2)
                output2 = model(batch2)
                
                val_loss = val_loss + (criterion(output1, label1).item() 
                                       + criterion(output2, label2).item()) * batch_size
                index = index + batch_size
                
        val_loss = val_loss / (len(validation_data.dataset))
        val_losses.append(val_loss)
        
        time2 = time.time()
        
        # Print stuff
        print('Epoch: {e:3d}/{n_e:3d}  - loss: {l:.3f}  - val_loss: {v_l:.5f}  - time: {t:2f}'
              .format(e=epoch+1, n_e=epochs, l=train_loss, v_l=val_loss, t=time2-time1))
        
    return train_losses, val_losses

In [22]:
class WeatherBenchDatasetXarrayHealpixTemp(Dataset):
    
    """ Dataset used for graph models (1D), where data is loaded from stored numpy arrays.
    
    Parameters
    ----------
    ds : xarray Dataset
        Dataset containing the input data
    out_features : int
        Number of output features
    delta_t : int
        Temporal spacing between samples in temporal sequence (in hours)
    len_sqce : int
        Length of the input and output (predicted) sequences
    years : tuple(str)
        Years used to split the data
    nodes : float
        Number of nodes each sample has
    max_lead_time : int
        Maximum lead time (in case of iterative predictions) in hours
    load : bool
        If true, load dataset to RAM
    mean : np.ndarray of shape 2
        Mean to use for data normalization. If None, mean is computed from data
    std : np.ndarray of shape 2
        std to use for data normalization. If None, mean is computed from data
    """
        
    def __init__(self, ds, out_features, delta_t, len_sqce, years, nodes, nb_timesteps, 
                 max_lead_time=None, load=True, mean=None, std=None):
        
        
        self.delta_t = delta_t
        self.len_sqce = len_sqce
        self.years = years
        
        self.nodes = nodes
        self.out_features = out_features
        self.max_lead_time = max_lead_time
        self.nb_timesteps = nb_timesteps
        
        self.data = ds.to_array(dim='level', name='Dataset').transpose('time', 'node', 'level')
        self.in_features = self.data.shape[-1]
        
        self.mean = self.data.mean(('time', 'node')).compute() if mean is None else mean
        self.std = self.data.std(('time', 'node')).compute() if std is None else std
        
        
        
        # Count total number of samples
        total_samples = self.data.shape[0]        
        
        if max_lead_time is None:
            self.n_samples = total_samples - (len_sqce+1) * delta_t
        else:
            self.n_samples = total_samples - (len_sqce+1) * delta_t - max_lead_time
        
        # Normalize
        self.data = (self.data - self.mean) / self.std
        
        # Create indexes
        #self.idxs = [[[[sample_idx + delta_t*k for k in range(len_sqce)], sample_idx + delta_t * len_sqce], 
        #              [sample_idx + delta_t * len_sqce, sample_idx + delta_t * (len_sqce+1)]] 
        #             for sample_idx in range(self.n_samples)]
        
        self.idxs = np.array(range(self.n_samples))
        
        
        #if load: 
        #    print('Loading data into RAM')
        #    self.data.load()
            
        
    def __len__(self):
        return self.n_samples
    
    def __getitem__(self, idx):
        """ Returns sample and label corresponding to an index as torch.Tensor objects
            The return tensor shapes are (for the sample and the label): [n_vertex, len_sqce, n_features]
        """
        idx_data = self.idxs[idx]
        #1,0,2
        X = (
            torch.tensor([self.data.isel(time=[idx_d + self.delta_t*k for k in range(self.len_sqce)]).values for idx_d in idx_data], \
                         dtype=torch.float).permute(0, 2,1,3),\
            
             torch.tensor([self.data.isel(time=[idx_d + self.delta_t * self.len_sqce]).values[:,:,self.out_features:] for idx_d in idx_data],\
                         dtype=torch.float)
        )
        
        y = ( torch.tensor([self.data.isel(time=[idx_d + self.delta_t * self.len_sqce]).values[:,:,:self.out_features] for idx_d in idx_data],\
                         dtype=torch.float), 
             torch.tensor([self.data.isel(time=[idx_d + self.delta_t * (self.len_sqce+1)]).values[:,:,:self.out_features] for idx_d in idx_data],\
                         dtype=torch.float)
        
        )
        
        #X = (torch.tensor(self.data.isel(time=self.idxs[idx][0][0]).values).float().permute(1, 0, 2), 
        #     torch.tensor(self.data.isel(time=self.idxs[idx][0][1]).values[:, self.out_features:]).float())
        
        #y = (torch.Tensor(self.data.isel(time=self.idxs[idx][1][0]).values[:, :self.out_features]).float(), 
        #     torch.Tensor(self.data.isel(time=self.idxs[idx][1][1]).values[:, :self.out_features]).float())
        
        return X, y 

In [23]:
# Train and validation data
training_ds = WeatherBenchDatasetXarrayHealpixTemp(ds=ds_train, out_features=out_features,
                                                   len_sqce=len_sqce, delta_t=delta_t, years=train_years,
                                                   nodes=nodes, nb_timesteps=nb_timesteps, 
                                                   mean=train_mean, std=train_std)

In [24]:
validation_ds = WeatherBenchDatasetXarrayHealpixTemp(ds=ds_valid, out_features=out_features, 
                                                     len_sqce=len_sqce, delta_t=delta_t, years=val_years, 
                                                     nodes=nodes, nb_timesteps=nb_timesteps, 
                                                     mean=train_mean, std=train_std)

In [26]:
chunk_size

2160

In [27]:
batch_size = 70

In [31]:
dl_train = DataLoader(training_ds, batch_size=batch_size, shuffle=True, num_workers=num_workers, 
                      pin_memory=pin_memory)

In [32]:
dl_val = DataLoader(validation_ds, batch_size=batch_size*2, shuffle=False, num_workers=num_workers, 
                    pin_memory=pin_memory)

Define model

In [28]:
# Model
#spherical_unet = UNetSphericalTempHealpix(N=nodes, len_sqce=len_sqce, in_channels=in_features, out_channels=out_features, graph_width=3)

spherical_unet = UNetSphericalHealpix(N=nodes, in_channels=in_features*len_sqce, out_channels=out_features, kernel_size=3)
spherical_unet, device = init_device(spherical_unet, gpu=gpu)


Train and test. Plot results

In [29]:
import warnings
warnings.filterwarnings("ignore")

**Note:**
When defining ```WeatherBenchDatasetXarrayHealpixTemp```, set torch tensor to float and not float64 (double); otherwise it raises an error.

In [33]:
# Train model
train_loss, val_loss = train_model_2steps_temp(model=spherical_unet, device=device, training_ds=training_ds, batch_size=batch_size, epochs=nb_epochs, lr=learning_rate, validation_data=dl_val)

torch.save(spherical_unet.state_dict(), model_filename)

Epoch : 0Reading batch...
Done
torch.Size([70, 3072, 2, 7])


ValueError: Caught ValueError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/home/bolon/miniconda3/envs/weather/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py", line 60, in _worker
    output = module(*input, **kwargs)
  File "/home/bolon/miniconda3/envs/weather/lib/python3.7/site-packages/torch/nn/modules/module.py", line 541, in __call__
    result = self.forward(*input, **kwargs)
  File "/mnt/scratch/students/bolon/weather_prediction/modules/healpix_models.py", line 209, in forward
    x_encoded = self.encode(x)
  File "/mnt/scratch/students/bolon/weather_prediction/modules/healpix_models.py", line 136, in encode
    x_enc1 = self.conv11(x)
  File "/home/bolon/miniconda3/envs/weather/lib/python3.7/site-packages/torch/nn/modules/module.py", line 541, in __call__
    result = self.forward(*input, **kwargs)
  File "/mnt/scratch/students/bolon/weather_prediction/modules/healpix_models.py", line 59, in forward
    x = self.conv(x)
  File "/home/bolon/miniconda3/envs/weather/lib/python3.7/site-packages/torch/nn/modules/module.py", line 541, in __call__
    result = self.forward(*input, **kwargs)
  File "/mnt/scratch/students/bolon/weather_prediction/modules/layers.py", line 340, in forward
    outputs = self._conv(self.laplacian, inputs, self.weight)
  File "/mnt/scratch/students/bolon/weather_prediction/modules/layers.py", line 113, in cheb_conv
    B, V, Fin = inputs.shape
ValueError: too many values to unpack (expected 3)


**IMPORTANT:**
The model used in the other notebooks is UnetSphericalHelapix and NOT the temporal version! 

If the original model is used, there is a missmatch with the dimensions. Check that!

In [None]:
t = time.time()
for (batch,labels) in dl_train:
    print(batch[0].shape)
    print(batch[1].shape)
    print(labels[0].shape)
    print(labels[1].shape)
    print('Time ', time.time()-t)
    break

In [None]:
# Train model
train_loss, val_loss = train_model_2steps_temp_old(spherical_unet, device, dl_train, batch_size, epochs=nb_epochs, 
                                               lr=learning_rate, validation_data=dl_val)
torch.save(spherical_unet.state_dict(), model_filename)


In [None]:
training_ds.data.isel(time=training_ds.idxs[0][0][0])

In [None]:
training_ds.idxs[]

In [None]:
# Show training losses
plt.plot(train_loss, label='Training loss')
plt.plot(val_loss, label='Validation loss')
plt.xlabel('Epochs')
plt.ylabel('MSE Loss')
plt.legend()
plt.show()

del dl_train, dl_val, training_ds, validation_ds
torch.cuda.empty_cache()

In [None]:
# Testing data
testing_ds = WeatherBenchDatasetXarrayHealpixTemp(ds=ds_test, out_features=out_features,
                                                  len_sqce=len_sqce, delta_t=delta_t, years=test_years, 
                                                  nodes=nodes, nb_timesteps=nb_timesteps, 
                                                  mean=train_mean, std=train_std, 
                                                  max_lead_time=max_lead_time)

dataloader_test = DataLoader(testing_ds, batch_size=int(0.7*batch_size), shuffle=False,
                             num_workers=num_workers)



In [None]:
# Compute predictions
preds = create_iterative_predictions_healpix_temp(spherical_unet, device, dataloader_test)
preds.to_netcdf(pred_filename)


In [None]:
# Compute and save RMSE
rmse = compute_rmse_healpix(preds, obs).load()
rmse.to_netcdf(rmse_filename)

# Show RMSE
print('Z500 - 0:', rmse.z.values[0])
print('T850 - 0:', rmse.t.values[0])
plot_rmses(rmse, rmses_weyn, lead_time=6)

del spherical_unet, preds, rmse
torch.cuda.empty_cache()