### Check influence of different parameters in performance. 

Check influence of:
- specifying different chunk sizes and chunking along different dimensions
- use already standardized data --> does it save memory?
- use ``` .persist()``` to load data in a distributed way and speed up reading



In [1]:
standardization_contants = False

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
import sys
sys.path.append('/'.join(sys.path[0].split('/')[:-1]))

import xarray as xr
import matplotlib.pyplot as plt
import numpy as np
import time
import os
import healpy as hp
import random

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

from modules.utils import train_model_2steps, init_device
from modules.data import WeatherBenchDatasetXarrayHealpix
from modules.healpix_models import UNetSphericalHealpix
from modules.test import create_iterative_predictions_healpix
from modules.test import compute_rmse_healpix
from modules.plotting import plot_rmses

datadir = "../data/healpix/"
input_dir = datadir + "5.625deg_nearest/"
model_save_path = datadir + "models/"
pred_save_path = datadir + "predictions/"

train_years = ('1979', '2012')
val_years = ('2013', '2016')
test_years = ('2017', '2018')

nodes = 12*16*16
max_lead_time = 5*24
lead_time = 6
out_features = 2
nb_timesteps = 2
len_sqce = 2
# define time resolution
delta_t = 6

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="2,4"
gpu = [0,1]
num_workers = 10
pin_memory = True
batch_size = 95

nb_epochs = 10
learning_rate = 8e-3


In [5]:
from modules.data import WeatherBenchDatasetIterative
class WeatherBenchDatasetXarrayHealpixTemp(Dataset):
    
    """ Dataset used for graph models (1D), where data is loaded from stored numpy arrays.
    
    Parameters
    ----------
    ds : xarray Dataset
        Dataset containing the input data
    out_features : int
        Number of output features
    delta_t : int
        Temporal spacing between samples in temporal sequence (in hours)
    len_sqce : int
        Length of the input and output (predicted) sequences
    years : tuple(str)
        Years used to split the data
    nodes : float
        Number of nodes each sample has
    max_lead_time : int
        Maximum lead time (in case of iterative predictions) in hours
    load : bool
        If true, load dataset to RAM
    mean : np.ndarray of shape 2
        Mean to use for data normalization. If None, mean is computed from data
    std : np.ndarray of shape 2
        std to use for data normalization. If None, mean is computed from data
    """
        
    def __init__(self, ds, out_features, delta_t, len_sqce, years, nodes, nb_timesteps, 
                 max_lead_time=None, load=False, mean=None, std=None, standardize=True):
        
        
        self.delta_t = delta_t
        self.len_sqce = len_sqce
        self.years = years
        
        self.nodes = nodes
        self.out_features = out_features
        self.max_lead_time = max_lead_time
        self.nb_timesteps = nb_timesteps
        
        self.data = ds.to_array(dim='level', name='Dataset').transpose('time', 'node', 'level')
        
        self.in_features = self.data.shape[-1]
        
        if standardize:
            self.mean = self.data.mean(('time', 'node')).compute() if mean is None else mean
            self.std = self.data.std(('time', 'node')).compute() if std is None else std
        
        eps = 0.001 #add to std to avoid division by 0
        
        # Count total number of samples
        total_samples = self.data.shape[0]        
        
        if max_lead_time is None:
            self.n_samples = total_samples - (len_sqce+1) * delta_t
        else:
            self.n_samples = total_samples - (len_sqce+1) * delta_t - max_lead_time
        
        # Normalize
        if standardize:
            self.data = (self.data - self.mean.to_array(dim='level')) / (self.std.to_array(dim='level') + eps)
        
        self.data.persist()
        self.idxs = np.array(range(self.n_samples))
        
        if load:
            print('Loading data to RAM...')
            self.data.load()
        
        
    def __len__(self):
        return self.n_samples
    
    def __getitem__(self, idx):
        """ Returns sample and label corresponding to an index as torch.Tensor objects
            The return tensor shapes are (for the sample and the label): [n_vertex, len_sqce, n_features]
            
        """
        idx_data = idx#self.idxs[idx]
        #1,0,2
        
        #batch[0] --> (batch_size, num_nodes, n_features*len_sq)
        idx_full = np.concatenate([idx_data+delta_t,  idx_data + delta_t * len_sqce, idx_data + delta_t * (len_sqce+1)])
        dat = self.data.isel(time=idx_full).values
        
        
        X = (
            torch.tensor(dat[:len(idx),:,:] , \
                         dtype=torch.float).reshape(len(idx), self.nodes, -1),
        )
        
        y = (torch.tensor(dat[len(idx):len(idx)*2,:,:],\
                         dtype=torch.float).reshape(len(idx), self.nodes, -1),\
             torch.tensor(dat[len(idx)*2:,:,:out_features],\
                         dtype=torch.float).reshape(len(idx), self.nodes, -1)
        
        )
        return X, y 

Define model

In [6]:
#description = "no_const"
description = "all_const_hd5"

model_filename = model_save_path + "spherical_unet_" + description + ".h5"
pred_filename = pred_save_path + "spherical_unet_" + description + ".nc"
rmse_filename = datadir + 'metrics/rmse_' + description + '.nc'

In [7]:
# Model #old: in_channels=in_features*len_sqce
spherical_unet = UNetSphericalHealpix(N=nodes, in_channels=7, out_channels=2, 
                                      kernel_size=3)
spherical_unet, device = init_device(spherical_unet, gpu=gpu)

In [8]:
def train_model_2steps_custom(model, device, training_ds, constants, batch_size, epochs, lr, validation_ds):    
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr, eps=1e-7, weight_decay=0, amsgrad=False)
    
    train_losses = []
    val_losses = []
    n_samples = training_ds.n_samples
    n_samples_val = validation_ds.n_samples
    num_nodes = training_ds.nodes
    num_constants = constants.shape[1]
    out_features = training_ds.out_features
    
    constants_expanded = constants.expand(batch_size, num_nodes, num_constants)
    constants1 = constants_expanded.to(device)
    idxs_val = validation_ds.idxs
    
    for epoch in range(epochs):
        
        print('\rEpoch : {}'.format(epoch), end="")
        
        time1 = time.time()
        
        val_loss = 0
        train_loss = 0
        
        model.train()  
        
        random.shuffle(training_ds.idxs)
        idxs = training_ds.idxs
        
        batch_idx = 0
        times_read = []
        for i in range(0, n_samples - batch_size, batch_size):
            i_next = min(i + batch_size, n_samples)
            
            if len(idxs[i:i_next]) < batch_size:
                constants_expanded = contants.expand(len(idxs[i:i_next]), num_nodes, num_constants)
                constants1 = constants_expanded.to(device)
        
            
            t1 = time.time()
            batch, labels = training_ds[idxs[i:i_next]]
            
            t2 = time.time()
            
            # Transfer to GPU
            
            
            batch1 = torch.cat((batch[0], constants_expanded), dim=2).to(device)
            label1 = labels[0].to(device)
            label2 = labels[1].to(device)
            
            
            t3 = time.time()
            batch_size = batch1.shape[0]
            
            # Model
            
            t4 = time.time()
            output1 = model(batch1)  
            t5 = time.time()
            batch2 = torch.cat((output1, label1[:,:,-1].view(-1, num_nodes, 1), constants1), dim=2)
            t6 = time.time()
            output2 = model(batch2)
            t7 = time.time()
            loss = criterion(output1, label1[:,:,:out_features]) + criterion(output2, label2)
            t8 = time.time()
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            train_loss = train_loss + loss.item() * batch_size
            
            
            print('\nTime to read batch: {}s'.format(t2-t1))
            print('Time to transfer data to GPU: {}s'.format(t3-t2))
            print('Time to process input 1: {}s'.format(t5-t4))
            print('Time to process input 2: {}s'.format(t7-t6))
            print('Time to compute loss: {}s'.format(t8-t7))
            print('\n')
            print('\rBatch idx: {}; Loss: {:.3f}'.format(batch_idx, train_loss/(batch_size*(batch_idx+1))), end="")
            times_read.append(t2-t1)
            
            if len(times_read) == 10: 
                print('Reading time: {} +- {}'.format(np.mean(times_read), np.std(times_read)))
                return times_read
            batch_idx += 1
        
        train_loss = train_loss / n_samples
        train_losses.append(train_loss)
        
        model.eval()
        
        constants1 = constants_expanded.to(device)
        with torch.set_grad_enabled(False):
            index = 0
            
            for i in range(0, n_samples_val - batch_size, batch_size):
                i_next = min(i + batch_size, n_samples_val)

                if len(idxs_val[i:i_next]) < batch_size:
                    constants_expanded = contants.expand(len(idxs_val[i:i_next]), num_nodes, num_constants)
                    constants1 = constants_expanded.to(device)


                #t1 = time.time()
                batch, labels = validation_ds[idxs_val[i:i_next]]
                # Transfer to GPU
                batch1 = torch.cat((batch[0], constants_expanded), dim=2).to(device)
                label1 = labels[0].to(device)
                label2 = labels[1].to(device)

                batch_size = batch1.shape[0]
                
                output1 = model(batch1)
                batch2 = torch.cat((output1, label1[:,:,-1].view(-1, num_nodes, 1), constants1), dim=2)
                output2 = model(batch2)
                
                val_loss = val_loss + (criterion(output1, label1[:,:,:out_features]).item() 
                                       + criterion(output2, label2).item()) * batch_size
                index = index + batch_size
                
        val_loss = val_loss / n_samples_val
        val_losses.append(val_loss)
        
        time2 = time.time()
        
        # Print stuff
        print('Epoch: {e:3d}/{n_e:3d}  - loss: {l:.3f}  - val_loss: {v_l:.5f}  - time: {t:2f}'
              .format(e=epoch+1, n_e=epochs, l=train_loss, v_l=val_loss, t=time2-time1))
        
    return train_losses, val_losses

## Explore effect of different parameters on the training time

In [9]:
class results_time():
    def __init__(self, time, chunk_size, memory, standardization):
        self.times = time
        self.chunk_size = chunk_size
        self.loaded_in_memory = memory
        self.standardized = standardization
        self.mean_time = np.mean(time)
        self.std_time = np.std(time)
        self.max_time = np.max(time)
        self.min_time = np.min(time)
        
    def print(self):
        print('Parameters: \n\t* Chunk size: {}\n\t* Loaded in memory: {}\n\t* Previously standardized: {}'.\
             format(self.chunk_size, self.loaded_in_memory, self.standardized))
        print('Loading time: {:.3f}s $\pm$ {:.3f}'.format(self.mean_time, self.std_time))
        

In [25]:
def generate_comparison(chunk_size, load_ram, data_standardized, chunk=True):
    if chunk:
        z500 = xr.open_mfdataset(f'{input_dir}geopotential_500/*.nc', combine='by_coords', \
                             chunks={'time':chunk_size}, parallel=False).rename({'z':'z500'})
        t850 = xr.open_mfdataset(f'{input_dir}temperature_850/*.nc', combine='by_coords', \
                                 chunks={'time':chunk_size}, parallel=False).rename({'t':'t850'})
        rad = xr.open_mfdataset(f'{input_dir}toa_incident_solar_radiation/*.nc', combine='by_coords', \
                                chunks={'time':chunk_size}, parallel=False)
    else:
        z500 = xr.open_mfdataset(f'{input_dir}geopotential_500/*.nc', combine='by_coords', \
                             parallel=False).rename({'z':'z500'})
        t850 = xr.open_mfdataset(f'{input_dir}temperature_850/*.nc', combine='by_coords', \
                                 parallel=False).rename({'t':'t850'})
        rad = xr.open_mfdataset(f'{input_dir}toa_incident_solar_radiation/*.nc', combine='by_coords', \
                                parallel=False)

    z500 = z500.isel(time=slice(7, None))
    t850 = t850.isel(time=slice(7, None))

    constants = xr.open_dataset(f'{input_dir}constants/constants_5.625deg_standardized.nc')
    orog = constants['orog']
    lsm = constants['lsm']
    lats = constants['lat2d']
    slt = constants['slt']
    cos_lon = constants['cos_lon']
    sin_lon = constants['sin_lon']

    num_constants = len([orog, lats, lsm, slt])
    constants_tensor = torch.tensor(xr.merge([orog, lats, lsm, slt], compat='override').to_array().values, \
                                dtype=torch.float)
    
    
    in_features = 7 #len(feature_idx)
    train_mean_ = xr.open_mfdataset(f'{input_dir}mean_train_features_dynamic.nc')
    train_std_ = xr.open_mfdataset(f'{input_dir}std_train_features_dynamic.nc')
    
    if data_standardized:
        ds = xr.merge([z500, t850, rad], compat='override')
        #ds = xr.merge([z500, t850, orog, lats, lsm, slt, rad], compat='override')

        ds_train = ds.sel(time=slice(*train_years))
        ds_valid = ds.sel(time=slice(*val_years))
        ds_test = ds.sel(time=slice(*test_years))
    
    else:
        ds = xr.open_mfdataset(f'{input_dir}ds_standardized.nc')

        ds_train = ds.sel(time=slice(*train_years))
        ds_valid = ds.sel(time=slice(*val_years))
        ds_test = ds.sel(time=slice(*test_years))
    
    # Train and validation data
    training_ds = WeatherBenchDatasetXarrayHealpixTemp(ds=ds_train, out_features=out_features, delta_t=delta_t,
                                                       len_sqce=len_sqce, max_lead_time=max_lead_time,
                                                       years=train_years, nodes=nodes, nb_timesteps=nb_timesteps, 
                                                       mean=train_mean_, std=train_std_, load=load_ram, standardize=data_standardized)
    validation_ds = WeatherBenchDatasetXarrayHealpixTemp(ds=ds_valid, out_features=out_features, delta_t=delta_t,
                                                         len_sqce=len_sqce, max_lead_time=max_lead_time,
                                                         years=val_years, nodes=nodes, nb_timesteps=nb_timesteps, 
                                                         mean=train_mean_, std=train_std_, load=load_ram, standardize=data_standardized)

    
    torch.cuda.empty_cache()
    
    times1 = train_model_2steps_custom(spherical_unet, device, training_ds, constants_tensor.transpose(1,0), batch_size, epochs=7, \
                                           lr=learning_rate, validation_ds=validation_ds)
    
    del z500, t850, ds, ds_train, ds_valid, ds_test, training_ds, validation_ds
    return times1

### Option 1. 

* chunk size = 521
* data loaded to memory = False
* data previously standardized = False

In [11]:
chunk_size = 521#483*2 #483
load_ram = False
data_standardized = True

In [12]:
times1 = generate_comparison(chunk_size, load_ram, data_standardized)


Epoch : 0
Time to read batch: 3.419006586074829s
Time to transfer data to GPU: 0.017685651779174805s
Time to process input 1: 2.754517078399658s
Time to process input 2: 0.10320544242858887s
Time to compute loss: 0.04997611045837402s


Batch idx: 0; Loss: 21.044
Time to read batch: 2.7278189659118652s
Time to transfer data to GPU: 0.004885435104370117s
Time to process input 1: 0.04137897491455078s
Time to process input 2: 0.03589057922363281s
Time to compute loss: 0.15802240371704102s


Batch idx: 1; Loss: 15.557
Time to read batch: 2.5874085426330566s
Time to transfer data to GPU: 0.005301475524902344s
Time to process input 1: 0.047299861907958984s
Time to process input 2: 0.043466806411743164s
Time to compute loss: 0.1377248764038086s


Batch idx: 2; Loss: 12.312
Time to read batch: 2.4888930320739746s
Time to transfer data to GPU: 0.004891872406005859s
Time to process input 1: 0.048787593841552734s
Time to process input 2: 0.04067730903625488s
Time to compute loss: 0.146593570709228

### Option 2. 

* chunk size = 521
* data loaded to memory = True
* data previously standardized = False --> data_standardized = True

In [13]:
chunk_size = 521#483*2 #483
load_ram = True
data_standardized = True

In [14]:
times2 = generate_comparison(chunk_size, load_ram, data_standardized)

Loading data to RAM...
Loading data to RAM...
Epoch : 0
Time to read batch: 0.01496434211730957s
Time to transfer data to GPU: 0.0043544769287109375s
Time to process input 1: 0.14259696006774902s
Time to process input 2: 0.10696983337402344s
Time to compute loss: 0.05048060417175293s


Batch idx: 0; Loss: 1.618
Time to read batch: 0.04442238807678223s
Time to transfer data to GPU: 0.021195173263549805s
Time to process input 1: 0.037370920181274414s
Time to process input 2: 0.03946328163146973s
Time to compute loss: 0.15044331550598145s


Batch idx: 1; Loss: 2.578
Time to read batch: 0.04675745964050293s
Time to transfer data to GPU: 0.01607823371887207s
Time to process input 1: 0.0394594669342041s
Time to process input 2: 0.040471553802490234s
Time to compute loss: 0.14900612831115723s


Batch idx: 2; Loss: 2.583
Time to read batch: 0.025327444076538086s
Time to transfer data to GPU: 0.004659414291381836s
Time to process input 1: 0.03720903396606445s
Time to process input 2: 0.04345583

### Option 3. 

* chunk size = 1042
* data loaded to memory = False
* data previously standardized = False --> data_standardized = True

In [15]:
chunk_size = 1042#483*2 #483
load_ram = False
data_standardized = True

In [16]:
times3 = generate_comparison(chunk_size, load_ram, data_standardized)

Epoch : 0
Time to read batch: 5.038513660430908s
Time to transfer data to GPU: 0.0050580501556396484s
Time to process input 1: 0.1417396068572998s
Time to process input 2: 0.14063715934753418s
Time to compute loss: 0.03820323944091797s


Batch idx: 0; Loss: 1.344
Time to read batch: 4.678669691085815s
Time to transfer data to GPU: 0.005233049392700195s
Time to process input 1: 0.04192686080932617s
Time to process input 2: 0.04330945014953613s
Time to compute loss: 0.14320993423461914s


Batch idx: 1; Loss: 4.119
Time to read batch: 5.5206780433654785s
Time to transfer data to GPU: 0.00513148307800293s
Time to process input 1: 0.04539132118225098s
Time to process input 2: 0.041292428970336914s
Time to compute loss: 0.14367341995239258s


Batch idx: 2; Loss: 3.327
Time to read batch: 5.225013017654419s
Time to transfer data to GPU: 0.005410194396972656s
Time to process input 1: 0.04780411720275879s
Time to process input 2: 0.04814791679382324s
Time to compute loss: 0.14158368110656738s



### Option 4. 

* chunk size = 1042
* data loaded to memory = True
* data previously standardized = False --> data_standardized = True

In [17]:
chunk_size = 521*2#483*2 #483
load_ram = True
data_standardized = True

In [18]:
times4 = generate_comparison(chunk_size, load_ram, data_standardized)

Loading data to RAM...
Loading data to RAM...
Epoch : 0
Time to read batch: 0.06084752082824707s
Time to transfer data to GPU: 0.004936695098876953s
Time to process input 1: 0.15724921226501465s
Time to process input 2: 0.11062240600585938s
Time to compute loss: 0.04253649711608887s


Batch idx: 0; Loss: 1.578
Time to read batch: 0.025967121124267578s
Time to transfer data to GPU: 0.00445556640625s
Time to process input 1: 0.03679609298706055s
Time to process input 2: 0.04136204719543457s
Time to compute loss: 0.1475214958190918s


Batch idx: 1; Loss: 2.405
Time to read batch: 0.022838592529296875s
Time to transfer data to GPU: 0.004541873931884766s
Time to process input 1: 0.0363919734954834s
Time to process input 2: 0.03941154479980469s
Time to compute loss: 0.14887571334838867s


Batch idx: 2; Loss: 2.217
Time to read batch: 0.022765636444091797s
Time to transfer data to GPU: 0.004614830017089844s
Time to process input 1: 0.04070091247558594s
Time to process input 2: 0.0392043590545

### Option 5. 

* chunk size = 483*2
* data loaded to memory = False
* data previously standardized = False --> data_standardized = True

In [19]:
chunk_size = 483*2#483*2 #483
load_ram = False
data_standardized = True

In [20]:
times5 = generate_comparison(chunk_size, load_ram, data_standardized)

Epoch : 0
Time to read batch: 4.467376470565796s
Time to transfer data to GPU: 0.004725217819213867s
Time to process input 1: 0.1329643726348877s
Time to process input 2: 0.1218101978302002s
Time to compute loss: 0.03996849060058594s


Batch idx: 0; Loss: 1.254
Time to read batch: 4.220887184143066s
Time to transfer data to GPU: 0.005463361740112305s
Time to process input 1: 0.04322171211242676s
Time to process input 2: 0.041930437088012695s
Time to compute loss: 0.14554357528686523s


Batch idx: 1; Loss: 1.553
Time to read batch: 3.678936004638672s
Time to transfer data to GPU: 0.005243062973022461s
Time to process input 1: 0.04355621337890625s
Time to process input 2: 0.043515920639038086s
Time to compute loss: 0.14034295082092285s


Batch idx: 2; Loss: 1.522
Time to read batch: 3.908228635787964s
Time to transfer data to GPU: 0.005069732666015625s
Time to process input 1: 0.04314708709716797s
Time to process input 2: 0.03963589668273926s
Time to compute loss: 0.14670014381408691s




### Option 6. 

* chunk size = 521
* data loaded to memory = False
* data previously standardized = True --> data_standardized = False

In [21]:
chunk_size = 521#483*2 #483
load_ram = False
data_standardized = False

In [26]:
times6 = generate_comparison(chunk_size, load_ram, data_standardized)

Epoch : 0
Time to read batch: 16.40645933151245s
Time to transfer data to GPU: 0.010524988174438477s
Time to process input 1: 0.15302515029907227s
Time to process input 2: 0.08313894271850586s
Time to compute loss: 0.07191610336303711s


Batch idx: 0; Loss: 3679.374
Time to read batch: 16.69389009475708s
Time to transfer data to GPU: 0.0060651302337646484s
Time to process input 1: 0.040924072265625s
Time to process input 2: 0.04015660285949707s
Time to compute loss: 0.1479175090789795s


Batch idx: 1; Loss: 3648.454
Time to read batch: 16.5794780254364s
Time to transfer data to GPU: 0.0057947635650634766s
Time to process input 1: 0.039808034896850586s
Time to process input 2: 0.037888288497924805s
Time to compute loss: 0.1605701446533203s


Batch idx: 2; Loss: 3617.992
Time to read batch: 17.111807823181152s
Time to transfer data to GPU: 0.01013636589050293s
Time to process input 1: 0.04415488243103027s
Time to process input 2: 0.041107177734375s
Time to compute loss: 0.149336814880371

### Option 7. 

* chunk size = 521
* data loaded to memory = True
* data previously standardized = True --> data_standardized = False

In [27]:
chunk_size = 521#483*2 #483
load_ram = True
data_standardized = False

In [28]:
times7 = generate_comparison(chunk_size, load_ram, data_standardized)

Loading data to RAM...
Loading data to RAM...
Epoch : 0
Time to read batch: 0.03540205955505371s
Time to transfer data to GPU: 0.020819425582885742s
Time to process input 1: 0.11373329162597656s
Time to process input 2: 0.10968923568725586s
Time to compute loss: 0.05250096321105957s


Batch idx: 0; Loss: 3068.629
Time to read batch: 0.015705585479736328s
Time to transfer data to GPU: 0.0044286251068115234s
Time to process input 1: 0.036890268325805664s
Time to process input 2: 0.039411067962646484s
Time to compute loss: 0.15049958229064941s


Batch idx: 1; Loss: 3040.797
Time to read batch: 0.013686895370483398s
Time to transfer data to GPU: 0.004465818405151367s
Time to process input 1: 0.03734326362609863s
Time to process input 2: 0.03717613220214844s
Time to compute loss: 0.15335655212402344s


Batch idx: 2; Loss: 3010.812
Time to read batch: 0.013140678405761719s
Time to transfer data to GPU: 0.004460334777832031s
Time to process input 1: 0.03710055351257324s
Time to process input 

### Option 8. 

* chunk size = No chunking
* data loaded to memory = False
* data previously standardized = False --> data_standardized = True

In [29]:
chunk_size = 0#483*2 #483
load_ram = False
data_standardized = True

In [30]:
times8 = generate_comparison(chunk_size, load_ram, data_standardized, chunk=False)

Epoch : 0
Time to read batch: 29.508171319961548s
Time to transfer data to GPU: 0.020882606506347656s
Time to process input 1: 0.1598670482635498s
Time to process input 2: 0.11409187316894531s
Time to compute loss: 0.04235720634460449s


Batch idx: 0; Loss: 281.655
Time to read batch: 36.39236617088318s
Time to transfer data to GPU: 0.014907598495483398s
Time to process input 1: 0.06324267387390137s
Time to process input 2: 0.0420374870300293s
Time to compute loss: 0.16190505027770996s


Batch idx: 1; Loss: 246.762
Time to read batch: 35.60328912734985s
Time to transfer data to GPU: 0.020716428756713867s
Time to process input 1: 0.045294761657714844s
Time to process input 2: 0.0397646427154541s
Time to compute loss: 0.1466660499572754s


Batch idx: 2; Loss: 216.223
Time to read batch: 33.10098910331726s
Time to transfer data to GPU: 0.01052999496459961s
Time to process input 1: 0.06509852409362793s
Time to process input 2: 0.04654264450073242s
Time to compute loss: 0.1512775421142578s


### Option 9. 

* chunk size = No chunking
* data loaded to memory = True
* data previously standardized = False --> data_standardized = True

In [31]:
chunk_size = 0#483*2 #483
load_ram = True
data_standardized = True

In [32]:
times9 = generate_comparison(chunk_size, load_ram, data_standardized, chunk=False)

Loading data to RAM...
Loading data to RAM...
Epoch : 0
Time to read batch: 0.018594741821289062s
Time to transfer data to GPU: 0.004164695739746094s
Time to process input 1: 0.15126490592956543s
Time to process input 2: 0.11709141731262207s
Time to compute loss: 0.04013204574584961s


Batch idx: 0; Loss: 43.448
Time to read batch: 0.051525115966796875s
Time to transfer data to GPU: 0.004364013671875s
Time to process input 1: 0.0355229377746582s
Time to process input 2: 0.0403444766998291s
Time to compute loss: 0.14962267875671387s


Batch idx: 1; Loss: 38.708
Time to read batch: 0.023848533630371094s
Time to transfer data to GPU: 0.00469970703125s
Time to process input 1: 0.037389516830444336s
Time to process input 2: 0.04007863998413086s
Time to compute loss: 0.1486983299255371s


Batch idx: 2; Loss: 34.891
Time to read batch: 0.019304990768432617s
Time to transfer data to GPU: 0.004502534866333008s
Time to process input 1: 0.036222219467163086s
Time to process input 2: 0.03931140899

## Compare results 

In [34]:
t = []
for times in [times1, times2, times3, times4, times5, times6, times7, times8, times9]:
    t.append()

AttributeError: 'list' object has no attribute 'printimes'

In [None]:
times_plot = [t1, t2, t3, t4, t5]

t_loaded = [t for t in times_plot if t.loaded_in_memory]
t_not_loaded = [t for t in times_plot if not t.loaded_in_memory]

fig, ax = plt.subplots(1,2,figsize=(15,5))

ax[0].set_title('Previously load on memory')
ax[1].set_title('Loading when required')

for i, t_list in enumerate([t_loaded, t_not_loaded]):
    ax[i].boxplot([t.times for t in t_list])
    ax[i].set_xticklabels([t.chunk_size for t in t_list])
    ax[i].set_xlabel('Chunk_size')
    ax[i].set_ylabel('Reading time [s]')
    
plt.show()

In [None]:
times_plot = [t1, t2, t6, t7]

t_loaded = [t for t in times_plot if t.loaded_in_memory]
t_not_loaded = [t for t in times_plot if not t.loaded_in_memory]

fig, ax = plt.subplots(1,2,figsize=(15,5))

ax[0].set_title('Previously load on memory')
ax[1].set_title('Loading when required')

for i, t_list in enumerate([t_loaded, t_not_loaded]):
    ax[i].boxplot([t.times for t in t_list])
    ax[i].set_xticklabels([t.standardized for t in t_list])
    ax[i].set_xlabel('Previously standardized')
    ax[i].set_ylabel('Reading time [s]')
    
plt.show()

In [None]:
times_plot = [t1, t2, t6, t7, t8, t9]

t_loaded = [t for t in times_plot if t.loaded_in_memory]
t_not_loaded = [t for t in times_plot if not t.loaded_in_memory]

fig, ax = plt.subplots(1,2,figsize=(15,5))

ax[0].set_title('Previously load on memory')
ax[1].set_title('Loading when required')

for i, t_list in enumerate([t_loaded, t_not_loaded]):
    ax[i].boxplot([t.times for t in t_list])
    ax[i].set_xticklabels([t.chunk_size for t in t_list])
    ax[i].set_xlabel('Chunk_size')
    ax[i].set_ylabel('Reading time [s]')
    
plt.show()

Comments:
* Loading the data previously is the most significant difference regarding the total training time.

* Chunking the data has a large effect when it is not previously loaded. Also, the chunk size affects the reading time.

* Using previously standardized data instead of performing the standardization for each batch helps reducing the time. 