ConvLSTM trained on gridded forcings for all stations

In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append('..')
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt 
from datetime import datetime, timedelta
from sklearn import preprocessing
import netCDF4 as nc
import torch
from torch import nn
from torch.utils.tensorboard import SummaryWriter
from src import load_data, evaluate, conv_lstm
import torch.autograd as autograd
import pickle

time_stamp = datetime.now().strftime('%Y%m%d-%H%M%S')
time_stamp



'20190718-123412'

In [2]:
USE_CUDA = False
if torch.cuda.is_available():
    print('CUDA Available')
    USE_CUDA = True
device = torch.device('cuda' if USE_CUDA else 'cpu')
torch.manual_seed(0)
np.random.seed(0)

In [3]:
rdrs_data, rdrs_var_names, rdrs_time_index = load_data.load_rdrs_forcings(as_grid=True)
data_runoff = load_data.load_discharge_gr4j_vic()

  data = pd.read_csv(os.path.join(dir, f), skiprows=2, skipfooter=1, index_col=False, header=None, names=['runoff'], na_values='-1.2345')
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [4]:
gauge_info = pd.read_csv('../data/gauge_info.csv')[['ID', 'Lat', 'Lon']]
data_runoff = pd.merge(data_runoff, gauge_info, left_on='station', right_on='ID').drop('ID', axis=1)

In [168]:
rdrs_data = rdrs_data[:,[4,5],:,:]
rdrs_var_names = rdrs_var_names[4:6]

In [6]:
n_vars, n_rows, n_cols = rdrs_data.shape[1], rdrs_data.shape[2], rdrs_data.shape[3]
n_vars, n_rows, n_cols

In [109]:
predictions = {}
actuals = {}
seq_len = 3 * 24
validation_fraction = 0.2

torch.manual_seed(0)
np.random.seed(0)

train_start = datetime.strptime('2010-01-01', '%Y-%m-%d') + timedelta(hours=seq_len)  # first day for which to make a prediction in train set
train_end = '2012-12-31'
test_start = '2013-01-01'
test_end = '2014-12-31'

data_runoff = data_runoff[~pd.isna(data_runoff['runoff'])]
data_runoff = data_runoff[(data_runoff['date'] >= train_start) & (data_runoff['date'] <= test_end)].sort_values(by='date').reset_index(drop='True')

num_train_samples = data_runoff[(data_runoff['date'] >= train_start) & (data_runoff['date'] <= train_end)].shape[0]
num_train_dates = len(data_runoff[(data_runoff['date'] >= train_start) & (data_runoff['date'] <= train_end)]['date'].unique())
num_total_samples = data_runoff.shape[0]
num_train_samples, num_train_dates, num_total_samples

(49113, 1091, 82693)

In [110]:
# input: (seq_len, samples, variables, height, width)
# But: x_conv is the same for all stations for the same date, so we only generate #dates samples and feed them multiple times (as many times as we have stations for that date)
x_conv = np.zeros((seq_len, len(data_runoff['date'].unique()), n_vars, n_rows, n_cols))
i = 0
samples_per_date = []
for date in data_runoff['date'].unique():
    # For each day that is to be predicted, cut out a sequence that ends with that day's 23:00 and is seq_len long
    end_of_day_index = rdrs_time_index[rdrs_time_index == date].index.values[0] + 23
    x_conv[:,i,:,:,:] = rdrs_data[end_of_day_index - seq_len : end_of_day_index]
    samples_per_date.append((data_runoff['date'] == date).sum())
    i += 1

# Scale training data
conv_scalers = []  # save scalers to apply them to test data later
x_conv_train = x_conv[:,:num_train_dates,:,:,:].copy()
for i in range(x_conv.shape[2]):
    conv_scalers.append(preprocessing.StandardScaler())
    x_conv_train[:,:,i,:,:] = np.nan_to_num(conv_scalers[i].fit_transform(x_conv_train[:,:,i,:,:].reshape((-1, 1))).reshape(x_conv_train[:,:,i,:,:].shape))

x_fc = data_runoff[['Lat', 'Lon']].to_numpy()    
fc_scalers = []  # save scalers to apply them to test data later
x_fc_train = x_fc[:num_train_samples,:].copy()
for i in range(x_fc.shape[1]):
    fc_scalers.append(preprocessing.StandardScaler())
    x_fc_train[:,i] = np.nan_to_num(fc_scalers[i].fit_transform(x_fc_train[:,i].reshape((-1,1))).reshape(x_fc_train[:,i].shape))

y_train = data_runoff[(data_runoff['date'] >= train_start) & (data_runoff['date'] <= train_end)]['runoff'].to_numpy()

In [124]:
# Get validation split
# Each x_conv belongs to multiple x_fc's and y's. Therefore, we select the x_conv_val's first and then pick the x_fc's and y's that belong to them.
num_validation_dates = int(x_conv_train.shape[1] * validation_fraction)
validation_indices = np.random.choice(x_conv_train.shape[1], size=num_validation_dates, replace=False)
train_indices = [i for i in range(x_conv_train.shape[1]) if i not in validation_indices]
x_conv_val = x_conv_train[:,validation_indices,:,:,:]
x_conv_train = x_conv_train[:,train_indices,:,:,:]
samples_per_date_train = np.array(samples_per_date)[train_indices]
samples_per_date_val = np.array(samples_per_date)[validation_indices]
samples_per_date_test = np.array(samples_per_date)[num_train_dates:]

x_fc_val = np.empty((0, x_fc_train.shape[1]))
y_val = np.empty(0)
for i in validation_indices:
    start_idx = sum(samples_per_date[:i])
    end_idx = start_idx + samples_per_date[i]
    x_fc_val = np.concatenate([x_fc_val, x_fc_train[start_idx:end_idx,:]], axis=0)
    y_val = np.concatenate([y_val, y_train[start_idx:end_idx]])

x_fc_train_tmp = np.empty((0, x_fc_train.shape[1]))
y_train_tmp = np.empty(0)
for i in train_indices:
    start_idx = sum(samples_per_date[:i])
    end_idx = start_idx + samples_per_date[i]
    x_fc_train_tmp = np.concatenate([x_fc_train_tmp, x_fc_train[start_idx:end_idx,:]], axis=0)
    y_train_tmp = np.concatenate([y_train_tmp, y_train[start_idx:end_idx]])

x_fc_train = x_fc_train_tmp
y_train = y_train_tmp
print('Shapes: x_conv_train {}, x_fc_train {}, y_train {}, x_conv_val {}, x_fc_val {}, y_val {}'.format(x_conv_train.shape, x_fc_train.shape, y_train.shape, 
                                                                                                        x_conv_val.shape, x_fc_val.shape, y_val.shape))

In [142]:
x_conv_train = torch.from_numpy(x_conv_train).float().to(device)
x_conv_val = torch.from_numpy(x_conv_val).float().to(device)
x_fc_train = torch.from_numpy(x_fc_train).float().to(device)
x_fc_val = torch.from_numpy(x_fc_val).float().to(device)
y_train = torch.from_numpy(y_train).float().to(device)
y_val = torch.from_numpy(y_val).float().to(device)

In [239]:
# Train model
num_epochs = 100
learning_rate = 3e-3
patience = 50
min_improvement = 0.05
best_loss_model = (-1, np.inf, None)

# Prepare model
H = 4
H_fc = 4
batch_size = 10
lstm_layers = 2
fc_layers = 1
kernel_size = (5,5)
dropout = 0.2
pooling = [False, True]
model = conv_lstm.ConvLSTMRegression(x_conv_train.shape[3:], 2, batch_size, x_conv_train.shape[2], H, 
                                     kernel_size, lstm_layers, dropout, fc_layers, H_fc, pooling).to(device)
loss_fn = torch.nn.MSELoss(reduction='mean')
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

writer = SummaryWriter()
param_description = {'H': H, 'H_fc': H_fc, 'batch_size': batch_size, 'lstm_layers': lstm_layers, 'fc_layers': fc_layers, 'kernel_size': kernel_size, 'loss': loss_fn, 
                     'optimizer': optimizer, 'lr': learning_rate, 'patience': patience, 'min_improvement': min_improvement, 'pooling': pooling,
                     'num_epochs': num_epochs, 'seq_len': seq_len, 'validation_fraction': validation_fraction, 'dropout': dropout,
                     'train_start': train_start, 'train_end': train_end, 'test_start': test_start, 'test_end': test_end}
writer.add_text('Parameter Description', str(param_description))

In [240]:
def expand_x_conv(x_conv, conv_idx, samples_done, samples_per_date, batch_size):
    """ 
    x_conv only contains one entry per distinct date. x_fc and y contain one entry per training sample.
    This method will repeat entries of x_conv as needed to get to the dimensions of x_fc and y.
    """
    x_conv_in = x_conv[:,[conv_idx],:,:,:].repeat(1, samples_per_date[conv_idx], 1, 1, 1)[:,samples_done:,:,:,:]
    samples_done += batch_size
    if x_conv_in.shape[1] < batch_size:
        conv_idx += 1
        if conv_idx >= len(samples_per_date):
            # fill with dummies
            num_dummies = batch_size - (x_conv_in.shape[1] % batch_size)
            print('Filling with {} dummies'.format(num_dummies))
            x_conv_in = torch.cat([x_conv_in, np.zeros(x_conv_in.shape[0], num_dummies, x_conv_in.shape[2], x_conv_in.shape[3], x_conv_in.shape[4])], dim=1)
        else:
            samples_done = batch_size - x_conv_in.shape[1]
            additional_conv_in = x_conv[:,[conv_idx],:,:,:].repeat(1, samples_per_date[conv_idx], 1, 1, 1)
            x_conv_in = torch.cat([x_conv_in, additional_conv_in], dim=1)

    return x_conv_in, conv_idx, samples_done

In [241]:
for epoch in range(num_epochs):
    epoch_losses = []

    # Shuffle train dates
    shuffle_indices = np.random.choice(x_conv_train.shape[1], size=x_conv_train.shape[1], replace=False)
    x_conv_train = x_conv_train[:,shuffle_indices,:,:,:]
    x_fc_train_tmp = torch.empty((0, x_fc_train.shape[1]), device=device)
    y_train_tmp = torch.empty(0, device=device)
    for i in shuffle_indices:
        start_idx = sum(samples_per_date_train[:i])
        end_idx = start_idx + samples_per_date_train[i]
        x_fc_train_tmp = torch.cat([x_fc_train_tmp, x_fc_train[start_idx:end_idx,:]], dim=0)
        y_train_tmp = torch.cat([y_train_tmp, y_train[start_idx:end_idx]])
    
    samples_per_date_train = samples_per_date_train[shuffle_indices]
    x_fc_train = x_fc_train_tmp
    y_train = y_train_tmp
    del x_fc_train_tmp, y_train_tmp

    model.train()
    conv_idx = 0
    samples_done = 0
    for i in range(y_train.shape[0] // batch_size):
        x_conv_in, conv_idx, samples_done = expand_x_conv(x_conv_train, conv_idx, samples_done, samples_per_date_train, batch_size)
        y_pred = model(x_conv_in[:,:batch_size,:,:,:], x_fc_train[i*batch_size : (i+1)*batch_size,:])
        loss = loss_fn(y_pred, y_train[i*batch_size : (i+1)*batch_size].reshape((batch_size, 1))).to(device)
        epoch_losses.append(loss.item())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    epoch_loss = np.array(epoch_losses).mean()
    print('Epoch', epoch, 'mean train loss:\t{}'.format(epoch_loss))
    writer.add_scalar('loss', epoch_loss, epoch)

    # eval on validation split
    model.eval()
    val_pred = pd.Series()
    conv_idx = 0
    samples_done = 0
    for i in range(y_val.shape[0] // batch_size):
        x_conv_in, conv_idx, samples_done = expand_x_conv(x_conv_val, conv_idx, samples_done, samples_per_date_val, batch_size)
        batch_pred = model(x_conv_in[:,:batch_size,:,:,:], x_fc_val[i*batch_size : (i+1)*batch_size,:]).detach().cpu().numpy()
        val_pred = val_pred.append(pd.DataFrame(batch_pred))
    model.train()
    val_nse, val_mse = evaluate.evaluate_daily('All Stations', pd.DataFrame(val_pred.values.flatten()), pd.Series(y_val.cpu().numpy().flatten())[:val_pred.shape[0]*val_pred.shape[1]])
    print('Epoch {} mean val mse:    \t{},\tnse: {}'.format(epoch, val_mse, val_nse))
    writer.add_scalar('loss_eval', val_mse, epoch)

    if val_mse < best_loss_model[1] - min_improvement:
        best_loss_model = (epoch, val_mse, model.state_dict())  # new best model
    elif epoch > best_loss_model[0] + patience:
        print('Patience exhausted in epoch {}. Best val-loss was {}'.format(epoch, best_loss_model[1]))
        break

print('Using best model from epoch', str(best_loss_model[0]), 'which had loss', str(best_loss_model[1]))
model.load_state_dict(best_loss_model[2])
load_data.pickle_model('ConvLSTM+LinearLayer_VIC', model, 'allStations', time_stamp)

del x_conv_in, x_conv_train, x_fc_train, x_conv_val, x_fc_val, y_pred

KeyboardInterrupt: 

In [None]:
model.eval()

# scale test data
x_conv_test = x_conv[:,num_train_dates:,:,:,:].copy()
for i in range(x_conv.shape[2]):
    x_conv_test[:,:,i,:,:] = np.nan_to_num(conv_scalers[i].transform(x_conv_test[:,:,i,:,:].reshape((-1, 1))).reshape(x_conv_test[:,:,i,:,:].shape))
x_fc_test = x_fc[num_train_samples:,:].copy()
for i in range(x_fc.shape[1]):
    x_fc_test[:,i] = np.nan_to_num(fc_scalers[i].transform(x_fc_test[:,i].reshape((-1,1))).reshape(x_fc_test[:,i].shape))
    
print('x_conv_test shape: {}, x_fc_test shape: {}'.format(x_conv_test.shape, x_fc_test.shape))
# if batch size doesn't align with number of samples, add dummies to the last batch
num_dummies = batch_size - (x_fc_test.shape[0] % batch_size)
if num_dummies != batch_size:
    x_fc_test = np.concatenate([x_fc_test, np.zeros((num_dummies, x_fc_test.shape[1]))], axis=0)
    print('Appended {} dummy entries to x_test. New shape: x_fc_test {}'.format(num_dummies, x_fc_test.shape))

# Predict
x_conv_test = torch.from_numpy(x_conv_test).float().to(device)
x_fc_test = torch.from_numpy(x_fc_test).float().to(device)
predict = data_runoff[(data_runoff['date'] >= test_start) & (data_runoff['date'] <= test_end)].copy()
predict['runoff'] = np.nan
pred_array = np.array([])
print('Predicting')

conv_idx = 0
samples_done = 0
for i in range(x_fc_test.shape[0] // batch_size):
    x_conv_in, conv_idx, samples_done = expand_x_conv(x_conv_test, conv_idx, samples_done, samples_per_date_test, batch_size)
    pred_array = np.concatenate([pred_array, model(x_conv_in[:,:batch_size,:,:,:], x_fc_test[i*batch_size : (i+1)*batch_size,:]).detach().cpu().numpy().reshape(batch_size)])
predict['runoff'] = pred_array[:predict.shape[0]]  # ignore dummies

In [None]:
actuals = data_runoff[(data_runoff['date'] >= test_start) & (data_runoff['date'] <= test_end)].copy()

nse_list = []
mse_list = []
for station in predict['station'].unique():
    nse, mse = evaluate.evaluate_daily(station, predict[predict['station'] == station]['runoff'], actuals[actuals['station'] == station]['runoff'], writer=writer)
    nse_list.append(nse)
    mse_list.append(mse)
    
    print(station, '\tNSE:', nse, '\tMSE:', mse, '(clipped to 0)')

print('Median NSE (clipped to 0)', np.median(nse_list), '/ Min', np.min(nse_list), '/ Max', np.max(nse_list))
print('Median MSE (clipped to 0)', np.median(mse_list), '/ Min', np.min(mse_list), '/ Max', np.max(mse_list))

In [14]:
writer.close()

In [156]:
save_df = pd.merge(predict.rename({'runoff': 'prediction'}, axis=1), actuals.rename({'runoff': 'actual'}, axis=1), on=['date', 'station'])[['date', 'station', 'prediction', 'actual']]

load_data.pickle_results('ConvLSTM+LinearLayer_VIC', save_df, time_stamp)

'ConvLSTM+LinearLayer_VIC_20190718-123412.pkl'

In [157]:
datetime.now().strftime('%Y%m%d-%H%M%S')

'20190718-143116'