ConvLSTM trained on gridded forcings for all stations

In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append('..')
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt 
from datetime import datetime, timedelta
from sklearn import preprocessing
import netCDF4 as nc
import torch
from torch import nn
from torch.utils.tensorboard import SummaryWriter
from src import load_data, evaluate, conv_lstm
import torch.autograd as autograd
import pickle

time_stamp = datetime.now().strftime('%Y%m%d-%H%M%S')
time_stamp

CUDA Available


'20190717-122732'

In [2]:
USE_CUDA = False
if torch.cuda.is_available():
    print('CUDA Available')
    USE_CUDA = True
device = torch.device('cuda' if USE_CUDA else 'cpu')
torch.manual_seed(0)
np.random.seed(0)

writer = SummaryWriter()

CUDA Available


In [3]:
rdrs_data, rdrs_var_names, rdrs_time_index = load_data.load_rdrs_forcings(as_grid=True)
data_runoff = load_data.load_discharge_gr4j_vic()
data_runoff = data_runoff.pivot(index='date', columns='station', values='runoff')
data_runoff = data_runoff.loc[:,data_runoff.columns != '04214500']  # This station has >1K nan target values

  rdrs_data[:,i,:,:] = rdrs_nc[forcing_variables[i]][:]
  data = pd.read_csv(os.path.join(dir, f), skiprows=2, skipfooter=1, index_col=False, header=None, names=['runoff'], na_values='-1.2345')
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [4]:
rdrs_data = rdrs_data[:,[4,5],:,:]  # only keep precipitation and temperature
rdrs_var_names = rdrs_var_names[4:6]

In [5]:
class ConvLSTMRegression(nn.Module):
    def __init__(self, input_size, output_size, batch_size, input_dim, hidden_dim, kernel_size, num_layers):
        super(ConvLSTMRegression, self).__init__()
        self.batch_size = batch_size
        self.conv_lstm = conv_lstm.ConvLSTM(input_size, input_dim, hidden_dim, kernel_size, num_layers)
        self.linear = nn.Linear(hidden_dim * input_size[0] * input_size[1], output_size)

    def forward(self, input):
        lstm_out, hidden = self.conv_lstm(input)
        linear_in = lstm_out[-1][:,-1,:,:,:].reshape((batch_size, -1))
        
        return self.linear(linear_in)

In [6]:
n_vars, n_rows, n_cols = rdrs_data.shape[1], rdrs_data.shape[2], rdrs_data.shape[3]

In [7]:
predictions = {}
actuals = {}
seq_len = 5 * 24
validation_fraction = 0.1

torch.manual_seed(0)
np.random.seed(0)

train_start = datetime.strptime('2010-01-01', '%Y-%m-%d') + timedelta(hours=seq_len)  # first day for which to make a prediction in train set
train_end = '2012-12-31'
test_start = '2013-01-01'
test_end = '2014-12-31'

# For training, only include dates where no station has an nan target value.
data_runoff = data_runoff.loc[train_start:test_end]
non_nan_train_dates = pd.Series(data_runoff[(pd.isna(data_runoff).sum(axis=1)==0) | (data_runoff.index >= test_start)].index)
data_runoff = data_runoff.loc[non_nan_train_dates,:]

num_train_days = non_nan_train_dates[(non_nan_train_dates >= train_start) & (non_nan_train_dates <= train_end)].shape[0]
num_total_days = num_train_days + len(pd.date_range(test_start, test_end, freq='D'))

In [8]:
# input: (seq_len, samples, variables, height, width)
x = np.zeros((seq_len, num_total_days, n_vars, n_rows, n_cols))
i = 0
for day in range(len(pd.date_range(train_start, test_end, freq='D'))):
    # For each day that is to be predicted, cut out a sequence that ends with that day's 23:00 and is seq_len long
    day_date = train_start + timedelta(days=day)
    if len(non_nan_train_dates[non_nan_train_dates == day_date]) == 0:
        continue
    end_of_day_index = rdrs_time_index[rdrs_time_index == day_date].index.values[0] + 23
    x[:,i,:,:,:] = rdrs_data[end_of_day_index - seq_len : end_of_day_index]
    i += 1
    
# Scale training data
scalers = []  # save scalers to apply them to test data later
x_train = x[:,:num_train_days,:,:,:].copy()
for i in range(x.shape[2]):
    scalers.append(preprocessing.StandardScaler())
    x_train[:,:,i,:,:] = np.nan_to_num(scalers[i].fit_transform(x_train[:,:,i,:,:].reshape((-1, 1))).reshape(x_train[:,:,i,:,:].shape))
x_train = torch.from_numpy(x_train).float().to(device)    
y_train = torch.from_numpy(data_runoff.loc[train_start:train_end].to_numpy()).float().to(device)

In [9]:
# Get validation split
num_validation_samples = int(x_train.shape[1] * validation_fraction)
validation_indices = np.random.choice(range(x_train.shape[1]), size=num_validation_samples)
shuffle_indices = np.arange(x_train.shape[1])
np.random.shuffle(shuffle_indices)
x_train = x_train[:,shuffle_indices,:,:,:]
y_train = y_train[shuffle_indices,:]

x_val, x_train = x_train[:,-num_validation_samples:,:], x_train[:,:-num_validation_samples,:]
y_val, y_train = y_train[-num_validation_samples:,:], y_train[:-num_validation_samples,:]
print('Shapes: x_train {}, y_train {}, x_val {}, y_val {}'.format(x_train.shape, y_train.shape, x_val.shape, y_val.shape))

Shapes: x_train torch.Size([120, 915, 2, 34, 39]), y_train torch.Size([915, 45]), x_val torch.Size([120, 101, 2, 34, 39]), y_val torch.Size([101, 45])


In [10]:
# Train model
learning_rate = 2e-3
patience = 100
min_improvement = 0.05
best_loss_model = (-1, np.inf, None)

# Prepare model
H = 5
batch_size = 20
lstm_layers = 10
kernel_size = (3,3)
model = ConvLSTMRegression(x_train.shape[3:], data_runoff.shape[1], batch_size, x_train.shape[2], H, kernel_size, lstm_layers).to(device)
loss_fn = torch.nn.MSELoss(reduction='mean')
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [11]:
for epoch in range(300):
    epoch_losses = []

    shuffle_indices = np.arange(x_train.shape[1])
    np.random.shuffle(shuffle_indices)
    x_train = x_train[:,shuffle_indices,:,:,:]
    y_train = y_train[shuffle_indices,:]

    model.train()
    #model.init_hidden()
    for i in range(x_train.shape[1] // batch_size):
        #model.hidden = model.init_hidden()
        y_pred = model(x_train[:,i*batch_size : (i+1)*batch_size,:,:,:])
        loss = loss_fn(y_pred, y_train[i*batch_size : (i+1)*batch_size,:].reshape((batch_size,y_train.shape[1]))).to(device)
        epoch_losses.append(loss.item())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    epoch_loss = np.array(epoch_losses).mean()
    print('Epoch', epoch, 'mean train loss:\t{}'.format(epoch_loss))
    writer.add_scalar('loss', epoch_loss, epoch)

    # eval on validation split
    model.eval()
    val_pred = pd.Series()
    for i in range(x_val.shape[1] // batch_size):
        #model.hidden = model.init_hidden()
        batch_pred = model(x_val[:,i*batch_size : (i+1)*batch_size,:,:,:]).detach().cpu().numpy().reshape((batch_size, y_val.shape[1]))
        val_pred = val_pred.append(pd.DataFrame(batch_pred))
    model.train()
    val_nse, val_mse = evaluate.evaluate_daily('All Stations', pd.DataFrame(val_pred.values.flatten()), pd.Series(y_val.cpu().numpy().flatten())[:val_pred.shape[0]*val_pred.shape[1]])
    print('Epoch {} mean val mse:    \t{},\tnse: {}'.format(epoch, val_mse, val_nse))
    writer.add_scalar('loss_eval', val_mse, epoch)

    if val_mse < best_loss_model[1] - min_improvement:
        best_loss_model = (epoch, val_mse, model.state_dict())  # new best model
    elif epoch > best_loss_model[0] + patience:
        print('Patience exhausted in epoch {}. Best val-loss was {}'.format(epoch, best_loss_model[1]))
        break

print('Using best model from epoch', str(best_loss_model[0]), 'which had loss', str(best_loss_model[1]))
model.load_state_dict(best_loss_model[2])
load_data.pickle_model('ConvLSTM+LinearLayer_VIC', model, 'allStations', time_stamp)
model.eval()

Epoch 0 mean train loss:	3209.1125528971356
Epoch 0 mean val mse:    	1347.7435302734375,	nse: 0.0908656607782371
Epoch 1 mean train loss:	2752.0862616644963
Epoch 1 mean val mse:    	1211.7293701171875,	nse: 0.18261556581716742
Epoch 2 mean train loss:	2454.5735500759547
Epoch 2 mean val mse:    	1317.2901611328125,	nse: 0.11140834413622513
Epoch 3 mean train loss:	2497.5356838650173
Epoch 3 mean val mse:    	1368.138916015625,	nse: 0.0771078299749881
Epoch 4 mean train loss:	2230.266506618924
Epoch 4 mean val mse:    	922.4185180664062,	nse: 0.3777731006275896
Epoch 5 mean train loss:	2129.22047390408
Epoch 5 mean val mse:    	834.2853393554688,	nse: 0.4372242126584335
Epoch 6 mean train loss:	2055.3392137315536
Epoch 6 mean val mse:    	921.92529296875,	nse: 0.37810583582732427
Epoch 7 mean train loss:	1850.2706902398004
Epoch 7 mean val mse:    	1122.9857177734375,	nse: 0.24247833316636525
Epoch 8 mean train loss:	1744.6233072916666
Epoch 8 mean val mse:    	796.8050537109375,	nse:

  "type " + obj.__name__ + ". It won't be checked "


Saved model as ../pickle/models/ConvLSTM+LinearLayer_VIC_allStations_20190717-122732.pkl


ConvLSTMRegression(
  (conv_lstm): ConvLSTM(
    (cell_list): ModuleList(
      (0): ConvLSTMCell(
        (conv): Conv2d(7, 20, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      )
      (1): ConvLSTMCell(
        (conv): Conv2d(10, 20, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      )
      (2): ConvLSTMCell(
        (conv): Conv2d(10, 20, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      )
      (3): ConvLSTMCell(
        (conv): Conv2d(10, 20, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      )
      (4): ConvLSTMCell(
        (conv): Conv2d(10, 20, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      )
      (5): ConvLSTMCell(
        (conv): Conv2d(10, 20, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      )
      (6): ConvLSTMCell(
        (conv): Conv2d(10, 20, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      )
      (7): ConvLSTMCell(
        (conv): Conv2d(10, 20, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      )
      (

In [12]:
# scale test data
x_test = x[:,num_train_days:,:,:,:].copy()
for i in range(x.shape[2]):
    x_test[:,:,i,:,:] = np.nan_to_num(scalers[i].transform(x_test[:,:,i,:,:].reshape((-1, 1))).reshape(x_test[:,:,i,:,:].shape))
print('x_test shape: {}'.format(x_test.shape))
# if batch size doesn't align with number of samples, add dummies to the last batch
num_dummies = x_test.shape[1] % batch_size
if num_dummies != 0:
    x_test = np.concatenate([x_test, np.zeros((x_test.shape[0], batch_size - (x_test.shape[1] % batch_size), 
                                               x_test.shape[2], x_test.shape[3], x_test.shape[4]))], axis=1)
    print('Appended dummy entries to x_test. New shape: {}'.format(x_test.shape))

# Predict
x_test = torch.from_numpy(x_test).float().to(device)
predict = data_runoff.loc[test_start:test_end].copy()
for station in predict.columns:
    predict[station] = np.nan
print('Predicting')
for i in range(x_test.shape[1] // batch_size):
    #model.hidden = model.init_hidden()
    pred = model(x_test[:,i*batch_size : (i+1)*batch_size,:,:,:]).detach().cpu().numpy().reshape((batch_size, predict.shape[1]))
    if (i+1) * batch_size <= predict.shape[0]:
        predict.iloc[i*batch_size:(i+1)*batch_size,:] = pred
    else:
        predict.iloc[i*batch_size:,:] = pred[:-num_dummies,:]

x_test shape: (120, 730, 2, 34, 39)
Appended dummy entries to x_test. New shape: (120, 740, 2, 34, 39)
Predicting


In [13]:
actuals = data_runoff.loc[test_start:test_end].copy()

nse_list = []
mse_list = []
for station in predict.columns:
    nse, mse = evaluate.evaluate_daily(station, predict[station], actuals[station], writer=writer)
    nse_list.append(nse)
    mse_list.append(mse)
    
    print(station, '\tNSE:', nse, '\tMSE:', mse, '(clipped to 0)')

print('Median NSE (clipped to 0)', np.median(nse_list), '/ Min', np.min(nse_list), '/ Max', np.max(nse_list))
print('Median MSE (clipped to 0)', np.median(mse_list), '/ Min', np.min(mse_list), '/ Max', np.max(mse_list))


To register the converters:
	>>> from pandas.plotting import register_matplotlib_converters
	>>> register_matplotlib_converters()


02GA010 	NSE: 0.17250661764345487 	MSE: 394.52695607520377 (clipped to 0)
02GA018 	NSE: 0.1112776088977605 	MSE: 223.20545510870343 (clipped to 0)
02GA038 	NSE: 0.10893136249828372 	MSE: 146.61505863231716 (clipped to 0)
02GA047 	NSE: -0.34763858117028157 	MSE: 105.64142161698133 (clipped to 0)
02GB001 	NSE: 0.15146505331818128 	MSE: 6372.391551233281 (clipped to 0)
02GB007 	NSE: -0.23131307811435242 	MSE: 38.505640468197704 (clipped to 0)
02GC002 	NSE: 0.2770323033988562 	MSE: 92.90354294583504 (clipped to 0)
02GC007 	NSE: -0.08230978527763577 	MSE: 32.60062586972414 (clipped to 0)
02GC010 	NSE: 0.10517271811337325 	MSE: 53.19741014199898 (clipped to 0)
02GC018 	NSE: 0.20160353952258614 	MSE: 54.07283877597912 (clipped to 0)
02GC026 	NSE: 0.05301496808233508 	MSE: 152.30621005660112 (clipped to 0)
02GD004 	NSE: 0.0832117626647807 	MSE: 50.481178016536674 (clipped to 0)
02GE007 	NSE: 0.2822775933803692 	MSE: 25.96365003160486 (clipped to 0)
02GG002 	NSE: 0.28275122204252423 	MSE: 172.9

In [14]:
writer.close()

In [15]:
prediction = predict.unstack().reset_index().rename({0: 'runoff'}, axis=1)
save_df = pd.merge(prediction, actuals.unstack().reset_index().rename({0: 'actual'}, axis=1), on=['date', 'station'])

load_data.pickle_results('ConvLSTM+LinearLayer_VIC', save_df, time_stamp)

'ConvLSTM+LinearLayer_VIC_20190717-122732.pkl'

In [16]:
datetime.now().strftime('%Y%m%d-%H%M%S')

'20190717-150931'