LSTM trained on gridded forcings for each station

In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append('..')
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from datetime import datetime, timedelta
from sklearn import preprocessing
import netCDF4 as nc
import torch
from torch import nn
from torch.utils.tensorboard import SummaryWriter
from src import load_data, evaluate
import torch.autograd as autograd

In [2]:
USE_CUDA = False
if torch.cuda.is_available():
    print('CUDA Available')
    USE_CUDA = True
device = torch.device('cuda' if USE_CUDA else 'cpu')

writer = SummaryWriter()

CUDA Available


In [3]:
data_runoff = load_data.load_discharge_gr4j_vic()

  data = pd.read_csv(os.path.join(dir, f), skiprows=2, skipfooter=1, index_col=False, header=None, names=['runoff'], na_values='-1.2345')
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [4]:
# For each station, read which grid cells belong to its subwatershed
station_cell_mapping = pd.read_csv('../data/station_cell_mapping.csv', skiprows=1, names=['station', 'lat', 'lon', 'row', 'col', 'area'])

In [5]:
rdrs_data = load_data.load_rdrs_forcings()

  var_data = pd.DataFrame(rdrs_nc[var][:].reshape(43825,34*39))


In [6]:
class LSTMRegression(nn.Module):
        def __init__(self, input_dim, hidden_dim, num_layers, batch_size):
            super(LSTMRegression, self).__init__()
            self.batch_size = batch_size
            self.hidden_dim = hidden_dim
            self.num_layers = num_layers
            self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers)
            self.linear = nn.Linear(hidden_dim, 1)
            self.hidden = self.init_hidden()
        def init_hidden(self):
            return (torch.randn(self.num_layers, self.batch_size, self.hidden_dim, device=device),
                    torch.randn(self.num_layers, self.batch_size, self.hidden_dim, device=device))

        def forward(self, input):
            lstm_out, self.hidden = self.lstm(input, self.hidden)
            return self.linear(lstm_out[-1])

In [7]:
predictions = {}
actuals = {}
models = {}
seq_len = 7 * 24
train_start = datetime.strptime('2010-01-01', '%Y-%m-%d') + timedelta(days=seq_len // 24 + 1)
train_end = '2013-12-31'
test_start = '2014-01-01'
test_end = '2014-12-31'

for station in data_runoff['station'].unique():
    print(station)
    station_runoff = data_runoff[data_runoff['station'] == station].set_index('date')
    station_cell_ids = 39 * station_cell_mapping[station_cell_mapping['station'] == station]['col'] \
        + station_cell_mapping[station_cell_mapping['station'] == station]['row']
    station_rdrs = rdrs_data.filter(regex='_(' + '|'.join(map(lambda x: str(x), station_cell_ids)) + ')$', axis=1)
    
    month_onehot = pd.get_dummies(station_rdrs.index.month, prefix='month')
    month_onehot.index = station_rdrs.index
    station_rdrs = station_rdrs.join(month_onehot)
    if any(station_runoff['runoff'].isna()):
        print('Station', station, 'had NA runoff values. Skipping.')
        continue
    
    station_train = station_rdrs.loc[train_start : train_end]
    station_test = station_rdrs.loc[test_start : test_end]
    num_train_days = len(pd.date_range(train_start, train_end, freq='D'))
    
    x = np.zeros((seq_len, len(pd.date_range(train_start, test_end, freq='D')), station_rdrs.shape[1]))
    for day in range(x.shape[1]):
        x[:,day,:] = station_rdrs[train_start - timedelta(hours = seq_len - 1) + timedelta(days=day) : train_start + timedelta(days=day)]
    
    # Scale training data
    scalers = []  # save scalers to apply them to test data later
    x_train = x[:,:num_train_days,:]
    for i in range(x.shape[2]):
        scalers.append(preprocessing.StandardScaler())
        x_train[:,:,i] = scalers[i].fit_transform(x_train[:,:,i].reshape((-1, 1))).reshape(x_train[:,:,i].shape)
    x_train = torch.from_numpy(x_train).float().to(device)
    y_train = torch.from_numpy(station_runoff.loc[train_start:train_end, 'runoff'].to_numpy()).float().to(device)
    
    # Train model
    learning_rate = 2e-3
    patience = 50
    min_improvement = 0.05
    best_loss_model = (-1, np.inf, None)
    
    # Prepare model
    H = 200
    batch_size = 3
    lstm_layers = 2
    model = LSTMRegression(station_rdrs.shape[1], H, lstm_layers, batch_size).to(device)
    loss_fn = torch.nn.MSELoss(reduction='mean')
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    
    # warm-start if possible
    if best_loss_model[2] is not None:
        model.load_state_dict(best_loss_model[2])
    for epoch in range(300):
        epoch_losses = []
        for i in range(num_train_days // batch_size):
            model.hidden = model.init_hidden()
            y_pred = model(x_train[:,i*batch_size : (i+1)*batch_size,:])

            loss = loss_fn(y_pred, y_train[i*batch_size : (i+1)*batch_size].reshape((batch_size,1))).to(device)
            epoch_losses.append(loss.item())
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        epoch_loss = np.array(epoch_losses).mean()
        print('Epoch', epoch, 'mean loss:', epoch_loss)
        writer.add_scalar('loss_' + station, epoch_loss, epoch)
        if epoch_loss < best_loss_model[1] - min_improvement:
            best_loss_model = (epoch, epoch_loss, model.state_dict())  # new best model
        elif epoch > best_loss_model[0] + patience:
            print('Patience exhausted in epoch {}. Best loss was {}'.format(epoch, best_loss_model[1]))
            break

    print('Using best model from epoch', str(best_loss_model[0]), 'which had loss', str(best_loss_model[1]))
    model.load_state_dict(best_loss_model[2])
    model.eval()        
    
    # scale test data
    x_test = x[:,num_train_days:,:]
    for i in range(x.shape[2]):
        x_test[:,:,i] = scalers[i].transform(x_test[:,:,i].reshape((-1, 1))).reshape(x_test[:,:,i].shape)
    # if batch size doesn't align with number of samples, add dummies to the last batch
    if x_test.shape[1] % batch_size != 0:
        x_test = np.concatenate([x_test, np.zeros((x_test.shape[0], batch_size - (x_test.shape[1] % batch_size), x_test.shape[2]))], axis=1)
    
    x_test = torch.from_numpy(x_test).float().to(device)
    predict = station_runoff[test_start:test_end].copy()
    predict['runoff'] = np.nan
    pred_array = np.array([])
    print('  Predicting')
    for i in range(x_test.shape[1] // batch_size):
        pred_array = np.concatenate([pred_array, model(x_test[:,i*batch_size : (i+1)*batch_size,:]).detach().cpu().numpy().reshape(batch_size)])
    predict['runoff'] = pred_array[:predict.shape[0]]  # ignore dummies
    predictions[station] = predict
    actuals[station] = station_runoff['runoff'].loc[test_start:test_end]
    models[station] = model

04159900
Epoch 0 mean loss: 41.908054062828214
Epoch 1 mean loss: 41.05602528911562
Epoch 2 mean loss: 39.54978373519377
Epoch 3 mean loss: 38.612018249770685
Epoch 4 mean loss: 36.484691924773315
Epoch 5 mean loss: 37.99502525895099
Epoch 6 mean loss: 36.31324566454179
Epoch 7 mean loss: 33.04339137317205
Epoch 8 mean loss: 28.199339747773088
Epoch 9 mean loss: 26.058544146239303
Epoch 10 mean loss: 27.560022486248528
Epoch 11 mean loss: 22.85691627181421
Epoch 12 mean loss: 22.631627450427732
Epoch 13 mean loss: 21.695456259960427
Epoch 14 mean loss: 18.392350009640737
Epoch 15 mean loss: 20.54314268730708
Epoch 16 mean loss: 16.58552919440551
Epoch 17 mean loss: 17.504006397171995
Epoch 18 mean loss: 13.037509387228658
Epoch 19 mean loss: 13.14212381971029
Epoch 20 mean loss: 13.184221856636048
Epoch 21 mean loss: 10.97933406881369
Epoch 22 mean loss: 8.784514903150846
Epoch 23 mean loss: 10.765022820038316
Epoch 24 mean loss: 12.991331527356929
Epoch 25 mean loss: 11.54719897941397

In [8]:
nse_list = []
for station, predict in predictions.items():
    nse = evaluate.evaluate_daily(station, predict['runoff'], actuals[station], writer=writer)
    nse_list.append(nse)
    
    print(station, '\tNSE: (clipped to 0)', nse_list[-1])

print('Median NSE (clipped to 0)', np.median(nse_list), '/ Min', np.min(nse_list), '/ Max', np.max(nse_list))


To register the converters:
	>>> from pandas.plotting import register_matplotlib_converters
	>>> register_matplotlib_converters()


04159900 	NSE: (clipped to 0) 0.07555177164821347
02GE007 	NSE: (clipped to 0) 0.10223855243834301
04215500 	NSE: (clipped to 0) 0.28020533719271534
04174500 	NSE: (clipped to 0) 0.3425259378756733
04177000 	NSE: (clipped to 0) 0.17493143628889452
04165500 	NSE: (clipped to 0) 0.36339362511318973
04200500 	NSE: (clipped to 0) 0.3073001606337943
02GG009 	NSE: (clipped to 0) -0.010345404968821192
04199500 	NSE: (clipped to 0) 0.3056176717609139
04207200 	NSE: (clipped to 0) 0.45238336289408576
04208504 	NSE: (clipped to 0) 0.4260165468568047
04213000 	NSE: (clipped to 0) 0.30721041423949735
02GC010 	NSE: (clipped to 0) 0.28029324305837877
02GB001 	NSE: (clipped to 0) 0.15612362150482217
04159492 	NSE: (clipped to 0) 0.06366013566794015
04195820 	NSE: (clipped to 0) 0.1985995107960219
04161820 	NSE: (clipped to 0) 0.2974335211317911
04166500 	NSE: (clipped to 0) 0.21780111197988528
04215000 	NSE: (clipped to 0) 0.04773756179089206
04196800 	NSE: (clipped to 0) 0.14628019201070175
02GA010 

In [9]:
writer.close()

In [10]:
load_data.pickle_results('LSTM_VIC', (predictions, actuals), models=models)

'LSTM_VIC_20190701-084118.pkl'