ConvLSTM trained on measured streamflow at gauge stations.

In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append('..')
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt 
from datetime import datetime, timedelta
import netCDF4 as nc
import torch
from torch import nn, utils
from torch.utils.tensorboard import SummaryWriter
from src import load_data, evaluate, conv_lstm, datasets, utils, stgcn
import random
import pickle
import json
import networkx as nx

time_stamp = datetime.now().strftime('%Y%m%d-%H%M%S')
time_stamp

'20190823-201832'

In [2]:
import logging
logger = logging.getLogger()
fhandler = logging.FileHandler(filename='../log.out', mode='a')
chandler = logging.StreamHandler(sys.stdout)
formatter = logging.Formatter('%(asctime)s - {} - %(message)s'.format(time_stamp))
fhandler.setFormatter(formatter)
chandler.setFormatter(formatter)
logger.addHandler(fhandler)
logger.addHandler(chandler)
logger.setLevel(logging.INFO)

In [3]:
USE_CUDA = False
if torch.cuda.is_available():
    print('CUDA Available')
    USE_CUDA = True
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
device = torch.device('cuda:0' if USE_CUDA else 'cpu')
num_devices = torch.cuda.device_count() if USE_CUDA else 0
logger.warning('cuda devices: {}'.format(list(torch.cuda.get_device_name(i) for i in range(num_devices))))
torch.manual_seed(0)
np.random.seed(0)
random.seed(0)

CUDA Available
2019-08-23 20:18:32,542 - 20190823-201832 - cuda devices: ['Tesla V100-SXM2-16GB']


In [4]:
partitioning_strategy = 'distance'  # 'distance' or 'unilabel', see https://arxiv.org/abs/1801.07455
max_hops = 10 if partitioning_strategy == 'distance' else None
rdrs_vars = [4, 5]
agg = ['sum', 'minmax']
include_month = True
dem, landcover, soil, groundwater = True, False, False, False
landcover_types = []
seq_len = 8
seq_steps = 1

train_start = datetime.strptime('2010-01-01', '%Y-%m-%d') + timedelta(days=seq_len * seq_steps)  # first day for which to make a prediction in train set
train_end = '2012-12-31'
test_start = '2013-01-01'
test_end = '2014-12-31'
val_fraction = 0.1

In [5]:
with open('../data/simulations_shervan/subbasins.geojson', 'r') as f:
     subbasin_shapes = json.loads(f.read())

subbasin_graph = utils.create_subbasin_graph()
subbasin_graph.remove_nodes_from(['sub-1'])
subbasins = list(int(n[3:]) for n in subbasin_graph.nodes)

if partitioning_strategy == 'unilabel':
    adjacency = torch.unsqueeze(torch.from_numpy(nx.to_numpy_array(subbasin_graph)), 0).float().to(device)
elif partitioning_strategy == 'distance':  # use distances in upstream-graph, i.e. in reversed downstream-graph
    adjacency = utils.create_hop_matrix(subbasin_graph.reverse(), max_hops).float().to(device)
else:
    raise Exception('Unsupported partitioning strategy')

In [6]:
train_dataset = datasets.SubbasinAggregatedDataset(rdrs_vars, subbasins, seq_len, seq_steps, train_start, train_end, aggregate_daily=agg, include_months=include_month, 
                                                   dem=dem, landcover=landcover, soil=soil, groundwater=groundwater, landcover_types=landcover_types)
test_dataset = datasets.SubbasinAggregatedDataset(rdrs_vars, subbasins, seq_len, seq_steps, test_start, test_end, aggregate_daily=agg, include_months=include_month, 
                                                  conv_scalers=train_dataset.scalers, dem=dem, landcover=landcover, soil=soil, groundwater=groundwater, landcover_types=landcover_types)

station_subbasins = list(train_dataset.subbasin_to_station.keys())

Using saved dataset in file /home/mgauch/runoff-nn/src/../data/train_test/SubbasinAggregatedDataset_4-5_8-1_2010-01-09 00:00:00-2012-12-31_dem__sum-minmax_month.pkl
Using saved dataset in file /home/mgauch/runoff-nn/src/../data/train_test/SubbasinAggregatedDataset_4-5_8-1_2013-01-01-2014-12-31_dem__sum-minmax_month.pkl


In [7]:
# Train model
num_epochs = 800
learning_rate = 2e-3
patience = 300
min_improvement = 0.01
best_loss_model = (-1, np.inf, None)
dropout = 0.3
weight_decay = 1e-5
edge_weighting = False

batch_size = 32
model = stgcn.Model(train_dataset.x.shape[2], adjacency.shape[0], edge_importance_weighting=edge_weighting, 
                    adjacency_shape=adjacency.shape, dropout=dropout).to(device)
if num_devices > 1:
    model = torch.nn.DataParallel(model, device_ids=list(range(num_devices)))
loss_fn = evaluate.NSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

writer = SummaryWriter(comment='STGCN_stationTraining')
param_description = {'time_stamp': time_stamp, 'batch_size': batch_size, 'loss': loss_fn, 'include_month': include_month, 'aggregate_daily': agg, 'rdrs_vars': rdrs_vars, 'dropout': dropout, 'val_fraction': val_fraction,
                     'optimizer': optimizer, 'lr': learning_rate, 'patience': patience, 'min_improvement': min_improvement, 'x_train_shape': train_dataset.x.shape, 'partitioning_strategy': partitioning_strategy, 'max_hops': max_hops,
                     'x_test_shape': test_dataset.x.shape, 'num_epochs': num_epochs, 'seq_len': seq_len, 'seq_steps': seq_steps, 'train_start': train_start, 'train_end': train_end, 'weight_decay': weight_decay, 'edge_weighting': edge_weighting,
                     'landcover_types': landcover_types, 'test_start': test_start, 'test_end': test_end, 'model': str(model).replace('\n','').replace(' ', ''),}
writer.add_text('Parameter Description', str(param_description))
str(param_description)

"{'time_stamp': '20190823-201832', 'batch_size': 32, 'loss': NSELoss(), 'include_month': True, 'aggregate_daily': ['sum', 'minmax'], 'rdrs_vars': [4, 5], 'dropout': 0.3, 'val_fraction': 0.1, 'optimizer': Adam (\nParameter Group 0\n    amsgrad: False\n    betas: (0.9, 0.999)\n    eps: 1e-08\n    lr: 0.002\n    weight_decay: 1e-05\n), 'lr': 0.002, 'patience': 300, 'min_improvement': 0.01, 'x_train_shape': torch.Size([1088, 8, 49, 724]), 'partitioning_strategy': 'distance', 'max_hops': 10, 'x_test_shape': torch.Size([730, 8, 49, 724]), 'num_epochs': 800, 'seq_len': 8, 'seq_steps': 1, 'train_start': datetime.datetime(2010, 1, 9, 0, 0), 'train_end': '2012-12-31', 'weight_decay': 1e-05, 'edge_weighting': False, 'landcover_types': [], 'test_start': '2013-01-01', 'test_end': '2014-12-31', 'model': 'Model((st_gcn_networks):ModuleList((0):st_gcn((gcn):ConvTemporalGraphical((conv):Conv2d(49,640,kernel_size=(1,1),stride=(1,1)))(tcn):Sequential((0):BatchNorm2d(64,eps=1e-05,momentum=0.1,affine=True,

In [8]:
val_indices = np.random.choice(len(train_dataset), size=int(val_fraction * len(train_dataset)), replace=False)
train_indices = list(i for i in range(len(train_dataset)) if i not in val_indices)
train_sampler = torch.utils.data.SubsetRandomSampler(train_indices)
val_sampler = torch.utils.data.SubsetRandomSampler(val_indices)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size, sampler=train_sampler, pin_memory=True, drop_last=False)
val_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size, sampler=val_sampler, pin_memory=True, drop_last=False)
    
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size, shuffle=False, pin_memory=True, drop_last=False)

In [9]:
y_train_means = train_dataset.y_means.to(device)

In [10]:
torch.manual_seed(0)
np.random.seed(0)
for epoch in range(num_epochs):
    model.train()

    train_losses = torch.tensor(0.0)
    for i, train_batch in enumerate(train_dataloader):
        y_mask = train_batch['y_mask'].all(dim=0).to(device)  # only backprop for stations where we have non-NA values for the whole batch
        if not torch.any(y_mask):
            continue
            
        y_pred = model(train_batch['x'].permute(0,2,1,3).to(device), adjacency)
        train_loss = loss_fn(y_pred[:,y_mask], train_batch['y'][:,y_mask].to(device), means=y_train_means[y_mask])
        
        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()
        
        train_losses += train_loss.detach()
        
    train_loss = (train_losses / len(train_dataloader)).item()
    print('Epoch', epoch, 'mean train loss:\t{}'.format(train_loss))
    writer.add_scalar('loss_nse', train_loss, epoch)
    
    model.eval()
    val_losses = torch.tensor(0.0)
    for i, val_batch in enumerate(val_dataloader):
        y_mask = val_batch['y_mask'].all(dim=0).to(device)
        if not torch.any(y_mask):
            continue
            
        y_pred = model(val_batch['x'].permute(0,2,1,3).to(device), adjacency).detach()
        val_losses += loss_fn(y_pred[:,y_mask], val_batch['y'][:,y_mask].to(device), means=y_train_means[y_mask]).detach()
            
    val_loss = (val_losses / len(val_dataloader)).item()
    print('Epoch', epoch, 'mean val loss:\t{}'.format(val_loss))
    writer.add_scalar('loss_nse_val', val_loss, epoch)
    
    if val_loss < best_loss_model[1] - min_improvement:
        best_loss_model = (epoch, val_loss, model.state_dict())  # new best model
        load_data.pickle_model('STGCN_stationTraining', model, 'allStations', time_stamp, model_type='torch.dill')
    elif epoch > best_loss_model[0] + patience:
        print('Patience exhausted in epoch {}. Best val-loss was {}'.format(epoch, best_loss_model[1]))
        break
    
print('Using best model from epoch', str(best_loss_model[0]), 'which had loss', str(best_loss_model[1]))
model.load_state_dict(best_loss_model[2])
load_data.save_model_with_state('STGCN_stationTraining', best_loss_model[0], model, optimizer, time_stamp, use_dill=True)

Epoch 0 mean train loss:	1.2367256879806519
Epoch 0 mean val loss:	1.6778547763824463
Saved model as /home/mgauch/runoff-nn/src/../pickle/models/STGCN_stationTraining_allStations_20190823-201832.pkl
Epoch 1 mean train loss:	1.1475633382797241
Epoch 1 mean val loss:	1.0966538190841675
Saved model as /home/mgauch/runoff-nn/src/../pickle/models/STGCN_stationTraining_allStations_20190823-201832.pkl
Epoch 2 mean train loss:	1.0023038387298584
Epoch 2 mean val loss:	1.089839220046997
Epoch 3 mean train loss:	0.9417170882225037
Epoch 3 mean val loss:	0.9162752628326416
Saved model as /home/mgauch/runoff-nn/src/../pickle/models/STGCN_stationTraining_allStations_20190823-201832.pkl
Epoch 4 mean train loss:	0.9190874695777893
Epoch 4 mean val loss:	0.9505383968353271
Epoch 5 mean train loss:	0.8884377479553223
Epoch 5 mean val loss:	1.0225058794021606
Epoch 6 mean train loss:	0.8860957026481628
Epoch 6 mean val loss:	0.9850835800170898
Epoch 7 mean train loss:	0.8919770121574402
Epoch 7 mean val

In [11]:
del y_train_means, y_pred, y_mask
if USE_CUDA:
    torch.cuda.empty_cache()

In [12]:
logger.warning('predicting')
model.eval()

predictions = []  # test on same graph but different time
for i, test_batch in enumerate(test_dataloader):
    pred = model(test_batch['x'].permute(0,2,1,3).to(device), adjacency).detach().cpu()
    predictions.append(pred)
    
predictions = torch.cat(predictions, dim=0)

2019-08-24 00:36:53,360 - 20190823-201832 - predicting


In [13]:
actuals = test_dataset.data_streamflow.copy()
if len(actuals['date'].unique()) != len(predictions):
    print('Warning: length of prediction {} and actuals {} does not match.'.format(len(predictions), len(actuals['date'].unique())))

nse_dict = {}
mse_dict = {}
predictions_df = pd.DataFrame(columns=actuals.columns)
for i in range(len(subbasins)):
    subbasin = subbasins[i]
    if subbasin not in station_subbasins:
        continue
    station = test_dataset.subbasin_to_station[subbasin]
    act = actuals[actuals['station'] == station].set_index('date')['runoff']
    if predictions.shape[0] != act.shape[0]:
        print('Warning: length of prediction {} and actuals {} does not match for subbasin {}. Ignoring excess actuals.'.format(len(predictions), len(act), subbasin))
        act = act.iloc[:predictions.shape[0]]
            
    pred = pd.DataFrame({'runoff': predictions[:,i]}, index=act.index)
    pred['subbasin'] = subbasin
    pred['station'] = station
    predictions_df = predictions_df.append(pred.reset_index(), sort=True)
    
    nse, mse = evaluate.evaluate_daily(station, pred['runoff'], act, writer=writer)
    nse_dict[subbasin] = nse
    mse_dict[subbasin] = mse
    print(station, subbasin, '\tNSE:', nse, '\tMSE:', mse, '(clipped to 0)')


To register the converters:
	>>> from pandas.plotting import register_matplotlib_converters
	>>> register_matplotlib_converters()


02GA038 676 	NSE: 0.10491611025044156 	MSE: 147.2757220413576 (clipped to 0)
02GA047 677 	NSE: 0.13653443943008303 	MSE: 67.68708659015742 (clipped to 0)
04159492 678 	NSE: 0.2748030826073138 	MSE: 336.7044345175085 (clipped to 0)
04159900 680 	NSE: 0.4172837874030568 	MSE: 25.317250684882573 (clipped to 0)
04160600 681 	NSE: 0.4201359931870571 	MSE: 16.775996552307234 (clipped to 0)
02GG006 682 	NSE: 0.1792364817852251 	MSE: 43.9988746558405 (clipped to 0)
02GD004 683 	NSE: 0.31469232570993866 	MSE: 37.73514677990548 (clipped to 0)
02GA018 684 	NSE: 0.16574811678491574 	MSE: 209.52501380928308 (clipped to 0)
02GA010 685 	NSE: 0.14333154593330055 	MSE: 408.43685853545514 (clipped to 0)
02GB007 686 	NSE: 0.4881506636199011 	MSE: 16.006559883795276 (clipped to 0)
02GB001 687 	NSE: 0.08036034597101416 	MSE: 6906.379029443659 (clipped to 0)
04215000 688 	NSE: 0.10667581132323634 	MSE: 85.80630108237908 (clipped to 0)
04166100 689 	NSE: -0.14361442048872486 	MSE: 8.386476419730366 (clipped 

In [14]:
def print_nse_mse(name, nse_dict, mse_dict, subbasins):
    nses = list(nse_dict[s] for s in subbasins)
    mses = list(mse_dict[s] for s in subbasins)
    print(name, 'Median NSE (clipped to 0)', np.nanmedian(nses), '/ Min', np.nanmin(nses), '/ Max', np.nanmax(nses))
    print(' ' * len(name), 'Median MSE (clipped to 0)', np.nanmedian(mses), '/ Min', np.nanmin(mses), '/ Max', np.nanmax(mses))
    
    return np.nanmedian(nses)

nse_median_stations = print_nse_mse('Stations test', nse_dict, mse_dict, list(s for s in station_subbasins))

writer.add_scalar('nse_median_stations_temporal', nse_median_stations)

Stations test Median NSE (clipped to 0) 0.24454255462425906 / Min -0.5117260848680631 / Max 0.4881506636199011
              Median MSE (clipped to 0) 125.42436461066076 / Min 8.386476419730366 / Max 44931.53098051141


In [15]:
nse_dict

{676: 0.10491611025044156,
 677: 0.13653443943008303,
 678: 0.2748030826073138,
 680: 0.4172837874030568,
 681: 0.4201359931870571,
 682: 0.1792364817852251,
 683: 0.31469232570993866,
 684: 0.16574811678491574,
 685: 0.14333154593330055,
 686: 0.4881506636199011,
 687: 0.08036034597101416,
 688: 0.10667581132323634,
 689: -0.14361442048872486,
 690: -0.09365615064942112,
 691: 0.04053819625086241,
 692: 0.21598371805661398,
 693: 0.056987242208799804,
 694: 0.31148922246978705,
 695: 0.2491880556869689,
 696: 0.443459017898302,
 697: 0.387195183127955,
 698: 0.4243285371849709,
 699: 0.3255957064870766,
 700: 0.2338690327125612,
 701: 0.33615918264848044,
 702: 0.23989705356154922,
 703: 0.42252198115721173,
 704: 0.33056234645470584,
 705: 0.22049829220413952,
 706: 0.26053962783883566,
 707: -0.15410309791251442,
 709: -0.10401042979627761,
 710: 0.28344666688150033,
 712: -0.01035663384048946,
 713: 0.3653805723081148,
 714: 0.44216999546015534,
 715: 0.47034662490107626,
 716: 0.3

In [16]:
writer.close()

In [17]:
save_df = pd.merge(predictions_df.rename({'runoff': 'prediction'}, axis=1), 
                   test_dataset.simulated_streamflow, on=['date', 'subbasin'])
save_df = pd.merge(save_df, actuals.rename({'runoff': 'actual'}, axis=1), how='left', on=['date', 'station'])\
            [['date', 'subbasin', 'station', 'prediction', 'actual', 'simulated_streamflow']]
load_data.pickle_results('STGCN_stationTraining', save_df, time_stamp)

'STGCN_stationTraining_20190823-201832.pkl'

In [18]:
datetime.now().strftime('%Y%m%d-%H%M%S')

'20190824-003709'