In [1]:
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn import linear_model
from sklearn import metrics, svm, neural_network, ensemble
from datetime import datetime, timedelta
import hydroeval
import netCDF4 as nc

In [2]:
dir = 'ObservedDischarge_GR4J+VIC'  # Read runoff observations
data_runoff = pd.DataFrame(columns=['date','runoff', 'station'])
for f in os.listdir(dir):
    if not f.endswith('.rvt'):
        continue
    data = pd.read_csv(os.path.join(dir, f), skiprows=2, skipfooter=1, index_col=False, header=None, names=['runoff'], na_values='-1.2345')
    data['date'] = pd.date_range('2010-01-01', periods=len(data), freq='D')
    data['station'] = f[11:-4]
    data_runoff = data_runoff.append(data, ignore_index=True)

  
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [3]:
data_runoff[pd.isna(data_runoff['runoff'])].groupby(['station'])['date'].count()  # Count NAs per station

station
02GC018       44
02GG013       31
04214500    1003
Name: date, dtype: int64

In [4]:
forcing_variables = ['RDRS_FB_SFC', 'RDRS_FI_SFC', 'RDRS_HU_40m', 'RDRS_P0_SFC', 'RDRS_PR0_SFC', 'RDRS_TT_40m', 'RDRS_UVC_40m', 'RDRS_WDC_40m']
rdrs_nc = nc.Dataset('RDRS_CaPA24hr_forcings_final.nc', 'r')

In [5]:
rdrs_data = pd.DataFrame(index=pd.date_range('2010-01-01 7:00', '2015-01-01 7:00', freq='H')) # Using 7:00 because forcings are UTC, while runoff is local time

In [6]:
for var in forcing_variables:
    var_data = pd.DataFrame(rdrs_nc[var][:].reshape(43825,34*39))
    var_data.dropna(axis=1, how='all', inplace=True)
    var_data.columns = [var + '_' + str(c) for c in var_data.columns]
    rdrs_data = rdrs_data.reset_index(drop=True).join(var_data.reset_index(drop=True))
rdrs_data.index = pd.date_range('2010-01-01 7:00', '2015-01-01 7:00', freq='H')

In [7]:
resampled = rdrs_data.resample('D')
rdrs_daily = resampled.mean().join(resampled.min(), lsuffix='_mean', rsuffix='_min').join(resampled.max().rename(lambda c: c + '_max', axis=1))

In [8]:
data = data_runoff.join(rdrs_daily, on='date')

In [9]:
rdrs_nc.close()
del resampled, rdrs_daily, rdrs_nc, var_data, rdrs_data, data_runoff

In [None]:
# Create test and train splits for each station (by time), then create predictions for each subbasin
history = 7
train_start = datetime.strptime('2010-01-01', '%Y-%m-%d') + timedelta(days=history + 1)
use_runoff_history = False

predictions = {}
actuals = {}
independent_vars = list(col for col in data.columns if col not in ['date', 'station', 'runoff'])
for station in data['station'].unique():
    print(station)
    station_data = data[data['station'] == station].set_index('date')

    for i in range(1, history + 1):
        print('   {}'.format(i))
        station_data[['-{}_{}'.format(i, var) for var in independent_vars]] = station_data[independent_vars].shift(i, axis=0)
        if use_runoff_history:
            station_data['-{}_runoff'.format(i)] = station_data['runoff'].shift(i, axis=0)
    
    station_train = station_data.loc[train_start : '2013-12-31']
    station_test = station_data.loc['2014-01-01' : '2014-12-31']
    
    print('  Fitting model')
    m = linear_model.LinearRegression()
    m.fit(station_train.drop(['station', 'runoff'], axis=1), station_train['runoff'])
    
    print('  Creating prediction dataframe')
    station_test = station_test[~pd.isna(station_test['runoff'])]
    if len(station_test) == 0:
        print('Skipping', station)
        continue
    predict = pd.DataFrame(columns=[x for x in station_test.columns if x not in ['station']], index=station_test.index)
    predict.loc[:,independent_vars] = station_test[independent_vars]
    for i in range(history):
        print('   {}'.format(i))
        predict[['-{}_{}'.format(i + 1, var) for var in independent_vars]] = \
            station_test[['-{}_{}'.format(i + 1, var) for var in independent_vars]]
        if use_runoff_history:
            for j in range(i + 1, history + 1):
                predict.iloc[i]['-{}_runoff'.format(j)] = station_test.iloc[i]['-{}_runoff'.format(j)]
    print('  Predicting')
    if not use_runoff_history:
        predict['runoff'] = m.predict(predict.drop('runoff', axis=1))
    else:
        for i in range(len(predict)):
            print('   day {}'.format(i))
            predict.iloc[i]['runoff'] = m.predict([predict.iloc[i].drop('runoff')])[0]
            for j in range(1, history + 1):
                if (i + j) >= len(predict):
                    break
                predict.iloc[i + j]['-{}_runoff'.format(j)] = predict.iloc[i]['runoff']
    predictions[station] = predict
    actuals[station] = station_test['runoff']

02GA010
   1
  Fitting model
  Predicting
   0


In [None]:
# Evaluate each subbasin
nse_list = []
plot_list = ['02GA010']
for station, predict in predictions.items():
    mse = metrics.mean_squared_error(actuals[station], predict['runoff'])

    predict_clipped = predict.copy()
    predict_clipped['runoff'] = predict['runoff'].clip(0)
    mse_clip = metrics.mean_squared_error(actuals[station], predict_clipped['runoff'])
    nse_list.append(hydroeval.evaluator(hydroeval.nse, predict_clipped['runoff'].to_numpy(), actuals[station].to_numpy())[0])
    
    print(station, '\n\tRMSE (clipped to 0):', np.sqrt(mse_clip))
    print('\tNSE: (clipped to 0)', nse_list[-1])
    
    if station in plot_list:
        plt.figure(figsize=(17,4))
        plt.title(station)
        plt.plot(actuals[station], label='Test')
        plt.plot(predict_clipped['runoff'], label='Prediction')
        plt.legend()
print('Median NSE (clipped to 0)', np.median(nse_list))