In [16]:
import numpy as np
import pandas as pd

In [17]:
# From https://github.com/kratzert/pangeo_lstm_example/blob/master/LSTM_for_rainfall_runoff_modelling.ipynb
def calc_nse(obs: np.array, sim: np.array) -> float:
    """Calculate Nash-Sutcliff-Efficiency.

    :param obs: Array containing the observations
    :param sim: Array containing the simulations
    :return: NSE value.
    """
    # only consider time steps, where observations are available
    sim = np.delete(sim, np.argwhere(obs < 0), axis=0)
    obs = np.delete(obs, np.argwhere(obs < 0), axis=0)

    # check for NaNs in observations
    sim = np.delete(sim, np.argwhere(np.isnan(obs)), axis=0)
    obs = np.delete(obs, np.argwhere(np.isnan(obs)), axis=0)

    denominator = np.sum((obs - np.mean(obs)) ** 2)
    numerator = np.sum((sim - obs) ** 2)
    nse_val = 1 - numerator / denominator

    return nse_val

In [18]:
# Read all input txt files
import glob

path = "../Input files (.txt)"
all_files = glob.glob(path + "/*.txt")


def dateparse(dates, times):
    return [
        pd.datetime.strptime(date + time, "%d.%m.%Y%H:%M:%S")
        for date, time in zip(dates, times)
    ]


df_dict = {}
for file_path in all_files:
    print(f"Reading file: {file_path}")
    # Name is formatted `./Input files (.txt)/nve_inp_XX.txt`
    number = int(file_path.split("_")[-1].split(".")[0])

    df = pd.read_csv(
        file_path,
        encoding="cp1252",
        skiprows=[0],
        delimiter=r"\s+",
        parse_dates={"timestamp": ["dd.mm.yyyy", "hh:mm:ss"]},
        date_parser=dateparse,
    )
    
    # All files have equal values for grC and grC.1
    df = df.drop(['grC.1'], axis=1)
    df_dict[number] = df

Reading file: ../Input files (.txt)/nve_inp_32.txt
Reading file: ../Input files (.txt)/nve_inp_26.txt
Reading file: ../Input files (.txt)/nve_inp_27.txt
Reading file: ../Input files (.txt)/nve_inp_33.txt
Reading file: ../Input files (.txt)/nve_inp_19.txt
Reading file: ../Input files (.txt)/nve_inp_25.txt
Reading file: ../Input files (.txt)/nve_inp_31.txt
Reading file: ../Input files (.txt)/nve_inp_30.txt
Reading file: ../Input files (.txt)/nve_inp_24.txt
Reading file: ../Input files (.txt)/nve_inp_18.txt
Reading file: ../Input files (.txt)/nve_inp_20.txt
Reading file: ../Input files (.txt)/nve_inp_34.txt
Reading file: ../Input files (.txt)/nve_inp_35.txt
Reading file: ../Input files (.txt)/nve_inp_21.txt
Reading file: ../Input files (.txt)/nve_inp_37.txt
Reading file: ../Input files (.txt)/nve_inp_23.txt
Reading file: ../Input files (.txt)/nve_inp_22.txt
Reading file: ../Input files (.txt)/nve_inp_36.txt
Reading file: ../Input files (.txt)/nve_inp_1.txt
Reading file: ../Input files (.t

In [19]:
print(df_dict[1].keys())
n_without_gaps = 0
without_gaps = {}
for n in df_dict:
    df = df_dict[n]
    df1 = (
        df.sort_values("timestamp")
        .apply(lambda x: x.diff().max())
        .reset_index(name="max_diff")
    )
    display(df1)
    if df1["max_diff"][0].days < 2:
        print("Without gap")
        n_without_gaps += 1
        without_gaps[n] = df

print(f"Number of timeseries without gaps: {n_without_gaps}")

Index(['timestamp', 'mm', 'grC', 'm3/s'], dtype='object')


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,96.85
2,grC,11.09
3,m3/s,33.32


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,66.22
2,grC,11.01
3,m3/s,60.99


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,86.42
2,grC,11.13
3,m3/s,107.95


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,60.83
2,grC,11.74
3,m3/s,13.9


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,40.95
2,grC,11.83
3,m3/s,2.95


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,66.13
2,grC,10.78
3,m3/s,3.75


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,61.81
2,grC,12.01
3,m3/s,22.58


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,137.21
2,grC,10.74
3,m3/s,28.19


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,70.51
2,grC,9.2
3,m3/s,9.66


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,60.67
2,grC,12.78
3,m3/s,18.19


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,46.95
2,grC,13.35
3,m3/s,2.28


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,26.21
2,grC,12.28
3,m3/s,5.96


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,52.68
2,grC,11.32
3,m3/s,24.61


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,60.76
2,grC,12.78
3,m3/s,0.1


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,65.19
2,grC,14
3,m3/s,14.61


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,67.39
2,grC,11.58
3,m3/s,7.73


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,67.96
2,grC,9.32
3,m3/s,40.11


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,67.49
2,grC,12.63
3,m3/s,28.68


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,36.02
2,grC,14.08
3,m3/s,57


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,98.36
2,grC,10.26
3,m3/s,36.07


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,54.87
2,grC,11.73
3,m3/s,0.3


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,56.8
2,grC,9.7
3,m3/s,1.28


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,45.9
2,grC,13.62
3,m3/s,20.5


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,99.38
2,grC,13.9
3,m3/s,13.47


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,58.11
2,grC,12.79
3,m3/s,50.84


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,44.68
2,grC,15.28
3,m3/s,9.34


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,64.09
2,grC,11.06
3,m3/s,7.08


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,38.88
2,grC,9.85
3,m3/s,1.6


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,117.38
2,grC,15.44
3,m3/s,12.33


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,47.7
2,grC,12.25
3,m3/s,7.81


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,57.71
2,grC,11.44
3,m3/s,3.32


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,39.02
2,grC,15.11
3,m3/s,3.87


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,69.37
2,grC,14.44
3,m3/s,30.67


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,32.63
2,grC,11.51
3,m3/s,6.81


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,32.64
2,grC,12.96
3,m3/s,8.23


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,61.35
2,grC,11.54
3,m3/s,20


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,42.02
2,grC,15.09
3,m3/s,8.14


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,43.39
2,grC,8.8
3,m3/s,1.63


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,49.95
2,grC,9.13
3,m3/s,2.66


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,46.9
2,grC,10.42
3,m3/s,28.11


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,57.56
2,grC,11.64
3,m3/s,21.23


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,87.9
2,grC,8.93
3,m3/s,0.25


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,66.59
2,grC,15.64
3,m3/s,6.12


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,54.13
2,grC,12.66
3,m3/s,31.65


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,69.02
2,grC,12.58
3,m3/s,24.07


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,110.98
2,grC,9.65
3,m3/s,34.09


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,37.88
2,grC,13.02
3,m3/s,4.04


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,43.14
2,grC,15.04
3,m3/s,10.02


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,59.39
2,grC,10.57
3,m3/s,7.61


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,51.61
2,grC,13.3
3,m3/s,9.4


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,36.69
2,grC,10.75
3,m3/s,21.65


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,50.39
2,grC,14.53
3,m3/s,10.15


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,95.78
2,grC,11.83
3,m3/s,7.77


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,70.29
2,grC,7.14
3,m3/s,1.33


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,51.27
2,grC,9.71
3,m3/s,2.4


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,113.27
2,grC,7.6
3,m3/s,1.13


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,95.25
2,grC,12.56
3,m3/s,10.24


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,66.7
2,grC,8.5
3,m3/s,0.86


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,45.15
2,grC,14.6
3,m3/s,1.63


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,35.17
2,grC,9.32
3,m3/s,5.33


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,47.67
2,grC,11.51
3,m3/s,1.7


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,41.58
2,grC,11.48
3,m3/s,7.53


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,25.21
2,grC,9.87
3,m3/s,6.98


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,76.8
2,grC,9.7
3,m3/s,0.63


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,78.62
2,grC,11.99
3,m3/s,11.57


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,76.12
2,grC,14.21
3,m3/s,26.53


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,71.75
2,grC,14.4
3,m3/s,1.79


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,37.85
2,grC,9.92
3,m3/s,6.4


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,80.53
2,grC,9.3
3,m3/s,4.87


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,49.52
2,grC,8.41
3,m3/s,10.64


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,72.81
2,grC,14.74
3,m3/s,18.68


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,78.07
2,grC,13.87
3,m3/s,6.23


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,86.5
2,grC,10.41
3,m3/s,13.34


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,74.54
2,grC,12.82
3,m3/s,32.24


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,81.89
2,grC,9.4
3,m3/s,37.78


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,37.46
2,grC,13.2
3,m3/s,6.8


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,66.1
2,grC,7.47
3,m3/s,0.39


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,72.31
2,grC,13.21
3,m3/s,15.75


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,70.58
2,grC,14.57
3,m3/s,10.13


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,95.39
2,grC,15.6
3,m3/s,32.74


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,59.64
2,grC,12.64
3,m3/s,33.04


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,72.93
2,grC,15.04
3,m3/s,18.64


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,24.93
2,grC,8.87
3,m3/s,5.7


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,51.1
2,grC,12.84
3,m3/s,11.63


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,84.78
2,grC,8.51
3,m3/s,9.28


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,32.8
2,grC,8
3,m3/s,0.51


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,80.91
2,grC,15.73
3,m3/s,41.8


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,52.04
2,grC,9.47
3,m3/s,12.07


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,35.93
2,grC,14.09
3,m3/s,6.82


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,37.21
2,grC,12.41
3,m3/s,29.68


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,61.13
2,grC,11.91
3,m3/s,44.1


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,32.24
2,grC,10.33
3,m3/s,25.97


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,47.05
2,grC,13.31
3,m3/s,2.28


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,66.73
2,grC,12.87
3,m3/s,31.06


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,37.61
2,grC,11.45
3,m3/s,14.21


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,94.21
2,grC,11.04
3,m3/s,40.06


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,58.99
2,grC,9.03
3,m3/s,10.97


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,76.07
2,grC,8.38
3,m3/s,9.05


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,43.72
2,grC,11.73
3,m3/s,8.97


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,71
2,grC,10.39
3,m3/s,12.04


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,49.64
2,grC,13.36
3,m3/s,2.65


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,67
2,grC,13.56
3,m3/s,35.63


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,71.6
2,grC,15.89
3,m3/s,17.62


Without gap


Unnamed: 0,index,max_diff
0,timestamp,1 days 00:00:00
1,mm,41.32
2,grC,12.02
3,m3/s,1.07


Without gap
Number of timeseries without gaps: 104


In [20]:
long_without_gaps = {}
# Catchments without measurement gaps
for id in sorted(without_gaps.keys()):
    df = without_gaps[id]
    # print(df)
    _min = df["timestamp"].min()
    _max = df["timestamp"].max()
    _diff = _max - _min
    print(f"{id:3d} {_min} {_max} {_diff}")
    if _diff.days >= 5843:
        long_without_gaps[id] = without_gaps[id]
        # print('Added catchment')

print(f"\nNumber of catchment with long measurement period: {len(long_without_gaps)}")
print(f"Ids: {list(long_without_gaps.keys())}")

1 2000-01-01 00:00:00 2015-12-31 00:00:00 5843 days 00:00:00
  2 2003-03-11 00:00:00 2015-12-31 00:00:00 4678 days 00:00:00
  3 2006-10-22 00:00:00 2015-12-31 00:00:00 3357 days 00:00:00
  4 2005-01-01 00:00:00 2015-12-31 00:00:00 4016 days 00:00:00
  5 2007-03-13 00:00:00 2015-12-31 00:00:00 3215 days 00:00:00
  6 2000-01-01 00:00:00 2015-12-31 00:00:00 5843 days 00:00:00
  7 2004-03-25 00:00:00 2015-12-31 00:00:00 4298 days 00:00:00
  8 2006-06-28 00:00:00 2015-12-31 00:00:00 3473 days 00:00:00
  9 2000-01-01 00:00:00 2015-12-31 00:00:00 5843 days 00:00:00
 10 2008-02-04 00:00:00 2015-12-31 00:00:00 2887 days 00:00:00
 11 2002-12-24 00:00:00 2015-12-31 00:00:00 4755 days 00:00:00
 12 2005-07-06 00:00:00 2015-12-31 00:00:00 3830 days 00:00:00
 13 2005-08-26 00:00:00 2015-12-31 00:00:00 3779 days 00:00:00
 14 2000-01-01 00:00:00 2015-12-31 00:00:00 5843 days 00:00:00
 15 2002-11-18 00:00:00 2015-12-31 00:00:00 4791 days 00:00:00
 16 2003-09-19 00:00:00 2015-12-31 00:00:00 4486 days 00:

In [21]:
import matplotlib.pyplot as plt

%matplotlib inline

In [22]:
def plot_multi_histogram(catchments):
    # plot with various axes scales

    plt.figure(figsize=(5, 5))

    all_nses = []
    max_y = None
    for offset in range(12):
        plt.subplot(4, 4, offset + 1)
        offset += 1
        nses = []
        all_nses.append(nses)
        # Calculate nse based on model which predicts the previous day for all timeseries without gaps
        for id in sorted(catchments.keys()):
            runoff = catchments[id]["m3/s"].to_numpy()
            offset = offset
            nse = calc_nse(runoff[offset:], runoff[:-offset])
            nses.append(nse)
            # print(f'{id:02d}, nse: {nse:.3f}')

        # the histogram of the data
        n, bins, patches = plt.hist(
            all_nses[-1],
            bins=[0, 0.10, 0.20, 0.30, 0.40, 0.50, 0.6, 0.7, 0.8, 0.9, 1.0],
        )
        plt.grid(False)
        plt.xticks([])
        plt.yticks([])
        plt.xlabel(f"{offset} days")
        if max_y is None:
            max_y = max(n)
        plt.ylim(top=max_y, bottom=0)

    plt.show()

In [23]:
plot_multi_histogram(long_without_gaps)
plot_multi_histogram(df_dict)

AttributeError: 'Series' object has no attribute 'to_numpy'

In [27]:
def create_test_and_validation_set(
    df_dict, split_date, length, x_parameters=["mm", "grC"], y_parameters=["m3/s"]
):
    TRAIN_X_DICT = {}
    TRAIN_Y_DICT = {}
    TEST_X_DICT = {}
    TEST_Y_DICT = {}
    for i, _id in enumerate(df_dict):
        # if i % 2 == 0:
        #     print(f'{i+1}/{len(df_dict)}: Parsing catchment {_id}')
        catchment_dataframe = df_dict[_id]
        train_dataframe = catchment_dataframe[
            catchment_dataframe["timestamp"] <= split_date
        ]
        test_dataframe = catchment_dataframe[
            catchment_dataframe["timestamp"] > split_date
        ]

        TRAIN_X_DICT[_id] = substring_array(
            train_dataframe[x_parameters].values[:-1], length
        )
        TRAIN_Y_DICT[_id] = train_dataframe[y_parameters].values[length:]

        TEST_X_DICT[_id] = substring_array(
            test_dataframe[x_parameters].values[:-1], length
        )
        TEST_Y_DICT[_id] = test_dataframe[y_parameters].values[length:]

        assert len(TRAIN_X_DICT[_id]) == len(TRAIN_Y_DICT[_id])
        assert len(TEST_X_DICT[_id]) == len(TEST_Y_DICT[_id])

    return TRAIN_X_DICT, TRAIN_Y_DICT, TEST_X_DICT, TEST_Y_DICT


def substring_array(array, length):
    python_list = array.tolist()
    new_list = []
    for i in range(len(python_list) - length + 1):
        new_list.append(python_list[i : i + length])
    return np.array(new_list)

In [28]:
import datetime

TRAIN_X_DICT, TRAIN_Y_DICT, TEST_X_DICT, TEST_Y_DICT = create_test_and_validation_set(
    long_without_gaps,
    datetime.datetime(2007, 8, 31),
    1,
    x_parameters=["mm", "grC", "m3/s"],
)

In [134]:
# Training using k_nearest_neighbour
from sklearn.neighbors import KNeighborsRegressor

stats_dict = {"desc": []}
lookback_days = [1]
x_parameters_list = [["mm", "grC"], ["mm", "grC", "m3/s"]]

for lookback in lookback_days:
    print(f"Number of days lookback: {lookback}")
    for use_rainoff_for_x in [False]:
        x_parameters = (
            x_parameters_list[1] if use_rainoff_for_x else x_parameters_list[0]
        )
        (
            TRAIN_X_DICT,
            TRAIN_Y_DICT,
            TEST_X_DICT,
            TEST_Y_DICT,
        ) = create_test_and_validation_set(
            long_without_gaps,
            datetime.datetime(2007, 8, 31),
            lookback,
            x_parameters=x_parameters,
        )

        for n_neighbors in [1]:
            stats_dict["desc"].append(
                f"lookback {lookback}, use_rainoff {use_rainoff_for_x}, n_neighbors {n_neighbors}"
            )
            for _id in TRAIN_X_DICT.keys():
            # for _id in [94]:
                # print(_id)
                train_x, train_y, test_x, test_y = (
                    TRAIN_X_DICT[_id],
                    TRAIN_Y_DICT[_id],
                    TEST_X_DICT[_id],
                    TEST_Y_DICT[_id],
                )

                model = KNeighborsRegressor(n_neighbors=n_neighbors, weights='distance')
                train_x_flatend = train_x.reshape(
                    (train_x.shape[0], -1), order="F"
                )  # https://stackoverflow.com/a/37500847
                # print(train_x_flatend[0:2])
                # print(train_y[0])
                model.fit(train_x_flatend, train_y)

                test_x_flatend = test_x.reshape((test_x.shape[0], -1), order="F")
                pred_y = model.predict(test_x_flatend)
                nse = calc_nse(pred_y, test_y)
                print(f"Catchment {_id:2d}: {nse:.3f}")
                if _id not in stats_dict:
                    stats_dict[_id] = []
                stats_dict[_id].append(nse)

display(nse)

Number of days lookback: 1
Catchment  1: -0.488
Catchment  6: -0.333
Catchment  9: -0.927
Catchment 14: -0.072
Catchment 18: -0.308
Catchment 25: -0.620
Catchment 30: -0.527
Catchment 38: -0.219
Catchment 44: -0.251
Catchment 65: -0.644
Catchment 66: -0.232
Catchment 88: -0.539
Catchment 92: -0.792
Catchment 94: 0.477
Catchment 97: -0.111
Catchment 99: -0.148


-0.14765602772976405

In [79]:
display(stats_dict)

{'desc': ['lookback 1, use_rainoff False, n_neighbors 1',
  'lookback 1, use_rainoff False, n_neighbors 2',
  'lookback 1, use_rainoff False, n_neighbors 3',
  'lookback 1, use_rainoff False, n_neighbors 4'],
 94: [0.4773603406289034,
  0.5260722391477675,
  0.5541837571473378,
  0.5558224686477836]}

In [80]:
stats_df = pd.DataFrame(data=stats_dict)

In [81]:
stats_df.to_excel("knn.xlsx")

In [82]:
import plotly.graph_objects as go
import pandas as pd

for _id in list(long_without_gaps)[:3]:
    df = df_dict[_id]

    fig = go.Figure()
    for y_name in ['m3/s', 'mm', 'grC']:
        fig.add_trace(go.Scatter(x=df['timestamp'], y=df[y_name], name=y_name))

    fig.update_layout(title_text=f'Catchment {_id}',
                    xaxis_rangeslider_visible=True)
fig.show()

In [160]:

n_neighbors = 3
off = 1
for lookback in [1, 5, 10, 20]:
    (
    TRAIN_X_DICT,
    TRAIN_Y_DICT,
    TEST_X_DICT,
    TEST_Y_DICT,
    ) = create_test_and_validation_set(
        long_without_gaps,
        datetime.datetime(2007, 8, 31),
        lookback,
        x_parameters=["mm", "grC"],
    )
    for _id in [6]:
        # print(_id)
        train_x, train_y, test_x, test_y = (
            TRAIN_X_DICT[_id],
            TRAIN_Y_DICT[_id],
            TEST_X_DICT[_id],
            TEST_Y_DICT[_id],
        )

        model = KNeighborsRegressor(n_neighbors=n_neighbors, weights='distance')
        train_x_flatend = train_x.reshape(
            (train_x.shape[0], -1), order="F"
        )  # https://stackoverflow.com/a/37500847
        # print(train_x_flatend[0:2])
        # print(train_y[0])
        model.fit(train_x_flatend, train_y)

        test_x_flatend = test_x.reshape((test_x.shape[0], -1), order="F")
        pred_y = model.predict(test_x_flatend)
        if off > 0:
            test_y = test_y[:-off]
            pred_y = running_mean(pred_y, 0.5)
        nse = calc_nse(pred_y, test_y)
        print(f"Catchment {_id:2d}: {nse:.3f}")
        fig = go.Figure()

        fig.add_trace(go.Scatter(y=pred_y.flatten()[100:400], name='pred'))
        fig.add_trace(go.Scatter(y=test_y.flatten()[100:400], name='obs'))
        fig.update_layout(title_text=f'Catchment',
                            xaxis_rangeslider_visible=True)
        fig.show()

Catchment  6: -0.907


Catchment  6: 0.006


Catchment  6: 0.181


Catchment  6: 0.132


In [139]:
fig = go.Figure()

fig.add_trace(go.Scatter(y=pred_y.flatten(), name='pred'))
fig.add_trace(go.Scatter(y=test_y.flatten(), name='obs'))
fig.update_layout(title_text=f'Catchment',
                    xaxis_rangeslider_visible=True)
fig.show()

In [113]:
# https://stackoverflow.com/a/27681394
def running_mean(x, ratio):
    # cumsum = np.cumsum(np.insert(x, 0, 0)) 
    # return (cumsum[N:] - cumsum[:-N]) / float(N)
    return ratio*x[1:] + (1-ratio)*x[:-1]


print(len(pred_y_smooth))
print(len(test_y))

3038
3039


In [129]:
print(calc_nse(pred_y, test_y))
movement = 1
print(calc_nse(pred_y[movement:], test_y[:-movement]))

0.128739008842218
0.29780863021884374


In [130]:
N = 5
for N in range(2, 3):
    pred_y_smooth = running_mean(pred_y.flatten(), 0.7)
    test_y_for_smooth = test_y.flatten()[:-1]
    print(f"N={N}, nse: {calc_nse(pred_y_smooth, test_y_for_smooth)}")

    fig = go.Figure()

    fig.add_trace(go.Scatter(y=pred_y_smooth, name='pred'))
    fig.add_trace(go.Scatter(y=test_y_for_smooth, name='obs'))
    fig.update_layout(title_text=f'Catchment',
                        xaxis_rangeslider_visible=True)
    fig.show()

N=2, nse: 0.2750370580811261


In [163]:
from typing import Tuple
# Residual
def load_training_data(path: str, catchment: int) -> Tuple[pd.DataFrame, int]:
    """Load the meteorological forcing data and other characteristics of a specific catchment.

    :param catchment: number (id)
    
    :return: pd.DataFrame containing the meteorological forcing and other characteristics data.
    """
    # path = '../Training_data'
    all_files = glob.glob(path + "/*.csv")

    file_exist = False
    
    # Loop through files and find correct catchment
    for file_path in all_files:
        # Name is formatted `./Input files (.txt)/nve_inp_XX.txt`
        number = int(file_path.split('_')[-1].split('.')[0])
        
        if number == catchment:
            file_exist = True
            df = pd.read_csv(file_path)
            df["timestamp"] = pd.to_datetime(df["timestamp"])
            # Delay by one day
            df["Residual_in"] = np.roll(df["Residual"], 1)
            
    # Return None if catchment does not exist
    if file_exist == False:
        print("Catchment does not exist")
        return None
    else:
        return df

In [166]:
# Number of catchment with long measurement period: 16
# Ids: [1, 6, 9, 14, 18, 25, 30, 38, 44, 65, 66, 88, 92, 94, 97, 99]
path = '../Training_data'
df = load_training_data(path, 6)
display(df)

Unnamed: 0.1,Unnamed: 0,timestamp,mm,grC,grC.1,m3/s,evapo_transp,grC1,grC2,grC3,...,mm6,mm7,mm8,mm9,mm10,EPOT,Q_obs,Q_sim,Residual,Residual_in
0,0,2000-01-01,1.29,2.19,2.19,1.45,0.1,1.252955,1.300455,1.415455,...,1.968412,2.028455,2.106159,2.190422,2.381653,0.100,1.45,0.014,-1.436,0.303
1,1,2000-01-02,11.52,4.28,4.28,1.50,0.1,3.342955,3.390455,3.505455,...,17.578373,18.114576,18.808487,19.560974,21.268713,0.100,1.50,0.180,-1.320,-1.436
2,2,2000-01-03,11.79,3.35,3.35,1.86,0.1,2.412955,2.460455,2.575455,...,17.990366,18.539137,19.249310,20.019434,21.767199,0.100,1.86,0.478,-1.382,-1.320
3,3,2000-01-04,38.43,3.94,3.94,3.05,0.1,3.002955,3.050455,3.165455,...,58.640354,60.429095,62.743936,65.254185,70.951098,0.100,3.05,1.405,-1.645,-1.382
4,4,2000-01-05,2.58,2.16,2.16,3.51,0.1,1.222955,1.270455,1.385455,...,3.936823,4.056910,4.212317,4.380843,4.763306,0.100,3.51,2.382,-1.128,-1.645
5,5,2000-01-06,3.37,5.59,5.59,3.52,0.1,4.652955,4.700455,4.815455,...,5.142285,5.299143,5.502135,5.722264,6.221837,0.100,3.52,2.158,-1.362,-1.128
6,6,2000-01-07,21.29,4.09,4.09,4.16,0.1,3.152955,3.200455,3.315455,...,32.486421,33.477373,34.759781,36.150445,39.306502,0.100,4.16,2.676,-1.484,-1.362
7,7,2000-01-08,20.41,5.43,5.43,5.07,0.1,4.492955,4.540455,4.655455,...,31.143628,32.093620,33.323022,34.656204,37.681809,0.100,5.07,3.706,-1.364,-1.484
8,8,2000-01-09,23.76,4.07,4.07,6.18,0.1,3.132955,3.180455,3.295455,...,36.255395,37.361314,38.792504,40.344508,43.866721,0.100,6.18,4.692,-1.488,-1.364
9,9,2000-01-10,7.80,2.90,2.90,5.92,0.1,1.962955,2.010455,2.125455,...,11.902024,12.265078,12.734913,13.244409,14.400691,0.100,5.92,5.033,-0.887,-1.488


In [168]:
from numba import njit

@njit
def reshape_data(x: np.ndarray, y: np.ndarray, seq_length: int) -> Tuple[np.ndarray, np.ndarray]:
    """
    Reshape matrix data into sample shape for LSTM training.

    :param x: Matrix containing input features column wise and time steps row wise
    :param y: Matrix containing the output feature.
    :param seq_length: Length of look back days for one day of prediction
    
    :return: Two np.ndarrays, the first of shape (samples, length of sequence,
        number of features), containing the input data for the LSTM. The second
        of shape (samples, 1) containing the expected output for each input
        sample.
    """
    num_samples, num_features = x.shape

    x_new = np.zeros((num_samples - seq_length + 1, seq_length, num_features))
    y_new = np.zeros((num_samples - seq_length + 1, 1))

    for i in range(0, x_new.shape[0]):
        x_new[i, :, :num_features] = x[i:i + seq_length, :]
        y_new[i, :] = y[i + seq_length - 1, 0]

    return x_new, y_new

TypeError: not enough arguments: expected 3, got 1

In [200]:
# input_features = ['mm1','mm2','mm3','mm4','mm5','mm6','mm7','mm8','mm9','mm10','grC1','grC2','grC3','grC4','grC5','grC6','grC7','grC8','grC9','grC10','Residual_in','EPOT']
# input_features = ['mm', 'grC', 'Q_sim', 'EPOT']
input_features = ['Residual_in']
output_features = ['Residual']

def dataframe_to_x_y(df, start_date, end_date, input_features, output_features, seq_length):
    df = df[df["timestamp"] >= start_date]
    df = df[df["timestamp"] <= end_date]
    x = np.array([df[feature].values for feature in input_features]).T
    y = np.array([df[feature].values for feature in output_features]).T
    return reshape_data(x, y, seq_length)

def create_test_and_training_data(catchement, training_dates, test_dates, input_features, output_features, seq_length, flatten=True):
    catchment_dataframe = load_training_data('../Training_data', catchement)
    x_train, y_train = dataframe_to_x_y(catchment_dataframe, training_dates[0], training_dates[1], input_features, output_features, seq_length)
    x_test, y_test = dataframe_to_x_y(catchment_dataframe, test_dates[0], test_dates[1], input_features, output_features, seq_length)
    if flatten:
        x_train = x_train.reshape((x_train.shape[0], -1), order="F")
        y_train = y_train.reshape((y_train.shape[0], -1), order="F")
        x_test = x_test.reshape((x_test.shape[0], -1), order="F")
        y_test = y_test.reshape((y_test.shape[0], -1), order="F")
    return x_train, y_train, x_test, y_test

In [230]:
catchment = 6
seq_length = 10
x_train, y_train, x_test, y_test = create_test_and_training_data(6, (datetime.datetime(2000, 9, 1), datetime.datetime(2007, 8, 31)), (datetime.datetime(2007, 9, 1), datetime.datetime(2015, 8, 31)), input_features, output_features, seq_length)

In [231]:
model = KNeighborsRegressor(n_neighbors=1, weights='distance')
model.fit(x_train, y_train)

y_pred = model.predict(x_test)
y_test = y_test
print(calc_nse(y_pred, y_test))

0.008526158605329903


In [226]:
fig = go.Figure()

fig.add_trace(go.Scatter(y=y_pred.flatten(), name='pred'))
fig.add_trace(go.Scatter(y=y_test.flatten(), name='obs'))
fig.update_layout(title_text=f'Catchment',
                    xaxis_rangeslider_visible=True)
fig.show()