In [125]:
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM

import pandas as pd
import numpy as np
import random

import sys
sys.path.append('../resources/')

from get_data import stations


def get_time_series_for_window(start_date, end_date, station, dataframe):
    return dataframe.loc[start_date:end_date, [station]]


def check_nan_values(dataframe):
    return dataframe.isnull().values.any()


def get_valid_sequences(df):
    valid_sequences = []
    starting_idx = 0

    for i, (_, row) in enumerate(df.iterrows()):
        flow = row[0]

        if np.isnan(flow):

            if starting_idx < i-1:
                valid_sequences.append((starting_idx, i))
                starting_idx = i+1
            else:
                starting_idx = i+1
                continue
    return valid_sequences


def valid_seqs_minimum_len(valid_seqs, seq_len):
    
    valid_seqs_min_len = []
    pops = []
    for i, (start, end) in enumerate(valid_seqs):
        if end - start >= seq_len:
            valid_seqs_min_len.append((start, end))

    return valid_seqs_min_len



def split_sequences(possible_seqs, split_len):
    
    usable_seqs = []
    for seq in possible_seqs:
        usable_seqs += get_seq_splits(seq, split_len)
        
    return usable_seqs
        
        
        
def get_seq_splits(seq, split_len):
    
    start = seq[0]
    end = seq[1]
    
    chunks = (end - start) // (split_len+1)     # +1 because there must be an unobserved item after each chunk
                                                # which will be the y (after window value)

    splits = []
    prev_end_chunk = start
    for i in range(chunks):

        start_chunk = prev_end_chunk
        end_chunk = start_chunk + split_len
        splits.append((start_chunk, end_chunk))
        prev_end_chunk = end_chunk+1
        
    return splits


def get_seq_obs_values(seq, df):
    return np.array(df.iloc[seq[0]:seq[1], :]), np.array(df.iloc[seq[1], :])


def split_seqs_train_test(train_frac, usable_seqs):
    
    total_seqs = len(usable_seqs)
    train_amount = round(total_seqs * train_frac)
    
    random.shuffle(usable_seqs)
    train_seqs = usable_seqs[0:train_amount]
    test_seqs = usable_seqs[train_amount:]
    
    return train_seqs, test_seqs


def mount_trainable_testable_arrays(seqs, df):
    
    x_data = []
    y_data = []
    for seq in seqs:
        x, y = get_seq_obs_values(seq, df)
        x_data.append(x)
        y_data.append(y)
    
    return np.array(x_data), np.array(y_data)

In [7]:
interested_stations = ['13180000', '13300000', '13405000', '13410000', '13450000', '13470000', '13550000', '13600002', 
'13650000', '13710001', '13740000', '13750000', '13870000', '13880000', '13885000', '13886000']

df, nodata = stations(interested_stations, '3')

df1 = get_time_series_for_window('2000-01-01', '2015-12-31', '13450000', df)
valid_sequences = get_valid_sequences(df1)

val_seq = valid_seqs_minimum_len(valid_sequences, 31)

100%|██████████████████████████████████████████████████████████████████████████████████| 16/16 [00:37<00:00,  2.33s/it]


In [129]:
usable_chunks = split_sequences(val_seq, 30)

In [126]:
(val_seq[6][1] - val_seq[6][0]) // 30

4

In [109]:
val_seq[6]

(4930, 5074)

In [127]:
get_seq_splits(val_seq[6], 29)

[(4930, 4959), (4960, 4989), (4990, 5019), (5020, 5049)]

In [97]:
usable_chunks[71]

(2130, 2159)

In [94]:
val_seq

[(0, 2122),
 (2155, 2922),
 (3274, 3353),
 (3487, 3528),
 (3774, 4429),
 (4432, 4837),
 (4930, 5074),
 (5285, 5510)]

In [130]:
for i in range(len(usable_chunks)):
    val = get_seq_obs_values(usable_chunks[i], df1)[1]
    if np.isnan(val):
        print(i)
        if input("c?") == 'b':
            break

In [85]:
usable_chunks[]

(5054, 5084)

In [74]:
split_seqs_train_test(0.9, usable_chunks)

([(4494, 4524),
  (806, 836),
  (1457, 1487),
  (4961, 4991),
  (403, 433),
  (2434, 2464),
  (4425, 4455),
  (3805, 3835),
  (62, 92),
  (620, 650),
  (1829, 1859),
  (2558, 2588),
  (124, 154),
  (4711, 4741),
  (31, 61),
  (3991, 4021),
  (1488, 1518),
  (1364, 1394),
  (496, 526),
  (4618, 4648),
  (4930, 4960),
  (775, 805),
  (2170, 2200),
  (2682, 2712),
  (2403, 2433),
  (2620, 2650),
  (5471, 5501),
  (682, 712),
  (3518, 3548),
  (3836, 3866),
  (4680, 4710),
  (2217, 2247),
  (1953, 1983),
  (4177, 4207),
  (899, 929),
  (1643, 1673),
  (1240, 1270),
  (2372, 2402),
  (2139, 2169),
  (5347, 5377),
  (2108, 2138),
  (3898, 3928),
  (4022, 4052),
  (2651, 2681),
  (2341, 2371),
  (744, 774),
  (1984, 2014),
  (2744, 2774),
  (5316, 5346),
  (2837, 2867),
  (4270, 4300),
  (1023, 1053),
  (1736, 1766),
  (4432, 4462),
  (5054, 5084),
  (3305, 3335),
  (4649, 4679),
  (434, 464),
  (1612, 1642),
  (372, 402),
  (3867, 3897),
  (558, 588),
  (5502, 5532),
  (2465, 2495),
  (1767,

In [132]:
train_seqs, test_seqs = split_seqs_train_test(0.9, usable_chunks)
train_seqs[0]

(1333, 1363)

In [135]:
x_train, y_train = mount_trainable_testable_arrays(train_seqs, df1)
x_test, y_test = mount_trainable_testable_arrays(test_seqs, df1)

In [45]:
model = Sequential()
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2, input_shape=(None, 1)))
model.add(Dense(32))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')

In [136]:
model.fit(x_train, y_train,validation_data=(x_test,y_test))

Train on 126 samples, validate on 14 samples


<tensorflow.python.keras.callbacks.History at 0x230f860ed48>