In [84]:
import pandas as pd
import numpy as np


flow_path = "/home/colombelli/Documents/hydro-ml/data/Vazao.txt"
rain_path = "/home/colombelli/Documents/hydro-ml/data/Chuva.txt"
et_path = "/home/colombelli/Documents/hydro-ml/data/ET.txt"

flow_df = pd.read_csv(flow_path, sep="\t", header=None)
flow_df.columns = ["day", "month", "year", "hour", "flow"]

rain_df = pd.read_csv(rain_path, sep="\t", header=None)
rain_df.columns = ["day", "month", "year", "hour", "rain"]

et_df = pd.read_csv(et_path, sep="\t", header=None)
et_df.columns = ["day", "month", "year", "hour", "et"]

In [109]:
def get_time_series_for_window(start_date, end_date, station, dataframe):
    return dataframe.loc[start_date:end_date, [station]]


def check_nan_values(dataframe):
    return dataframe.isnull().values.any()


def get_valid_sequences(df):
    valid_sequences = []
    starting_idx = 0

    for i, (_, row) in enumerate(df.iterrows()):
        flow = row[0]

        if np.isnan(flow):

            if starting_idx < i-1:
                valid_sequences.append((starting_idx, i))
                starting_idx = i+1
            else:
                starting_idx = i+1
                continue
    
    if not check_nan_values(df.iloc[starting_idx:, :]):
        valid_sequences.append((starting_idx, len(df)))
    return valid_sequences


def valid_seqs_minimum_len(valid_seqs, seq_len):
    
    valid_seqs_min_len = []
    pops = []
    for i, (start, end) in enumerate(valid_seqs):
        if end - start >= seq_len:
            valid_seqs_min_len.append((start, end))

    return valid_seqs_min_len



def split_sequences(possible_seqs, split_len):
    
    usable_seqs = []
    for seq in possible_seqs:
        usable_seqs += get_seq_splits(seq, split_len)
        
    return usable_seqs
        
        
        
def get_seq_splits(seq, split_len):
    
    start = seq[0]
    end = seq[1]
    
    chunks = (end - start) // (split_len+1)     # +1 because there must be an unobserved item after each chunk
                                                # which will be the y (after window value)

    splits = []
    prev_end_chunk = start
    for i in range(chunks):

        start_chunk = prev_end_chunk
        end_chunk = start_chunk + split_len
        splits.append((start_chunk, end_chunk))
        prev_end_chunk = end_chunk+1
        
    return splits


def get_seq_obs_values(seq, df):
    return np.array(df.iloc[seq[0]:seq[1], :]), np.array(df.iloc[seq[1], :])


def split_seqs_train_test(train_frac, usable_seqs):
    
    total_seqs = len(usable_seqs)
    train_amount = round(total_seqs * train_frac)
    
    random.shuffle(usable_seqs)
    train_seqs = usable_seqs[0:train_amount]
    test_seqs = usable_seqs[train_amount:]
    
    return train_seqs, test_seqs


def mount_trainable_testable_arrays(seqs, df):
    
    x_data = []
    y_data = []
    for seq in seqs:
        x, y = get_seq_obs_values(seq, df)
        x_data.append(x)
        y_data.append(y)
    
    return np.array(x_data), np.array(y_data)

In [86]:
def transform_cleb_df_into_wal_df(cleb_df):
    
    index_names = {}
    for i, (_, row) in enumerate(cleb_df.iterrows()):
        
        year = str(int(row[2]))
        month = str(int(row[1]))
        day = str(int(row[0]))
        hour = str(int(row[3]))
        index_name = year+'-'+month+'-'+day+'-'+hour
        
        index_names[i] = index_name
        
    
    cleb_df.rename(index=index_names)
    cleb_df = cleb_df.drop('day', 1)
    cleb_df = cleb_df.drop('month', 1)
    cleb_df = cleb_df.drop('year', 1)
    cleb_df = cleb_df.drop('hour', 1)
    return cleb_df.replace(-1, np.nan)

In [87]:
flow_df = transform_cleb_df_into_wal_df(flow_df)
rain_df = transform_cleb_df_into_wal_df(rain_df)
et_df = transform_cleb_df_into_wal_df(et_df)

In [88]:
valid_sequences = get_valid_sequences(flow_df)

In [89]:
valid_sequences

[(0, 5953), (5962, 6051), (6069, 17486), (17487, 18718), (18741, 23437)]

In [90]:
val_seq = valid_seqs_minimum_len(valid_sequences, 24)

In [91]:
val_seq

[(0, 5953), (5962, 6051), (6069, 17486), (17487, 18718), (18741, 23437)]

In [92]:
def check_for_other_dfs_missing_values(val_seq, df):
    
    for seq in val_seq:
        ini = seq[0]
        end = seq[1]
        
        print("Seq:", seq)
        print("Missing?", check_nan_values(df.iloc[ini:end, :]))

In [93]:
check_for_other_dfs_missing_values(val_seq, et_df)

Seq: (0, 5953)
Missing? False
Seq: (5962, 6051)
Missing? False
Seq: (6069, 17486)
Missing? False
Seq: (17487, 18718)
Missing? False
Seq: (18741, 23437)
Missing? False


In [94]:
check_for_other_dfs_missing_values(val_seq, rain_df)

Seq: (0, 5953)
Missing? True
Seq: (5962, 6051)
Missing? False
Seq: (6069, 17486)
Missing? True
Seq: (17487, 18718)
Missing? False
Seq: (18741, 23437)
Missing? False


In [110]:
valid_seq_rain = get_valid_sequences(rain_df)
valid_seq_rain

[(0, 389),
 (390, 598),
 (599, 3000),
 (3391, 3930),
 (3931, 4386),
 (4387, 5646),
 (5647, 5953),
 (5962, 6077),
 (6078, 25954)]

In [111]:
# Piece of code taken from: https://stackoverflow.com/questions/32480423/how-to-check-if-a-range-is-a-part-of-another-range-in-python-3-x
def range_subset(range1, range2):
    """Whether range1 is a subset of range2."""
    if not range1:
        return True  # empty range is subset of anything
    if not range2:
        return False  # non-empty range can't be subset of empty range
    if len(range1) > 1 and range1.step % range2.step:
        return False  # must have a single value or integer multiple step
    return range1.start in range2 and range1[-1] in range2


# not gonna use it now, but let it stay here to maybe future usage

In [118]:
val_seq

[(0, 5953), (5962, 6051), (6069, 17486), (17487, 18718), (18741, 23437)]

In [119]:
val_seq.pop(0)

(0, 5953)

In [120]:
val_seq

[(5962, 6051), (6069, 17486), (17487, 18718), (18741, 23437)]

In [122]:
val_seq[1] = (6078, 17486)

In [123]:
val_seq

[(5962, 6051), (6078, 17486), (17487, 18718), (18741, 23437)]

In [124]:
check_for_other_dfs_missing_values(val_seq, rain_df)

Seq: (5962, 6051)
Missing? False
Seq: (6078, 17486)
Missing? False
Seq: (17487, 18718)
Missing? False
Seq: (18741, 23437)
Missing? False
