
Query the HYDAT database for runoff events in BC and Alberta that fall within 175km of a radar station in Western Canada.  

Filter for watersheds with data after June 2007.  

Filter for watersheds larger than 15 km^2 and smaller than 500 km^2.

Output table like:

| Station | ID | Drainage Area [$km^2$] | Start Date | End Date |
|---|---|---|---|---|
| Elaho | EHBN008 | 400 | 2007 | 2017 |


In [91]:
import pandas as pd
import numpy as np
import os 
import sys
import math
import utm
import time
import pickle

import json
import geopandas as gpd
import itertools
import fiona
from geopy import distance

from numba import jit

from sklearn import preprocessing
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
%matplotlib inline

from numpy.random import seed

from multiprocessing import Pool

from shapely.geometry import shape, mapping

from PIL import Image
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

from bokeh.plotting import ColumnDataSource, output_notebook, figure
from bokeh.transform import factor_cmap, factor_mark
from bokeh.palettes import Spectral3
from bokeh.layouts import gridplot
from bokeh.io import show
output_notebook()

from radar_scrape import get_radar_img_urls, request_img_files
from get_station_data import get_daily_runoff

import tensorflow

from keras.layers import Input, Dropout
from keras.layers.core import Dense 
from keras.models import Model, Sequential, load_model
from keras import regularizers
from keras.models import model_from_json

In [3]:
BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(''))))
DB_DIR = os.path.join(BASE_DIR, 'code/hydat_db')
PROJECT_DIR = os.path.abspath('')
IMG_DIR = os.path.join(PROJECT_DIR, 'data/radar_img')
RADAR_IMG_DIR = os.path.join(PROJECT_DIR, 'data/sorted_radar_images')

In [4]:
# define radar sites
radar_stations = {'CASAG': {'lat_lon': [49.0580516, -122.470667], # radar location code, lat/lon
                       'scale': 1,
                      'alt_name': 'Aldergrove',
                        }, # km/pixel                       
               'CASPG': {'lat_lon': [53.916943, -122.749443], # radar location code, lat/lon
                       'scale': 1,
                      'alt_name': 'Prince George',}, # km/pixel}, # km/pixel
               'CASSS': {'lat_lon': [50.271790, -119.276505], # radar location code, lat/lon
                       'scale': 1,
                      'alt_name': 'Silver Star',}, # km/pixel}, # km/pixel
               'CASSI': {'lat_lon': [48.407326, -123.329773], # radar location code, lat/lon
                       'scale': 1,
                      'alt_name': 'Victoria',}, # km/pixel}, # km/pixel
               'CASSM': {'lat_lon': [51.206092, -113.399426],
                        'scale': 1,
                        'alt_name': 'Strathmore'},
              }

In [5]:
def find_closest_radar_stn(row):
    """ 
    Input the dict of all station distances,
    Return the location code of the nearest radar station.
    """
    radar_station_distances = row['radar_stn_distance_dict']
    min_dist = min(radar_station_distances.items(), key=lambda x: x[1])
    return min_dist[0]

In [6]:
def find_closest_radar_stn_distance(row):
    """ 
    Input the dict of all station distances,
    Return the location code of the nearest radar station.
    """
    radar_station_distances = row['radar_stn_distance_dict']
    min_dist = min(radar_station_distances.items(), key=lambda x: x[1])
    return min_dist[1]


In [7]:
def calc_distance(wsc_row, radar_station):
    wsc_stn_coords = (wsc_row['Latitude'], wsc_row['Longitude'])
    radar_coords = radar_stations[radar_station]['lat_lon']
    return distance.distance(radar_coords, wsc_stn_coords).km


In [8]:
def calculate_radar_stn_distances(row):
    distance_dict = {}
    for site in radar_stations:
        distance_dict[site] = calc_distance(row, site)
    return distance_dict

In [9]:
def initialize_wsc_station_info_dataframe():
    # import master station list
    stations_df = pd.read_csv(DB_DIR + '/WSC_Stations_Master.csv')
    # filter for stations that have concurrent record with the historical radar record
    stations_df['RADAR_Overlap'] = stations_df['Year To'].astype(int) - 2007
    stations_filtered = stations_df[stations_df['RADAR_Overlap'] > 0]
    # filter for stations that are natural flow regimes
    stations_filtered = stations_filtered[stations_filtered['Regulation'] == 'N']
    stations_filtered.rename(columns={'Gross Drainage Area (km2)': 'DA'}, inplace=True)
    # filter for stations in Alberta and British Columbia
    stations_filtered = stations_filtered[(stations_filtered['Province'] == 'BC') | (stations_filtered['Province'] == 'AB')]
    
    # calculate distance to each radar station
    stations_filtered['radar_stn_distance_dict'] = stations_filtered.apply(lambda row: calculate_radar_stn_distances(row), axis=1)    
    stations_filtered['closest_radar_station'] = stations_filtered.apply(lambda row: find_closest_radar_stn(row), axis=1)
    stations_filtered['radar_distance_km'] = stations_filtered.apply(lambda row: find_closest_radar_stn_distance(row), axis=1)
    
    # radar range is a 240km radius from the station
    stations_filtered = stations_filtered[stations_filtered['radar_distance_km'] < 190]
    stn_df = stations_filtered[np.isfinite(stations_filtered['DA'].astype(float))]
    # filter for stations greater than 10 km^2 (too small for meaningful results)
    stn_df = stn_df[stn_df['DA'].astype(float) >= 10]
    # filter for stations smaller than 1000 km^2 (too large and complex)
    stn_df = stn_df[stn_df['DA'].astype(float) < 1000].sort_values('DA')
    df = stn_df[['Province', 'Station Number', 'Station Name', 'DA', 
                 'Elevation', 'Latitude', 'Longitude', 'RADAR_Overlap',
                'closest_radar_station', 'radar_stn_distance_dict', 'radar_distance_km']]
#     print('After filtering, there are {} candidate stations.'.format(len(stn_df)))
    df.reset_index(inplace=True)
    return df


In [10]:
def initialize_runoff_dataframe(test_stn):
    
    runoff_df = get_daily_runoff(test_stn)
    runoff_df['Year'] = runoff_df.index.year
    runoff_df['Month'] = runoff_df.index.month
    
    # filter by minimum radar date
    runoff_df = runoff_df[runoff_df.index > pd.to_datetime('2007-05-31')]
    
    runoff_df['Date'] = runoff_df.index.values
    
    return runoff_df


In [11]:
def create_lag_df(df, stn_da):
    lag_df = df[['DAILY_FLOW']].copy()
    lag_df.rename(columns={'DAILY_FLOW': 'Q'}, inplace=True)

    num_lags = int(np.ceil(stn_da / 100) + 5)

    for i in range(1,num_lags):
        lag_df['Q{}'.format(i)] = lag_df['Q'].shift(i)

    lag_df.dropna(inplace=True)
    
    return lag_df, num_lags

In [12]:
# Based on code from Anomaly detection ML methods article:
# https://towardsdatascience.com/machine-learning-for-anomaly-detection-and-condition-monitoring-d4614e7de770
def split_train_and_test_data(data, training_months, training_year):
    time_range_check = (data.index.year == training_year[0]) & (data.index.month.isin(list(training_months)[0]))
    train_data = data[time_range_check]
    # the test data is the entire dataset because we want to extract
    # extreme events from the training year as well
    test_data = data
    return train_data, test_data

In [None]:
# def get_polygon(stn):
#     gdb_path = os.path.join(DB_DIR, 'WSC_Basins.gdb.zip')
#     data = gpd.read_file(gdb_path, driver='FileGDB', layer='EC_{}_1'.format(stn))
#     return data.geometry

# def find_closest_radar_stn(wsc_stn):
#     """
#     To retrieve radar images, we need to find the closest radar location
#     to the station of interest.  
#     Input the station number,
#     returns the location code of the nearest radar station.
#     """
#     stn_data = stn_df[stn_df['Station Number'] == wsc_stn]
#     s1 = [stn_data['Latitude'].values[0], stn_data['Longitude'].values[0]]
#     min_dist = 1E6
#     closest_stn = None
#     for site in radar_sites.keys():
#         s2 = [*radar_sites[site]['lat_lon']]
#         this_dist = np.sqrt((s2[0] - s1[0])**2 + (s2[1] - s1[1])**2)

#         if this_dist < min_dist:
#             min_dist = this_dist
#             closest_stn = site
        
#     return closest_stn

# basin_geom = get_polygon(test_stn)
# basin_geom = basin_geom.to_crs('EPSG:4326')
# basin_bbox = basin_geom.bounds
# print(basin_bbox)

In [59]:
def MahalanobisDist(inv_cov_matrix, mean_distr, data, verbose=False):
    inv_covariance_matrix = inv_cov_matrix
    vars_mean = mean_distr
    diff = data - vars_mean
    md = []
    for i in range(len(diff)):
        md.append(np.sqrt(diff[i].dot(inv_covariance_matrix).dot(diff[i])))
    return md

In [60]:
def MD_detectOutliers(dist, extreme=False, verbose=False):
    k = 3. if extreme else 2.
    threshold = np.mean(dist) * k
    outliers = []
    for i in range(len(dist)):
        if dist[i] >= threshold:
            outliers.append(i)  # index of the outlier
    return np.array(outliers)

In [35]:
def MD_threshold(dist, extreme=False, verbose=False):
    k = 3. if extreme else 2.
    threshold = np.mean(dist) * k
    return threshold

In [36]:
def is_pos_def(A):
    if np.allclose(A, A.T):
        try:
            np.linalg.cholesky(A)
            return True
        except np.linalg.LinAlgError:
            return False
    else:
        return False

In [23]:
def do_PCA(X_train, X_test, n_components):
    
    for n_components_kept in range(2, n_components + 1):

        pca = PCA(n_components=n_components_kept, svd_solver= 'full')
        X_train_PCA = pca.fit_transform(X_train)
        X_train_PCA = pd.DataFrame(X_train_PCA)
        X_train_PCA.index = X_train.index

        X_test_PCA = pca.transform(X_test)
        X_test_PCA = pd.DataFrame(X_test_PCA)
        X_test_PCA.index = X_test.index

        var_expl = 100*np.sum(pca.explained_variance_ratio_)
        if var_expl >= 90:
#             print('var > 0.9 in {} components'.format(n_components_kept))
            return X_train_PCA, X_test_PCA, var_expl, n_components_kept
#     print('var < 0.9 in {} components'.format(n_components_kept))
    return X_train_PCA, X_test_PCA, var_expl, n_components_kept


In [24]:
def initialize_input_data(wsc_stn_num):
        
    t0 = time.time()
    stn_df = initialize_wsc_station_info_dataframe()

    test_stn_info = stn_df[stn_df['Station Number'] == wsc_stn_num]
    stn_da = test_stn_info['DA'].values[0]
    wsc_stn_name = test_stn_info['Station Name'].values[0]
    closest_radar_stn = test_stn_info['closest_radar_station'].values[0]
#     print('{} ({}) has a DA of {} km^2'.format(wsc_stn_name, wsc_stn_num, stn_da))
    
    runoff_df = initialize_runoff_dataframe(wsc_stn_num)    
    lag_df, num_lags = create_lag_df(runoff_df, stn_da) 
    
    
    candidate_stations = stn_df['Station Number'].values
    
    return lag_df, closest_radar_stn, runoff_df, num_lags

In [28]:
def train_model(input_array):
    
    training_months = input_array[0], 
    training_year = input_array[1],
    training_set_len = input_array[2]
    wsc_station_num = input_array[3]
    training_sample_size = input_array[4]

    lag_df, closest_radar_stn, runoff_df, num_lags = initialize_input_data(wsc_station_num)
    
    dataset_train, dataset_test = split_train_and_test_data(lag_df, training_months, training_year)
    
    training_set_len = len(dataset_train)
    
    if len(dataset_train) < 2:
        print('exited because dataset_train is too small')
#         print(dataset_train)
        return pd.DataFrame([]), 0

    scaler = preprocessing.MinMaxScaler()

    X_train = pd.DataFrame(scaler.fit_transform(dataset_train), 
                                  columns=dataset_train.columns, 
                                  index=dataset_train.index)
    # Random shuffle training data
    X_train.sample(frac=1)

    X_test = pd.DataFrame(scaler.transform(dataset_test), 
                                 columns=dataset_test.columns, 
                                 index=dataset_test.index)
    
   
    X_train_PCA, X_test_PCA, var_expl, n_components = do_PCA(X_train, X_test, num_lags)
    
    data_train = np.array(X_train_PCA.values)
    data_test = np.array(X_test_PCA.values)
    
    def cov_matrix(data, verbose=False):
        covariance_matrix = np.cov(data, rowvar=False)
        if is_pos_def(covariance_matrix):
            inv_covariance_matrix = np.linalg.inv(covariance_matrix)
            if is_pos_def(inv_covariance_matrix):
                return True, covariance_matrix, inv_covariance_matrix
            else:
                print("Error: Inverse of Covariance Matrix is not positive definite!")
                return False, None, None
        else:
            print("Error: Covariance Matrix is not positive definite!")
            return False, None, None

               
    cov_test, cov_matrix, inv_cov_matrix = cov_matrix(data_train)
    
    if cov_test == False:
        return pd.DataFrame([]), 0

    mean_distr = data_train.mean(axis=0)

    dist_test = MahalanobisDist(inv_cov_matrix, mean_distr, data_test, verbose=False)
    dist_train = MahalanobisDist(inv_cov_matrix, mean_distr, data_train, verbose=False)
    threshold = MD_threshold(dist_train, extreme = True)
    
    anomaly_train = pd.DataFrame()
    anomaly_train['Mob dist']= dist_train
    anomaly_train['Thresh'] = threshold
    # If Mob dist above threshold: Flag as anomaly
    anomaly_train['Anomaly'] = anomaly_train['Mob dist'] > anomaly_train['Thresh']
    anomaly_train.index = X_train_PCA.index
    anomaly = pd.DataFrame()
    anomaly['Mob dist']= dist_test
    anomaly['Thresh'] = threshold
    anomaly['num_components_kept'] = n_components
    # If Mob dist above threshold: Flag as anomaly
    anomaly['Anomaly'] = anomaly['Mob dist'] > anomaly['Thresh']
    anomaly.index = X_test_PCA.index
    anomaly.head()
    
    anomaly_alldata = pd.concat([anomaly_train, anomaly], sort=True)
    
    event_times = np.where(anomaly_alldata['Anomaly'].values[:-1] != anomaly_alldata['Anomaly'].values[1:])[0]
    events = pd.merge(lag_df, anomaly_alldata.iloc[event_times,:], how='inner', 
                      left_index=True, right_index=True)

    events = events.loc[~events.index.duplicated(keep='first')]
    
    if len(events) == 0:
        print('exited because len(events) == 0')
        return pd.DataFrame([]), 0
    elif events.iloc[0]['Anomaly'] == True:
        events = events.iloc[1:]
        
    # create a column of time difference between events in days
    events['dt_days'] = events.index.to_series().diff(1)    

    a = time.time()

    last_event_end = False

    new_events = pd.DataFrame()

    # iterate through the detected event pairs 
    for i in np.arange(0, len(events) - 1, 2):
        # parse a single event pair
        this_event = events.iloc[i:i+2]
        
        check_sign_switch = this_event['Anomaly'].values[0] != this_event['Anomaly'].values[1]
        concurrent_wsc = lag_df[(lag_df.index >= this_event.index.values[0]) & (lag_df.index <= this_event.index.values[1])][['Q']]
        peak_in_middle = check_peak_in_middle(this_event, concurrent_wsc)

        if (check_sign_switch) & (peak_in_middle):

            # get the start date
            this_event_start = pd.to_datetime(this_event[this_event['Anomaly'] == False].index.values[0])
            # get the end date
            this_event_end = pd.to_datetime(this_event[this_event['Anomaly'] == True].index.values[0])

            new_event_start = lag_df[lag_df.index == this_event_start][['Q']]
            new_event_end = lag_df[lag_df.index == this_event_end][['Q']]

            adjusted_start_date = pd.to_datetime(adjust_edge_date(this_event_start, lag_df[['Q']], 'start'))

            new_event_start = lag_df[lag_df.index == adjusted_start_date][['Q']]

            if last_event_end is not False:

                # find if the start date is on the rising limb - adjust if so
                if adjusted_start_date < last_event_end:
                    new_event_start = lag_df[lag_df.index == this_event_start][['Q']]

            new_event_start['timing'] = 'start'
            new_event_end['timing'] = 'end'

            min_time_check = (new_event_end.index - new_event_start.index).days > 1
            max_time_check = (new_event_end.index - new_event_start.index).days <= 14
            start_month = new_event_start.index.month
            end_month = new_event_end.index.month
            season_check = (start_month > 5) & (start_month <= 11) & (end_month <= 11)

            if (min_time_check) & (max_time_check) & (season_check):
                # filter out events that are longer than 5 days
                new_events = new_events.append(new_event_start)
                new_events = new_events.append(new_event_end)

            last_event_end = pd.to_datetime(this_event_end)


    b = time.time()
#     print(b - a)

    new_events.sort_index(inplace=True)
    

    new_events['dt_days'] = new_events.index.to_series().diff(1)
    new_events['wsc_station'] = wsc_station_num
    new_events['training_year'] = training_year[0]
    new_events['training_months'] = training_months * len(new_events)
    new_events['training_set_len'] = training_set_len
    new_events['m_threshold'] = threshold
    new_events['var_explained'] = var_expl
    new_events['n_components'] = n_components
    new_events['num_lags'] = num_lags
                
    return new_events, n_components

In [29]:
def adjust_edge_date(initial_date, data, direction):
    """
    If the start flow is on a rising limb, adjust the start to the start of the runoff event.
    """
    initial_val = data[data.index == initial_date]['Q']

    if direction == 'end':
        search_criteria = (data.index <= initial_date + pd.Timedelta('7 days')) & (data.index >= initial_date)
        search_direction = -1
    elif direction == 'start':
        search_criteria = (data.index >= initial_date - pd.Timedelta('7 days')) & (data.index <= initial_date)
        search_direction = 1
        
        
    extended_week_vals = data[search_criteria][['Q']]
    extended_week_vals['diff'] = extended_week_vals.diff(periods=search_direction)
    extended_week_vals['pct_change'] = 100 * extended_week_vals['diff'] / extended_week_vals['Q']

    if direction == 'start':
        try:
            extended_week_vals.at[extended_week_vals.index.min(),'diff'] = -1
            change_point_row = extended_week_vals[['pct_change']].idxmax()
            if len(change_point_row) > 1:
                change_point_date = extended_week_vals.loc[change_point_row - pd.DateOffset(1)].index.values[0]
                adjusted_date = change_point_date
            else:
                adjusted_date = initial_date
            
        except ValueError as err:
            adjusted_date = initial_date

    elif direction == 'end':
        try:
            change_point = extended_week_vals[extended_week_vals['diff'] < 0][['Q']].idxmin().values[0]
            adjusted_dates = change_point

        except ValueError as err:
            change_point = extended_week_vals[extended_week_vals['Q'] == extended_week_vals['Q'].min()].index.values[0]
            adjusted_date = change_point

            
    return pd.to_datetime(adjusted_date)


def check_peak_in_middle(event, data):
    """
    Ensure there is a peak between the start and end points
    so we aren't targeting a non-runoff event.
    """
    start_time = event.index.values[0] 
    end_time = event.index.values[-1]
    max_time = data[data['Q'] == data['Q'].max()].index.values[0]
    if (max_time == start_time) | (max_time == end_time):
        return False
    else:
        return True


def get_all_combinations(months, years):
    month_combos = [list(itertools.combinations(months, n)) for n in range(1, len(months) + 1)]
    flat_combos =  [item for sublist in month_combos for item in sublist]
    return np.asarray(list(itertools.product(flat_combos, years)))


def run_AD_training(wsc_station_num, stn_df, runoff_df, radar_stn, training_sample_size=5, ):
    
    training_months = list(set(runoff_df.index.month))
    training_years = list(set(runoff_df.index.year))     

    all_combinations = get_all_combinations(training_months, training_years)
      
    # a complete search is intractable, so sample n permutations without replacement
    rand_ints = np.random.choice(range(len(all_combinations)), training_sample_size, replace=False)

    sample_list = all_combinations[rand_ints]
    
    input_array = [[*c, training_sample_size, wsc_station_num, radar_stn] for c in sample_list]
    
    results = [(i, *train_model(i)) for i in input_array]
    
    return results


In [71]:
stn_df = initialize_wsc_station_info_dataframe()

all_wsc_stations = stn_df['Station Number'].values
best_results = {}
for s_size in [55]:
    sample_path = os.path.join(PROJECT_DIR, 'data/AD_results/sample_{}/'.format(s_size))
    
    if not os.path.exists(sample_path):
        os.makedirs(sample_path)
    
    ta = time.time()
    
    n = 0
    
    for wsc_stn in all_wsc_stations[:2]:
        
        stn_df = initialize_wsc_station_info_dataframe()    
        radar_stn = stn_df[stn_df['Station Number'] == wsc_stn]['closest_radar_station'].values[0]
        runoff_df = initialize_runoff_dataframe(wsc_stn)       

        n += 1
        t0 = time.time()
        results = run_AD_training(wsc_stn, stn_df, runoff_df, radar_stn, s_size)
        t1 = time.time()
        print('{} of {}: {}: {:.2f}s'.format(n, len(all_wsc_stations), wsc_stn, t1 - t0))

        AD_model_params = []
        for r in results:
            params = r[0]
            result_info = r[1]
            n_components = r[2]
#             print(params, len(result_info), num_lags)
            months = params[0]
            year = params[1]
            n_sample = params[2]
            AD_model_params.append((months, year, wsc_stn, radar_stn, n_sample, len(result_info), n_components))

        result_df = pd.DataFrame(AD_model_params, columns=['train_months', 'train_year', 'wsc_stn', 'radar_stn', 'n_sample', 'len_results', 'num_components'])
        results_save_path = os.path.join(PROJECT_DIR, 'data/AD_results/sample_{}/{}_results.csv'.format(n_sample, wsc_stn))
        result_df.to_csv(results_save_path)
        

        best_result = result_df.sort_values(by='len_results', ascending=False).iloc[0]
        best_results[wsc_stn] = best_result
        
    tb = time.time()
    print('For n={}, execution time = {}'.format(s_size, tb-ta))
    

1 of 141: 08HB048: 70.95s
2 of 141: 08LF100: 50.10s
For n=55, execution time = 122.58240556716919


#  End of Find Events Script

In [112]:
def run_AD_training(wsc_station_num, stn_df, runoff_df, radar_stn, training_sample_size=5, ):
  
    input_array = [[*c, training_sample_size, wsc_station_num, radar_stn] for c in sample_list]
    
    results = train_model(input_array)
    
    return results

stn = list(best_results.keys())[0]

train_year = best_results[stn]['train_year']
radar_stn = best_results[stn]['radar_stn']
training_months = best_results[stn]['train_months']
training_sample_size = best_results[stn]['n_sample']

input_array = [training_months, train_year, training_sample_size, stn, radar_stn]

best_events, n_components = train_model(input_array)

lag_df, closest_radar_stn, runoff_df, num_lags = initialize_input_data(stn)
stn_df = initialize_input_data(stn)

In [116]:
print(len(best_events))
# print(best_events)
print(stn)
print(stn_df)
print(stn_df[stn_df['Station Number'] == stn])

48
08HB048
(                Q     Q1     Q2     Q3     Q4     Q5
DATE                                                
2007-06-06  0.337  0.087  0.081  0.082  0.087  0.082
2007-06-07  0.304  0.337  0.087  0.081  0.082  0.087
2007-06-08  0.157  0.304  0.337  0.087  0.081  0.082
2007-06-09  0.556  0.157  0.304  0.337  0.087  0.081
2007-06-10  1.200  0.556  0.157  0.304  0.337  0.087
...           ...    ...    ...    ...    ...    ...
2018-12-27  0.564  0.700  0.591  1.030  3.560  1.810
2018-12-28  0.662  0.564  0.700  0.591  1.030  3.560
2018-12-29  4.970  0.662  0.564  0.700  0.591  1.030
2018-12-30  1.340  4.970  0.662  0.564  0.700  0.591
2018-12-31  0.633  1.340  4.970  0.662  0.564  0.700

[4227 rows x 6 columns], 'CASSI',             DAILY_FLOW FLAG_08HB048  Year  Month       Date
DATE                                                       
2007-06-01       0.082         None  2007      6 2007-06-01
2007-06-02       0.087         None  2007      6 2007-06-02
2007-06-03       0.082  

TypeError: tuple indices must be integers or slices, not str

In [106]:
# create grid plot of individual events

plots = []

for i in np.arange(0, len(best_events) - 1, 2):
    
    # parse a single event pair
    this_event = best_events.iloc[i:i+2]
    
    s1 = figure(background_fill_color="#fafafa", x_axis_type='datetime')
    
    s1.circle(this_event.index, this_event['Q'], 
              size=12, alpha=0.8, color="red")#, legend_label='{estimated endpoints}')
    s1.xaxis.major_label_orientation = math.pi / 2
#     s1.yaxis.axis_label = 'Flow [cms]'
    this_start = pd.to_datetime(this_event.index.values[0])
    this_end = pd.to_datetime(this_event.index.values[1])
    this_dat = lag_df[(lag_df.index >= this_start) & (lag_df.index <= this_end)][['Q']]
    
    if (this_end.month < 12) & (this_start.month > 5):
        year = this_event.index.year.values[0]
        month = this_event.index.month.values[0]
        day = this_event.index.day.values[0]
        date = '{}-{}-{}'.format(year, month, day)
        s1.line(this_dat.index, this_dat['Q'], color='blue')
        plots.append(s1)

print('there are {} plots'.format(len(plots)))

there are 24 plots


In [107]:
# if len(plots) < 6:
#     grid = gridplot(plots, plot_width=150, plot_height=150)
# else:
n_cols = 8
n_rows = int(np.ceil(len(plots) / n_cols))

g = []
for i in range(0, len(plots), n_cols):
    this_plot = plots[i]
    if i % n_cols == 0:
        print(i)
        this_plot.yaxis.axis_label = 'Flow [cms]'
    g += [plots[i:i+n_cols]]
grid = gridplot(g, plot_width=150, plot_height=150)

0
8
16


In [108]:
show(grid)

In [126]:
event_pairs = []
for i in np.arange(0, len(best_events) - 1, 2):
    # parse a single event pair
    this_event = best_events.iloc[i:i+2]
    date_pair = [e.astype(str).replace('T', ' ').split('.')[0].split(' ')[0] for e in this_event.index.values]
    this_start = pd.to_datetime(this_event.index.values[0])
    this_month = this_start.month
    if (this_month > 5) & (this_month <= 11) & (this_start > pd.to_datetime('2007-01-01')):
        event_pairs.append(date_pair)

In [128]:
print(best_events.iloc[:2])

                Q timing dt_days wsc_station  training_year training_months  \
DATE                                                                          
2007-07-20  0.200  start     NaT     08HB048           2012    (2, 6, 7, 8)   
2007-07-28  0.097    end  8 days     08HB048           2012    (2, 6, 7, 8)   

            training_set_len  m_threshold  var_explained  n_components  \
DATE                                                                     
2007-07-20               121     4.827273      96.694442             5   
2007-07-28               121     4.827273      96.694442             5   

            num_lags  
DATE                  
2007-07-20         6  
2007-07-28         6  


## drop winter events and group for individual plotting

In [130]:
starts = best_events[best_events['timing'] == 'start']
ends = best_events[best_events['timing'] == 'end']
runoff_df = initialize_runoff_dataframe(stn)

In [139]:
p = figure(plot_width=800, plot_height=400, x_axis_type='datetime')

# p.circle(adj_starts.index, adj_starts['Q'], size=10, color="red", 
#          alpha=0.5, legend_label='start'.format(len(foo)))
# p.circle(adj_ends.index, adj_ends['Q'], size=10, color="blue", 
#          alpha=0.5, legend_label='end'.format(len(foo)))

p.circle(starts.index, starts['Q'], size=10, color="red", 
         alpha=0.5, legend_label='start')
p.circle(ends.index, ends['Q'], size=10, color="blue", 
         alpha=0.5, legend_label='end')
# p.line(input_sig.index, input_sig['f_sig'], color='blue')
p.line(runoff_df.index, runoff_df['DAILY_FLOW'], color='blue')

p.yaxis.axis_label = 'Flow [cms]'
# p.line()
# show the results
show(p)

##  Autoencoder Neural Network

In [136]:
seed(10)
tensorflow.random.set_seed(10)
act_func = 'elu'

# Input layer:
model=Sequential()
# First hidden layer, connected to input vector X. 
model.add(Dense(10,activation=act_func,
                kernel_initializer='glorot_uniform',
                kernel_regularizer=regularizers.l2(0.0),
                input_shape=(X_train.shape[1],)
               )
         )

model.add(Dense(2,activation=act_func,
                kernel_initializer='glorot_uniform'))

model.add(Dense(10,activation=act_func,
                kernel_initializer='glorot_uniform'))

model.add(Dense(X_train.shape[1],
                kernel_initializer='glorot_uniform'))

model.compile(loss='mse',optimizer='adam')

# Train model for 100 epochs, batch size of 10: 
NUM_EPOCHS=100
BATCH_SIZE=10

NameError: name 'X_train' is not defined

In [None]:
history=model.fit(np.array(X_train),np.array(X_train),
                  batch_size=BATCH_SIZE, 
                  epochs=NUM_EPOCHS,
                  validation_split=0.05,
                  verbose = 0)

In [None]:
plt.plot(history.history['loss'],
         'b',
         label='Training loss')
plt.plot(history.history['val_loss'],
         'r',
         label='Validation loss')
plt.legend(loc='upper right')
plt.xlabel('Epochs')
plt.ylabel('Loss, [mse]')
plt.ylim([0,.1])
plt.show()

In [None]:
X_pred = model.predict(np.array(X_train))
X_pred = pd.DataFrame(X_pred, 
                      columns=X_train.columns)
X_pred.index = X_train.index

scored = pd.DataFrame(index=X_train.index)
scored['Loss_mae'] = np.mean(np.abs(X_pred-X_train), axis = 1)
plt.figure()
sns.distplot(scored['Loss_mae'],
             bins = 10, 
             kde= True,
            color = 'blue');
plt.xlim([0.0,.5])

In [None]:
X_pred = model.predict(np.array(X_test))
X_pred = pd.DataFrame(X_pred, 
                      columns=X_test.columns)
X_pred.index = X_test.index

scored = pd.DataFrame(index=X_test.index)
scored['Loss_mae'] = np.mean(np.abs(X_pred-X_test), axis = 1)
scored['Threshold'] = 0.3
scored['Anomaly'] = scored['Loss_mae'] > scored['Threshold']
scored.head()

In [None]:
X_pred_train = model.predict(np.array(X_train))
X_pred_train = pd.DataFrame(X_pred_train, 
                      columns=X_train.columns)
X_pred_train.index = X_train.index

scored_train = pd.DataFrame(index=X_train.index)
scored_train['Loss_mae'] = np.mean(np.abs(X_pred_train-X_train), axis = 1)
scored_train['Threshold'] = 0.3
scored_train['Anomaly'] = scored_train['Loss_mae'] > scored_train['Threshold']
scored = pd.concat([scored_train, scored], sort=True)

In [None]:
scored[scored.index > pd.to_datetime('2016-06-01')].plot(logy=True,  figsize = (10,6), ylim = [1e-2,1e2], color = ['blue','red'])

In [None]:

event_times = np.where(scored['Anomaly'].values[:-1] != scored['Anomaly'].values[1:])[0]
events = pd.merge(input_sig, scored.iloc[event_times,:], how='inner', 
                  left_index=True, right_index=True)

starts = events[events['Anomaly'] == False]
ends = events[events['Anomaly'] == True]


In [None]:
p = figure(plot_width=800, plot_height=400, x_axis_type='datetime')

# p.circle(adj_starts.index, adj_starts['Q'], size=10, color="red", 
#          alpha=0.5, legend_label='start'.format(len(foo)))
# p.circle(adj_ends.index, adj_ends['Q'], size=10, color="blue", 
#          alpha=0.5, legend_label='end'.format(len(foo)))

p.circle(starts.index, starts['DAILY_FLOW'], size=10, color="red", 
         alpha=0.5, legend_label='start'.format(len(foo)))
p.circle(ends.index, ends['DAILY_FLOW'], size=10, color="blue", 
         alpha=0.5, legend_label='end'.format(len(foo)))
# p.line(input_sig.index, input_sig['f_sig'], color='blue')
p.line(input_sig.index, input_sig['DAILY_FLOW'], color='blue')
# p.line()
# show the results
show(p)

In [None]:
# dates_covered = []
# fldr = os.path.join(IMG_DIR, test_stn)
# for f in os.listdir(fldr):
#     date = f[:4] + '-' + f[4:6] + '-' + f[6:8]
#     dates_covered.append(date)

# dates_covered = list(set(dates_covered))
# unchecked = []
# for ep in event_pairs:
#     if (ep[0] not in dates_covered) & (ep[1] not in dates_covered):
#         unchecked.append(ep)
        
# print(unchecked)

In [None]:
def find_peaks(data, lag=7, threshold=500, influence=0.5):
    # Settings (the ones below are examples: choose what is best for your data)
#     lag = 5         # lag 5 for the smoothing functions
#     threshold = 3.5  # 3.5 standard deviations for signal
#     influence = 0.5  # between 0 and 1, where 1 is normal influence, 0.5 is half
    # Initialize variables
    signals = np.zeros(len(data))            # Initialize signal results
    filteredY = np.empty(len(data))
    filteredY[:lag] = data[:lag]             # Initialize filtered series
    avgFilter = [0]                          # Initialize average filter
    stdFilter = [0]                          # Initialize std. filter
    avgFilter = {lag: np.mean(data[:lag])}      # Initialize first value
    stdFilter = {lag: np.std(data[:lag])}     # Initialize first value
    
    for i in range(lag + 1, len(data)):
        d = data[i]
        
        af = avgFilter[i-1]
        sf = stdFilter[i-1]
        
        if abs(d - af) > threshold * sf:
            if d > af:
                signals[i] = 1                     # Positive signal
            else:
                signals[i] = -1                    # Negative signal

            
            filteredY[i] = influence*d + (1-influence)*filteredY[i-1]
        else:
            signals[i] = 0                        # No signal
            filteredY[i] = 0
        
        
        # Adjust the filters
        avgFilter[i] = np.mean(filteredY[i-lag:i])
        stdFilter[i] = np.std(filteredY[i-lag:i])
        
    return signals, filteredY

n_test = 500

dats = list(df['DAILY_FLOW'].to_numpy())
sigs, f_dat = find_peaks(dats, influence=0.75, lag=7, threshold=5)

In [None]:
from bokeh.plotting import figure, output_file, show, output_notebook

input_sig = df[['DAILY_FLOW']].copy()
signal = np.array(sigs)
input_sig['sig'] = signal.copy().astype(int)
input_sig['f_sig'] = f_dat

foo = input_sig[input_sig['sig'] == 1].copy()
p = figure(plot_width=800, plot_height=400, x_axis_type='datetime')
# add a circle renderer with a size, color, and alpha
p.circle(foo.index, foo['DAILY_FLOW'], size=10, color="red", 
         alpha=0.5, legend_label='{} pts'.format(len(foo)))
# p.line(input_sig.index, input_sig['f_sig'], color='blue')
p.line(input_sig.index, input_sig['DAILY_FLOW'], color='blue')
# p.line()
# show the results
show(p)

Find the summer baseflow

Break up the May to November records by periods where it comes back to within X% of baseflow.  

check durations of these periods, see how many there are.