In [None]:
import os
import time

import pandas as pd
import numpy as np
import xarray as xr

import scipy.stats as st

from itertools import combinations

from shapely.geometry import Point
import geopandas as gpd

from multiprocessing import Pool

# Paired Watershed Characteristics

Develop framework to compare pairs of daily flow series from basins in the WSC database.  

## Method:

1. Generate a list of valid pairs of stations. A valid pair is one where:
    * basin geometry exists for both stations
    * there is a minimum N years of concurrent data between the two stations.

In [None]:
# import basin characteristics
WSC_db_folder = '/media/danbot/T7 Touch/hydat_db/'
metadata_fn = 'WSC_Stations_Master.csv'
hysets_folder = '/media/danbot/T7 Touch/hysets_series/'

df = pd.read_csv(WSC_db_folder + metadata_fn)
df.head()

In [None]:
df['num_years_record'] = df['Year To'] - df['Year From']

In [None]:
# filter for stations in BC and Alberta
df = df[df['Province'].isin(['BC', 'AB'])]
print(len(df))

In [None]:
stn_pairs_list = list(combinations(df['Station Number'].to_numpy(), 2))
print(len(df))
print(len(stn_pairs_list))

In [None]:
hysets_df = pd.read_csv('data/HYSETS_watershed_properties.txt', sep=';', dtype={'Official_ID': str})
hysets_df = hysets_df[hysets_df['Source'] == 'HYDAT']

In [None]:
# create a centroid shapely Point
hysets_df['centroid_geom'] = hysets_df.apply(lambda xy: Point((xy['Centroid_Lon_deg_E'], xy['Centroid_Lat_deg_N'])), axis=1)

In [None]:
hysets_df.columns

In [None]:
# create a dictionary of identifying information to facilitate
# selection of specific watersheds
basin_metadata = ['Watershed_ID', 'Official_ID', 'Name']

basin_centroid_geom = ['centroid_geom']

basin_characteristics_cols = ['Drainage_Area_km2', 'Centroid_Lat_deg_N',
                              'Centroid_Lon_deg_E', 'Elevation_m', 'Gravelius', 
                              'Aspect_deg', 'Slope_deg', 'Land_Use_Forest_frac',
                              'Land_Use_Grass_frac', 'Land_Use_Wetland_frac', 
                              'Land_Use_Water_frac', 'Land_Use_Urban_frac', 
                              'Land_Use_Shrubs_frac', 'Land_Use_Crops_frac',
                              'Land_Use_Snow_Ice_frac', 'Permeability_logk_m2', 
                              'Porosity_frac']

hysets_dict = hysets_df[basin_metadata + basin_centroid_geom + basin_characteristics_cols].set_index('Official_ID').to_dict(orient='index')

In [None]:
hysets_stns = list(hysets_dict.keys())
n_hydat_stns = len(hysets_stns)
print(f'There are {n_hydat_stns} HYDAT station records in the HYSETS database.')

In [None]:
pair_df = pd.DataFrame(stn_pairs_list, columns=['b1', 'b2'])

In [None]:
def check_if_pair_in_hysets(pair):
    return (pair[0] in hysets_stns) & (pair[1] in hysets_stns)        

In [None]:
pool = Pool()
t0 = time.time()
pair_df['pair_in_hysets'] = pool.map(check_if_pair_in_hysets, stn_pairs_list)
pool.close()
pool.join()
t1 = time.time()
print(f't for {len(stn_pairs_list)} results: {t1-t0:.1f}s')

In [None]:
print(f'len before filter = {len(pair_df)}')
pair_df = pair_df[pair_df['pair_in_hysets']]
print(f'len after filter = {len(pair_df)}')

In [None]:
def check_pair_properties(row):
    for c in basin_characteristics_cols:
        p1 = hysets_dict[row['b1']][c]
        p2 = hysets_dict[row['b2']][c]
    if ~np.isnan(p1) & ~np.isnan(p2):
        return True
    else:
        return False

In [None]:
pair_df['char_check'] = pair_df.apply(lambda row: check_pair_properties(row), axis=1)


In [None]:
missing_characteristics = pair_df[~pair_df['char_check']].count()
print(f'{len(missing_characteristics)} basins have missing characteristics')

In [None]:
# filter out pairs missing basin characteristics
pair_df = pair_df[pair_df['char_check']]

In [None]:
def extract_streamflow_series(stn):
#     ws = hysets_dict[stn]
#     df = ds.sel(watershed=ws['Watershed_ID']-1, drop=True).to_dataframe()
    df = pd.read_csv(f'{hysets_folder}{stn}.csv', index_col=['time'])
    df.dropna(inplace=True)
    return df

In [None]:
def check_actual_concurrence_len(pair):
    df1 = extract_streamflow_series(pair[0])
    df1.rename(mapper={'discharge': f'{pair[0]}'}, inplace=True, axis=1)
    
    df2 = extract_streamflow_series(pair[1])
    df2.rename(mapper={'discharge': f'{pair[1]}'}, inplace=True, axis=1)
    concurrent_df = pd.concat([df1, df2], join='inner', axis=1)
    return len(concurrent_df)

In [None]:
# filter for pairs that have minimum 50 years of concurrent data
t0 = time.time()
pool = Pool()

# pair_df['concurrence_check'] = pool.map(check_actual_concurrence_len, pair_df[['b1', 'b2']].to_numpy()[:10])
pair_df['concurrent_days'] = pool.map(check_actual_concurrence_len, 
               pair_df[['b1', 'b2']].to_numpy())
pool.close()
pool.join()
t1 = time.time()
print(f'Time to calculate concurrent period lengths: {t1 - t0:.1f}')

In [None]:
print(pair_df.head())
print(len(pair_df))


In [None]:
pair_df = pair_df[pair_df['concurrent_days'] > 365]
print(f'{len(pair_df)} basin pairs meet the concurrence length criteria.')

In [None]:
# write the list of unique pairs to disk so you 
# don't have to go through that process again
pair_df.to_pickle('results/filtered_pairs_all_concurrent_lengths.csv')

In [None]:
filtered_pairs = pair_df[['b1', 'b2']].to_numpy()
filtered_pairs = [tuple(e) for e in filtered_pairs]
unique_concurrent = list(set(filtered_pairs))
print(unique_concurrent[:5])
print(len(unique_concurrent))

## Re-Load all Saved Results

Continue the distance metric calculation

In [None]:
all_df = pd.read_pickle('results/filtered_pairs_all_concurrent_lengths.csv')
all_df = all_df[['b1', 'b2', 'concurrent_days']]

## Run similarity operation on filtered pairs

1. Calculate a 'similarity' metric based on concurrent data.
2. Retrieve basin characteristics from the hysets basin characteristics file.
3. Calculate differences in basin elevation, gravelius, drainage area, and distance between basin centroids.

In [None]:
def get_param_diff(pair, param):
    return hysets_dict[pair[0]][param] - hysets_dict[pair[1]][param]

In [None]:
def get_distance(pair):
    foo = hysets_df[hysets_df['Official_ID'].isin(pair)]
    hdf = gpd.GeoDataFrame(foo, geometry=foo['centroid_geom'], crs='EPSG:4326')
    hdf = hdf.to_crs(3005)
    hdf.reset_index(inplace=True)
    return hdf.loc[0, 'geometry'].distance(hdf.loc[1, 'geometry']) / 1000
    

In [None]:
import shapely.geometry as geom

def create_line(row):
    return geom.LineString([hysets_dict[row['b1']]['centroid_geom'], hysets_dict[row['b2']]['centroid_geom']])
    

In [None]:
# def set_centroid_geom()

# all_df['centroids'] = all_df.apply(row: Point(xy) for xy in )
geometry = gpd.GeoDataFrame({'geometry': all_df.apply(lambda row: create_line(row), axis=1)}, crs='EPSG:4326')

geometry = geometry.to_crs(3005)
geometry['centroid_distance_km'] = geometry.length / 1000  # convert to km
geometry.head()
# all_df.head()

# all_df['b1'].apply(lambda e: hysets_dict[e]['centroid_geom']) 
# all_df['b2_centroid_geom'] = all_df['b2'].apply(lambda e: hysets_dict[e]['centroid_geom']) 

In [None]:
all_df['distance_btwn_centroids_km'] = geometry['centroid_distance_km']

all_df['pair_midpoint'] = geometry['geometry'].interpolate(0.5, normalized=False)

all_df.head()
# print(hysets_dict['05AA006'])

In [None]:
# pair_df['normed_distance'] = (pair_df['PC_distance'] - pair_df['PC_distance'].min()) / (pair_df['PC_distance'].max() - pair_df['PC_distance'].min())

In [None]:
foo = all_df.copy()
# convert back to EPSG 4326 for saving geographic coordinates
geometry = geometry.to_crs(4326)

all_df['midpoint_lat_deg_N'] = midpoint.apply(lambda mp: mp.y)
all_df['midpoint_lon_deg_E'] = midpoint.apply(lambda mp: mp.x)
all_df.head()

In [None]:
def calculate_similarity_measure_COD(pair):
    df1 = extract_streamflow_series(pair[0])
#     df1.rename(mapper={'discharge': f'{pair[0]}'}, inplace=True, axis=1)
    
    df2 = extract_streamflow_series(pair[1])
#     df2.rename(mapper={'discharge': f'{pair[1]}'}, inplace=True, axis=1)
    concurrent_df = pd.concat([df1, df2], join='inner', axis=1)
    if len(concurrent_df) >= 365:
        cols = concurrent_df.columns
        out = st.linregress(concurrent_df.to_numpy())    

        return out[2]**2
    else:
        return np.nan
    

In [None]:
basin_characteristics_cols
for char in basin_characteristics_cols:
#     b1_char = 
    all_df[f'{char}_diff'] = [hysets_dict[b1][char] - hysets_dict[b2][char] for b1, b2 in all_df[['b1', 'b2']].to_numpy()]

In [None]:
pool = Pool()
t0 = time.time()
all_df['similarity'] = pool.map(calculate_similarity_measure_COD, all_df[['b1', 'b2']].to_numpy())
pool.close()
pool.join()
t1 = time.time()

In [None]:
print(f't for {len(all_df)} results: {t1-t0:.1f}s')
all_df.columns

In [None]:
results_df = all_df.copy()
print(len(results_df))
results_df.dropna(how='any', inplace=True)
print(len(results_df))
results_df.to_csv('results/results_min_365d_concurrent.csv', index=False)

In [None]:
import pandas as pd

foo = pd.read_csv('results/results_min_365d_concurrent.csv')

In [None]:
foo[foo['similarity'].isna()].count()