In [None]:
import os
import time

import pandas as pd
import numpy as np
import xarray as xr

import scipy.stats as st

from shapely.geometry import Point
import geopandas as gpd

from multiprocessing import Pool

# Paired Watershed Characteristics

Develop framework to compare pairs of daily flow series from basins in the WSC database.  

## Method:

1. Generate a list of valid pairs of stations. A valid pair is one where:
    * basin geometry exists for both stations
    * there is a minimum N years of concurrent data between the two stations.

In [None]:
# import basin characteristics
WSC_db_folder = '/media/danbot/T7 Touch/hydat_db/'
metadata_fn = 'WSC_Stations_Master.csv'

df = pd.read_csv(WSC_db_folder + metadata_fn)
df.head()

In [None]:
df['num_years_record'] = df['Year To'] - df['Year From']

In [None]:
# filter for stations in BC and Alberta
df = df[df['Province'].isin(['BC', 'AB'])]
print(len(df))

In [None]:
stn_pairs_list = np.random.choice(df['Station Number'].to_numpy(), size=(int(5E6), 2), replace=True)
stn_pairs_list = [list(sorted(e)) for e in stn_pairs_list]

In [None]:
len(stn_pairs_list)

In [None]:
def check_time_periods(p):
    stn1, stn2 = p[0], p[1]
    s1 = df[df['Station Number']==stn1]
    s2 = df[df['Station Number']==stn2]
    start1, end1 = s1['Year From'].to_numpy()[0], s1['Year To'].to_numpy()[0]
    start2, end2 = s2['Year From'].to_numpy()[0], s2['Year To'].to_numpy()[0]
    if end1 < end2:
        overlap_duration = end1 - start2
    else:
        overlap_duration = end2 - start1
    if overlap_duration > 50:
        return p
    else:
        return None

In [None]:
# filter for pairs that have minimum 50 years of concurrent data

pool = Pool()
overlapping_records = pool.map(check_time_periods, stn_pairs_list)
pool.close()
pool.join()


In [None]:
filtered_pairs = [e for e in overlapping_records if e]

In [None]:
def check_unique_pairs(pair):
    if pair[::-1] not in filtered_pairs:
        return pair
        

In [None]:
pool = Pool()
unique_pairs = pool.map(check_unique_pairs, filtered_pairs)
pool.close()
pool.join()
# unique_pairs = [e for e in filtered_pairs if e[::-1] not in filtered_pairs]
print(len(unique_pairs))
print(f'Of {len(filtered_pairs)}, {len(unique_pairs)} are unique.')

In [None]:
hysets_df = pd.read_csv('data/HYSETS_watershed_properties.txt', sep=';', dtype={'Official_ID': str})
print(hysets_df.columns)

In [None]:
# create a centroid shapely Point
hysets_df['centroid_geom'] = hysets_df.apply(lambda xy: Point((xy['Centroid_Lon_deg_E'], xy['Centroid_Lat_deg_N'])), axis=1)

In [None]:
hysets_df.columns

In [None]:
# create a dictionary of identifying information to facilitate
# selection of specific watersheds
basin_metadata = ['Watershed_ID', 'Official_ID', 'Name']

basin_centroid_geom = ['centroid_geom']

basin_characteristics_cols = ['Drainage_Area_km2', 
                              'Elevation_m', 'Gravelius', 'Aspect_deg', 
                              'Slope_deg', 'Land_Use_Forest_frac',
                              'Land_Use_Grass_frac', 'Land_Use_Wetland_frac', 
                              'Land_Use_Water_frac', 'Land_Use_Urban_frac', 
                              'Land_Use_Shrubs_frac', 'Land_Use_Crops_frac',
                              'Land_Use_Snow_Ice_frac', 'Permeability_logk_m2', 
                              'Porosity_frac']

hysets_dict = hysets_df[basin_metadata + basin_centroid_geom + basin_characteristics_cols].set_index('Official_ID').to_dict(orient='index')

In [None]:
hysets_folder = '/media/danbot/T7 Touch/hysets_series/'

In [None]:
unique_pairs = [p for p in unique_pairs if p]
unique_stn_set_from_pairs = list(set([l for sublist in unique_pairs for l in sublist]))
unique_stn_set = [e for e in unique_stn_set_from_pairs if e in hysets_dict]
print(len(unique_stn_set))
unique_stn_set = [e for e in unique_stn_set if ~np.isnan(hysets_dict[e]['Elevation_m'])]
print(len(unique_stn_set))
print(f'{len(unique_stn_set)} unique stations from the WSC dataset fit the concurrence criteria.')


In [None]:
unique_pairs_hysets = [p for p in unique_pairs if set(p).issubset(unique_stn_set)]
    
# unique_pairs_hysets = [e for e in unique_pairs if np.in1d(e, unique_stn_set)) == 2]
print(f'{len(unique_pairs_hysets)} unique station pairs from the WSC dataset fit the concurrence criteria and are in HYSETS.')

In [None]:
print(unique_pairs_hysets[:10])

In [None]:
# write the list of unique pairs to disk so you 
# don't have to go through that process again
np.save('unique_pairs.npy', unique_pairs_hysets, allow_pickle=True)

## Extract the comparative characteristics for each basin pairing

1. Calculate a 'similarity' metric based on concurrent data.
2. Retrieve basin characteristics from the hysets basin characteristics file.
3. Calculate differences in basin elevation, gravelius, drainage area, and distance between basin centroids.

In [None]:
def extract_streamflow_series(stn):
#     ws = hysets_dict[stn]
#     df = ds.sel(watershed=ws['Watershed_ID']-1, drop=True).to_dataframe()
    df = pd.read_csv(f'{hysets_folder}{stn}.csv', index_col=['time'])
    df.dropna(inplace=True)
    return df

In [None]:
def get_param_diff(pair, param):
    return abs(hysets_dict[pair[0]][param] - hysets_dict[pair[1]][param])

In [None]:
def get_similarity_measure_COD(pair):
    df1 = extract_streamflow_series(pair[0])
    df1.rename(mapper={'discharge': f'{pair[0]}'}, inplace=True, axis=1)
    
    df2 = extract_streamflow_series(pair[1])
    df2.rename(mapper={'discharge': f'{pair[1]}'}, inplace=True, axis=1)
    concurrent_df = pd.concat([df1, df2], join='inner', axis=1)
    
    if concurrent_df.empty:
        return None
    
    cols = concurrent_df.columns
    out = st.linregress(concurrent_df[cols[0]], concurrent_df[cols[1]])    

    return out[2]**2
    

In [None]:
def get_distance(pair):
    foo = hysets_df[hysets_df['Official_ID'].isin(pair)]
    hdf = gpd.GeoDataFrame(foo, geometry=foo['centroid_geom'], crs='EPSG:4326')
    hdf = hdf.to_crs(3005)
    hdf.reset_index(inplace=True)
    return hdf.loc[0, 'geometry'].distance(hdf.loc[1, 'geometry']) / 1000
    

In [None]:
def run_similarity_to_distance_calc(pair):
    property_diffs = []
    similarity = get_similarity_measure_COD(pair)
    property_diffs.append(similarity)
    property_diffs.append(get_distance(pair))
    
    for c in basin_characteristics_cols:
        property_diffs.append(get_param_diff(pair, c))

    if similarity is not None:
        return property_diffs

In [None]:
basin_characteristics_cols

In [None]:
# results = []
# i = 0
# t0 = time.time()
# for p in unique_pairs_hysets:
#     results.append(run_similarity_to_distance_calc(p))
#     if (i > 99) & (i % 100 == 0):
#         t1 = time.time()
#         print(f'time for {i} results: {t1-t0:.1f}')
        
#     i += 1

In [None]:
# load the saved unique pairs
unique_pairs_hysets = np.load('unique_pairs.npy').tolist()
pairs_df = pd.DataFrame(unique_pairs_hysets)
pairs_df.columns = ['b1', 'b2']
print(pairs_df.head())
# print(f'There are {len(unique_pairs_hysets)} pairs')

In [None]:
pool = Pool()
t0 = time.time()

results = pool.map(run_similarity_to_distance_calc, unique_pairs_hysets)
pool.close()
pool.join()
t1 = time.time()
print(f't for {len(results)} results: {t1-t0:.1f}s')

In [None]:
# filter out empty result arrays
results = [e for e in results if e]

In [None]:
results_df = pd.DataFrame(results)
results_df.columns = ['similarity', 'distance'] + basin_characteristics_cols
results_df
results_df.to_csv('results1.csv', index=False)

In [None]:
# print(results_df.head())
# foo = results_df[0]
# bar = results_df[1]
# print(results_df.max())
# print(results_df.min())
# sum(np.where(foo > 1))

In [None]:
#