In [1]:
import os
import time

import pandas as pd
import numpy as np
import xarray as xr

import scipy.stats as st

from itertools import combinations

from shapely.geometry import Point
import geopandas as gpd

from multiprocessing import Pool



# Paired Watershed Characteristics

Develop framework to compare pairs of daily flow series from basins in the WSC database.  

## Method:

1. Generate a list of valid pairs of stations. A valid pair is one where:
    * basin geometry exists for both stations
    * there is a minimum N years of concurrent data between the two stations.

In [2]:
# import basin characteristics
WSC_db_folder = '/media/danbot/T7 Touch/hydat_db/'
metadata_fn = 'WSC_Stations_Master.csv'

df = pd.read_csv(WSC_db_folder + metadata_fn)
df.head()

Unnamed: 0,Station Number,Station Name,Province,Status,Latitude,Longitude,Year From,Year To,Gross Drainage Area (km2),Effective Drainage Area (km2),...,Data Type,Operation Schedule,Sediment,RHBN,Real-Time,Datum Name,Publishing Office,Operating Agency,Contributed,Elevation
0,01AA002,DAAQUAM (RIVIERE) EN AVAL DE LA RIVIERE SHIDGEL,QC,Discontinued,46.5575,-70.08111,1967,1977,598.0,,...,Flow,Continuous,N,N,N,ARBITRAIRE,QUEBEC CITY,MINISTERE DE L'ENVIRONNEMENT DU QUEBEC,Y,347.313904
1,01AA004,GAUTHIER (RIVIERE) A SON EMBOUCHURE,QC,Discontinued,46.80083,-70.13806,1975,1981,16.6,,...,Level,Seasonal,N,N,N,ARBITRAIRE,QUEBEC CITY,MINISTERE DE L'ENVIRONNEMENT DU QUEBEC,Y,401.0
2,01AB001,BUCKLEY (RIVIERE) EN AVAL DU LAC JOHNNY,QC,Discontinued,46.87917,-70.0875,1973,1981,6.94,,...,Level,Seasonal,N,N,N,ARBITRAIRE,QUEBEC CITY,MINISTERE DE L'ENVIRONNEMENT DU QUEBEC,Y,411.0
3,01AD001,MADAWASKA (RIVIÈRE) À 6 KM EN AVAL DU BARRAGE ...,QC,Discontinued,47.54833,-68.63639,1918,2005,2690.0,,...,Flow,Continuous,N,N,N,ARBITRAIRE,QUEBEC CITY,MINISTERE DE L'ENVIRONNEMENT DU QUEBEC,Y,143.66777
4,01AD002,SAINT JOHN RIVER AT FORT KENT,NB,Active,47.25806,-68.59583,1926,2018,14700.0,,...,Flow,Continuous,N,Y,N,INTERNATIONAL BOUNDARY COMMISSION DATUM,DARTMOUTH,UNITED STATES GEOLOGICAL SURVEY,Y,158.894211


In [3]:
df['num_years_record'] = df['Year To'] - df['Year From']

In [4]:
# filter for stations in BC and Alberta
df = df[df['Province'].isin(['BC', 'AB'])]
print(len(df))

3588


In [5]:
stn_pairs_list = list(combinations(df['Station Number'].to_numpy(), 2))
print(len(df))
print(len(stn_pairs_list))

3588
6435078


In [6]:
# pairs with < 20 years overlap return as None, so filter them out
# filtered_pairs = [e for e in overlapping_records if e]

In [7]:
hysets_df = pd.read_csv('data/HYSETS_watershed_properties.txt', sep=';', dtype={'Official_ID': str})
hysets_df = hysets_df[hysets_df['Source'] == 'HYDAT']

In [8]:
# create a centroid shapely Point
hysets_df['centroid_geom'] = hysets_df.apply(lambda xy: Point((xy['Centroid_Lon_deg_E'], xy['Centroid_Lat_deg_N'])), axis=1)

In [9]:
hysets_df.columns

Index(['Watershed_ID', 'Source', 'Name', 'Official_ID', 'Centroid_Lat_deg_N',
       'Centroid_Lon_deg_E', 'Drainage_Area_km2', 'Drainage_Area_GSIM_km2',
       'Flag_GSIM_boundaries', 'Flag_Artificial_Boundaries', 'Elevation_m',
       'Slope_deg', 'Gravelius', 'Perimeter', 'Flag_Shape_Extraction',
       'Aspect_deg', 'Flag_Terrain_Extraction', 'Land_Use_Forest_frac',
       'Land_Use_Grass_frac', 'Land_Use_Wetland_frac', 'Land_Use_Water_frac',
       'Land_Use_Urban_frac', 'Land_Use_Shrubs_frac', 'Land_Use_Crops_frac',
       'Land_Use_Snow_Ice_frac', 'Flag_Land_Use_Extraction',
       'Permeability_logk_m2', 'Porosity_frac', 'Flag_Subsoil_Extraction',
       'centroid_geom'],
      dtype='object')

In [10]:
# create a dictionary of identifying information to facilitate
# selection of specific watersheds
basin_metadata = ['Watershed_ID', 'Official_ID', 'Name']

basin_centroid_geom = ['centroid_geom']

basin_characteristics_cols = ['Drainage_Area_km2', 
                              'Elevation_m', 'Gravelius', 'Aspect_deg', 
                              'Slope_deg', 'Land_Use_Forest_frac',
                              'Land_Use_Grass_frac', 'Land_Use_Wetland_frac', 
                              'Land_Use_Water_frac', 'Land_Use_Urban_frac', 
                              'Land_Use_Shrubs_frac', 'Land_Use_Crops_frac',
                              'Land_Use_Snow_Ice_frac', 'Permeability_logk_m2', 
                              'Porosity_frac']

hysets_dict = hysets_df[basin_metadata + basin_centroid_geom + basin_characteristics_cols].set_index('Official_ID').to_dict(orient='index')

In [11]:
hysets_stns = list(hysets_dict.keys())
n_hydat_stns = len(hysets_stns)
print(f'There are {n_hydat_stns} HYDAT station records in the HYSETS database.')

There are 2375 HYDAT station records in the HYSETS database.


In [12]:
def check_if_pair_in_hysets(pair):
    return (pair[0] in hysets_stns) & (pair[1] in hysets_stns)        

In [13]:
pair_df = pd.DataFrame(stn_pairs_list, columns=['b1', 'b2'])

In [15]:
pool = Pool()
t0 = time.time()
pair_df['pair_in_hysets'] = pool.map(check_if_pair_in_hysets, stn_pairs_list)
pool.close()
pool.join()
t1 = time.time()
print(f't for {len(stn_pairs_list)} results: {t1-t0:.1f}s')

t for 6435078 results: 77.2s


In [16]:
print(f'len before filter = {len(pair_df)}')
pair_df = pair_df[pair_df['pair_in_hysets']]
print(f'len after filter = {len(pair_df)}')

len before filter = 6435078
len after filter = 529935


In [24]:
def check_time_periods(p):
    stn1, stn2 = p[0], p[1]
    s1 = df[df['Station Number']==stn1]
    s2 = df[df['Station Number']==stn2]
    start1, end1 = s1['Year From'].to_numpy()[0], s1['Year To'].to_numpy()[0]
    start2, end2 = s2['Year From'].to_numpy()[0], s2['Year To'].to_numpy()[0]
    if end1 < end2:
        overlap_duration = end1 - start2
    else:
        overlap_duration = end2 - start1
    if overlap_duration > 20:
        return p
    else:
        return None

In [26]:
# filter for pairs that have minimum 50 years of concurrent data
t0 = time.time()
pool = Pool()
# overlapping_records = pool.map(check_time_periods, pair_df[['b1', 'b2']].to_numpy())
pool.close()
pool.join()
t1 = time.time()
print(f'Time to check first pass record overlap: {t1-t0:.1f}')

Time to check first pass record overlap: 116.9


In [28]:
print(overlapping_records[:10])

[array(['05AA006', '05AA008'], dtype=object), array(['05AA006', '05AA011'], dtype=object), array(['05AA006', '05AA013'], dtype=object), array(['05AA006', '05AA022'], dtype=object), array(['05AA006', '05AA023'], dtype=object), array(['05AA006', '05AA026'], dtype=object), array(['05AA006', '05AA027'], dtype=object), array(['05AA006', '05AA028'], dtype=object), None, None]


In [62]:
unique_pairs = np.array([tuple(e) for e in overlapping_records if e is not None])

In [63]:
print(f'there are {len(unique_pairs)} unique pairs')
print(f'object type is {type(unique_pairs)}')

there are 229617 unique pairs
object type is <class 'numpy.ndarray'>


In [104]:
unique_concurrent = list(set(unique_pairs.flatten()))
num_concurrent = len(unique_concurrent)
print(f'there are {num_concurrent} unique stations meeting the concurrence criteria.')

there are 1030 unique stations meeting the concurrence criteria.


In [67]:
# write the list of unique pairs to disk so you 
# don't have to go through that process again
np.save('unique_pairs.npy', unique_pairs, allow_pickle=True)

## Extract the comparative characteristics for each basin pairing

1. Calculate a 'similarity' metric based on concurrent data.
2. Retrieve basin characteristics from the hysets basin characteristics file.
3. Calculate differences in basin elevation, gravelius, drainage area, and distance between basin centroids.

In [73]:
hysets_folder = '/media/danbot/T7 Touch/hysets_series/'

In [49]:
def extract_streamflow_series(stn):
#     ws = hysets_dict[stn]
#     df = ds.sel(watershed=ws['Watershed_ID']-1, drop=True).to_dataframe()
    df = pd.read_csv(f'{hysets_folder}{stn}.csv', index_col=['time'])
    df.dropna(inplace=True)
    return df

In [50]:
def get_param_diff(pair, param):
    return abs(hysets_dict[pair[0]][param] - hysets_dict[pair[1]][param])

In [51]:
def get_similarity_measure_COD(pair):
    df1 = extract_streamflow_series(pair[0])
    df1.rename(mapper={'discharge': f'{pair[0]}'}, inplace=True, axis=1)
    
    df2 = extract_streamflow_series(pair[1])
    df2.rename(mapper={'discharge': f'{pair[1]}'}, inplace=True, axis=1)
    concurrent_df = pd.concat([df1, df2], join='inner', axis=1)
    
    min_concurrence = 40 * 365
    if len(concurrent_df) < min_concurrence:
        return None, len(concurrent_df)
    
    cols = concurrent_df.columns
    out = st.linregress(concurrent_df[cols[0]], concurrent_df[cols[1]])    

    return out[2]**2, len(concurrent_df)
    

In [52]:
def get_distance(pair):
    foo = hysets_df[hysets_df['Official_ID'].isin(pair)]
    hdf = gpd.GeoDataFrame(foo, geometry=foo['centroid_geom'], crs='EPSG:4326')
    hdf = hdf.to_crs(3005)
    hdf.reset_index(inplace=True)
    return hdf.loc[0, 'geometry'].distance(hdf.loc[1, 'geometry']) / 1000
    

In [53]:
def run_similarity_to_distance_calc(pair):
    property_diffs = []
    similarity, n_days_concurrent = get_similarity_measure_COD(pair)
    property_diffs.append(similarity)
    property_diffs.append(get_distance(pair))
    
    for c in basin_characteristics_cols:
        property_diffs.append(get_param_diff(pair, c))

    if similarity is not None:
        return property_diffs

In [102]:
basin_characteristics_cols

['Drainage_Area_km2',
 'Elevation_m',
 'Gravelius',
 'Aspect_deg',
 'Slope_deg',
 'Land_Use_Forest_frac',
 'Land_Use_Grass_frac',
 'Land_Use_Wetland_frac',
 'Land_Use_Water_frac',
 'Land_Use_Urban_frac',
 'Land_Use_Shrubs_frac',
 'Land_Use_Crops_frac',
 'Land_Use_Snow_Ice_frac',
 'Permeability_logk_m2',
 'Porosity_frac']

In [55]:
# results = []
# i = 0
# t0 = time.time()
# for p in unique_pairs_hysets:
#     results.append(run_similarity_to_distance_calc(p))
#     if (i > 99) & (i % 100 == 0):
#         t1 = time.time()
#         print(f'time for {i} results: {t1-t0:.1f}')
        
#     i += 1

In [68]:
# load the saved unique pairs
unique_pairs_hysets = np.load('unique_pairs.npy').tolist()
pairs_df = pd.DataFrame(unique_pairs_hysets)
pairs_df.columns = ['b1', 'b2']
print(pairs_df.head())
# print(f'There are {len(unique_pairs_hysets)} pairs')

        b1       b2
0  05AA006  05AA008
1  05AA006  05AA011
2  05AA006  05AA013
3  05AA006  05AA022
4  05AA006  05AA023


In [69]:
def check_pair_properties(row):
    for c in basin_characteristics_cols:
        p1 = hysets_dict[row['b1']][c]
        p2 = hysets_dict[row['b2']][c]
    if ~np.isnan(p1) & ~np.isnan(p2):
        return True
    else:
        return False

In [70]:
pairs_df['char_check'] = pairs_df.apply(lambda row: check_pair_properties(row), axis=1)


In [71]:
pairs_df[~pairs_df['char_check']].count()

b1            62170
b2            62170
char_check    62170
dtype: int64

In [77]:
# filter out pairs missing basin characteristics
pairs_df = pairs_df[pairs_df['char_check']]

In [78]:
pool = Pool()
t0 = time.time()

results = pool.map(run_similarity_to_distance_calc, pairs_df[['b1', 'b2']].to_numpy())
pool.close()
pool.join()
t1 = time.time()
print(f't for {len(results)} results: {t1-t0:.1f}s')

t for 167447 results: 6142.8s


In [81]:
# filter out empty result arrays
print(len(results))
results1 = [e for e in results if e]
print(len(results1))

167447
22152


In [82]:
results_df = pd.DataFrame(results1)
results_df.columns = ['similarity', 'distance'] + basin_characteristics_cols
results_df
results_df.to_csv('results_40y_min1.csv', index=False)

In [93]:
# char_df['basin_id'] = list(set(pairs_df[['b1', 'b2']].to_numpy().flatten()))
filtered_stns = list(set(pairs_df[['b1', 'b2']].to_numpy().flatten()))
char_df = hysets_df.loc[hysets_df['Official_ID'].isin(filtered_stns), :].copy()
print(len(char_df))

759


In [99]:
char_df = char_df[[e for e in char_df.columns if 'Centroid' in e] + basin_characteristics_cols]


In [100]:
char_df.to_csv('filtered_basins_characteristics.csv', index=False)