### Compare CONUS2 and USGS streamflow ###
DTT, 10/22

This notebook reads in CSVs of CONUS2 daily mean flow [cms] and daily mean USGS flow [cms] and computes comparison statistics. Gages are matched between the two csv's to make sure that we're comparing viable gage locations. 

Inputs:
- CONUS2 daily mean flow csv in [cms] - the output of `Make_CONUS2_streamflow_csv.py`
- USGS daily mean flow csv in [cms] - the output of `point_obs_CONUS2_Streamflow.ipynb`
- USGS gage metadata csv - the output of `point_obs_CONUS2_Streamflow.ipynb`

Outputs:
- Gage summary CSV of statistics  
(bias, relative bias, correlation, Spearman rank, RMSE, NSE, scaled RMSE)
for CONUS 1.0 simulated streamflow, compared to USGS observations.

In [2]:
import sys
from parflow.tools.io import read_pfb,write_pfb
import numpy as np
import pandas as pd
#import hydroeval as he
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from math import sqrt
from scipy import stats

import matplotlib.pyplot as plt

In [3]:
water_year = 2003
# Directory containing output csv's from `1_Organize_Streamflow.py`
#organized_dir = '/glade/work/tijerina/PFCONUS2-analysis/scripts/Validation/Streamflow/Organized_Daily_Flow'
organized_dir = '/home/dtt2/CONUS2/PFCONUS2-analysis/scripts/Validation/Streamflow/Organized_Daily_Flow'

# names of csv files
usgs_csv = 'USGS_Daily_matched_flow_cms_2003.csv'
pf_csv = 'PFCONUS2_Daily_matched_flow_cms_2003.csv'
metadata_csv = 'USGS_metadata_matched_flow_2003.csv'

### Read CONUS2 & USGS flow

In [4]:
### read in CONUS2 daily flow matched csv
pfdata = pd.read_csv(f'{organized_dir}/{pf_csv}',index_col=['site_id'])
print(pfdata.shape)
pfdata.head()

(5399, 365)


Unnamed: 0_level_0,2002-10-01,2002-10-02,2002-10-03,2002-10-04,2002-10-05,2002-10-06,2002-10-07,2002-10-08,2002-10-09,2002-10-10,...,2003-09-21,2003-09-22,2003-09-23,2003-09-24,2003-09-25,2003-09-26,2003-09-27,2003-09-28,2003-09-29,2003-09-30
site_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1010000,43.948426,40.76587,34.389042,27.512755,22.526485,18.593126,15.312236,12.53296,10.260977,8.550327,...,0.228571,0.168567,0.238709,0.472419,0.646194,0.612115,0.546623,0.822631,8.067395,0.0
1010070,5.335298,3.581117,2.112628,1.401444,1.006803,0.803474,0.716559,0.526913,0.385837,0.335107,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.116555,2.476887,0.0
1010500,94.018137,87.844915,81.101017,74.425605,67.642209,60.635444,52.793281,44.862003,38.017604,32.625893,...,1.563807,1.28633,1.151007,1.301772,1.288099,1.414516,1.376162,1.64936,44.72265,0.0
1011000,37.266464,34.151593,31.949418,28.746042,24.794453,21.037749,17.72566,14.927488,12.823219,11.328315,...,0.38303,0.248542,0.270354,0.370566,0.447849,0.437744,0.346303,1.857882,32.173826,0.0
1013500,18.686342,17.994405,17.17183,14.470901,14.358695,12.452931,11.061765,10.040905,9.119768,8.49928,...,0.542694,0.269608,0.231789,0.451696,0.32128,0.249696,0.204132,1.203276,30.743592,0.0


In [5]:
### read in USGS daily flow matched csv
usgsdata= pd.read_csv(f'{organized_dir}/{usgs_csv}',index_col=['site_id']) 
print(usgsdata.shape)
usgsdata.head()

(5399, 365)


Unnamed: 0_level_0,2002-10-01,2002-10-02,2002-10-03,2002-10-04,2002-10-05,2002-10-06,2002-10-07,2002-10-08,2002-10-09,2002-10-10,...,2003-09-21,2003-09-22,2003-09-23,2003-09-24,2003-09-25,2003-09-26,2003-09-27,2003-09-28,2003-09-29,2003-09-30
site_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1010000,9.1692,7.8391,7.7259,6.792,5.7732,5.1223,4.7261,4.4714,4.1318,3.9337,...,4.0752,3.9903,3.9903,4.3865,6.3958,7.075,6.8486,7.4712,8.9711,9.1975
1010070,0.59713,0.50657,0.45846,0.43582,0.42733,0.39054,0.37922,0.36224,0.37073,0.39903,...,0.94805,0.89994,0.90277,1.03578,1.35274,1.42066,1.39236,1.33576,1.64423,1.52254
1010500,8.3202,11.4615,12.9331,11.3766,11.4049,10.3012,9.2824,8.207,7.5278,7.4429,...,11.603,11.4049,11.1219,11.9143,13.1595,13.9519,15.9046,18.0554,58.581,74.146
1011000,7.4146,7.3863,6.9901,6.5656,6.3675,6.1977,5.7449,5.0657,4.528,4.4148,...,12.6501,12.3671,11.9426,12.7067,13.6972,13.6123,13.1595,14.0085,66.222,76.127
1013500,3.5092,3.4243,3.1696,3.113,3.3394,2.8583,2.8583,2.6319,2.4055,2.50455,...,14.3764,13.8104,13.6123,13.5557,13.1595,12.2539,11.9426,15.7631,58.298,51.223


In [6]:
# change zero values in USGS df so that we can calculate stats without errors
usgsdata = usgsdata.where(usgsdata>0, 0.0001)

In [7]:
### read in USGS streamflow gage metadata for all gages with flow in October AND NWM csv for adjusted lat/long
#NWM_gage_csv = pd.read_csv('/glade/p/univ/ucsm0002/CONUS2/domain_files/NWM_Gage_Adjustments_final.csv')
usgs_gage_metadata = pd.read_csv(f'{organized_dir}/{metadata_csv}',index_col=['site_id'])
print(usgs_gage_metadata.shape)
usgs_gage_metadata.head()

(5399, 84)


Unnamed: 0_level_0,site_type,agency,site_name,site_id_agency,site_query_url,date_metadata_last_updated,latitude,longitude,state,huc,...,topo_cd,instruments_cd,construction_dt,inventory_dt,drain_area_va,contrib_drain_area_va,local_time_fg,reliability_cd,project_no,num_obs
site_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1010000,stream gauge,USGS,"St. John River at Ninemile Bridge, Maine",1010000,https://waterservices.usgs.gov/nwis/site/?form...,2023-05-30,46.700556,-69.715556,ME,1010001,...,,YNNNYNNNNYNNNNNNYNNNNNNNNNNNNN,,,1341.0,1341.0,Y,,442300100,365
1010070,stream gauge,USGS,"Big Black River near Depot Mtn, Maine",1010070,https://waterservices.usgs.gov/nwis/site/?form...,2023-05-30,46.893889,-69.751667,ME,1010001,...,,YNNNYNYNNYNNNNNNYNNNNNNNNNNNNN,,,171.0,171.0,Y,,442300100,365
1010500,stream gauge,USGS,"St. John River at Dickey, Maine",1010500,https://waterservices.usgs.gov/nwis/site/?form...,2023-05-30,47.113056,-69.088056,ME,1010001,...,,YNNNYNNNNYNNNYNNYNNNNNNNNNNNNN,,,2680.0,2680.0,Y,,442300100,365
1011000,stream gauge,USGS,"Allagash River near Allagash, Maine",1011000,https://waterservices.usgs.gov/nwis/site/?form...,2023-05-30,47.069722,-69.079444,ME,1010002,...,,YNNNYNNNNYNNNNNNYNNNNNNNNNNNNN,,,1478.0,1229.0,Y,,442300100,365
1013500,stream gauge,USGS,"Fish River near Fort Kent, Maine",1013500,https://waterservices.usgs.gov/nwis/site/?form...,2023-05-30,47.2375,-68.582778,ME,1010003,...,,YNNNYNYNNYNNNNNNYNNNNNNNNNNNNN,,,873.0,873.0,Y,,442300100,365


### Calculate statistics

In [8]:
# set up DF for daily stats, index is gage site_id
stats_df = pd.DataFrame(index = usgs_gage_metadata.index, columns = ['latitude', 'longitude', 'conus2_x','conus2_y']) 
#, 'bias', 'pbias','absrelbias', 'relbias','srho', 'r2', 'mse', 'rmse', 'nse',scaled RMSE

stats_df['latitude'] = usgs_gage_metadata['latitude']
stats_df['longitude'] = usgs_gage_metadata['longitude']
stats_df['conus2_x'] = usgs_gage_metadata['conus2_x'].astype(int)
stats_df['conus2_y'] = usgs_gage_metadata['conus2_y'].astype(int)
stats_df['huc'] = usgs_gage_metadata['huc']
stats_df['HUC2']  = usgs_gage_metadata['huc'] // 1000000 #get the HUC2 while preserving the dtype=int64

print(stats_df.shape)
stats_df.head()

(5399, 6)


Unnamed: 0_level_0,latitude,longitude,conus2_x,conus2_y,huc,HUC2
site_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1010000,46.700556,-69.715556,4181,2731,1010001,1
1010070,46.893889,-69.751667,4172,2746,1010001,1
1010500,47.113056,-69.088056,4209,2784,1010001,1
1011000,47.069722,-69.079444,4210,2783,1010002,1
1013500,47.2375,-68.582778,4237,2810,1010003,1


In [9]:
# pfmean = np.mean(pfdata, axis = 1)
# pfmean

In [10]:
# compute annual average for observations
usgs_flow_sum = usgsdata.sum(axis=1)
stats_df['USGS_mean_cms'] = usgs_flow_sum/usgs_gage_metadata.num_obs
stats_df['PF_mean_cms'] = np.mean(pfdata, axis = 1)

In [11]:
# Set up array for stats
r2_daily = np.zeros(len(usgsdata.index))
srho_daily = np.zeros(len(usgsdata.index))
pval_daily = np.zeros(len(usgsdata.index))
mse_daily = np.zeros(len(usgsdata.index))
rmse_daily = np.zeros(len(usgsdata.index))
nse_daily = np.zeros(len(usgsdata.index))
bias_daily = np.zeros(len(usgsdata.index))
pbias_daily = np.zeros(len(usgsdata.index))
pbias_test = np.zeros(len(usgsdata.index))
absrelbias_daily = np.zeros(len(usgsdata.index))
relbias_daily = np.zeros(len(usgsdata.index))
r2_daily.shape
pf_minus_obs_daily = np.zeros(len(usgsdata.index))

In [12]:
for g in range(len(usgsdata.index)):
    gage = usgsdata.index[g]
    absrelbias_daily[g] = abs((pfdata.loc[gage].sum()-usgsdata.loc[gage].sum())/usgsdata.loc[gage].sum()) #absolute relative bias
    bias_daily[g] = (pfdata.loc[gage].sum()-usgsdata.loc[gage].sum())/usgsdata.loc[gage].sum()
    pbias_daily[g] = bias_daily[g]*100
    srho_daily[g], pval_daily[g] = stats.spearmanr(usgsdata.loc[gage], pfdata.loc[gage])
    mse_daily[g] = mean_squared_error(usgsdata.loc[gage], pfdata.loc[gage])
    rmse_daily[g] = sqrt(mean_squared_error(usgsdata.loc[gage], pfdata.loc[gage]))
    r2_daily[g] = r2_score(usgsdata.loc[gage], pfdata.loc[gage])
    pf_minus_obs_daily[g] = pfdata.loc[gage].sum()-usgsdata.loc[gage].sum()
   
    # #nse_daily[g] = he.nse(np.sum(pf_flow_matched.loc[gage]), np.sum(usgs_flow_matched.loc[gage]))
    #print(gage)
    # print(usgs_flow_matched.loc[gage])
    
    #import necessary libraries


#calculate RMSE
# sqrt(mean_squared_error(actual, pred)) 



In [13]:
# assign arrays to pandas columns in stats_df
stats_df['pf_minus_obs'] = pf_minus_obs_daily
stats_df['absrelbias'] = absrelbias_daily
stats_df['bias'] = bias_daily
stats_df['pbias'] = pbias_daily
stats_df['srho'] = srho_daily
stats_df['r2'] = r2_daily
stats_df['rmse'] = rmse_daily
stats_df['mse'] = mse_daily




In [14]:
stats_df.head()

Unnamed: 0_level_0,latitude,longitude,conus2_x,conus2_y,huc,HUC2,USGS_mean_cms,PF_mean_cms,pf_minus_obs,absrelbias,bias,pbias,srho,r2,rmse,mse
site_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1010000,46.700556,-69.715556,4181,2731,1010001,1,54.73251,21.812227,-12015.90332,0.601476,-0.601476,-60.147585,0.859089,0.194915,77.527725,6010.548221
1010070,46.893889,-69.751667,4172,2746,1010001,1,9.014225,2.360176,-2428.727876,0.738172,-0.738172,-73.817209,0.697101,0.161414,14.367161,206.415312
1010500,47.113056,-69.088056,4209,2784,1010001,1,104.191141,51.536925,-19218.788658,0.505362,-0.505362,-50.536173,0.79535,0.387808,126.564636,16018.606962
1011000,47.069722,-69.079444,4210,2783,1010002,1,41.836549,29.714315,-4424.615543,0.289752,-0.289752,-28.975225,0.706543,0.598717,36.749715,1350.541531
1013500,47.2375,-68.582778,4237,2810,1010003,1,39.405556,18.607378,-7591.334817,0.527798,-0.527798,-52.77981,0.517411,0.220357,44.114798,1946.115399


In [15]:
stats_df.to_csv(f'{organized_dir}/Summary_Gages_CONUS2_WY2003.csv')

### Calculate PDF and CDF of stats

In [None]:
# test values for the bw_method option ('None' is the default value)
bw_values =  [None, 0.1, 0.01]

# generate a list of kde estimators for each bw
kde = [scipy.stats.gaussian_kde(data,bw_method=bw) for bw in bw_values]


# plot (normalized) histogram of the data
import matplotlib.pyplot as plt 
plt.hist(data, 50, normed=1, facecolor='green', alpha=0.5);

# plot density estimates
t_range = np.linspace(-2,8,200)
for i, bw in enumerate(bw_values):
    plt.plot(t_range,kde[i](t_range),lw=2, label='bw = '+str(bw))
plt.xlim(-1,6)
plt.legend(loc='best')