### Compare CONUS2 and SNOTEL SWE ###
DTT, 06/23

This notebook reads in CSVs of CONUS2 daily mean flow [cms] and daily mean USGS flow [cms] and computes comparison statistics. Gages are matched between the two csv's to make sure that we're comparing viable gage locations. 

Inputs:
- CONUS2 daily mean flow csv in [cms] - the output of `Make_CONUS2_streamflow_csv.py`
- USGS daily mean flow csv in [cms] - the output of `point_obs_CONUS2_Streamflow.ipynb`
- USGS gage metadata csv - the output of `point_obs_CONUS2_Streamflow.ipynb`

Outputs:
- Gage summary CSV of statistics  
(bias, relative bias, correlation, Spearman rank, RMSE, NSE, scaled RMSE)
for CONUS 1.0 simulated streamflow, compared to USGS observations.

In [4]:
import sys
from parflow.tools.io import read_pfb,write_pfb
import numpy as np
import pandas as pd
#import hydroeval as he
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from math import sqrt
from scipy import stats

import matplotlib.pyplot as plt

In [10]:
water_year = 2003
# Directory containing output csv's from `1_Organize_Streamflow.py`
#organized_dir = '/glade/work/tijerina/PFCONUS2-analysis/scripts/Validation/Streamflow/Organized_Daily_Flow'
organized_dir = '/home/dtt2/CONUS2/PFCONUS2-analysis/scripts/Validation/SNOTEL/SWE/Organized_Daily_SWE'

# names of csv files
pf_csv = 'PFCONUS2_Daily_matched_SWE_mm_2003.csv'
obs_csv = 'SNOTEL_Daily_matched_SWE_mm_2003.csv'
metadata_csv = 'SNOTEL_metadata_matched_SWE_mm_2003.csv'

### Read CONUS2 & USGS flow

In [6]:
### read in CONUS2 daily flow matched csv
pfdata = pd.read_csv(f'{organized_dir}/{pf_csv}',index_col=['site_id'])
print(pfdata.shape)
pfdata.head()

(640, 365)


Unnamed: 0_level_0,2002-10-01,2002-10-02,2002-10-03,2002-10-04,2002-10-05,2002-10-06,2002-10-07,2002-10-08,2002-10-09,2002-10-10,...,2003-09-21,2003-09-22,2003-09-23,2003-09-24,2003-09-25,2003-09-26,2003-09-27,2003-09-28,2003-09-29,2003-09-30
site_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000:OR:SNTL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1005:CO:SNTL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1006:NV:SNTL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1008:MT:SNTL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1009:MT:SNTL,0.0,0.0,0.0,0.925932,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
### read in USGS daily flow matched csv
usgsdata= pd.read_csv(f'{organized_dir}/{obs_csv}',index_col=['site_id']) 
print(usgsdata.shape)
usgsdata.head()

(640, 365)


Unnamed: 0_level_0,2002-10-01,2002-10-02,2002-10-03,2002-10-04,2002-10-05,2002-10-06,2002-10-07,2002-10-08,2002-10-09,2002-10-10,...,2003-09-21,2003-09-22,2003-09-23,2003-09-24,2003-09-25,2003-09-26,2003-09-27,2003-09-28,2003-09-29,2003-09-30
site_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000:OR:SNTL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1005:CO:SNTL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1006:NV:SNTL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1008:MT:SNTL,0.0,0.0,0.0,0.0,2.54,7.62,2.54,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1009:MT:SNTL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
# change zero values in USGS df so that we can calculate stats without errors
usgsdata = usgsdata.where(usgsdata>0, 0.0001)

In [11]:
### read in USGS streamflow gage metadata for all gages with flow in October AND NWM csv for adjusted lat/long
#NWM_gage_csv = pd.read_csv('/glade/p/univ/ucsm0002/CONUS2/domain_files/NWM_Gage_Adjustments_final.csv')
usgs_gage_metadata = pd.read_csv(f'{organized_dir}/{metadata_csv}',index_col=['site_id'])
print(usgs_gage_metadata.shape)
usgs_gage_metadata.head()

(640, 34)


Unnamed: 0_level_0,site_type,agency,site_name,site_id_agency,site_query_url,date_metadata_last_updated,latitude,longitude,state,huc,...,file_path,county_name,elevation,shef_id,acton_id,conus1_x,conus1_y,conus2_x,conus2_y,num_obs
site_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000:OR:SNTL,SNOTEL station,NRCS,Annie Springs,1000:OR:SNTL,https://wcc.sc.egov.usda.gov/awdbWebService/we...,2023-03-07,42.87007,-122.16518,OR,180102030101,...,/hydrodata/national_obs/swe/data/daily/1000:OR...,Klamath,6010.0,ANSO3,22G06S,,,258,2286,365
1005:CO:SNTL,SNOTEL station,NRCS,Ute Creek,1005:CO:SNTL,https://wcc.sc.egov.usda.gov/awdbWebService/we...,2023-03-07,37.6148,-105.37322,CO,130100020604,...,/hydrodata/national_obs/swe/data/daily/1005:CO...,Costilla,10734.0,UTCC2,05M17S,1063.0,495.0,1490,1448,365
1006:NV:SNTL,SNOTEL station,NRCS,Lewis Peak,1006:NV:SNTL,https://wcc.sc.egov.usda.gov/awdbWebService/we...,2023-03-07,40.3572,-116.8647,NV,160401070903,...,/hydrodata/national_obs/swe/data/daily/1006:NV...,Lander,7370.0,LWPN2,16J01S,137.0,957.0,592,1908,365
1008:MT:SNTL,SNOTEL station,NRCS,Onion Park,1008:MT:SNTL,https://wcc.sc.egov.usda.gov/awdbWebService/we...,2023-03-07,46.91348,-110.8536,MT,100301030801,...,/hydrodata/national_obs/swe/data/daily/1008:MT...,Meagher,7410.0,ONPM8,10C22S,753.0,1575.0,1196,2499,365
1009:MT:SNTL,SNOTEL station,NRCS,Stringer Creek,1009:MT:SNTL,https://wcc.sc.egov.usda.gov/awdbWebService/we...,2023-03-07,46.9269,-110.90198,MT,100301030801,...,/hydrodata/national_obs/swe/data/daily/1009:MT...,Meagher,6550.0,STCM8,10C23S,750.0,1577.0,1193,2501,365


### Calculate statistics

In [12]:
# set up DF for daily stats, index is gage site_id
stats_df = pd.DataFrame(index = usgs_gage_metadata.index, columns = ['latitude', 'longitude', 'conus2_x','conus2_y']) 
#, 'bias', 'pbias','absrelbias', 'relbias','srho', 'r2', 'mse', 'rmse', 'nse',scaled RMSE

stats_df['latitude'] = usgs_gage_metadata['latitude']
stats_df['longitude'] = usgs_gage_metadata['longitude']
stats_df['conus2_x'] = usgs_gage_metadata['conus2_x'].astype(int)
stats_df['conus2_y'] = usgs_gage_metadata['conus2_y'].astype(int)
stats_df['huc'] = usgs_gage_metadata['huc']
stats_df['HUC2']  = usgs_gage_metadata['huc'] // 1000000 #get the HUC2 while preserving the dtype=int64

print(stats_df.shape)
stats_df.head()

(640, 6)


Unnamed: 0_level_0,latitude,longitude,conus2_x,conus2_y,huc,HUC2
site_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1000:OR:SNTL,42.87007,-122.16518,258,2286,180102030101,180102
1005:CO:SNTL,37.6148,-105.37322,1490,1448,130100020604,130100
1006:NV:SNTL,40.3572,-116.8647,592,1908,160401070903,160401
1008:MT:SNTL,46.91348,-110.8536,1196,2499,100301030801,100301
1009:MT:SNTL,46.9269,-110.90198,1193,2501,100301030801,100301


In [9]:
# pfmean = np.mean(pfdata, axis = 1)
# pfmean

In [13]:
# compute annual average for observations
usgs_flow_sum = usgsdata.sum(axis=1)
stats_df['USGS_mean_mm'] = usgs_flow_sum/usgs_gage_metadata.num_obs
stats_df['PF_mean_mm'] = np.mean(pfdata, axis = 1)

In [14]:
# Set up array for stats
r2_daily = np.zeros(len(usgsdata.index))
srho_daily = np.zeros(len(usgsdata.index))
pval_daily = np.zeros(len(usgsdata.index))
mse_daily = np.zeros(len(usgsdata.index))
rmse_daily = np.zeros(len(usgsdata.index))
nse_daily = np.zeros(len(usgsdata.index))
bias_daily = np.zeros(len(usgsdata.index))
pbias_daily = np.zeros(len(usgsdata.index))
pbias_test = np.zeros(len(usgsdata.index))
absrelbias_daily = np.zeros(len(usgsdata.index))
relbias_daily = np.zeros(len(usgsdata.index))
r2_daily.shape
pf_minus_obs_daily = np.zeros(len(usgsdata.index))

In [15]:
for g in range(len(usgsdata.index)):
    gage = usgsdata.index[g]
    absrelbias_daily[g] = abs((pfdata.loc[gage].sum()-usgsdata.loc[gage].sum())/usgsdata.loc[gage].sum()) #absolute relative bias
    bias_daily[g] = (pfdata.loc[gage].sum()-usgsdata.loc[gage].sum())/usgsdata.loc[gage].sum()
    pbias_daily[g] = bias_daily[g]*100
    srho_daily[g], pval_daily[g] = stats.spearmanr(usgsdata.loc[gage], pfdata.loc[gage])
    mse_daily[g] = mean_squared_error(usgsdata.loc[gage], pfdata.loc[gage])
    rmse_daily[g] = sqrt(mean_squared_error(usgsdata.loc[gage], pfdata.loc[gage]))
    r2_daily[g] = r2_score(usgsdata.loc[gage], pfdata.loc[gage])
    pf_minus_obs_daily[g] = pfdata.loc[gage].sum()-usgsdata.loc[gage].sum()
   
    # #nse_daily[g] = he.nse(np.sum(pf_flow_matched.loc[gage]), np.sum(usgs_flow_matched.loc[gage]))
    #print(gage)
    # print(usgs_flow_matched.loc[gage])
    
    #import necessary libraries


#calculate RMSE
# sqrt(mean_squared_error(actual, pred)) 



In [16]:
# assign arrays to pandas columns in stats_df
stats_df['pf_minus_obs'] = pf_minus_obs_daily
stats_df['absrelbias'] = absrelbias_daily
stats_df['bias'] = bias_daily
stats_df['pbias'] = pbias_daily
stats_df['srho'] = srho_daily
stats_df['r2'] = r2_daily
stats_df['rmse'] = rmse_daily
stats_df['mse'] = mse_daily




In [17]:
stats_df.head()

Unnamed: 0_level_0,latitude,longitude,conus2_x,conus2_y,huc,HUC2,USGS_mean_mm,PF_mean_mm,pf_minus_obs,absrelbias,bias,pbias,srho,r2,rmse,mse
site_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1000:OR:SNTL,42.87007,-122.16518,258,2286,180102030101,180102,407.172478,261.951316,-53005.723969,0.356658,-0.356658,-35.665761,0.984528,0.766965,207.868536,43209.328082
1005:CO:SNTL,37.6148,-105.37322,1490,1448,130100020604,130100,116.130234,57.069518,-21557.161585,0.508573,-0.508573,-50.857313,0.987093,0.607546,84.807882,7192.376818
1006:NV:SNTL,40.3572,-116.8647,592,1908,160401070903,160401,23.082743,0.875038,-8105.81247,0.962091,-0.962091,-96.209125,0.484683,-0.434445,39.722155,1577.849576
1008:MT:SNTL,46.91348,-110.8536,1196,2499,100301030801,100301,119.818445,9.1747,-40384.967173,0.923428,-0.923428,-92.342832,0.706607,-0.579561,162.675299,26463.252787
1009:MT:SNTL,46.9269,-110.90198,1193,2501,100301030801,100301,78.378178,7.813375,-25756.153293,0.900312,-0.900312,-90.031186,0.771845,-0.368998,107.264012,11505.568329


In [18]:
stats_df.to_csv(f'{organized_dir}/Summary_SNOTEL_CONUS2_WY2003.csv')