# Compare sites

**Purpose:** We want to compare metrics by the RNN maps to predictions by the transformer model. We will compare performance on the sites that neither model was trained on. This script determines those sites.

**Date:** Aug 1, 2024


In [59]:
from hydroDL import kPath

import pandas as pd
import json
import os

In [77]:
# RNN train sites
rnn_df_path = '/Users/andyhuynh/Documents/lfmc/data/predictions/lstm_input_data_pure+all_same_28_may_2019_res_SM_gap_3M'
rnn_df = pd.read_pickle(rnn_df_path)
rnn_train_sites = rnn_df['site'].unique().tolist()

print("number of sites:", len(rnn_train_sites))


number of sites: 150


In [80]:
# Transformer train/test sites
transformer_df_path = '/Users/andyhuynh/Documents/lfmc/data/model/data/singleDaily-nadgrid/site.csv'
transformer_df = pd.read_csv(transformer_df_path)

# Get splits
splits_path = os.path.join(kPath.dirVeg, 'model', 'attention', 'dataset')
splits_json = os.path.join(splits_path, 'subset.json')

with open(splits_json) as json_file:
    splits = json.load(json_file)

quality_test_sites = splits['testSite_k05']
poor_test_sites = splits['testSite_underThresh']
test_sites = quality_test_sites + poor_test_sites

print("number of sites:", len(test_sites))
print("number of quality sites:", len(quality_test_sites))
print("number of poor sites:", len(poor_test_sites))

number of sites: 177
number of quality sites: 32
number of poor sites: 145


In [70]:
# all test sites
transformer_test_df = transformer_df.iloc[test_sites]
transformer_test_sites = transformer_test_df['siteName'].unique().tolist()

non_train_siteNames_both_models = set(transformer_test_sites) - set(rnn_train_sites)
non_train_siteIds_both_models = transformer_test_df[transformer_test_df.siteName.isin(non_train_siteNames_both_models)].siteId

out_path = os.path.join(kPath.dirVeg, 'predictions/non_train_siteIds_both_models_all.csv')
non_train_siteIds_both_models.to_csv(out_path, index=False)

print("number of sites:", len(non_train_siteIds_both_models))


number of sites:  161


In [76]:
# quality test sites
transformer_test_df = transformer_df.iloc[quality_test_sites]
transformer_test_sites = transformer_test_df['siteName'].unique().tolist()

non_train_siteNames_both_models = set(transformer_test_sites) - set(rnn_train_sites)
non_train_siteIds_both_models = transformer_test_df[transformer_test_df.siteName.isin(non_train_siteNames_both_models)].siteId

out_path = os.path.join(kPath.dirVeg, 'predictions/non_train_siteIds_both_models_quality.csv')
non_train_siteIds_both_models.to_csv(out_path, index=False)

print("number of sites:", len(non_train_siteIds_both_models))


number of sites: 20


In [71]:
# poor test sites
transformer_test_df = transformer_df.iloc[poor_test_sites]
transformer_test_sites = transformer_test_df['siteName'].unique().tolist()

non_train_siteNames_both_models = set(transformer_test_sites) - set(rnn_train_sites)
non_train_siteIds_both_models = transformer_test_df[transformer_test_df.siteName.isin(non_train_siteNames_both_models)].siteId

out_path = os.path.join(kPath.dirVeg, 'predictions/non_train_siteIds_both_models_poor.csv')
non_train_siteIds_both_models.to_csv(out_path, index=False)

print("number of sites:", len(non_train_siteIds_both_models))


number of sites: 141
