### Investigating sites that are never used in training
Created: Tue, Sep 24, 2024

Description: On Mon, Sep 23, 2024, Kuai messaged me that some sites are never used in training. See image he provided (Slack). Here, I try to determine why this is the case.

In [1]:
from hydroDL import kPath # package by Kuai Fang, kPath contains req paths

import pandas as pd
import json
import os

loading package hydroDL


In [2]:
# sites
sites_path = os.path.join(kPath.dirVeg, "model/data/singleDaily-modisgrid-new-const/site.csv")
sites_df = pd.read_csv(sites_path)

In [3]:
def check_site(split_version, site_id):
    splits_path = os.path.join(kPath.dirVeg, 'model', 'attention', split_version, 'subset.json')
    with open(splits_path) as f:
        splits_dict = json.load(f)

    site_idx = sites_df[sites_df.siteId == site_id].index[0] # get site index

    for i in range(5):
        if site_idx in splits_dict[f'trainSite_k{i}5']:
            print(f'Site {site_idx} is in trainSite_k{i}5')
        if site_idx in splits_dict[f'testSite_k{i}5']:
            print(f'Site {site_idx} is in testSite_k{i}5')

    if site_idx in splits_dict['testSite_underThresh']:
        print(f'Site {site_idx} is in testSite_underThresh')

In [4]:
check_site('stratified', 'N0881')

Site 189 is in testSite_underThresh


In [5]:
check_site('stratified', 'N0669')

Site 159 is in testSite_underThresh


In [6]:
def breakdown(split_version):
    splits_path = os.path.join(kPath.dirVeg, 'model', 'attention', split_version, 'subset.json')
    with open(splits_path) as f:
        splits_dict = json.load(f)

    data = {i : [] for i in range(5)}    
    for site_idx in range(len(sites_df)):
        for i in range(5):
            if site_idx in splits_dict[f'trainSite_k{i}5']:
                data[i].append('train')
            elif site_idx in splits_dict[f'testSite_k{i}5']:
                data[i].append('test,qual')
            elif site_idx in splits_dict['testSite_underThresh']:
                data[i].append('test,poor')
            else:
                data[i].append('bad')

    return pd.DataFrame(data)

In [7]:
breakdown('dataset')

Unnamed: 0,0,1,2,3,4
0,train,train,train,train,"test,qual"
1,train,train,"test,qual",train,train
2,train,train,train,"test,qual",train
3,train,train,train,"test,qual",train
4,"test,qual",train,train,train,train
...,...,...,...,...,...
330,bad,bad,bad,bad,bad
331,bad,bad,bad,bad,bad
332,bad,bad,bad,bad,bad
333,bad,bad,bad,bad,bad


In [50]:
df1 = breakdown('dataset')
dataset_poor_sites = set(df1[df1[0] == 'test,poor'].index)
len(dataset_poor_sites )

145

In [51]:
df2 = breakdown('stratified')
stratified_poor_sites = set(df2[df2[0] == 'test,poor'].index)
len(stratified_poor_sites)

146

In [52]:
len(dataset_poor_sites.intersection(stratified_poor_sites))

145

In [54]:
stratified_poor_sites - dataset_poor_sites

{306}

In [56]:
sites_df.iloc[list(stratified_poor_sites - dataset_poor_sites)]

Unnamed: 0,siteId,siteName,state,fuel,gacc,lat,lon
306,N1063,Clearlake,CA,"Manzanita, Greenleaf",NOCC,38.96,-122.6325


In [59]:
sites_df.iloc[list(dataset_poor_sites)]

Unnamed: 0,siteId,siteName,state,fuel,gacc,lat,lon
10,N0045,D10 - Sledgehammer North,CO,"Douglas-Fir, Rocky Mountain",RMCC,38.918611,-105.396667
11,N0047,D10_Dicks Peak,CO,"Douglas-Fir, Rocky Mountain",RMCC,38.818333,-105.649722
16,N0061,Kawuneeche,CO,"Pine, Lodgepole",RMCC,40.267222,-105.832500
19,N0096,Black Canyon,CO,"Oak, Gambel",RMCC,38.541944,-107.687222
20,N0098,Davewood,CO,"Pine, Ponderosa",RMCC,38.305000,-107.963889
...,...,...,...,...,...,...,...
318,N1082,Shasta Dam,CA,"Manzanita, Greenleaf",NOCC,40.693611,-122.427222
319,N1083,Shingletown,CA,"Manzanita, Greenleaf",NOCC,40.527222,-121.910000
320,N1084,Sweetland,CA,"Manzanita, Greenleaf",NOCC,39.326944,-121.111111
322,N1098,Old Man CG,CA,"Manzanita, Greenleaf",NOCC,40.853333,-122.407778


In [60]:
sites_df.iloc[list(dataset_poor_sites)][sites_df.siteId.isin(['N0881', 'N0669'])]

  sites_df.iloc[list(dataset_poor_sites)][sites_df.siteId.isin(['N0881', 'N0669'])]


Unnamed: 0,siteId,siteName,state,fuel,gacc,lat,lon
159,N0669,COF Cow Trap,AZ,"Juniper, Utah",SWCC,34.72,-111.090833
189,N0881,Blizzard Gap Bottom,OR,"Sagebrush, Wyoming Big",NWCC,42.107778,-119.747222


In [63]:
189 in stratified_poor_sites

True