In [188]:
import os
import json
import numpy as np
import pandas as pd

# hydroDL module by Kuai Fang
from hydroDL.data import dbVeg
from hydroDL.data import DataModel
from hydroDL.master import dataTs2Range
from hydroDL import kPath

In [189]:
rho = 45 # init rho
dataName = 'singleDaily-modisgrid-new-const' # init dataName
# importlib.reload(hydroDL.data.dbVeg) # reimport library
df = dbVeg.DataFrameVeg(dataName) # create DataFrameVeg class 
dm = DataModel(X=df.x, XC=df.xc, Y=df.y) # (?) create DataModel class (contains many confusing functions) 
siteIdLst = df.siteIdLst # get site list
dm.trans(mtdDefault='minmax') # (?) some sort of data normalization
dataTup = dm.getData() # get x, xc, y, and yc
dataEnd, (iInd, jInd) = dataTs2Range(dataTup, rho, returnInd=True) # get data into form (# LFMC, 91 day window, varX) 
x, xc, y, yc = dataEnd # data from dataTs2Range
iInd = np.array(iInd)
jInd = np.array(jInd)

In [190]:
# quality
thresh = 0.4

# get land cover percentages for quality sites
lc_idx = [df.varXC.index(var) for var in df.varXC[-6:]]
lc_pct = df.xc[:, lc_idx]
top_lc = np.argmax(lc_pct, axis=1)

In [224]:
split_version = 'dataset'

In [225]:
splits_path = os.path.join(kPath.dirVeg, 'model', 'attention', split_version, 'subset.json')
with open(splits_path) as f:
    splits_dict = json.load(f)

data = {
    'fold' : range(5),
    '# train examples' : [len(splits_dict[f'trainInd_k{fold}5']) for fold in range(5)],
    '# test examples' : [len(splits_dict[f'testInd_k{fold}5']) for fold in range(5)],
    '# train sites' : [len(splits_dict[f'trainSite_k{fold}5']) for fold in range(5)],
    '# test sites' : [len(splits_dict[f'testSite_k{fold}5']) for fold in range(5)]
}

In [203]:
# for i in range(6):
#     data[f'# train sites lc={i}'] = [0] * 5
#     data[f'# test sites lc={i}'] = [0] * 5
#     # data[f'# train examples lc={i}'] = [0] * 5
#     # data[f'# test examples lc={i}'] = [0] * 5

# for fold in range(5):
#     for land_cover_id in range(6):
#         for site in splits_dict[f'trainSite_k{fold}5']: 
#             if top_lc[site] != land_cover_id:
#                 continue
#             data[f'# train sites lc={top_lc[site]}'][fold] += 1
#             # data[f'# train examples lc={top_lc[site]}'][fold] += len(np.where(np.isin(jInd, site))[0])
#         for site in splits_dict[f'testSite_k{fold}5']: 
#             if top_lc[site] != land_cover_id:
#                 continue
#             data[f'# test sites lc={top_lc[site]}'][fold] += 1
#             # data[f'# test examples lc={top_lc[site]}'][fold] += len(np.where(np.isin(jInd, site))[0])


In [226]:
for i in range(6):
    data[f'# train sites lc={i}'] = [0] * 5
    data[f'# test sites lc={i}'] = [0] * 5
    # data[f'# train examples lc={i}'] = [0] * 5
    # data[f'# test examples lc={i}'] = [0] * 5

for fold in range(5):
    for land_cover_id in range(6):
        data[f'# train sites lc={land_cover_id}'][fold] = np.sum(top_lc[splits_dict[f'trainSite_k{fold}5']] == land_cover_id)
        data[f'# test sites lc={land_cover_id}'][fold] = np.sum(top_lc[splits_dict[f'testSite_k{fold}5']] == land_cover_id)

In [227]:
for i in range(6):
    # data[f'# train sites lc={i} 2'] = [0] * 5
    # data[f'# test sites lc={i} 2'] = [0] * 5
    data[f'# train examples lc={i}'] = [0] * 5
    data[f'# test examples lc={i}'] = [0] * 5

for site in range(335):
    for fold in range(5):
        lc = top_lc[site]
        if site in splits_dict[f'trainSite_k{fold}5']:
            data[f'# train examples lc={lc}'][fold] += len(np.where(np.isin(jInd, site))[0])
        if site in splits_dict[f'testSite_k{fold}5']:
            data[f'# test examples lc={lc}'][fold] += len(np.where(np.isin(jInd, site))[0])

In [229]:
pd.DataFrame(data).to_csv(f'{kPath.dirVeg}random_summary.csv', index=False)

In [223]:
pd.DataFrame(data).to_csv(f'{kPath.dirVeg}stratified_summary.csv', index=False)

In [228]:
data

{'fold': range(0, 5),
 '# train examples': [6282, 6257, 6249, 6341, 6323],
 '# test examples': [1581, 1606, 1614, 1522, 1540],
 '# train sites': [126, 126, 126, 127, 127],
 '# test sites': [32, 32, 32, 31, 31],
 '# train sites lc=0': [3, 3, 3, 2, 1],
 '# test sites lc=0': [0, 0, 0, 1, 2],
 '# train sites lc=1': [36, 34, 39, 42, 41],
 '# test sites lc=1': [12, 14, 9, 6, 7],
 '# train sites lc=2': [3, 4, 4, 2, 3],
 '# test sites lc=2': [1, 0, 0, 2, 1],
 '# train sites lc=3': [60, 63, 63, 58, 60],
 '# test sites lc=3': [16, 13, 13, 18, 16],
 '# train sites lc=4': [20, 19, 14, 18, 17],
 '# test sites lc=4': [2, 3, 8, 4, 5],
 '# train sites lc=5': [4, 3, 3, 5, 5],
 '# test sites lc=5': [1, 2, 2, 0, 0],
 '# train examples lc=0': [110, 110, 110, 73, 37],
 '# test examples lc=0': [0, 0, 0, 37, 73],
 '# train examples lc=1': [1668, 1544, 1855, 1953, 1932],
 '# test examples lc=1': [570, 694, 383, 285, 306],
 '# train examples lc=2': [123, 165, 165, 82, 125],
 '# test examples lc=2': [42, 0, 0, 