In [78]:
from pathlib import Path
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib as matplotlib
import matplotlib.pyplot as plt

pd.options.mode.chained_assignment = None  # default='warn'

In [79]:
from utils import REPO_PATH, DATA_PATH, DATA_CLEAN_PATH

# Read in modis and forest biomass datasets

In [130]:
modis = pd.read_csv(DATA_CLEAN_PATH / 'modis_cln.csv')
modis.loc[:, 'fips'] = modis.statefp.astype(str) + modis.countyfp.astype(str).str.pad(width=3, side='left', fillchar='0')
modis.loc[:, 'fips'] = modis.fips.astype(int)
modis.tail()

Unnamed: 0,year,statefp,countyfp,name,prop_mean_mean,prop_mean_std,prop_std_mean,prop_std_std,prop_min_mean,prop_min_std,prop_max_mean,prop_max_std,mbm_sum_mean,mbm_sum_std,dbm_sum_mean,dbm_sum_std,consec_mbm_max_mean,consec_mbm_max_std,value,fips
136615,2021,56,37,Sweetwater,14623.478153,64.647533,893.483716,35.429396,13414.524338,95.786149,15843.354398,82.686401,6.031597,0.218446,1.0,0.0,3.052092,0.222308,lst,56037
136616,2021,56,39,Teton,13977.039642,104.074417,725.042605,61.096827,13021.09919,65.690499,15011.052632,146.43981,6.020243,0.431511,1.0,0.0,4.006073,0.410268,lst,56039
136617,2021,56,41,Uinta,14541.327273,137.666784,842.373837,62.005852,13405.263636,102.861851,15659.063636,215.002953,6.118182,0.442736,1.0,0.0,3.186364,0.390288,lst,56041
136618,2021,56,43,Washakie,14676.46139,212.763141,902.644565,53.118498,13378.034749,127.706524,15916.362934,243.948732,5.818533,0.605117,1.0,0.0,3.262548,0.440871,lst,56043
136619,2021,56,45,Weston,14660.644231,111.142083,857.347346,47.260968,13425.915385,110.596176,15818.973077,189.323861,5.346154,0.47666,1.0,0.0,3.011538,0.107002,lst,56045


In [139]:
fis = pd.read_csv(DATA_CLEAN_PATH / 'biomass_cln.csv')
fis = fis.rename(columns={'ord':'fips', 'c1':'county'}).drop(columns=('report'))
fis.loc[:,'state'] = fis.state.str.title()
fis.loc[:,'year_start'] = fis.year_end.astype(float)
fis.loc[:, 'statefp'] = fis.fips.astype(str).str[::-1].str[3:].str[::-1].str.replace(r'^\s*$','-1', regex=True).astype(int)
fis = fis.loc[fis.statefp > 0]
fis.head()

Unnamed: 0,county,fips,total,variance,sampling_error,sampling_error_percent,total_plots,domain_plots,non_zero_plots,r0,c0,state,year_start,year_end,statefp
0,Weston,56045,2301287000.0,2.314102e+17,481051100.0,20.90357,224044.0,20.0,20.0,561801,,Wyoming,2020.0,2020.0,56
1,Washakie,56043,3086214000.0,1.49082e+18,1220991000.0,39.562765,224044.0,15.0,15.0,561801,,Wyoming,2020.0,2020.0,56
2,Uinta,56041,2126152000.0,5.685005e+17,753989700.0,35.462635,224044.0,16.0,16.0,561801,,Wyoming,2020.0,2020.0,56
3,Teton,56039,80048160000.0,1.651294e+19,4063612000.0,5.076459,224044.0,263.0,262.0,561801,,Wyoming,2020.0,2020.0,56
4,Sweetwater,56037,1935946000.0,3.164663e+17,562553300.0,29.058317,224044.0,25.0,25.0,561801,,Wyoming,2020.0,2020.0,56


## Collapse each state in modis to report ranges

In [140]:
statefps = fis.statefp.sort_values().unique()

# for state
modis_r = pd.DataFrame()
for statefp in statefps[1:]:
    # get state data and report ranges for state
    m = modis.loc[modis.statefp == statefp]
    report_ranges = fis.loc[fis.statefp == statefp, ('year_start', 'year_end')].drop_duplicates().to_numpy()

    # label report ranges
    for i,r in enumerate(report_ranges):
        m.loc[(m.year >= r[0]) & (m.year <= r[1]), ['report','year_start', 'year_end']] = (str(i), r[0], r[1])

    # collapse state to ranges
    groupcols = ['name', 'statefp', 'countyfp', 'fips', 'value', 'report', 'year_start', 'year_end']
    m = m.groupby(groupcols).mean().reset_index()
    modis_r = pd.concat([m, modis_r])


## merge datasets

In [143]:
df = pd.merge(left=fis, right=modis_r, how='left', on=('fips', 'statefp', 'year_start', 'year_end'))
df.head()

Unnamed: 0,county,fips,total,variance,sampling_error,sampling_error_percent,total_plots,domain_plots,non_zero_plots,r0,...,prop_min_mean,prop_min_std,prop_max_mean,prop_max_std,mbm_sum_mean,mbm_sum_std,dbm_sum_mean,dbm_sum_std,consec_mbm_max_mean,consec_mbm_max_std
0,Weston,56045,2301287000.0,2.314102e+17,481051100.0,20.90357,224044.0,20.0,20.0,561801,...,13589.111538,92.9474,15707.876923,148.754184,5.930769,0.46811,1.0,0.0,3.157692,0.365155
1,Weston,56045,2301287000.0,2.314102e+17,481051100.0,20.90357,224044.0,20.0,20.0,561801,...,1428.365385,752.247198,4173.365385,807.910672,6.915385,1.755453,1.034615,0.424575,3.934615,0.645413
2,Washakie,56043,3086214000.0,1.49082e+18,1220991000.0,39.562765,224044.0,15.0,15.0,561801,...,13380.857143,88.94689,15902.316602,244.705487,5.281853,0.507405,1.0,0.0,3.208494,0.407018
3,Washakie,56043,3086214000.0,1.49082e+18,1220991000.0,39.562765,224044.0,15.0,15.0,561801,...,381.150579,578.623826,3656.791506,1409.754381,5.057915,1.825881,1.397683,0.747188,3.891892,1.469289
4,Uinta,56041,2126152000.0,5.685005e+17,753989700.0,35.462635,224044.0,16.0,16.0,561801,...,13278.3,57.114569,15669.618182,188.66982,5.481818,0.500809,1.0,0.0,3.481818,0.500809


In [146]:
# write final dataset
filename = 'analysis_df.csv'
filepath = DATA_CLEAN_PATH / filename
filepath.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(filepath, index=False)