In [38]:
from pathlib import Path
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib as matplotlib
import matplotlib.pyplot as plt
import os
# import datetime as dt
# from shapely import wkt
# from shapely.geometry import Point, Polygon
# import geopandas as gpd

pd.options.mode.chained_assignment = None  # default='warn'


In [3]:
from import_functions import readin_fis_biomass
# And change jupyter settings to auto-reload these functions before each instance running them
%load_ext autoreload
%autoreload 2

In [4]:
repo_path = Path('/Users/etriesch/dev/forest-biomass-modeling/')
data_path = repo_path / 'data/'
data_clean_path = repo_path / 'data/clean/'
geo_crs = 'epsg:4326'
proj_crs = '+proj=cea'

# read in report to year mapping

In [18]:
filepath = data_path / 'biomass/fis_biomass_reportlabels.csv'
labs = pd.read_csv(filepath)
labs = labs.loc[labs.label_part1 == 'CURRENT',]
labs.loc[:,'report'] = labs.report.astype(int)
labs = labs.loc[:,('report', 'state', 'year_start', 'year_end')]

# read biomass data

In [9]:
# get filenames
path = data_path / 'biomass'
files = [f for f in os.listdir(path) 
           if f.endswith('.xml')]
files.sort()
print('files to read:', len(files))

files to read: 6


In [10]:
biodf_raw = pd.DataFrame()
for f in tqdm(files):
    filepath = str(data_path/'biomass'/ f)
    chunk = readin_fis_biomass(filepath)
    biodf_raw = pd.concat([chunk, biodf_raw])

100%|█████████████████████████████████████████████| 6/6 [00:18<00:00,  3.12s/it]


# merge, clean, and save

In [39]:
biodf = biodf_raw.loc[biodf_raw.r0 != 'COL_SUBTOTAL']
biodf['report'] = biodf.r0.astype(int)
biodf = pd.merge(biodf, labs, how='left', on='report')

In [40]:
cols_to_float = ['total', 'variance', 'sampling_error',
       'sampling_error_percent', 'total_plots', 'domain_plots',
       'non_zero_plots']
biodf[cols_to_float] = biodf[cols_to_float].astype(float)

In [41]:
# overall
filename = 'biomass_cln.csv'
filepath = data_clean_path / filename
filepath.parent.mkdir(parents=True, exist_ok=True)
biodf.to_csv(filepath, index=False)