# Land Cover

Gets MODIS, NLCD, and performs the crosswalk

In [None]:
%matplotlib ipympl

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# setting up logging first or else it gets preempted by another package
import watershed_workflow.ui
watershed_workflow.ui.setup_logging(1)

In [None]:
import os,sys
import logging
import numpy as np
from matplotlib import pyplot as plt
import pickle
import shapely
import pandas as pd
import geopandas as gpd
import xarray as xr
pd.options.display.max_columns = None

import cftime
import datetime

import watershed_workflow 
import watershed_workflow.config
import watershed_workflow.sources
import watershed_workflow.mesh
import watershed_workflow.regions
import watershed_workflow.land_cover_properties
import watershed_workflow.io

# set the default figure size for notebooks
plt.rcParams["figure.figsize"] = (8, 6)

## Input: Parameters and other source data

In [None]:
# Force Watershed Workflow to pull data from this directory rather than a shared data directory.
# This picks up the Coweeta-specific datasets set up here to avoid large file downloads for 
# demonstration purposes.
#
def splitPathFull(path):
    """
    Splits an absolute path into a list of components such that
    os.path.join(*splitPathFull(path)) == path
    """
    parts = []
    while True:
        head, tail = os.path.split(path)
        if head == path:  # root on Unix or drive letter with backslash on Windows (e.g., C:\)
            parts.insert(0, head)
            break
        elif tail == path:  # just a single file or directory
            parts.insert(0, tail)
            break
        else:
            parts.insert(0, tail)
            path = head
    return parts

cwd = splitPathFull(os.getcwd())
assert cwd[-1] == 'workflow'
cwd = cwd[:-1]

# Note, this directory is where downloaded data will be put as well
data_dir = os.path.join(*(cwd + ['input_data',]))
def toInput(filename):
    return os.path.join(data_dir, filename)

output_filenames = dict()
output_dir = os.path.join(*(cwd + ['output_data',]))
def fromOutput(filename):
    return os.path.join(output_dir, filename)    

def toOutput(role, filename):
    output_filenames[role] = filename
    return fromOutput(filename)

# check output and input dirs exist
if not os.path.isdir(data_dir):
    os.makedirs(data_dir, exist_ok=True)
if not os.path.isdir(output_dir):
    os.makedirs(output_dir, exist_ok=True)
       

In [None]:
# Set the data directory to the local space to get the locally downloaded files
# REMOVE THIS CELL for general use outside fo Coweeta
watershed_workflow.config.setDataDirectory(data_dir)


In [None]:
## Parameters cell -- this provides all parameters that can be changed via pipelining to generate a new watershed. 
name = 'RussianRiver'
hucs = ['18010110'] # a list of HUCs to run

# Geometric parameters
# -- parameters to clean and reduce the river network prior to meshing
prune_by_area = 10               # km^2
simplify = 125                   # length scale to target average edge 

# -- mesh triangle refinement control
refine_d0 = 200
refine_d1 = 600

refine_L0 = 125
refine_L1 = 300

refine_A0 = refine_L0**2 / 2
refine_A1 = refine_L1**2 / 2

# Refine triangles if they get too acute
min_angle = 20 # degrees

# width of reach by stream order (order:width)
river_widths = dict({1:10, 2:10, 3:20, 4:30, 5:30}) 


# Note that, by default, we tend to work in the DayMet CRS because this allows us to avoid
# reprojecting meteorological forcing datasets.
crs = watershed_workflow.crs.default_crs


# start and stop time for simulation
# note that this is the overlap of AORC and MODIS
start = cftime.DatetimeGregorian(2007, 8, 1)
end = cftime.DatetimeGregorian(2020, 7, 31)

start_noleap = cftime.DatetimeNoLeap(2007, 8, 1)
end_noleap = cftime.DatetimeNoLeap(2020, 7, 31)
cyclic_nyears = 10


In [None]:
# set up a dictionary of source objects
#
# Data sources, also called managers, deal with downloading and parsing data files from a variety of online APIs.
sources = watershed_workflow.sources.getDefaultSources()

# log the sources that will be used here
watershed_workflow.sources.logSources(sources)


## Reload data

In [None]:
with open(fromOutput('02_watersheds.pickle'), 'rb') as fid:
    watersheds = pickle.load(fid)

In [None]:
with open(fromOutput('03_m2.pickle'), 'rb') as fid:
    m2 = pickle.load(fid)


In [None]:
tmp_df = gpd.GeoDataFrame(geometry=[watersheds.exterior,], crs=watersheds.crs) 

In [None]:
for ls in m2.labeled_sets:
    print(f'"{ls.name}" : {ls.setid} consists of {len(ls.ent_ids)} {ls.entity}s')

## Get NLCD data

In [None]:
# download the NLCD raster
nlcd = sources['land cover'].getDataset(watersheds.exterior, watersheds.crs)['cover']

# what land cover types did we get?
logging.info('Found land cover dtypes: {}'.format(nlcd.dtype))
logging.info('Found land cover types: {}'.format(set(list(nlcd.values.ravel()))))

In [None]:
# create a colormap for the data
nlcd_indices, nlcd_cmap, nlcd_norm, nlcd_ticks, nlcd_labels = \
      watershed_workflow.colors.createNLCDColormap(np.unique(nlcd))

fig = plt.figure(figsize=(12,6))
ax = fig.add_axes([0.1, 0.1, 0.55, 0.85])

nlcd.plot.imshow(ax=ax, cmap=nlcd_cmap, norm=nlcd_norm, add_colorbar=False)
watershed_workflow.colors.createIndexedColorbar(ncolors=len(nlcd_indices), 
                               cmap=nlcd_cmap, labels=nlcd_labels, norm=nlcd_norm, ax=ax) 
ax.set_title('Land Cover')
plt.show()


In [None]:
# map nlcd onto the mesh
m2_nlcd = watershed_workflow.getDatasetOnMesh(m2, nlcd, method='nearest')
m2.cell_data['land_cover'] = m2_nlcd


In [None]:
# double-check that nan not in the values
assert 127 not in m2_nlcd

# create a new set of labels and indices with only those that actually appear on the mesh
nlcd_indices, nlcd_cmap, nlcd_norm, nlcd_ticks, nlcd_labels = \
      watershed_workflow.colors.createNLCDColormap(np.unique(m2_nlcd))

In [None]:
# add labeled sets to the mesh for NLCD
nlcd_labels_dict = dict(zip(nlcd_indices, nlcd_labels))
watershed_workflow.regions.addSurfaceRegions(m2, names=nlcd_labels_dict)

In [None]:
for ls in m2.labeled_sets:
    print(f'{ls.setid} : {ls.entity} : {len(ls.ent_ids)} : "{ls.name}"')

In [None]:
# write the mesh to disk with new NLCD labels
with open(toOutput('m2', '04_m2.pickle'), 'wb') as fid:
    pickle.dump(m2, fid)


## MODIS LAI

Leaf area index is needed on each land cover type -- this is used in the Evapotranspiration calculation.

### Download

In [None]:
# download LAI and corresponding LULC datasets -- these are actually already downloaded, 
# as the MODIS AppEEARS API is quite slow
#
# get the full LAI record
#req = sources['LAI'].requestDataset(watersheds.exterior, crs, task_id='29411e08-5863-48c0-8b86-84e44334b846')
req = sources['LAI'].requestDataset(watersheds.exterior, crs)

In [None]:
sources['LAI'].isReady(req)

In [None]:
modis_data = sources['LAI'].fetchRequest(req)

### Process

In [None]:
# MODIS data comes with time-dependent LAI AND time-dependent LULC -- just take the mode to find the most common LULC
modis_data['LULC'] = watershed_workflow.data.computeMode(modis_data['LULC'], 'time_LULC')

# now it is safe to have only one time
modis_data = modis_data.rename({'time_LAI':'time'})

# remove leap day (366th day of any leap year) to use a Noleap Calendar
modis_data = watershed_workflow.data.filterLeapDay(modis_data)

In [None]:
# plot the MODIS data
modis_data['LULC'].plot.imshow()

In [None]:
fig, ax = plt.subplots(figsize=(14,5))

time_xax = modis_data['time'] - start_noleap
ax.plot(time_xax, modis_data['LAI'][:, 18, 21])

#ax.set_xlim(0, 365 * 86400 * 1e9) # to zoom in to one year, convert time to nano-seconds
ax.set_ylim(0,4)
plt.show()




In [None]:
# compute the transient time series by class
modis_lai_ts = watershed_workflow.land_cover_properties.computeTimeSeries(modis_data['LAI'], modis_data['LULC'], 
                                                                          polygon=watersheds.exterior, polygon_crs=watersheds.crs)

# also just compute the mean value time series
modis_lai_domain_avg_ts = modis_data['LAI'].mean(dim=('lat', 'lon'))


In [None]:
fig, ax = plt.subplots(figsize=(14,6))

watershed_workflow.land_cover_properties.plotLAI(modis_lai_ts, indices='MODIS', ax=ax)
ax.set_ylim(0,5)
ax.set_xlim(14534, 15000)
modis_lai_ts

### Evaluate simplified models of LAI relative to full spatial-temporal data

So the first question -- is the LAI well represented by just a time series and a LULC class?

Let's compute the class-averaged LAI time series, then compare each pixel to its class-average time series, and compute spatial and temporal patterns of RMSE.

In [None]:
# compute the error made by assuming LAI is represented by the class average
error = np.array([[modis_data['LAI'][:,i,j] - modis_lai_ts[watershed_workflow.sources.manager_modis_appeears.colors[int(modis_data['LULC'][i,j])][0] + ' LAI [-]'] for j in range(modis_data['LULC'].shape[1])] for i in range(modis_data['LULC'].shape[0])])


In [None]:
# Plot spatial MAE map
fig, ax = plt.subplots(figsize=(7, 5))
im = ax.imshow(np.linalg.norm(error, ord=1, axis=2) / len(modis_lai_ts), cmap="viridis", vmax=5)
plt.colorbar(im)
ax.set_title("Mean Absolute Error")
plt.tight_layout()
plt.show()

In [None]:
# compute the error made by assuming LAI is constant across the domain
error_total = modis_data['LAI'] - modis_lai_domain_avg_ts

# this is the total variance of the LAI
V_total = (error_total**2).mean(dim=('lat','lon'))


In [None]:
# now compute the within-class variance
V_within  = xr.zeros_like(V_total)  # will accumulate within-class variance

# Fraction of pixels in each class
pixel_frac = modis_data['LULC'].groupby(modis_data['LULC']).count() / modis_data['LULC'].size # dict: class -> fraction

# Compute within-class variance at each time
for c in np.unique(modis_data['LULC']):
    mask = (modis_data['LULC'] == c)
    # mask LAI to only class c
    lai_c = modis_data['LAI'].where(mask)
    # compute variance over spatial dims (ignoring NaNs for masked pixels)
    var_c = lai_c.var(dim=('lat', 'lon'), skipna=True)
    # weight by pixel fraction
    frac_c = mask.sum() / mask.size
    V_within += frac_c * var_c

In [None]:
# plot temporal error ratio -- what is R2 of the ratio of within-class variance to total variance?
r2 = 1 - V_within / V_total

fig,ax = plt.subplots()
ax.plot(r2)
ax.set_xlabel('time')
ax.set_ylabel('error ratio: class-avg to domain-avg')
plt.tight_layout()
plt.show()

It looks like class-based is a reasonable choice for this domain -- the within-class variance is MUCH smaller than the total variance

This convinces us that class-based LAI is good enough.

### Continue to process and write to disk

In [None]:
# smooth the data in time
modis_lai_smoothed = watershed_workflow.data.smoothTimeSeries(modis_lai_ts, 'time', window_length=31)

fig, ax = plt.subplots(figsize=(14,6))
watershed_workflow.land_cover_properties.plotLAI(modis_lai_smoothed, indices='MODIS', ax=ax)
ax.set_ylim(0,5)
ax.set_xlim(14534, 15000)

In [None]:
# compute a typical year
modis_lai_typical = watershed_workflow.data.computeAverageYear(modis_lai_smoothed,
                                                              start_date = start_noleap - datetime.timedelta(days=365*cyclic_nyears),
                                                              output_nyears=cyclic_nyears, 
                                                              )

In [None]:
fig, ax = plt.subplots(figsize=(14,6))
watershed_workflow.land_cover_properties.plotLAI(modis_lai_typical, indices='MODIS', ax=ax)
modis_lai_typical

In [None]:
# limit the raw data to the window we want
modis_lai_ts_limited = modis_lai_ts[(modis_lai_smoothed["time"] >= start_noleap) & (modis_lai_smoothed["time"] <= end_noleap)]

In [None]:
modis_lai_typical['time'].iloc[-1]

In [None]:
# save the two time series files -- modis data
fname_modis_lai_typical = toOutput('modis_lai_typical', f'RussianRiver_LAI_MODIS_CyclicSteadystate_{modis_lai_typical['time'].iloc[0].year}_{modis_lai_typical['time'].iloc[-1].year}.h5')
watershed_workflow.io.writeTimeseriesToHDF5(fname_modis_lai_typical, modis_lai_typical)

fname_modis_lai_ts = toOutput('modis_lai_ts', f'RussianRiver_LAI_MODIS_Transient_{modis_lai_ts_limited['time'].iloc[0].year}_{modis_lai_ts_limited['time'].iloc[-1].year}.h5')
watershed_workflow.io.writeTimeseriesToHDF5(fname_modis_lai_ts, modis_lai_ts_limited)


## Compute the crosswalk of NLCD and MODIS

This tells us how to map MODIS LAI onto NLCD classes.

In [None]:
crosswalk = watershed_workflow.land_cover_properties.computeCrosswalk(modis_data['LULC'], nlcd, method='fractional area')

In [None]:
# Compute the NLCD-based time series
nlcd_lai_typical = watershed_workflow.land_cover_properties.applyCrosswalk(crosswalk, modis_lai_typical)
watershed_workflow.land_cover_properties.removeNullLAI(nlcd_lai_typical)


nlcd_lai_ts = watershed_workflow.land_cover_properties.applyCrosswalk(crosswalk, modis_lai_ts_limited)
watershed_workflow.land_cover_properties.removeNullLAI(nlcd_lai_ts)

nlcd_lai_ts

In [None]:
# save the two time series files -- NLCD data
fname_nlcd_lai_typical = toOutput('nlcd_lai_typical', f'RussianRiver_LAI_NLCD_CyclicSteadystate_{nlcd_lai_typical['time'].iloc[0].year}_{nlcd_lai_typical['time'].iloc[-1].year}.h5')
watershed_workflow.io.writeTimeseriesToHDF5(fname_nlcd_lai_typical, nlcd_lai_typical)


fname_nlcd_lai_ts = toOutput('nlcd_lai_ts', f'RussianRiver_LAI_NLCD_Transient_{nlcd_lai_ts['time'].iloc[0].year}_{nlcd_lai_ts['time'].iloc[-1].year}.h5')
watershed_workflow.io.writeTimeseriesToHDF5(fname_nlcd_lai_ts, nlcd_lai_ts)

In [None]:
# lastly, reread, update, and output filenames
with open(toOutput('04_output_filenames', '04_output_filenames.txt'), 'wb') as fid:
    pickle.dump(output_filenames, fid)
