# 1: Setup and Data Exploration

Author: Daniel Lusk

## Imports

In [None]:
import os

import rioxarray as riox
import utils.geodata as geodata
from PreprocessingConfig import PreprocessingConfig
from utils.datasets import Dataset, Unit, resample_dataset, CollectionName

from utils.visualize import plot_traits

%load_ext autoreload
%autoreload 2

# Load configuration
config = PreprocessingConfig()

## Try/iNaturalist trait maps

### Visualize TRY/iNaturalist trait maps

First, let's look at the trait maps generated from TRY/iNaturalist observations by [Wolf, et al. (2022)](https://doi.org/10.1038/s41559-022-01904-x).

In [None]:
if config.plot_traits:
    inat_fns_2deg = config.iNat_fns("2")

    plot_traits(inat_fns_2deg, 4)

### Convert to GeoDataFrames and merge

In [None]:
trait_fns = config.iNat_fns

trait_gdfs = []
for trait_fn in trait_fns:
    gdf = geodata.tif2gdf(trait_fn)
    trait_gdfs.append(gdf)

trait_gdfs = geodata.merge_gdfs(trait_gdfs)
trait_gdfs.head(5)

## WorldClim data

Load the tifs and resample to all resolutions needed

In [None]:
wc = Dataset(
    res=0.5,
    unit=Unit.DEGREE,
    parent_dir=config.WC_dir,
    collection_name=config.WC_name,
    # bio_ids=config.WC_bio_ids,
)

if config.resamp_to_disk:
    resample_dataset(dataset=wc, resolution=2, unit=Unit.DEGREE)

Convert to GeoDataFrames and merge

In [None]:
bio_fns = config.WC_fns

bios = []
for bio_fn in bio_fns:
    name = os.path.splitext(os.path.basename(bio_fn))[0]
    bio = riox.open_rasterio(bio_fn, masked=True)
    bio.name = name
    bios.append(bio)
    
bio_gdfs = geodata.merge_gdfs([geodata.tif2gdf(bio) for bio in bios])
bio_gdfs.head(5)

Compute Preciptation Annual Range by subtracting BIO14 from BIO13

In [None]:
bio_13 = bio_gdfs.loc[:, ["bio_13" in x for x in bio_gdfs.columns]].values
bio_14 = bio_gdfs.loc[:, ["bio_14" in x for x in bio_gdfs.columns]].values
bio_gdfs["wc2.1_10m_bio_13-14"] = bio_13 - bio_14

## MODIS Terra Surface Reflectance bands 1-5

1. Get bands 1-5 of MODIS Terra Surface Reflectance dataset
2. mask clouds
3. aggregate into monthly collections
4. export to Google Drive with a target resolution of ~1km at the equator (0.008983152841195, -0.008983152841195)

In [None]:
if config.gee_export:
    import ee
    import utils.gee as gee

    # ee.Authenticate() # Uncomment if not already authenticated
    ee.Initialize()

    # Get MODIS Terra Surface Reflectance image collection for its first five years of operation
    # (2000-03-01 - 2001-03-01)
    ds, de = "2000-03-01", "2005-03-02"
    bands = [
        "sur_refl_b01",
        "sur_refl_b02",
        "sur_refl_b03",
        "sur_refl_b04",
        "sur_refl_b05",
    ]
    modis_tsr = ee.ImageCollection("MODIS/061/MOD09GA").filterDate(ds, de)

    # Mask clouds
    qa_band = "state_1km"
    modis_tsr_masked = gee.mask_clouds(modis_tsr, qa_band)

    # Aggregate the image collection into monthly averages for each band
    tsr_bands_monthly = []
    for band in bands:
        monthly = gee.aggregate_ic(modis_tsr_masked.select(band), ds, de)
        tsr_bands_monthly.append(monthly)

    tsr_monthly_means = []
    for band_ic in tsr_bands_monthly:
        tsr_monthly_means.append(gee.aggregate_ic_monthly(band_ic, ds, de))

    # Combine the five bands into a single image collection
    tsr_monthly_means = (
        tsr_monthly_means[0]
        .combine(tsr_monthly_means[1])
        .combine(tsr_monthly_means[2])
        .combine(tsr_monthly_means[3])
        .combine(tsr_monthly_means[4])
    )

    # Reproject the image collection to EPSG:4326 with a scale of 1km
    tsr_monthly_means = tsr_monthly_means.map(lambda x: x.reproject("EPSG:4326", None, 1000))

    # Unmask the image collection and convert to int16 (because NoData values are replaced with 0 when converting to int16)
    tsr_monthly_means = tsr_monthly_means.map(lambda x: x.unmask(-32768))
    tsr_monthly_means = tsr_monthly_means.map(lambda x: x.toInt16())

    # Export images to Google Drive
    gee.export_collection(
        collection=tsr_monthly_means, folder="multiband_monthly_averages"
    )

5. Merge semi-global observations for each band/month (See `scripts/merge_files.py`)

6. Downsample to match resolution of trait maps (0.5 deg)

In [None]:
modis = Dataset(
    res=0.5,
    unit=Unit.DEGREE,
    collection_name=CollectionName.MODIS,
)

# Uncomment to resample the MODIS dataset to another resolution
# resample_dataset(dataset=modis, resolution=2, unit=Unit.DEGREE)

## ISRIC soil data

1. Download soil data from [ISRIC](https://files.isric.org/soilgrids/latest/data/)
2. Reproject to WGS84, and resample to ~1km resolution

See `get_soil_data_multi.py`

3. Reproject and downsample to Wolf trait maps CRS

In [None]:
soil = Dataset(
    res=0.5,
    unit=Unit.DEGREE,
    collection_name=config.soil_name,
)

# Uncomment to resample the ISRIC dataset to another resolution
# resample_dataset(dataset=soil, resolution=2, unit=Unit.DEGREE)

# VODCA

Downsample all three bands spatially, resample to monthly, and save to disk

In [None]:
import xarray as xr


def daily_to_multiyear_monthly(
    ds: xr.Dataset, band_name: str, out_name: str, p_dir: str
) -> None:
    """Converts daily data to multiyear monthly data and saves to disk"""
    ds = ds.drop_vars(["sensor_flag", "processing_flag"])
    ds = ds.rename({"vod": band_name})
    ds_05 = (
        ds.coarsen(lon=2, lat=2, boundary="exact")
        .mean()
        .resample(time="1MS")
        .mean()
        .groupby("time.month")
        .mean("time")
    )
    ds_05.attrs["geospatiallatresolution"] = "0.5 degree"
    ds_05.attrs["geospatiallonresolution"] = "0.5 degree"

    ds_025 = ds.resample(time="1MS").mean().groupby("time.month").mean("time")

    ds_05.to_netcdf(f"{p_dir}/0.5_deg/{out_name}_multiyear_monthly_0.5_deg.nc")
    ds_025.to_netcdf(f"{p_dir}/0.25_deg/{out_name}_multiyear_monthly_0.25_deg.nc")

## C-band (2002-2004)

In [None]:
c_band = xr.open_mfdataset(
    "./data/vodca/source/C-Band/200*/*.nc",
    engine="h5netcdf",
    parallel=True,
    chunks={"time": 15},
)
daily_to_multiyear_monthly(c_band, "c_band", "C_2002-2004", "./data/vodca")

## Ku-band (2000-2004)

In [None]:
ku_band = xr.open_mfdataset(
    "./data/vodca/source/Ku-band/200*/*.nc",
    engine="h5netcdf",
    parallel=True,
    chunks={"time": 15},
)
daily_to_multiyear_monthly(ku_band, "ku_band", "Ku_2000-2004", "./data/vodca")

## X-band (2000-2004)

In [None]:
x_band = xr.open_mfdataset(
    "./data/vodca/source/X-band/200*/*.nc",
    engine="h5netcdf",
    parallel=True,
    chunks={"time": 15},
)
daily_to_multiyear_monthly(x_band, "x_band", "X_2000-2004", "./data/vodca")