# 2: Data Exploration
Author: Daniel Lusk

In [None]:
import os
os.environ['USE_PYGEOS'] = '0'
import json

from pathlib import Path

from utils.datasets import CollectionName, Dataset, GBIFBand
from utils.visualize import plot_distributions

%load_ext autoreload
%autoreload 2

## TRY + GBIF/iNaturalist trait maps

### Visualize TRY + GBIF/iNaturalist trait maps

First, let's look at the trait maps generated from TRY/iNaturalist observations by [Wolf, et al. (2022)](https://doi.org/10.1038/s41559-022-01904-x).

In [None]:
gbif = Dataset(
    res=0.5,
    collection_name=CollectionName.GBIF,
    band=GBIFBand.MEAN
)

splot = Dataset(
    res=0.5,
    collection_name=CollectionName.SPLOT,
    band=GBIFBand.MEAN
)

gbif_ln = Dataset(
    res=0.5,
    collection_name=CollectionName.GBIF_LN,
    band=GBIFBand.MEAN
)

splot_ln = Dataset(
    res=0.5,
    collection_name=CollectionName.SPLOT_LN,
    band=GBIFBand.MEAN
)

In [None]:
# Map trait IDs to trait names

# Rename response variable values according to the trait id -> trait name mapping
with open("./trait_id_to_trait_name.json", "r") as f:
    mapping = json.load(f)

# Replace each column in gbif.df and splot.df according to the pattern "TRYgapfilled_X{trait_id}_" with {trait_name}
for trait_id, trait_name in mapping.items():
    gbif.df = gbif.df.rename(columns={f"GBIF_TRYgapfilled_X{trait_id}_05deg_mean": f"GBIF_{trait_name}_05deg_mean"})
    splot.df = splot.df.rename(columns={f"sPlot_TRYgapfilled_X{trait_id}_05deg_mean": f"sPlot_{trait_name}_05deg_mean"})
    gbif_ln.df = gbif_ln.df.rename(columns={f"GBIF_TRYgapfilled_X{trait_id}_05deg_mean_ln": f"GBIF_{trait_name}_05deg_mean_ln"})
    splot_ln.df = splot_ln.df.rename(columns={f"sPlot_TRYgapfilled_X{trait_id}_05deg_mean_ln": f"sPlot_{trait_name}_05deg_mean_ln"})


### GBIF trait distributions

In [None]:
plot_distributions(gbif.df.drop(columns=["geometry"]), pdf=True)


In [None]:
plot_distributions(gbif_ln.df.drop(columns=["geometry"]), pdf=True)

### sPlotOpen trait distributions

In [None]:
plot_distributions(splot.df.drop(columns=["geometry"]), pdf=True)

## Predictor data

### MODIS surface reflectances

In [None]:
modis = Dataset(
    collection_name=CollectionName.MODIS,
)

Inspect each feature for anomalies

In [None]:
modis.plot_rasters()

Inspect feature distributions

In [None]:
modis.plot_distributions(pdf=True)

### WorldClim bio variables

In [None]:
wc = Dataset(
    collection_name=CollectionName.WORLDCLIM
)

Inspect features for anomalies

In [None]:
wc.plot_rasters()

Inspect feature distributions

In [None]:
wc.plot_distributions(pdf=True)

### ISRIC SoilGrids

In [None]:
soil = Dataset(
    collection_name=CollectionName.SOIL
)

In [None]:
soil.plot_rasters()

In [None]:
soil.plot_distributions(pdf=True)

### VODCA

In [None]:
vodca = Dataset(
    collection_name=CollectionName.VODCA,
)

In [None]:
vodca.plot_rasters()

In [None]:
vodca.plot_distributions(pdf=True)