# 4: Explore results
Author: Daniel Lusk

Imports and functions

In [None]:
import ast
import json
from pathlib import Path

import geopandas as gpd
import dask_geopandas as dgpd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import rioxarray as riox
import seaborn as sns

from geocube.api.core import make_geocube


from utils.geodata import compare_grids, compare_gdf_to_grid, pad_ds
from utils.geodata import read_001_predictions, splot_correlation_old

from utils.visualize import plot_all_trait_obs_pred, plot_pred_cov

%load_ext autoreload
%autoreload 2

## Overall training results

Load the results, map trait IDs to trait names, and remove unneeded columns

In [None]:
results = pd.read_csv("results/training_results.csv")

# Rename response variable values according to the trait id -> trait name mapping
with open("./trait_id_to_trait_name.json", "r") as f:
    mapping = json.load(f)

# Create a new column called "Trait name" according to the pattern "TRYgapfilled_X{trait_id}_" with {trait_name}
for trait_id, trait in mapping.items():
    results.loc[results["Response variable"].str.contains(trait_id), "Trait name"] = trait

# Rename "Test r-squared" to "Full r-squared"
results = results.rename(columns={"Test r-squared": "Full r-squared"})

run_id_mapping = {
    "2023-09-23_11-44-08": "Original",
    "2023-09-23_12-39-35": "Original + Imputed",
    "2023-09-23_13-30-09": "Log-transformed",
    "2023-09-23_14-19-41": "Log + Imputed",
}

# Only select four most recent run IDs
results = results[results["Run ID"].isin(run_id_mapping.keys())]

# Add new column "Run type" which maps each run ID to its corresponding run type
results["Run type"] = results["Run ID"].map(run_id_mapping)

# Isolate "Predictor importance" into its own dataframe (still retaining Run ID and Response variable)
PI = results[["Run ID", "Run type", "Response variable", "Trait name", "Predictor importance", "CV predictor importance"]]

results = results[
    [
        "Run ID",
        "Run type",
        "Response variable",
        "Trait name",
        "N observations",
        "CV nRMSE",
        "CV nRMSE STD",
        "CV r-squared",
        "CV r-squared STD",
        "Full r-squared",
    ]
]

Look at the top ten and bottom ten models from overall results

In [None]:
results.sort_values(by=["CV r-squared"], ascending=False).head(10)

In [None]:
results.sort_values(by=["CV r-squared"], ascending=True).head(10)

**Some quick takeaways:**
- the sPlot traits resulted in the best models. This is likely due to its smaller sample size and (likely) reduced variance.
- It appears that log-transforming the trait data resulted in models that were unable to be fit to the corresponding predictor variables.

Let's set a CV R^2 threshold of 0.05 to remove these outliers and explore the overall performance of each training suite.

In [None]:
results = results[results["CV r-squared"] > 0.05]

Plot overall results CV r-squared as box plot

In [None]:
sns.set_theme(style="whitegrid")
sns.set(rc={'figure.figsize':(15,10)})
sns.set(font_scale=1)

fig, axs = plt.subplots(nrows=2, ncols=2)
axs = axs.flatten()

models = ["Original", "Original + Imputed", "Log-transformed", "Log + Imputed"]
metrics = ["CV r-squared", "CV r-squared STD", "CV nRMSE", "Full r-squared"]

for i, metric in enumerate(metrics):
    for j, model in enumerate(models):
        ax = axs[i]
        sns.boxplot(x="Run type", y=metric, data=results, order=models, ax=ax)
        # data = results[results["Run ID"] == model][metric]
        # sns.boxplot(data=data, ax=ax)
        ax.set(xlabel="Data treatment", label=metric)
        ax.set_xlabel(ax.get_xlabel(), fontweight='bold')
        ax.set_ylabel(ax.get_ylabel(), fontweight='bold')

plt.show()

After dropping outlier models from the log-transformed suite of models, the log-transformed models appear to perform best in terms of CV R^2 and Overall R^2 and with a generally lower CV nRMSE, but greater variance in its CV R^2 standard deviation. This makes sense, as many of the untransformed traits were already normally distributed, and so log-transforming them would likley result in poorer model fitting.

In general, it appears that there is not a big difference between predictor datasets with missing values and imputed datasets, though the CV R^2 STD does increase slightly for models trained on non-imputed datasets.

**Now let's isolate the best performing models for each trait**

In [None]:
mapping

In [None]:
# First filter out rows that contain "imputed" in the Run ID column
results_mvs = results[~results["Run type"].str.contains("Imputed")]

# Next split results into GBIF and sPlot dataframes
results_gbif = results_mvs[results_mvs["Response variable"].str.contains("GBIF")]
results_splot = results_mvs[results_mvs["Response variable"].str.contains("sPlot")]

# Next create empty dataframes (one for GBIF and one for sPlot) with the same column names as the results dataframe
# Then, for each trait, get the row with the highest CV r-squared and append it to the empty dataframe
# This will give us the best model for each trait

best_models_gbif = pd.DataFrame(
    columns=results_gbif.columns.values,
    # index=range(len(mapping)),
)
best_models_splot = pd.DataFrame(
    columns=results_splot.columns.values,
    # index=range(len(mapping)),
)

for i, (_, trait) in enumerate(mapping.items()):
    best_gbif_row = (
        results_gbif[results_gbif["Trait name"].str.contains(trait, regex=False)]
        .sort_values(by=["CV r-squared"], ascending=False)
        .iloc[0]
    )
    best_models_gbif = pd.concat([best_models_gbif, best_gbif_row.to_frame().T])
    
    # The best GBIF run types don't necessarily correspond to the best sPlot run types
    # for the same traits, but for an apples-to-apples comparison we should use the same
    # run types for both datasets
    best_gbif_trait = best_gbif_row["Response variable"].split("GBIF_")[1]

    best_splot_row = (
        results_splot[
            results_splot["Response variable"].str.contains(best_gbif_trait)
        ]
        .sort_values(by=["CV r-squared"], ascending=False)
        .iloc[0]
    )

    best_models_splot = pd.concat([best_models_splot, best_splot_row.to_frame().T])

best_models_gbif = best_models_gbif.sort_values(by=["CV r-squared"], ascending=False)
best_models_splot = best_models_splot.sort_values(by=["CV r-squared"], ascending=False)

In [None]:
best_models_gbif

In [None]:
print(best_models_gbif.to_markdown(index=False, floatfmt=".3f"))

In [None]:
print(best_models_splot.to_markdown(index=False, floatfmt=".3f"))

Figure with scatterplots of CV-predictions vs observed for each of a subset of traits, along with R

In [None]:
results_dir = Path("results/training")
# Create a list of tuples of Run ID and Response variable for each row in best_models_gbif and best_models_splot
# This will be used to filter out the results dataframes to only include the best model for each trait
gbif_id_rvs = list(
    zip(best_models_gbif["Run ID"], best_models_gbif["Response variable"])
)
splot_id_rvs = list(
    zip(best_models_splot["Run ID"], best_models_splot["Response variable"])
)

gbif_trait_dirs = [Path(results_dir / run_id / rv) for run_id, rv in gbif_id_rvs]
splot_trait_dirs = [Path(results_dir / run_id / rv) for run_id, rv in splot_id_rvs]

In [None]:
plot_all_trait_obs_pred(gbif_trait_dirs, mapping)

In [None]:
plot_all_trait_obs_pred(splot_trait_dirs, mapping)

## Crowd-sourced vs sPlotOpen

Figure: Box plot of GBIF and sPlot CV-R2 (y-axis) for each trait (x-axis). I.e. two boxes for each trait, one for GBIF and one for sPlot. Could print RMSE ± STD for each one, too?

In [None]:
best_models_gbif = best_models_gbif.sort_values(by=["CV r-squared"], ascending=False)
best_models_splot = best_models_splot.sort_values(by=["CV r-squared"], ascending=False)

# For each matching Response variable in best_models_gbif and best_models_splot, plot
# the CV r-squared values as a bar chart, with the corresponding CV r-squared STD representing the error bars

# First, drop the GBIF_ and sPlot_ prefixes from the Response variable column
best_models_gbif["Response variable"] = best_models_gbif["Response variable"].str.replace("GBIF_", "")
best_models_splot["Response variable"] = best_models_splot["Response variable"].str.replace("sPlot_", "")

# Then, merge the two dataframes on Response variable
best_models_gbif_splot = pd.merge(best_models_gbif, best_models_splot, on="Response variable", suffixes=("_gbif", "_splot"))

import pandas as pd
import matplotlib.pyplot as plt

# set the plot style
plt.style.use("ggplot")

# set the figure size
plt.figure(figsize=(25, 9))

# set the x-axis labels
x_labels = best_models_gbif_splot["Trait name_gbif"]

# set the bar width
bar_width = 0.35

# set the x-axis positions
x_pos = [i for i in range(len(x_labels))]

# plot the GBIF CV r-squared values as bars
gbif_r2 = best_models_gbif_splot["CV r-squared_gbif"]
gbif_std = best_models_gbif_splot["CV r-squared STD_gbif"]
plt.bar(x_pos, gbif_r2, width=bar_width, yerr=gbif_std, label="GBIF")

# plot the sPlot CV r-squared values as bars
splot_r2 = best_models_gbif_splot["CV r-squared_splot"]
splot_std = best_models_gbif_splot["CV r-squared STD_splot"]
plt.bar([i + bar_width for i in x_pos], splot_r2, width=bar_width, yerr=splot_std, label="sPlot")

# set the x-axis labels and title
plt.xlabel("Response variable")
# Y label as "CV R2" with the 2 in R2 in superscript
plt.ylabel("CV R$^2$")
# plt.title("CV R$^2$ values for GBIF and sPlot")

# set the x-axis tick positions and labels
plt.xticks([i + bar_width / 2 for i in x_pos], x_labels, rotation=90)

# add a legend
plt.legend()

# show the plot
plt.show()


Table (or bar chart) showing global cover % for GBIF vs sPlotOpen traits (should include CV R2 for each model, too, since GBIF generally has lower R2.

In [None]:
predict_name = "MOD09GA.061_ISRIC_soil_WC_BIO_VODCA_0.5_deg_nan-strat=any_thr=0.5"
predicted_traits = list(Path(f"results/predictions/{predict_name}/").glob("*"))
# select_traits = [trait.name for trait in select_traits]

# replace trait id with trait name from mapping
# for i, trait in enumerate(select_traits):
#     trait_id = trait.split("_")[1].split("X")[-1]
#     trait_name = mapping[trait_id]
#     select_traits[i] = trait.replace(f"X{trait_id}", trait_name)

aoa_df = pd.DataFrame(
    columns=["Response variable", "GBIF AOA", "sPlot AOA", "Pct change"],
    index=range(len(predicted_traits)),
)

gbif_predictions = []
splot_predictions = []

for i, trait in enumerate(predicted_traits):
    if not trait.is_dir():
        continue
    gbif_trait_df = gpd.read_parquet(trait / f"GBIF/{predict_name}_predict.parq")
    splot_trait_df = gpd.read_parquet(trait / f"sPlot/{predict_name}_predict.parq")

    gbif_predictions.append(gbif_trait_df)
    splot_predictions.append(splot_trait_df)
    
    # For each trait, add a new row to aoa_df with the response variable, GBIF AOA, and sPlot AOA

    gbif_aoa = gbif_trait_df["AOA"].sum() / gbif_trait_df["AOA"].count()
    splot_aoa = splot_trait_df["AOA"].sum() / splot_trait_df["AOA"].count()

    # map trait id to trait name
    trait_id = trait.name.split("_")[1].split("X")[-1]
    trait_name = mapping[trait_id]

    new_row = {
        "Response variable": trait_name,
        "GBIF AOA": gbif_aoa,
        "sPlot AOA": splot_aoa,
        "Pct change": (gbif_aoa - splot_aoa) * 100,
    }

    aoa_df.iloc[i] = new_row

aoa_df = aoa_df.dropna().reset_index(drop=True)

In [None]:
print(aoa_df.to_markdown(index=False, floatfmt=".2f"))

In [None]:
aoa_df["Pct change"].mean()

## Global trait maps visualization

Figure of global trait maps for selected traits with two columns—left column containing trait predictions masked by AoA, right column containing trait prediction CoV (also masked by AoA?)

First, back-transform trait values predicted by models trained in log-space.

In [None]:
for i, (gbif, splot) in enumerate(zip(gbif_predictions, splot_predictions)):
    if gbif.columns[0].endswith("_ln"):
        # back-transform columns 0 and 4
        gbif.iloc[:, 0] = np.exp(gbif.iloc[:, 0])
        gbif.iloc[:, 4] = np.exp(gbif.iloc[:, 4])
    
    if splot.columns[0].endswith("_ln"):
        # back-transform columns 0 and 4
        splot.iloc[:, 0] = np.exp(splot.iloc[:, 0])
        splot.iloc[:, 4] = np.exp(splot.iloc[:, 4])
    
    # rename columns 0 and 4 by replacing "_ln" with nothing
    gbif = gbif.rename(columns={gbif.columns[0]: gbif.columns[0].replace("_ln", "")})
    gbif = gbif.rename(columns={gbif.columns[4]: gbif.columns[4].replace("_ln", "")})

    splot = splot.rename(columns={splot.columns[0]: splot.columns[0].replace("_ln", "")})
    splot = splot.rename(columns={splot.columns[4]: splot.columns[4].replace("_ln", "")})

    gbif_predictions[i] = gbif
    splot_predictions[i] = splot

In [None]:
for i, (gbif, splot) in enumerate(zip(gbif_predictions, splot_predictions)):
    gbif_predictions[i] = make_geocube(vector_data=gbif, resolution=(-0.5, 0.5))
    gbif_predictions[i] = pad_ds(gbif_predictions[i])

    splot_predictions[i] = make_geocube(vector_data=splot, resolution=(-0.5, 0.5))
    splot_predictions[i] = pad_ds(splot_predictions[i])

In [None]:
plot_pred_cov(gbif_predictions)

Figure (appendix): All remaining global trait maps with > R2-THRESHOLD

## Feature importances

Filter models with a CV r^2 below 0.2

In [None]:
pi_gbif = PI.loc[best_models_gbif[best_models_gbif["CV r-squared"] > 0.2].index]
pi_splot = PI.loc[best_models_splot[best_models_splot["CV r-squared"] > 0.2].index]

In [None]:
def ds_importance(PIs):
    
    PIs = ast.literal_eval(PIs)

    ds_keys = {
        "MODIS": ["sur_refl"],
        "WorldClim": ["wc2.1"],
        "Soil": ["0-5cm", "0-30cm", "5-15cm", "15-30cm", "30-60cm", "60-100cm", "100-200cm"],
        "VODCA": ["C_2", "Ku_2", "X_2"]
    }

    # Get the average and standard deviation of the predictor importance values for each dataset
    PIs = {
        k: [np.mean(v), np.std(v)] for k, v in PIs.items()
    }

    ds_importance = {}

    for ds, keys in ds_keys.items():
        
        for feature, imp in PIs.items():
            for key in keys:
                if key in feature:
                    if ds not in ds_importance:
                        ds_importance[ds] = []
                    ds_importance[ds].append(imp)
    
    return ds_importance


In [None]:
# Get DS importance for each response variable in PI_orig_gbif and PI_orig_splot
pi_gbif["DS importance"] = pi_gbif["CV predictor importance"].apply(ds_importance)
pi_splot["DS importance"] = pi_splot["CV predictor importance"].apply(ds_importance)

Figure: box and whisker plot of predictor datasets (x-axis) and their average importances across all traits for which models had a > R2-THRESHOLD score.

In [None]:
# create a list of dataframes for each dataset for GBIF
dfs_gbif = []
for ds in pi_gbif["DS importance"].iloc[0].keys():
    df = pd.DataFrame(
        {
            "Dataset": ds,
            "Mean importance": [
                np.asarray(imp[ds])[:, 0].mean() for imp in pi_gbif["DS importance"]
            ],
            "STD importance": [
                np.asarray(imp[ds])[:, 1].mean() for imp in pi_gbif["DS importance"]
            ],
        }
    )
    dfs_gbif.append(df)

# concatenate the dataframes into a single dataframe for GBIF
df_gbif = pd.concat(dfs_gbif)

# create a list of dataframes for each dataset for sPlot
dfs_splot = []
for ds in pi_splot["DS importance"].iloc[0].keys():
    df = pd.DataFrame(
        {
            "Dataset": ds,
            "Mean importance": [
                np.asarray(imp[ds])[:, 0].mean() for imp in pi_splot["DS importance"]
            ],
            "STD importance": [
                np.asarray(imp[ds])[:, 1].mean() for imp in pi_splot["DS importance"]
            ],
        }
    )
    dfs_splot.append(df)

# concatenate the dataframes into a single dataframe for sPlot
df_splot = pd.concat(dfs_splot)

# set the plot style
sns.set_theme()
# sns.set_style("whitegrid")

# create the figure and subplots
fig, axs = plt.subplots(1, 2, figsize=(14, 6), sharey=True)

# plot the GBIF boxplot
sns.boxplot(x="Dataset", y="Mean importance", data=df_gbif, ax=axs[0])
axs[0].set_xlabel("Dataset")
axs[0].set_ylabel("Mean importance")
axs[0].set_title("Predictor dataset importances for GBIF\nmodels with >0.2 CV r-squared")

# plot the sPlot boxplot
sns.boxplot(x="Dataset", y="Mean importance", data=df_splot, ax=axs[1])
axs[1].set_xlabel("Dataset")
axs[1].set_ylabel("")
axs[1].set_title("Predictor dataset importances for sPlot\nmodels with >0.2 CV r-squared")

# adjust the layout and spacing
plt.tight_layout()

# show the plot
plt.show()

Figure (appendix): Feature importance of individual predictors across all traits for which models had a > R2-THRESHOLD score.

## Comparison of GBIF (and other products) with sPlotOpen grids

### 0.5 degree grids

In [None]:
# Other products (0.5 degree)
product_dir = Path("data/other-products/all-prods_stacks_sla-nit-nita_05D_2022-02-14")
N_mass = riox.open_rasterio(product_dir / "all-prods_nit_stack_all-maps_05D_2022-02-14.grd", masked=True)
N_area = riox.open_rasterio(product_dir / "all-prods_nita_stack_all-maps_05D_2022-02-14.grd", masked=True)
SLA  = riox.open_rasterio(product_dir / "all-prods_sla_stack_all-maps_05D_2022-02-14.grd", masked=True)

# Extrapolations
pred_05_dir = Path("results/predictions/05deg_models/MOD09GA.061_ISRIC_soil_WC_BIO_VODCA_0.5_deg_nan-strat=any_thr=0.5", "Shrub_Tree_Grass")
pred_05_fn = "MOD09GA.061_ISRIC_soil_WC_BIO_VODCA_0.5_deg_nan-strat=any_thr=0.5_predict.parq"

# GBIF extrapolations (0.5 degree)
gbif_N_mass_05 = gpd.read_parquet(pred_05_dir / "TRYgapfilled_X14_05deg_mean/GBIF" / pred_05_fn)
gbif_N_area_05 = gpd.read_parquet(pred_05_dir / "TRYgapfilled_X50_05deg_mean_ln/GBIF" / pred_05_fn)
gbif_SLA_05 = gpd.read_parquet(pred_05_dir / "TRYgapfilled_X11_05deg_mean/GBIF" / pred_05_fn)

# sPlot extrapolations (0.5 degree)
sPlot_ext_N_mass_05 = gpd.read_parquet(pred_05_dir / "TRYgapfilled_X14_05deg_mean/sPlot" / pred_05_fn)
sPlot_ext_N_area_05 = gpd.read_parquet(pred_05_dir / "TRYgapfilled_X50_05deg_mean_ln/sPlot" / pred_05_fn)
sPlot_ext_SLA_05 = gpd.read_parquet(pred_05_dir / "TRYgapfilled_X11_05deg_mean/sPlot" / pred_05_fn)

# sPlot maps (0.5 degree)
sPlot_N_mass_05 = riox.open_rasterio("./GBIF_trait_maps/global_maps/Shrub_Tree_Grass/05deg/sPlot_TRYgapfilled_X14_05deg.grd", masked=True).sel(band=2)
sPlot_N_area_05 = riox.open_rasterio("./GBIF_trait_maps/global_maps/Shrub_Tree_Grass/05deg/sPlot_TRYgapfilled_X50_05deg.grd", masked=True).sel(band=2)
sPlot_SLA_05 = riox.open_rasterio("./GBIF_trait_maps/global_maps/Shrub_Tree_Grass/05deg/sPlot_TRYgapfilled_X11_05deg.grd", masked=True).sel(band=2)

Back-transform `gbif_N_area` and `sPlot_N_area` as they were trained on log-transformed trait values

In [None]:
# Back-transform `gbif_N_area` as it was trained on log-transformed trait values
gbif_N_area_05["GBIF_TRYgapfilled_X50_05deg_mean"] = np.exp(gbif_N_area_05["GBIF_TRYgapfilled_X50_05deg_mean_ln"])
sPlot_ext_N_area_05["sPlot_TRYgapfilled_X50_05deg_mean"] = np.exp(sPlot_ext_N_area_05["sPlot_TRYgapfilled_X50_05deg_mean_ln"])

In [None]:
corr_table = pd.DataFrame(columns=["Leaf N mass"])

splot_corr = compare_gdf_to_grid(sPlot_ext_N_mass_05, sPlot_N_mass_05, "sPlot_TRYgapfilled_X14_05deg_mean", "sPlot_N_mass")
gbif_corr = compare_gdf_to_grid(gbif_N_mass_05, sPlot_N_mass_05, "GBIF_TRYgapfilled_X14_05deg_mean", "sPlot_N_mass")

corr_table = pd.concat([corr_table, pd.DataFrame([splot_corr, gbif_corr], columns=["Leaf N mass"], index=["sPlot (extrap.)", "GBIF"])])

for i, band in enumerate(N_mass):
    band_name = band.long_name[i]
    corr = compare_grids(band, sPlot_N_mass_05, band_name, "sPlot_N_mass")
    corr_table = pd.concat([corr_table, pd.DataFrame([corr], columns=["Leaf N mass"], index=[band_name])])

In [None]:
splot_corr = compare_gdf_to_grid(sPlot_ext_N_area_05, sPlot_N_area_05, "sPlot_TRYgapfilled_X50_05deg_mean", "sPlot_N_area")
gbif_corr = compare_gdf_to_grid(gbif_N_area_05, sPlot_N_area_05, "GBIF_TRYgapfilled_X50_05deg_mean", "sPlot_N_area")

corr_table.loc["GBIF", "Leaf N area"] = gbif_corr
corr_table.loc["sPlot (extrap.)", "Leaf N area"] = splot_corr

for i, band in enumerate(N_area):
    band_name = band.long_name[i]
    corr = compare_grids(band, sPlot_N_area_05, band_name, "sPlot_N_area")
    if band_name in corr_table.index:
        corr_table.loc[band_name, "Leaf N area"] = corr
    else:
        corr_table = pd.concat([corr_table, pd.DataFrame({"Leaf N area": corr}, index=[band_name])])

In [None]:
splot_corr = compare_gdf_to_grid(sPlot_ext_SLA_05, sPlot_SLA_05, "sPlot_TRYgapfilled_X11_05deg_mean", "sPlot_SLA")
gbif_corr = compare_gdf_to_grid(gbif_SLA_05, sPlot_SLA_05, "GBIF_TRYgapfilled_X11_05deg_mean", "sPlot_SLA")

corr_table.loc["GBIF", "Leaf SLA"] = gbif_corr
corr_table.loc["sPlot (extrap.)", "Leaf SLA"] = splot_corr

for i, band in enumerate(SLA):
    band_name = band.long_name[i]
    corr = compare_grids(band, sPlot_SLA_05, band_name, "sPlot_SLA")
    if band_name in corr_table.index:
        corr_table.loc[band_name, "Leaf SLA"] = corr
    else:
        corr_table = pd.concat([corr_table, pd.DataFrame({"Leaf SLA": corr}, index=[band_name])])

Pearson's correlation coefficient (r) between extrapolations and sPlotOpen grids at 0.5 degrees.

In [None]:
print(corr_table.to_markdown(floatfmt=".3f"))

### 0.01 degree grids

#### Old method

In [None]:
# Extrapolations
pred_001_dir = Path("results/predictions/tiled_5x5_deg_MOD09GA.061_ISRIC_soil_WC_BIO_VODCA_0.01_deg_nan-strat=any_thr=0.5")
pred_001_fn = "merged_predictions.parq"

# GBIF extrapolations (0.01 degree)
gbif_N_mass_001 = gpd.read_parquet(pred_001_dir / "TRYgapfilled_X14_05deg_mean/GBIF" / pred_001_fn)
gbif_N_area_001 = gpd.read_parquet(pred_001_dir / "TRYgapfilled_X50_05deg_mean_ln/GBIF" / pred_001_fn)
gbif_SLA_001 = gpd.read_parquet(pred_001_dir / "TRYgapfilled_X11_05deg_mean/GBIF" / pred_001_fn)

# sPlot extrapolations (0.01 degree)
sPlot_ext_N_mass_001 = gpd.read_parquet(pred_001_dir / "TRYgapfilled_X14_05deg_mean/sPlot" / pred_001_fn)
sPlot_ext_N_area_001 = gpd.read_parquet(pred_001_dir / "TRYgapfilled_X50_05deg_mean_ln/sPlot" / pred_001_fn)
sPlot_ext_SLA_001 = gpd.read_parquet(pred_001_dir / "TRYgapfilled_X11_05deg_mean/sPlot" / pred_001_fn)

# sPlot maps (0.01 degree)
sPlot_N_mass_001 = riox.open_rasterio("data/splot/0.01_deg/sPlot_TRYgapfilled_X14_0.01deg.tif", masked=True)
sPlot_N_area_001 = riox.open_rasterio("data/splot/0.01_deg/sPlot_TRYgapfilled_X50_0.01deg.tif", masked=True)
sPlot_SLA_001 = riox.open_rasterio("data/splot/0.01_deg/sPlot_TRYgapfilled_X11_0.01deg.tif", masked=True)

Back-transform `gbif_N_area` and `sPlot_N_area` as they were trained on log-transformed trait values

In [None]:
# Back-transform `gbif_N_area` as it was trained on log-transformed trait values
gbif_N_area_001["GBIF_TRYgapfilled_X50_05deg_mean"] = np.exp(gbif_N_area_001["GBIF_TRYgapfilled_X50_05deg_mean_ln"])
sPlot_ext_N_area_001["sPlot_TRYgapfilled_X50_05deg_mean"] = np.exp(sPlot_ext_N_area_001["sPlot_TRYgapfilled_X50_05deg_mean_ln"])

In [None]:
trait_ids = ["X14", "X50", "X11"]
trait_names = ["Leaf N mass", "Leaf N area", "Leaf SLA"]
splot_trait_names = ["sPlot_N_mass", "sPlot_N_area", "sPlot_SLA"]

corr_table = pd.DataFrame(columns=trait_names, index=["sPlot", "GBIF"])

for trait_id, splot_trait_name, trait_name in zip(
    trait_ids, splot_trait_names, trait_names
):
    for model in ["GBIF", "sPlot"]:
        print(f"Processing {model} {trait_id}...")

        gdf = read_001_predictions(trait_id, model)
        corr = splot_correlation_old(gdf, trait_id, splot_trait_name, "05_range")

        corr_table.at[model, trait_name] = corr
    
print(corr_table.to_markdown(floatfmt=".3f"))

#### Updated method

See `scripts/splot_correlations.py`

In [29]:
import pandas as pd

corr_table = pd.read_parquet("results/trait_correlations.parquet")
corr_table.round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,2,2,0.5,0.5,0.2,0.2,0.01,0.01
Unnamed: 0_level_1,Unnamed: 1_level_1,GBIF,sPlot,GBIF,sPlot,GBIF,sPlot,GBIF,sPlot
X1080,Grass,0.51,0.71,0.48,0.82,0.43,0.57,0.36,0.44
X1080,Shrub-Tree,0.39,0.63,0.39,0.83,0.37,0.58,0.32,0.46
X1080,Shrub-Tree-Grass,0.60,0.77,0.57,0.85,0.51,0.64,0.44,0.53
X11,Grass,0.52,0.74,0.77,0.35,0.55,0.67,0.53,0.57
X11,Shrub-Tree,0.65,0.79,0.78,0.46,0.50,0.64,0.37,0.54
...,...,...,...,...,...,...,...,...,...
X78,Shrub-Tree,0.91,0.81,0.76,0.96,0.70,0.80,0.67,0.74
X78,Shrub-Tree-Grass,0.83,0.71,0.67,0.96,0.60,0.74,0.59,0.68
X95,Grass,0.38,0.65,0.27,0.83,0.16,0.40,0.15,0.30
X95,Shrub-Tree,0.59,0.77,0.46,0.88,0.35,0.52,0.32,0.45


In [34]:
print(corr_table.round(2).to_markdown())

|                               |   ('2', 'GBIF') |   ('2', 'sPlot') |   ('0.5', 'GBIF') |   ('0.5', 'sPlot') |   ('0.2', 'GBIF') |   ('0.2', 'sPlot') |   ('0.01', 'GBIF') |   ('0.01', 'sPlot') |
|:------------------------------|----------------:|-----------------:|------------------:|-------------------:|------------------:|-------------------:|-------------------:|--------------------:|
| ('X1080', 'Grass')            |            0.51 |             0.71 |              0.48 |               0.82 |              0.43 |               0.57 |               0.36 |                0.44 |
| ('X1080', 'Shrub-Tree')       |            0.39 |             0.63 |              0.39 |               0.83 |              0.37 |               0.58 |               0.32 |                0.46 |
| ('X1080', 'Shrub-Tree-Grass') |            0.6  |             0.77 |              0.57 |               0.85 |              0.51 |               0.64 |               0.44 |                0.53 |
| ('X11', 'Grass')  

In [33]:
print(corr_table.round(2).style.to_latex())

\begin{tabular}{llrrrrrrrr}
 &  & \multicolumn{2}{r}{2} & \multicolumn{2}{r}{0.5} & \multicolumn{2}{r}{0.2} & \multicolumn{2}{r}{0.01} \\
 &  & GBIF & sPlot & GBIF & sPlot & GBIF & sPlot & GBIF & sPlot \\
\multirow[c]{3}{*}{X1080} & Grass & 0.510000 & 0.710000 & 0.480000 & 0.820000 & 0.430000 & 0.570000 & 0.360000 & 0.440000 \\
 & Shrub-Tree & 0.390000 & 0.630000 & 0.390000 & 0.830000 & 0.370000 & 0.580000 & 0.320000 & 0.460000 \\
 & Shrub-Tree-Grass & 0.600000 & 0.770000 & 0.570000 & 0.850000 & 0.510000 & 0.640000 & 0.440000 & 0.530000 \\
\multirow[c]{3}{*}{X11} & Grass & 0.520000 & 0.740000 & 0.770000 & 0.350000 & 0.550000 & 0.670000 & 0.530000 & 0.570000 \\
 & Shrub-Tree & 0.650000 & 0.790000 & 0.780000 & 0.460000 & 0.500000 & 0.640000 & 0.370000 & 0.540000 \\
 & Shrub-Tree-Grass & 0.600000 & 0.800000 & 0.800000 & 0.410000 & 0.500000 & 0.640000 & 0.430000 & 0.550000 \\
\multirow[c]{3}{*}{X13} & Grass & 0.530000 & 0.150000 & 0.630000 & 0.110000 & 0.260000 & 0.410000 & 0.230000 & 0.34

Pearson's correlation coefficient (r) between extrapolations and sPlotOpen grids at 0.5 degrees.

In [None]:
print(corr_table.to_markdown(floatfmt=".3f"))