# 0.1.1b: Pythonize sPlotOpen data

## Imports

In [11]:
from pathlib import Path

import numpy as np
import pandas as pd

from src.conf.conf import get_config
from src.conf.environment import log
from IPython.display import display, HTML, Markdown

%load_ext autoreload
%autoreload 2

%load_ext rpy2.ipython

# Display all columns when printing a pandas DataFrame
pd.set_option("display.max_columns", None)

conf = get_config()

splot_raw_dir = Path(conf.raw_dir, conf.datasets.Y.splot)
splot_prep_dir = Path(conf.interim_dir, conf.splot.interim.dir)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


## Load sPlotOpen data and do some exploration

Let's see what's in the raw data export.

In [5]:
for item in splot_raw_dir.iterdir():
    print(item)

data/raw/sPlotOpen_v76/sPlotOpen_Metadata_dataset_3474(5).txt
data/raw/sPlotOpen_v76/sPlotOpen_metadata(2).txt
data/raw/sPlotOpen_v76/sPlotOpen_CWM_CWV(2).txt
data/raw/sPlotOpen_v76/sPlotOpen_DT(2).txt
data/raw/sPlotOpen_v76/3474_76_metadata_iDiv Metadata Vers.6.6.xml
data/raw/sPlotOpen_v76/Schema
data/raw/sPlotOpen_v76/manifest.json
data/raw/sPlotOpen_v76/3474_76_metadata.html
data/raw/sPlotOpen_v76/sPlotOpen.RData
data/raw/sPlotOpen_v76/Demo(2).pdf
data/raw/sPlotOpen_v76/sPlotOpen_references.bib
data/raw/sPlotOpen_v76/sPlotOpen_header(3).txt


Let's get a quick overview of the contents.

sPlotOpen_Metadata_dataset_3474(5).txt

In [13]:
with open(splot_raw_dir / "sPlotOpen_Metadata_dataset_3474(5).txt", "r") as f:
    txt = f.read()

print(txt)

## sPlotOpen – An environmentally-balanced, open-access, global dataset of vegetation plots
## Accompanying the paper of same title, by Sabatini FM, Lenoir J, et al. (2021) - Global Ecology and Biogeography

## Archive content
sPlotOpen_Metadata_dataset3474.txt (this file)
sPlotOpen_CWM_CWV.csv
sPlotOpen_DT.csv
sPlotOpen_header.csv
sPlotOpen_metadata.csv
sPlotOpen.RData
sPlotOpen_references.bib
Demo.pdf


## File description
sPlotOpen_Metadata_dataset3474.txt  - File describing the dataset 
sPlotOpen_CWM_CWV.csv               - Community Weighted Means and Community Weighted Variance of all vegetation plots  
sPlotOpen_DT.csv                    - List of species and relative cover in each vegetation plot  
sPlotOpen_header.csv                - Plot level information
sPlotOpen_metadata.csv              - Plot level metadata
sPlotOpen.RData                     - RData containing all tables above
sPlotOpen_references.bib            - Reference list in BibTeX format
Demo.pdf               

CWM/CWV table

In [14]:
spo_cwm = pd.read_csv(splot_raw_dir / "sPlotOpen_CWM_CWV(2).txt", sep="\t")
spo_cwm.head()

Unnamed: 0,PlotObservationID,TraitCoverage_cover,Species_richness,TraitCoverage_pa,LeafArea_CWM,StemDens_CWM,SLA_CWM,LeafC_perdrymass_CWM,LeafN_CWM,LeafP_CWM,PlantHeight_CWM,SeedMass_CWM,Seed_length_CWM,LDMC_CWM,LeafNperArea_CWM,LeafNPratio_CWM,Leaf_delta_15N_CWM,Seed_num_rep_unit_CWM,Leaffreshmass_CWM,Stem_cond_dens_CWM,Disp_unit_leng_CWM,Wood_vessel_length_CWM,LeafArea_CWV,StemDens_CWV,SLA_CWV,LeafC_perdrymass_CWV,LeafN_CWV,LeafP_CWV,PlantHeight_CWV,SeedMass_CWV,Seed_length_CWV,LDMC_CWV,LeafNperArea_CWV,LeafNPratio_CWV,Leaf_delta_15N_CWV,Seed_num_rep_unit_CWV,Leaffreshmass_CWV,Stem_cond_dens_CWV,Disp_unit_leng_CWV,Wood_vessel_length_CWV
0,16,0.277778,3,0.333333,3.678311,-1.047293,2.890748,6.128157,2.873263,1.114036,-1.514747,-0.897433,1.046333,-1.150683,0.024882,1.874128,-0.26475,5.633945,-4.387745,3.483073,1.470741,5.859517,,,,,,,,,,,,,,,,,,
1,17,0.038462,2,0.5,3.678311,-1.047293,2.890748,6.128157,2.873263,1.114036,-1.514747,-0.897433,1.046333,-1.150683,0.024882,1.874128,-0.26475,5.633945,-4.387745,3.483073,1.470741,5.859517,,,,,,,,,,,,,,,,,,
2,18,0.047619,4,0.25,3.678311,-1.047293,2.890748,6.128157,2.873263,1.114036,-1.514747,-0.897433,1.046333,-1.150683,0.024882,1.874128,-0.26475,5.633945,-4.387745,3.483073,1.470741,5.859517,,,,,,,,,,,,,,,,,,
3,20,0.666667,3,0.333333,3.686063,-0.907135,2.903715,6.136791,2.929729,0.739181,-2.711473,-1.417603,0.126305,-1.325711,0.107715,1.999404,-0.076209,3.036499,-4.124691,5.337666,0.252902,6.570591,,,,,,,,,,,,,,,,,,
4,22,0.538462,7,0.571429,3.899842,-0.900514,2.917708,6.131968,2.955072,0.733698,-2.543448,-1.528987,0.11811,-1.376588,0.122927,2.036662,0.073539,3.451333,-3.968128,5.380727,0.242525,6.558434,0.402247,0.002248,0.010056,0.000636,0.025483,0.016556,0.689812,0.138307,0.011436,0.041385,0.022313,0.017075,0.186384,1.315851,0.306499,0.163156,0.052239,0.002832


Header (plot information)

In [15]:
spo_header = pd.read_csv(splot_raw_dir / "sPlotOpen_header(3).txt", sep="\t")
spo_header.head()

  spo_header = pd.read_csv(splot_raw_dir / "sPlotOpen_header(3).txt", sep="\t")


Unnamed: 0,PlotObservationID,GIVD_ID,Dataset,Continent,Country,Biome,Date_of_recording,Latitude,Longitude,Location_uncertainty,Releve_area,Plant_recorded,Elevation,Aspect,Slope,is_forest,ESY,Naturalness,Forest,Shrubland,Grassland,Wetland,Sparse_vegetation,Cover_total,Cover_tree_layer,Cover_shrub_layer,Cover_herb_layer,Cover_moss_layer,Cover_lichen_layer,Cover_algae_layer,Cover_litter_layer,Cover_bare_rocks,Cover_cryptogams,Cover_bare_soil,Height_trees_highest,Height_trees_lowest,Height_shrubs_highest,Height_shrubs_lowest,Height_herbs_average,Height_herbs_lowest,Height_herbs_highest,SoilClim_PC1,SoilClim_PC2,Resample_1,Resample_2,Resample_3,Resample_1_consensus
0,16,NA-US-014,Aava,North America,United States,Boreal zone,1980-01-01,62.42,-154.18,1000.0,,Not specified,1790.0,,0.0,False,,,,,,,,,,,,,,,15.0,30.0,,15.0,,,,,,,,-3.66,0.546,True,False,False,True
1,17,NA-US-014,Aava,North America,United States,Boreal zone,1980-01-01,62.42,-154.18,1000.0,,Not specified,1750.0,,0.0,False,,,,,,,,,,,,,,,15.0,5.0,,5.0,,,,,,,,-3.66,0.546,True,False,False,True
2,18,NA-US-014,Aava,North America,United States,Boreal zone,1980-01-01,62.42,-154.18,1000.0,,Not specified,1750.0,,0.0,False,,,,,,,,,,,,,,,10.0,5.0,,3.0,,,,,,,,-3.66,0.546,False,False,True,False
3,20,NA-US-014,Aava,North America,United States,Boreal zone,1980-01-01,62.42,-154.18,1000.0,,Not specified,1520.0,,0.0,False,,,,,,,,,,,,,,,5.0,60.0,,2.0,,,,,,,,-3.66,0.546,False,True,False,False
4,22,NA-US-014,Aava,North America,United States,Boreal zone,1980-01-01,62.42,-154.18,1000.0,,Not specified,1550.0,,0.0,False,,,,,,,,,,,,,,,2.0,65.0,,5.0,,,,,,,,-3.66,0.546,True,False,False,True


DT (species abundances per plot)

In [17]:
dpo_dt = pd.read_csv(splot_raw_dir / "sPlotOpen_DT(2).txt", sep="\t", encoding="utf-8")
dpo_dt.head()


Unnamed: 0,PlotObservationID,Species,Original_species,Original_abundance,Abundance_scale,Relative_cover
0,16,Festuca brachyphylla,Festuca brachyphylla,10.0,CoverPerc,0.277778
1,16,Potentilla elegans,Potentilla elegans,25.0,CoverPerc,0.694444
2,16,Saxifraga serpyllifolia,Saxifraga serpyllifolia,1.0,CoverPerc,0.027778
3,17,Festuca brachyphylla,Festuca brachyphylla,1.0,CoverPerc,0.038462
4,17,Potentilla elegans,Potentilla elegans,25.0,CoverPerc,0.961538


Unlike with the full sPlot dataset, it looks like we don't need to access the `RData` file as everything we need appears to be broken out into separate text files.