In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os

### Set basic configuration

In [25]:
# Recommended on documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
pd.options.mode.copy_on_write = True

### Define some utilities

In [26]:
def enforce_dir(path: str):
    """Accepts path separated as strings.
    Creates directory in case it doesn't exist"""
    if not os.path.exists(path):
        os.makedirs(path, exist_ok=True)

### Get the Data
The datasets are located at GitHub.com, inside a public repository.

In [35]:
traits_url = "https://raw.githubusercontent.com/dylancraven/Rasgos-CL/main/Data/RasgosCL_aggregatedspp.csv"
try:
    traits_df = pd.read_csv(traits_url)
except Exception as err:
    print(f"Error when downloading: {err}")

We also need a secondary dataset for geographical references:

In [40]:
geo_url = "https://raw.githubusercontent.com/dylancraven/Rasgos-CL/main/Extra/Chile_spp_distrib.csv"
try:
    geo_df = pd.read_csv(geo_url)
except Exception as err:
    print(f"Error when downloading: {err}")

### First species exploration


In [37]:
traits_df.info(), traits_df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8643 entries, 0 to 8642
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   accepted_species  8643 non-null   object 
 1   traitValue        8643 non-null   object 
 2   obs               8413 non-null   float64
 3   traitName         8643 non-null   object 
 4   agreement         7256 non-null   float64
 5   traitUnit         8643 non-null   object 
dtypes: float64(2), object(4)
memory usage: 405.3+ KB


(None,
                obs    agreement
 count  8413.000000  7256.000000
 mean      3.020088     0.986776
 std       3.956636     0.057783
 min       1.000000     0.666667
 25%       1.000000     1.000000
 50%       2.000000     1.000000
 75%       4.000000     1.000000
 max     116.000000     1.000000)

In [98]:
traits_df.sample(5)

Unnamed: 0,accepted_species,traitValue,obs,traitName,agreement,traitUnit
5421,Lobelia excelsa,Toothed,4.0,Leaf_margin_type_1,0.75,"Toothed, Untoothed, No_leaves, Special_cases"
6007,Myrceugenia parvifolia,Arbuscular,1.0,Mycorrhizal_type_2,1.0,"Arbuscular, Ectomycorrhiza, Ectomycorrhiza-Arb..."
5132,Larrea divaricata,Untoothed,1.0,Leaf_margin_type_1,1.0,"Toothed, Untoothed, No_leaves, Special_cases"
1556,Baccharis patagonica,Arbuscular,1.0,Mycorrhizal_type_2,1.0,"Arbuscular, Ectomycorrhiza, Ectomycorrhiza-Arb..."
1022,Anarthrophyllum elegans,5,1.0,Seed_mass,,mg


### Reordering Dataframe & filtering relevant data
We want to know how studied the species are, which means we can reduce the size of our dataset.

In [65]:
df_ordered = traits_df[["accepted_species", "traitName", "obs"]]
df_ordered.rename(columns={"accepted_species": "specie", "traitName": "trait_name"}, inplace=True)

Unnamed: 0,specie,trait_name,obs
0,Acrisione cymosa,Max_plant_height,2.0
1,Acrisione cymosa,Seed_mass,1.0
2,Acrisione cymosa,Leaf_margin_type_2,1.0
3,Acrisione cymosa,Leaf_margin_type_1,3.0
4,Acrisione cymosa,Leaf_organization_1,2.0
5,Acrisione cymosa,Fruit_type_3,3.0
6,Acrisione cymosa,Fruit_type_2,3.0
7,Acrisione cymosa,Fruit_type_1,3.0
8,Acrisione cymosa,Dispersal_syndrome_1,1.0
9,Acrisione cymosa,Dispersal_syndrome_2,1.0


In [116]:
observed_species = df_ordered.groupby("specie").agg({"obs": ["sum"]})
observed_species.columns = ["total_observations"]
observed_species

Unnamed: 0_level_0,total_observations
specie,Unnamed: 1_level_1
Acrisione cymosa,27.0
Acrisione denticulata,49.0
Adenopeltis serrata,39.0
Adesmia aphylla,18.0
Adesmia argentea,25.0
...,...
Vestia foetida,35.0
Viviania crenata,19.0
Viviania marifolia,50.0
Viviania ovata,17.0


In [None]:

observed_species.to_csv()

# References:

- Working with groups: https://realpython.com/pandas-groupby/#example-1-us-congress-dataset