In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os

### Set basic configuration

In [2]:
# Recommended on documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
pd.options.mode.copy_on_write = True

### Define some utilities

In [3]:
def enforce_dir(path: str):
    """Accepts path separated as strings.
    Creates directory in case it doesn't exist"""
    if not os.path.exists(path):
        os.makedirs(path, exist_ok=True)

### Get the Data
The datasets are located at GitHub.com, inside a public repository.

In [4]:
traits_url = "https://raw.githubusercontent.com/dylancraven/Rasgos-CL/main/Data/RasgosCL_aggregatedspp.csv"
try:
    traits_df = pd.read_csv(traits_url)
except Exception as err:
    print(f"Error when downloading: {err}")

We also need a secondary dataset for geographical references:

In [5]:
geo_url = "https://raw.githubusercontent.com/dylancraven/Rasgos-CL/main/Extra/Chile_spp_distrib.csv"
try:
    geo_df = pd.read_csv(geo_url)
except Exception as err:
    print(f"Error when downloading: {err}")

### First species exploration


In [6]:
traits_df.info(), traits_df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8643 entries, 0 to 8642
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   accepted_species  8643 non-null   object 
 1   traitValue        8643 non-null   object 
 2   obs               8413 non-null   float64
 3   traitName         8643 non-null   object 
 4   agreement         7256 non-null   float64
 5   traitUnit         8643 non-null   object 
dtypes: float64(2), object(4)
memory usage: 405.3+ KB


(None,
                obs    agreement
 count  8413.000000  7256.000000
 mean      3.020088     0.986776
 std       3.956636     0.057783
 min       1.000000     0.666667
 25%       1.000000     1.000000
 50%       2.000000     1.000000
 75%       4.000000     1.000000
 max     116.000000     1.000000)

In [7]:
traits_df.sample(5)

Unnamed: 0,accepted_species,traitValue,obs,traitName,agreement,traitUnit
8089,Senna stipulacea,Dry_pericarp,8.0,Fruit_type_2,1.0,"Dry_pericarp, Fleshy"
4406,Haplopappus remyanus,Evergreen,2.0,Leaf_habit,1.0,"Deciduous, Evergreen, Variable"
6051,Myrcianthes coquimbensis,Entomophily,4.0,Pollination_syndrome_2,1.0,"Ambophily, Anemophily, Autophily, Entomophily,..."
6994,Prosopis alba,Simple_fruit,9.0,Fruit_type_1,1.0,"Compound_fruit, None, Pseudo_fruit, Simple_fru..."
2531,Coriaria ruscifolia,Dry_pericarp,7.0,Fruit_type_2,0.857143,"Dry_pericarp, Fleshy"


### Reordering Dataframe & filtering relevant data
We want to know how studied the species are, which means we can reduce the size of our dataset.

In [8]:
df_ordered = traits_df[["accepted_species", "traitName", "obs"]]
df_ordered.rename(columns={"accepted_species": "specie", "traitName": "trait_name"}, inplace=True)

In [9]:
observed_species = df_ordered.groupby("specie").agg({"obs": ["sum"]})
observed_species.columns = ["total_observations"]
observed_species.head()

Unnamed: 0_level_0,total_observations
specie,Unnamed: 1_level_1
Acrisione cymosa,27.0
Acrisione denticulata,49.0
Adenopeltis serrata,39.0
Adesmia aphylla,18.0
Adesmia argentea,25.0


## Geographical data exploration

In [14]:
geo_df.head()

Unnamed: 0,accepted_species,region,presencia
0,Acrisione cymosa,AIS,1
1,Acrisione cymosa,ANT,0
2,Acrisione cymosa,ARA,1
3,Acrisione cymosa,ATA,0
4,Acrisione cymosa,AYP,0


In [17]:
geo_df.loc[:, "region"].unique()

array(['AIS', 'ANT', 'ARA', 'ATA', 'AYP', 'BIO', 'COQ', 'IPA', 'JFE',
       'LBO', 'LLA', 'LRI', 'MAG', 'MAU', 'NUB', 'RME', 'TAR', 'VAL'],
      dtype=object)

In [36]:
new_regions = {
    'AIS': 'Aysén',
    'ANT': 'Antofagasta',
    'ARA': 'Araucanía',
    'ATA': 'Atacama',
    'AYP': 'Arica y Parinacota',
    'BIO': 'Bío-Bío',
    'COQ': 'Coquimbo',
    'IPA': 'Isla de Pascua',
    'JFE': 'Juan Fernández',
    'LBO': 'Libertador Bernardo O\'Higgins',
    'LLA': 'Los Lagos',
    'LRI': 'Los Ríos',
    'MAG': 'Magallanes',
    'MAU': 'Maule',
    'NUB': 'Ñuble',
    'RME': 'Metropolitana',
    'TAR': 'Tarapacá',
    'VAL': 'Valparaíso'
}
geo_df.columns = ["specie", "location", "is_present"]
geo_df.replace(to_replace=new_regions, inplace=True)

In [106]:
location_dataframes = list()
grouped = geo_df.groupby("specie")[["location", "is_present"]]
for specie_name in grouped.groups.keys():
    specie_df = grouped.get_group(specie_name).set_index("location").T
    specie_df.index = [specie_name]
    specie_df.columns.names = [""]
    location_dataframes.append(specie_df)

located_species_df = pd.concat(location_dataframes)


In [118]:
located_species_df.fillna(0, inplace=True)
located_species_corrected_df = located_species_df.astype('int64')
located_species_corrected_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 718 entries, Acrisione cymosa to Weinmannia trichosperma
Data columns (total 18 columns):
 #   Column                         Non-Null Count  Dtype
---  ------                         --------------  -----
 0   Aysén                          718 non-null    int64
 1   Antofagasta                    718 non-null    int64
 2   Araucanía                      718 non-null    int64
 3   Atacama                        718 non-null    int64
 4   Arica y Parinacota             718 non-null    int64
 5   Bío-Bío                        718 non-null    int64
 6   Coquimbo                       718 non-null    int64
 7   Isla de Pascua                 718 non-null    int64
 8   Juan Fernández                 718 non-null    int64
 9   Libertador Bernardo O'Higgins  718 non-null    int64
 10  Los Lagos                      718 non-null    int64
 11  Los Ríos                       718 non-null    int64
 12  Magallanes                     718 non-null    i

In [122]:
merged = pd.merge(observed_species, located_species_corrected_df, left_index=True, right_index=True)
merged.head()

Unnamed: 0_level_0,total_observations,Aysén,Antofagasta,Araucanía,Atacama,Arica y Parinacota,Bío-Bío,Coquimbo,Isla de Pascua,Juan Fernández,Libertador Bernardo O'Higgins,Los Lagos,Los Ríos,Magallanes,Maule,Ñuble,Metropolitana,Tarapacá,Valparaíso
specie,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Acrisione cymosa,27.0,1,0,1,0,0,1,0,0,0,0,1,1,0,0,0,0,0,1
Acrisione denticulata,49.0,0,0,1,0,0,1,1,0,0,1,1,1,0,1,1,1,0,1
Adenopeltis serrata,39.0,0,0,0,0,0,1,1,0,0,1,0,0,0,1,1,1,0,1
Adesmia aphylla,18.0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0
Adesmia argentea,25.0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1


# References:

- Working with groups: https://realpython.com/pandas-groupby/#example-1-us-congress-dataset