# Genus & Family distributions

In [1]:
import pandas as pd


base_path = "~/p-dsgt_clef2025-0/shared/plantclef/data"
file_name = "species_metadata.csv"
input_path = f"{base_path}/{file_name}"
df = pd.read_csv(input_path)
df.head()

Unnamed: 0,species_id,species,genus,family
0,1355868,Lactuca virosa L.,Lactuca,Asteraceae
1,1355869,Crepis capillaris (L.) Wallr.,Crepis,Asteraceae
2,1355870,Crepis foetida L.,Crepis,Asteraceae
3,1355871,Hypochaeris glabra L.,Hypochaeris,Asteraceae
4,1355872,Hypochaeris radicata L.,Hypochaeris,Asteraceae


In [2]:
# groupby genus
genus_df = (
    df.groupby("genus")
    .agg(species_count=("species", "nunique"))  # Count distinct species
    .reset_index()
)
print(f"Number of unique genera: {len(genus_df)}")
genus_df.head()

Number of unique genera: 1446


Unnamed: 0,genus,species_count
0,Abies,3
1,Abutilon,2
2,Acacia,9
3,Acanthoprasium,1
4,Acanthus,1


In [3]:
# groupby genus
family_df = (
    df.groupby("family")
    .agg(species_count=("species", "nunique"))  # Count distinct species
    .reset_index()
)
print(f"Number of unique families: {len(family_df)}")
family_df.head()

Number of unique families: 181


Unnamed: 0,family,species_count
0,Acanthaceae,1
1,Acoraceae,1
2,Aizoaceae,20
3,Alismataceae,14
4,Altingiaceae,1


In [4]:
# average species count per genus
avg_species_per_genus = genus_df["species_count"].mean()
avg_species_per_family = family_df["species_count"].mean()
print(f"Average species per genus: {round(avg_species_per_genus, 2)}")
print(f"Average species per family: {round(avg_species_per_family, 2)}")

Average species per genus: 5.4
Average species per family: 43.13
