# Normalize taxa list

Cleanup the normalized taxa list from the eODP researchers for a given particular taxon group.

In [1]:
import pandas as pd
import numpy as np

## Import normalized taxa list

In [2]:
taxon_group = 'nannofossils'

input_file = 'raw_data/taxa/Micropal_headers_PBDB_Taxonomy_notes.xlsx'
crosswalk_file = f"cleaned_data/taxa/taxa_crosswalk_{taxon_group}.csv"
taxa_list_file = f"cleaned_data/taxa/taxa_list_{taxon_group}.csv"

In [3]:
df = pd.read_excel(input_file, sheet_name="taxa list", skiprows = 9)
df.head()

Unnamed: 0,verbatim_name,name,"name to use (if different from ""name"")",name comment field,taxon_group,Comment,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10
0,PROBLEMATIC NAMES,,,,,,,,,,
1,Euuvigerina miozea (group) >100 m,Euuvigerina miozea (group) >100 m,these need both checking & can't be entered cu...,,benthic_forams,group,"enter ""Euuvigerina miozea"" and reproduce the e...",,,,
2,Euuvigerina rodleyi (group) >50 m,Euuvigerina rodleyi (group) >50 m,,,benthic_forams,group,"enter ""Euuvigerina miozea"" and reproduce the e...",,,,
3,Phorticium pylonium group,Phorticium pylonium group,,,radiolarians,group,"enter ""Phorticium pylonium"" and reproduce the ...",,,,
4,Skeletonemopsis and Skeletonema grp,Skeletonemopsis and Skeletonema grp,Skelentonematacea indet.,,diatoms,group,deal with later? LL: both are in the family Sk...,,,,


In [4]:
filtered_taxa = df[df['taxon_group'] == taxon_group]
filtered_taxa = filtered_taxa.dropna(axis=1, how='all')
filtered_taxa.head()

Unnamed: 0,verbatim_name,name,"name to use (if different from ""name"")",name comment field,taxon_group,Comment,Unnamed: 6
2263,calcispheres,calcispheres,Dinoflagellata indet.,calcispheres,nannofossils,"group, not a taxa","enter ""Haptophyta"" and reproduce the entry as ..."
2264,reworked taxa,reworked taxa,Coccolithophyceae indet.,reworked taxa,nannofossils,"group, not a taxa","enter ""Haptophyta"" and reproduce the entry as ..."
2265,Nannofossil fragments,Nannofossil fragments,Coccolithophyceae indet.,Nannofossil fragments,nannofossils,not a taxa,"enter ""Coccolithophyceae"" and reproduce the en..."
2266,Reworked species,Reworked species,Coccolithophyceae indet.,Reworked species,nannofossils,not a taxa,deal with later; should go in as a taphonomony...
2267,Calcisphere,Calcisphere,Dinoflagellata indet.,Calcisphere,nannofossils,not a taxa,"enter ""Coccolithophyceae"" and reproduce the en..."


In [5]:
len(filtered_taxa)

817

## Add normalized_name

In [6]:
# https://stackoverflow.com/a/10726275

filtered_taxa['normalized_name'] = np.where(
    filtered_taxa['name to use (if different from "name")'].notnull(), 
    filtered_taxa['name to use (if different from "name")'], 
    filtered_taxa['name']
)
filtered_taxa.head(8)

Unnamed: 0,verbatim_name,name,"name to use (if different from ""name"")",name comment field,taxon_group,Comment,Unnamed: 6,normalized_name
2263,calcispheres,calcispheres,Dinoflagellata indet.,calcispheres,nannofossils,"group, not a taxa","enter ""Haptophyta"" and reproduce the entry as ...",Dinoflagellata indet.
2264,reworked taxa,reworked taxa,Coccolithophyceae indet.,reworked taxa,nannofossils,"group, not a taxa","enter ""Haptophyta"" and reproduce the entry as ...",Coccolithophyceae indet.
2265,Nannofossil fragments,Nannofossil fragments,Coccolithophyceae indet.,Nannofossil fragments,nannofossils,not a taxa,"enter ""Coccolithophyceae"" and reproduce the en...",Coccolithophyceae indet.
2266,Reworked species,Reworked species,Coccolithophyceae indet.,Reworked species,nannofossils,not a taxa,deal with later; should go in as a taphonomony...,Coccolithophyceae indet.
2267,Calcisphere,Calcisphere,Dinoflagellata indet.,Calcisphere,nannofossils,not a taxa,"enter ""Coccolithophyceae"" and reproduce the en...",Dinoflagellata indet.
2268,Calcispheres,Calcispheres,Dinoflagellata indet.,Calcispheres,nannofossils,not a taxa,"enter ""Coccolithophyceae"" and reproduce the en...",Dinoflagellata indet.
2269,Cyclicargolithus cf. floridanus,Cyclicargolithus cf. floridanus,,,nannofossils,,"in PBDB, so it will be classified correctly",Cyclicargolithus cf. floridanus
2270,Ahmuellerella octoradiata,Ahmuellerella octoradiata,,,nannofossils,,"in PBDB, so it will be classified correctly",Ahmuellerella octoradiata


In [7]:
len(filtered_taxa)

817

create crosswalk csv

In [8]:
filtered_taxa.to_csv(crosswalk_file, index=False)

## Create taxa list csv

In [9]:
taxa_names = filtered_taxa['normalized_name'].unique()
df = pd.DataFrame(taxa_names, columns=['name'])
df['taxon_group'] = taxon_group
df.head()

Unnamed: 0,name,taxon_group
0,Dinoflagellata indet.,nannofossils
1,Coccolithophyceae indet.,nannofossils
2,Cyclicargolithus cf. floridanus,nannofossils
3,Ahmuellerella octoradiata,nannofossils
4,Algirosphaera robusta,nannofossils


In [10]:
df.to_csv(taxa_list_file, index=False)