# Normalize taxa list

Cleanup the normalized taxa list from the eODP researchers for a given particular taxon group.

In [1]:
import pandas as pd
import numpy as np

## Import normalized taxa list

In [2]:
taxon_group = 'nannofossils'
date = '2021-03-01'

input_file = f'raw_data/taxa/Micropal_headers_PBDB_Taxonomy_notes_nannos_{date}.xlsx'
crosswalk_file = f"cleaned_data/taxa/taxa_crosswalk_{taxon_group}_{date}.csv"
taxa_list_file = f"cleaned_data/taxa/taxa_list_{taxon_group}_{date}.csv"

In [3]:
df = pd.read_excel(input_file, sheet_name="taxa list", skiprows = 9)
df.head()

Unnamed: 0,verbatim_name,name,"name to use (if different from ""name"")",name comment field,taxon_group,Comment,notes,Any taxon above genus,genus modifier,genus name,...,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,comments,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank
0,PROBLEMATIC NAMES,,,,,,,,,,...,,,,,,,,,,
1,Euuvigerina miozea (group) >100 m,Euuvigerina miozea (group) >100 m,these need both checking & can't be entered cu...,,benthic_forams,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",,,,...,,,,,,,,,,
2,Euuvigerina rodleyi (group) >50 m,Euuvigerina rodleyi (group) >50 m,,,benthic_forams,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",,,,...,,,,,,,,,,
3,Pyrite,Pyrite,,,benthic_forams,not a taxa,Summer 2020: deal with later; should go in as ...,,,,...,,,,,,,,,,
4,Skeletonemopsis and Skeletonema grp,Skeletonemopsis and Skeletonema grp,Skelentonematacea indet.,,diatoms,group,Summer 2020: deal with later? LL: both are in ...,,,,...,,,,,,,,,,


select taxa for a given taxon group  

In [4]:
filtered_taxa = df[df['taxon_group'] == taxon_group]
filtered_taxa = filtered_taxa.dropna(axis=1, how='all')
filtered_taxa.head()

Unnamed: 0,verbatim_name,name,"name to use (if different from ""name"")",name comment field,taxon_group,Comment,notes,Any taxon above genus,genus modifier,genus name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,comments,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank
2263,calcispheres,calcispheres,Dinoflagellata indet.,calcispheres,nannofossils,"group, not a taxa","enter ""Haptophyta"" and reproduce the entry as ...",Dinoflagellata indet.,,,,,,,,calcispheres,,,
2264,reworked taxa,reworked taxa,Coccolithophyceae indet.,reworked taxa,nannofossils,"group, not a taxa","enter ""Haptophyta"" and reproduce the entry as ...",Coccolithophyceae indet.,,,,,,,,reworked taxa,,,
2265,Nannofossil fragments,Nannofossil fragments,Coccolithophyceae indet.,Nannofossil fragments,nannofossils,not a taxa,"enter ""Coccolithophyceae"" and reproduce the en...",Coccolithophyceae indet.,,,,,,,,Nannofossil fragments,,,
2266,Reworked species,Reworked species,Coccolithophyceae indet.,Reworked species,nannofossils,not a taxa,deal with later; should go in as a taphonomony...,Coccolithophyceae indet.,,,,,,,,Reworked species,,,
2267,Calcisphere,Calcisphere,Dinoflagellata indet.,Calcisphere,nannofossils,not a taxa,"enter ""Coccolithophyceae"" and reproduce the en...",Dinoflagellata indet.,,,,,,,,Calcisphere,,,


In [5]:
len(filtered_taxa)

817

## Add normalized_name

In [None]:
set normalized_name to "name" or "name to use"

In [6]:
# https://stackoverflow.com/a/10726275

filtered_taxa['normalized_name'] = np.where(
    filtered_taxa['name to use (if different from "name")'].notnull(), 
    filtered_taxa['name to use (if different from "name")'].str.strip(), 
    filtered_taxa['name'].str.strip()
)
filtered_taxa.head(8)

Unnamed: 0,verbatim_name,name,"name to use (if different from ""name"")",name comment field,taxon_group,Comment,notes,Any taxon above genus,genus modifier,genus name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,comments,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,normalized_name
2263,calcispheres,calcispheres,Dinoflagellata indet.,calcispheres,nannofossils,"group, not a taxa","enter ""Haptophyta"" and reproduce the entry as ...",Dinoflagellata indet.,,,,,,,,calcispheres,,,,Dinoflagellata indet.
2264,reworked taxa,reworked taxa,Coccolithophyceae indet.,reworked taxa,nannofossils,"group, not a taxa","enter ""Haptophyta"" and reproduce the entry as ...",Coccolithophyceae indet.,,,,,,,,reworked taxa,,,,Coccolithophyceae indet.
2265,Nannofossil fragments,Nannofossil fragments,Coccolithophyceae indet.,Nannofossil fragments,nannofossils,not a taxa,"enter ""Coccolithophyceae"" and reproduce the en...",Coccolithophyceae indet.,,,,,,,,Nannofossil fragments,,,,Coccolithophyceae indet.
2266,Reworked species,Reworked species,Coccolithophyceae indet.,Reworked species,nannofossils,not a taxa,deal with later; should go in as a taphonomony...,Coccolithophyceae indet.,,,,,,,,Reworked species,,,,Coccolithophyceae indet.
2267,Calcisphere,Calcisphere,Dinoflagellata indet.,Calcisphere,nannofossils,not a taxa,"enter ""Coccolithophyceae"" and reproduce the en...",Dinoflagellata indet.,,,,,,,,Calcisphere,,,,Dinoflagellata indet.
2268,Calcispheres,Calcispheres,Dinoflagellata indet.,Calcispheres,nannofossils,not a taxa,"enter ""Coccolithophyceae"" and reproduce the en...",Dinoflagellata indet.,,,,,,,,Calcispheres,,,,Dinoflagellata indet.
2269,Cyclicargolithus cf. floridanus,Cyclicargolithus cf. floridanus,,,nannofossils,,"in PBDB, so it will be classified correctly",,,Cyclicargolithus,cf.,floridanus,,,,,,,,Cyclicargolithus cf. floridanus
2270,Ahmuellerella octoradiata,Ahmuellerella octoradiata,,,nannofossils,,"in PBDB, so it will be classified correctly",,,Ahmuellerella,,octoradiata,,,,,88737.0,Ahmuellerella octoradiata,species,Ahmuellerella octoradiata


In [7]:
filtered_taxa.drop_duplicates(keep='first', inplace=True, subset =['verbatim_name', 'name'])

In [8]:
len(filtered_taxa)

816

In [10]:
len(filtered_taxa['name'].unique())

814

create crosswalk csv

In [12]:
filtered_taxa.to_csv(crosswalk_file, index=False)

## Create taxa list csv

In [13]:
fields = [
    'normalized_name',
    'Any taxon above genus',
    'genus modifier',
    'genus name',
    'subgenera modifier',
    'subgenera name',
    'species modifier',
    'species name',
    'subspecies modifier',
    'subspecies name',
    'non-taxa descriptor',
    'comments'
]

create blank columns for missing fields

In [14]:
missing_fields = list(set(fields) - set(filtered_taxa.columns))
missing_fields

['subgenera modifier', 'subgenera name']

In [15]:
for missing_field in missing_fields:
    filtered_taxa[missing_field] = np.nan

create new dataframe using taxa fields

In [16]:
taxa_df = pd.DataFrame(filtered_taxa, columns=fields)
taxa_df['taxon_group'] = taxon_group
taxa_df.drop_duplicates(keep='first', inplace=True)

taxa_df.head()

Unnamed: 0,normalized_name,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,comments,taxon_group
2263,Dinoflagellata indet.,Dinoflagellata indet.,,,,,,,,,,calcispheres,nannofossils
2264,Coccolithophyceae indet.,Coccolithophyceae indet.,,,,,,,,,,reworked taxa,nannofossils
2265,Coccolithophyceae indet.,Coccolithophyceae indet.,,,,,,,,,,Nannofossil fragments,nannofossils
2266,Coccolithophyceae indet.,Coccolithophyceae indet.,,,,,,,,,,Reworked species,nannofossils
2267,Dinoflagellata indet.,Dinoflagellata indet.,,,,,,,,,,Calcisphere,nannofossils


In [17]:
len(taxa_df)

766

In [18]:
taxa_df.to_csv(taxa_list_file, index=False)