# Normalize taxa list

Cleanup the normalized taxa list from the eODP researchers for a given particular taxon group.

In [1]:
import pandas as pd
import numpy as np

## Import normalized taxa list

In [2]:
taxon_group = 'nannofossils'
date = '2021-03-01'

input_file = f'raw_data/taxa/Micropal_headers_PBDB_Taxonomy_notes_nannos_{date}.xlsx'
crosswalk_file = f"cleaned_data/taxa/taxa_crosswalk_{taxon_group}_{date}.csv"
taxa_list_file = f"cleaned_data/taxa/taxa_list_{taxon_group}_{date}.csv"

In [3]:
df = pd.read_excel(input_file, sheet_name="taxa list", skiprows = 9)
df.head()

Unnamed: 0,verbatim_name,name,"name to use (if different from ""name"")",name comment field,taxon_group,Comment,notes,Any taxon above genus,genus modifier,genus name,...,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,comments,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank
0,PROBLEMATIC NAMES,,,,,,,,,,...,,,,,,,,,,
1,Euuvigerina miozea (group) >100 m,Euuvigerina miozea (group) >100 m,these need both checking & can't be entered cu...,,benthic_forams,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",,,,...,,,,,,,,,,
2,Euuvigerina rodleyi (group) >50 m,Euuvigerina rodleyi (group) >50 m,,,benthic_forams,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",,,,...,,,,,,,,,,
3,Pyrite,Pyrite,,,benthic_forams,not a taxa,Summer 2020: deal with later; should go in as ...,,,,...,,,,,,,,,,
4,Skeletonemopsis and Skeletonema grp,Skeletonemopsis and Skeletonema grp,Skelentonematacea indet.,,diatoms,group,Summer 2020: deal with later? LL: both are in ...,,,,...,,,,,,,,,,


select taxa for a given taxon group  

In [4]:
filtered_taxa = df[df['taxon_group'] == taxon_group].copy()

select columns for output csvs

In [5]:
taxa_fields = [
    'Any taxon above genus',
    'genus modifier',
    'genus name',
    'subgenera modifier',
    'subgenera name',
    'species modifier',
    'species name',
    'subspecies modifier',
    'subspecies name',
    'non-taxa descriptor'
]
metadata_fields = [
    'normalized_name', 
    'taxon_group',
    'verbatim_name',
    'comments',
]
fields = taxa_fields + metadata_fields

In [6]:
filtered_taxa['normalized_name'] = np.nan
filtered_taxa = filtered_taxa[fields]
filtered_taxa.head()

Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,normalized_name,taxon_group,verbatim_name,comments
2263,Dinoflagellata indet.,,,,,,,,,,,nannofossils,calcispheres,calcispheres
2264,Coccolithophyceae indet.,,,,,,,,,,,nannofossils,reworked taxa,reworked taxa
2265,Coccolithophyceae indet.,,,,,,,,,,,nannofossils,Nannofossil fragments,Nannofossil fragments
2266,Coccolithophyceae indet.,,,,,,,,,,,nannofossils,Reworked species,Reworked species
2267,Dinoflagellata indet.,,,,,,,,,,,nannofossils,Calcisphere,Calcisphere


In [7]:
len(filtered_taxa)

817

## Add normalized_name

set normalized_name using the taxa fields

In [8]:
taxa_fields[1:9]

['genus modifier',
 'genus name',
 'subgenera modifier',
 'subgenera name',
 'species modifier',
 'species name',
 'subspecies modifier',
 'subspecies name']

In [9]:
# concatenate taxa fields
filtered_taxa['normalized_name'] = filtered_taxa['Any taxon above genus'].str.cat(
    filtered_taxa[taxa_fields[1:9]], sep=' ', na_rep='')

# add "(descriptor)" if it exists
descriptor = np.where(
    filtered_taxa['non-taxa descriptor'].notnull(),
    '(' + filtered_taxa['non-taxa descriptor'] + ')',
    ''
)
filtered_taxa['normalized_name'] = filtered_taxa['normalized_name'] + descriptor

In [10]:
# get rid of extra spaces
filtered_taxa['normalized_name'] = filtered_taxa['normalized_name'].str.strip()
filtered_taxa['normalized_name'] = filtered_taxa['normalized_name'].replace(
    to_replace ='  +', value = ' ', regex = True)

## Create crowalk csv

In [11]:
filtered_taxa.drop_duplicates(keep='first', inplace=True, subset =['verbatim_name', 'normalized_name'])
filtered_taxa.head(5)

Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,normalized_name,taxon_group,verbatim_name,comments
2263,Dinoflagellata indet.,,,,,,,,,,Dinoflagellata indet.,nannofossils,calcispheres,calcispheres
2264,Coccolithophyceae indet.,,,,,,,,,,Coccolithophyceae indet.,nannofossils,reworked taxa,reworked taxa
2265,Coccolithophyceae indet.,,,,,,,,,,Coccolithophyceae indet.,nannofossils,Nannofossil fragments,Nannofossil fragments
2266,Coccolithophyceae indet.,,,,,,,,,,Coccolithophyceae indet.,nannofossils,Reworked species,Reworked species
2267,Dinoflagellata indet.,,,,,,,,,,Dinoflagellata indet.,nannofossils,Calcisphere,Calcisphere


In [12]:
len(filtered_taxa)

816

create crosswalk csv

In [13]:
filtered_taxa.to_csv(crosswalk_file, index=False)

## Create taxa list csv

create taxa list with unique taxa fields

In [14]:
taxa_df = pd.DataFrame(filtered_taxa, columns=fields)

# drop duplicate data
taxa_df = taxa_df.drop(['verbatim_name', 'comments'], axis = 1)
taxa_df = taxa_df.drop_duplicates() 

len(taxa_df)

742

create taxa csv

In [15]:
taxa_df.to_csv(taxa_list_file, index=False)