# Normalize taxa list

Cleanup the normalized taxa list from the eODP researchers for a given particular taxon group. Create csvs to import the taxa data into the database.   

In [1]:
import sys
import os

import pandas as pd
import numpy as np

sys.path.append('../scripts/')
import normalize_taxa as nt

In [2]:
input_file =  os.path.join('raw_data', 'taxa', 
                           f'Micropal_headers_PBDB_Taxonomy_notes_taxa_list_{nt.date}.csv')

taxon_group = nt.taxon_groups[1]
crosswalk_file = os.path.join('cleaned_data', 'taxa', 'LIMS',
                              f'taxa_crosswalk_{taxon_group}_{nt.date}.csv')

taxa_list_file = os.path.join('cleaned_data', 'taxa', 'LIMS',
                              f'taxa_list_{taxon_group}_{nt.date}.csv')

In [3]:
taxon_group

'bolboformids'

## Import normalized taxa list

In [4]:
df = pd.read_csv(input_file, skiprows = 9)
df.head(2)

Unnamed: 0,taxon_group,verbatim_name,name,"name to use (if different from ""name"")",name comment field,Comment,notes,Any taxon above genus,genus modifier,genus name,...,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,comments,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank
0,,RESOLVED NAMES BY TAXONOMIC GROUP,,,,,,,,,...,,,,,,,,,,
1,benthic_forams,Pyrite,Pyrite,,,"not a taxa; Going into Macrostrat, not PBDB",Summer 2020: deal with later; should go in as ...,,,,...,,,,,,,,,,


In [5]:
df.shape

(4754, 21)

setup the columns

In [6]:
dict = {'Comment': 'initial_comments'}
df.rename(columns=dict, inplace=True)

df['normalized_name'] = np.nan

## Filter taxon group

select taxa for one taxa group

In [7]:
filtered_taxa = df[df['taxon_group'] == taxon_group]
filtered_taxa.head(2)

Unnamed: 0,taxon_group,verbatim_name,name,"name to use (if different from ""name"")",name comment field,initial_comments,notes,Any taxon above genus,genus modifier,genus name,...,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,comments,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,normalized_name
1495,bolboformids,Bolboforma metzmacheri ornata,Bolboforma metzmacheri ornata,,,,in PBDB ANS entered. PBDB has nothing as of Ju...,,,Bolboforma,...,,metzmacheri,,ornata,,,,,,
1496,bolboformids,Bolboforma subfragoris s.l.,Bolboforma subfragoris s.l.,Bolboforma subfragoris sensu lato,,,"in PBDB, so it will be classified correctly",,,Bolboforma,...,s.l.,subfragoris,,,,,,,,


In [8]:
filtered_taxa.shape

(2, 22)

## Create crosswalk csv

In [9]:
fields = nt.taxa_rank_fields + nt.taxa_fields + nt.metadata_fields
filtered_taxa = pd.DataFrame(filtered_taxa, columns=fields)
filtered_taxa.head(2)

Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,normalized_name,taxon_group,verbatim_name,initial_comments,comments
1495,,,Bolboforma,,,,metzmacheri,,ornata,,,bolboformids,Bolboforma metzmacheri ornata,,
1496,,,Bolboforma,,,s.l.,subfragoris,,,,,bolboformids,Bolboforma subfragoris s.l.,,


set normalized_name using the taxa fields

In [10]:
nt.add_normalized_name_column(filtered_taxa)
filtered_taxa.head(2)

Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,normalized_name,taxon_group,verbatim_name,initial_comments,comments
1495,,,Bolboforma,,,,metzmacheri,,ornata,,Bolboforma metzmacheri ornata,bolboformids,Bolboforma metzmacheri ornata,,
1496,,,Bolboforma,,,s.l.,subfragoris,,,,Bolboforma s.l. subfragoris,bolboformids,Bolboforma subfragoris s.l.,,


In [11]:
filtered_taxa.shape

(2, 15)

In [12]:
filtered_taxa = filtered_taxa.drop(filtered_taxa[filtered_taxa['normalized_name'] == ''].index)
filtered_taxa.shape

(2, 15)

In [13]:
filtered_taxa.drop_duplicates(keep='first', inplace=True, subset =['verbatim_name', 'normalized_name'])
filtered_taxa.head(5)

Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,normalized_name,taxon_group,verbatim_name,initial_comments,comments
1495,,,Bolboforma,,,,metzmacheri,,ornata,,Bolboforma metzmacheri ornata,bolboformids,Bolboforma metzmacheri ornata,,
1496,,,Bolboforma,,,s.l.,subfragoris,,,,Bolboforma s.l. subfragoris,bolboformids,Bolboforma subfragoris s.l.,,


In [14]:
filtered_taxa.shape

(2, 15)

create crosswalk csv

In [15]:
filtered_taxa.to_csv(crosswalk_file, index=False)

## Create taxa list csv

create taxa list with unique taxa fields

In [16]:
fields = nt.taxa_rank_fields + nt.taxa_fields
taxa_df = pd.DataFrame(filtered_taxa, columns=fields)

# drop duplicate data
taxa_df = taxa_df.drop_duplicates() 

taxa_df.head()

Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,normalized_name,taxon_group
1495,,,Bolboforma,,,,metzmacheri,,ornata,,Bolboforma metzmacheri ornata,bolboformids
1496,,,Bolboforma,,,s.l.,subfragoris,,,,Bolboforma s.l. subfragoris,bolboformids


In [17]:
taxa_df.shape

(2, 12)

### create taxa csv

In [18]:
taxa_df.to_csv(taxa_list_file, index=False)