# Normalize taxa list

Cleanup the normalized taxa list from the eODP researchers for a given particular taxon group. Create csvs to import the taxa data into the database.   

In [1]:
import sys

import pandas as pd
import numpy as np

sys.path.append('../scripts/')
import normalize_taxa as nt

In [2]:
input_file = f'raw_data/taxa/Micropal_headers_PBDB_Taxonomy_notes_taxa_list_{nt.date}.csv'

taxon_group = nt.taxon_groups[0]
crosswalk_file = f"cleaned_data/taxa/taxa_crosswalk_{taxon_group}_{nt.date}.csv"
taxa_list_file = f"cleaned_data/taxa/taxa_list_{taxon_group}_{nt.date}.csv"

## Import normalized taxa list

In [3]:
df = pd.read_csv(input_file, skiprows = 9)
df.head(2)

Unnamed: 0,taxon_group,verbatim_name,name,"name to use (if different from ""name"")",name comment field,Comment,notes,Any taxon above genus,genus modifier,genus name,...,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,comments,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank
0,,RESOLVED NAMES BY TAXONOMIC GROUP,,,,,,,,,...,,,,,,,,,,
1,benthic_forams,Pyrite,Pyrite,,,"not a taxa; Going into Macrostrat, not PBDB",Summer 2020: deal with later; should go in as ...,,,,...,,,,,,,,,,


In [4]:
df.shape

(4754, 21)

drop rows with problematic taxa

In [5]:
df = df.drop(list(range(28)))
df.head(2)

Unnamed: 0,taxon_group,verbatim_name,name,"name to use (if different from ""name"")",name comment field,Comment,notes,Any taxon above genus,genus modifier,genus name,...,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,comments,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank
28,planktic_forams,Globigerina bulloides,Globigerina bulloides,,,,"in PBDB, so it will be classified correctly; t...",,,Globigerina,...,,,bulloides,,,,,,,
29,planktic_forams,Globigerina falconensis,Globigerina falconensis,,,,"in PBDB, so it will be classified correctly; t...",,,Globigerina,...,,,falconensis,,,,,,,


setup the columns

In [6]:
dict = {'Comment': 'initial_comments', 'notes': 'processing_notes'}
df.rename(columns=dict, inplace=True)

df['normalized_name'] = np.nan

## Filter taxon group

select taxa for one taxa group

In [7]:
filtered_taxa = df[df['taxon_group'] == taxon_group]
filtered_taxa.head(2)

Unnamed: 0,taxon_group,verbatim_name,name,"name to use (if different from ""name"")",name comment field,initial_comments,processing_notes,Any taxon above genus,genus modifier,genus name,...,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,comments,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,normalized_name
1509,nannofossils,Reticulofenestra circus,Reticulofenestra circus,,,,"in PBDB, so it will be classified correctly; t...",,,Reticulofenestra,...,,circus,,,,,,,,
1510,nannofossils,Amaurolithus delicatus,Amaurolithus delicatus,,,,"in PBDB, so it will be classified correctly; t...",,,Amaurolithus,...,,delicatus,,,,,,,,


In [8]:
filtered_taxa.shape

(903, 22)

## Create crosswalk csv

In [9]:
fields = nt.taxa_rank_fields + nt.taxa_fields + nt.metadata_fields
filtered_taxa = pd.DataFrame(filtered_taxa, columns=fields)
filtered_taxa.head(2)

Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,normalized_name,taxon_group,verbatim_name,initial_comments,processing_notes,comments
1509,,,Reticulofenestra,,,,circus,,,,,nannofossils,Reticulofenestra circus,,"in PBDB, so it will be classified correctly; t...",
1510,,,Amaurolithus,,,,delicatus,,,,,nannofossils,Amaurolithus delicatus,,"in PBDB, so it will be classified correctly; t...",


set normalized_name using the taxa fields

In [10]:
nt.add_normalized_name_column(filtered_taxa)
filtered_taxa.head(2)

Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,normalized_name,taxon_group,verbatim_name,initial_comments,processing_notes,comments
1509,,,Reticulofenestra,,,,circus,,,,Reticulofenestra circus,nannofossils,Reticulofenestra circus,,"in PBDB, so it will be classified correctly; t...",
1510,,,Amaurolithus,,,,delicatus,,,,Amaurolithus delicatus,nannofossils,Amaurolithus delicatus,,"in PBDB, so it will be classified correctly; t...",


In [11]:
filtered_taxa.shape

(903, 16)

In [12]:
filtered_taxa = filtered_taxa.drop(filtered_taxa[filtered_taxa['normalized_name'] == ''].index)
filtered_taxa.shape

(903, 16)

In [13]:
filtered_taxa.drop_duplicates(keep='first', inplace=True, subset =['verbatim_name', 'normalized_name'])
filtered_taxa.head(5)

Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,normalized_name,taxon_group,verbatim_name,initial_comments,processing_notes,comments
1509,,,Reticulofenestra,,,,circus,,,,Reticulofenestra circus,nannofossils,Reticulofenestra circus,,"in PBDB, so it will be classified correctly; t...",
1510,,,Amaurolithus,,,,delicatus,,,,Amaurolithus delicatus,nannofossils,Amaurolithus delicatus,,"in PBDB, so it will be classified correctly; t...",
1511,,,Amaurolithus,,,,primus,,,,Amaurolithus primus,nannofossils,Amaurolithus primus,,"in PBDB, so it will be classified correctly; t...",
1512,,,Amaurolithus,,,,tricorniculatus,,,,Amaurolithus tricorniculatus,nannofossils,Amaurolithus tricorniculatus,,"in PBDB, so it will be classified correctly; t...",
1513,,,Braarudosphaera,,,,bigelowii,,,,Braarudosphaera bigelowii,nannofossils,Braarudosphaera bigelowii,,"in PBDB, so it will be classified correctly; t...",


In [14]:
filtered_taxa.shape

(822, 16)

create crosswalk csv

In [15]:
filtered_taxa.to_csv(crosswalk_file, index=False)

## Create taxa list csv

create taxa list with unique taxa fields

In [16]:
fields = nt.taxa_rank_fields + nt.taxa_fields
taxa_df = pd.DataFrame(filtered_taxa, columns=fields)

# drop duplicate data
taxa_df = taxa_df.drop_duplicates() 

taxa_df.head()

Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,normalized_name,taxon_group
1509,,,Reticulofenestra,,,,circus,,,,Reticulofenestra circus,nannofossils
1510,,,Amaurolithus,,,,delicatus,,,,Amaurolithus delicatus,nannofossils
1511,,,Amaurolithus,,,,primus,,,,Amaurolithus primus,nannofossils
1512,,,Amaurolithus,,,,tricorniculatus,,,,Amaurolithus tricorniculatus,nannofossils
1513,,,Braarudosphaera,,,,bigelowii,,,,Braarudosphaera bigelowii,nannofossils


In [17]:
taxa_df.shape

(746, 12)

### create taxa csv

In [18]:
taxa_df.to_csv(taxa_list_file, index=False)