# Normalize taxa list

Cleanup the normalized taxa list from the eODP researchers for a given particular taxon group. Create csvs to import the taxa data into the database.   

In [1]:
import sys
import os

import pandas as pd
import numpy as np

sys.path.append('../scripts/')
import normalize_taxa as nt

In [2]:
date = '2021-07-28'
input_file =  os.path.join('raw_data', 'taxa', 
                           f'LIMS_Micropal_headers_PBDB_Taxonomy_notes_taxa_list_{date}.csv')
crosswalk_file = os.path.join('cleaned_data', 'taxa', 'LIMS',
                              f'taxa_crosswalk_all_taxa_{date}.csv')

taxa_list_file = os.path.join('cleaned_data', 'taxa', 'LIMS',
                              f'taxa_list_all_taxa_{date}.csv')

## Import normalized taxa list

In [3]:
df = pd.read_csv(input_file, header = 9)
df.head(2)

Unnamed: 0,taxon_group,verbatim_name,name,"name to use (if different from ""name"")",name comment field,Comment,notes,Any taxon above genus,genus modifier,genus name,...,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,comments,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank
0,,RESOLVED NAMES BY TAXONOMIC GROUP,,,,,,,,,...,,,,,,,,,,
1,benthic_forams,Pyrite,Pyrite,,,"not a taxa; Going into Macrostrat, not PBDB",Summer 2020: deal with later; should go in as ...,,,,...,,,,,,,,,,


In [4]:
df.shape

(4754, 21)

In [5]:
df = df.dropna(how="all", axis='index')
df.shape

(4744, 21)

In [6]:
df = df.drop(df.index[[0,1]])
df.head(2)

Unnamed: 0,taxon_group,verbatim_name,name,"name to use (if different from ""name"")",name comment field,Comment,notes,Any taxon above genus,genus modifier,genus name,...,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,comments,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank
2,benthic_forams,Euuvigerina miozea (group) >100 m,Euuvigerina miozea (group) >100 m,Euuvigerina miozea,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",,,Euuvigerina,...,,,miozea,,,,>100 m group,,,
3,benthic_forams,Euuvigerina rodleyi (group) >50 m,Euuvigerina rodleyi (group) >50 m,Euuvigerina rodleyi,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",,,Euuvigerina,...,,,rodleyi,,,,>50 m group,,,


In [7]:
df.shape

(4742, 21)

In [30]:
non_blank_df = df.dropna(axis="index", how='all', subset=nt.taxa_rank_fields)
non_blank_df.shape

(4735, 21)

In [32]:
list(non_blank_df['taxon_group'].unique())

['benthic_forams',
 'planktic_forams',
 'radiolarians',
 'bolboformids',
 'chrysophyte_cysts',
 'ebridians',
 'diatoms',
 'nannofossils',
 'dinoflagellates',
 'silicoflagellates',
 'ostracods',
 'palynology']

In [33]:
df.shape

(4742, 21)

## Create crosswalk csv

In [8]:
fields = nt.taxa_rank_fields + nt.taxa_fields + nt.metadata_fields
crosswalk_df = pd.DataFrame(df, columns=fields)
crosswalk_df.head()

Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,normalized_name,taxon_group,verbatim_name,comments
2,,,Euuvigerina,,,,miozea,,,,,benthic_forams,Euuvigerina miozea (group) >100 m,>100 m group
3,,,Euuvigerina,,,,rodleyi,,,,,benthic_forams,Euuvigerina rodleyi (group) >50 m,>50 m group
4,Foraminifera indet.,,,,,,,,,,,benthic_forams,Others,other benthic foraminifera
5,Pleurostomellidae indet.,,,,,,,,,,,benthic_forams,Pleurostomellids comment,
6,Ostracoda indet.,,,,,,,,,,,benthic_forams,Ostracoda spp.,


In [9]:
crosswalk_df.shape

(4742, 14)

set normalized_name using the taxa fields

In [10]:
nt.add_normalized_name_column(crosswalk_df)
crosswalk_df.head()

Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,normalized_name,taxon_group,verbatim_name,comments
2,,,Euuvigerina,,,,miozea,,,,Euuvigerina miozea,benthic_forams,Euuvigerina miozea (group) >100 m,>100 m group
3,,,Euuvigerina,,,,rodleyi,,,,Euuvigerina rodleyi,benthic_forams,Euuvigerina rodleyi (group) >50 m,>50 m group
4,Foraminifera indet.,,,,,,,,,,Foraminifera indet.,benthic_forams,Others,other benthic foraminifera
5,Pleurostomellidae indet.,,,,,,,,,,Pleurostomellidae indet.,benthic_forams,Pleurostomellids comment,
6,Ostracoda indet.,,,,,,,,,,Ostracoda indet.,benthic_forams,Ostracoda spp.,


In [11]:
crosswalk_df.shape

(4742, 14)

drop rows with no taxa info

In [12]:
crosswalk_df[crosswalk_df['normalized_name'] == '']

Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,normalized_name,taxon_group,verbatim_name,comments
21,,,,,,,,,,,,benthic_forams,fossil,
22,,,,,,,,,,,,benthic_forams,fossil_group,
3068,,,,,,,,,,,,palynology,Preservation palynofacies,
3082,,,,,,,,,,,,palynology,Exotic,
3196,,,,,,,,,,,,planktic_forams,Organic matter,
3197,,,,,,,,,,,,planktic_forams,Terrestrial organic matter,
3213,,,,,,,,,,,,planktic_forams,Pyrite,


In [16]:
crosswalk_df = crosswalk_df.dropna(axis="index", how='all', subset=nt.taxa_rank_fields)
crosswalk_df.head()

Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,normalized_name,taxon_group,verbatim_name,comments
2,,,Euuvigerina,,,,miozea,,,,Euuvigerina miozea,benthic_forams,Euuvigerina miozea (group) >100 m,>100 m group
3,,,Euuvigerina,,,,rodleyi,,,,Euuvigerina rodleyi,benthic_forams,Euuvigerina rodleyi (group) >50 m,>50 m group
4,Foraminifera indet.,,,,,,,,,,Foraminifera indet.,benthic_forams,Others,other benthic foraminifera
5,Pleurostomellidae indet.,,,,,,,,,,Pleurostomellidae indet.,benthic_forams,Pleurostomellids comment,
6,Ostracoda indet.,,,,,,,,,,Ostracoda indet.,benthic_forams,Ostracoda spp.,


In [17]:
crosswalk_df.shape

(4735, 14)

In [19]:
crosswalk_df.drop_duplicates(keep='first', inplace=True, subset =['verbatim_name', 'normalized_name'])
crosswalk_df.head(5)

Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,normalized_name,taxon_group,verbatim_name,comments
2,,,Euuvigerina,,,,miozea,,,,Euuvigerina miozea,benthic_forams,Euuvigerina miozea (group) >100 m,>100 m group
3,,,Euuvigerina,,,,rodleyi,,,,Euuvigerina rodleyi,benthic_forams,Euuvigerina rodleyi (group) >50 m,>50 m group
4,Foraminifera indet.,,,,,,,,,,Foraminifera indet.,benthic_forams,Others,other benthic foraminifera
5,Pleurostomellidae indet.,,,,,,,,,,Pleurostomellidae indet.,benthic_forams,Pleurostomellids comment,
6,Ostracoda indet.,,,,,,,,,,Ostracoda indet.,benthic_forams,Ostracoda spp.,


In [20]:
crosswalk_df.shape

(4594, 14)

create crosswalk csv

In [21]:
crosswalk_df.to_csv(crosswalk_file, index=False)

## Create taxa list csv

create taxa list with unique taxa fields

In [25]:
fields = nt.taxa_rank_fields + nt.taxa_fields
taxa_df = pd.DataFrame(crosswalk_df, columns=fields)
taxa_df = taxa_df.drop_duplicates()

taxa_df.head()

Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,normalized_name,taxon_group
2,,,Euuvigerina,,,,miozea,,,,Euuvigerina miozea,benthic_forams
3,,,Euuvigerina,,,,rodleyi,,,,Euuvigerina rodleyi,benthic_forams
4,Foraminifera indet.,,,,,,,,,,Foraminifera indet.,benthic_forams
5,Pleurostomellidae indet.,,,,,,,,,,Pleurostomellidae indet.,benthic_forams
6,Ostracoda indet.,,,,,,,,,,Ostracoda indet.,benthic_forams


In [26]:
taxa_df.shape

(4209, 12)

### create taxa csv

In [27]:
taxa_df.to_csv(taxa_list_file, index=False)