# Normalize taxa list

Cleanup the normalized taxa list from the eODP researchers for a given particular taxon group. Create csvs to import the taxa data into the database.   

In [1]:
import pandas as pd
import numpy as np

In [2]:
taxon_groups = [
    'nannofossils',
    'silicoflagellates',
    'ostracods',
    'ebridians',
    'chrysophyte_cysts',
    'bolboformids',
    'diatoms'
]
taxon_group = taxon_groups[0]
date = '2021-05-03'

input_file = f'raw_data/taxa/Micropal_headers_PBDB_Taxonomy_notes_taxa_list_{date}.csv'
crosswalk_file = f"cleaned_data/taxa/taxa_crosswalk_{taxon_group}_{date}.csv"
taxa_list_file = f"cleaned_data/taxa/taxa_list_{taxon_group}_{date}.csv"

taxon_group

'nannofossils'

# 1. QA Micropal_headers_PBDB_Taxonomy_notes_taxa_list

Check if the normalized taxa from the google sheet matches the taxa from the LIMS taxa_list.csv.

In [3]:
all_taxa_file = 'cleaned_data/taxa/taxa_list.csv'

In [4]:
normalized_df = pd.read_csv(input_file, skiprows = 9)
normalized_df.shape

(4749, 21)

In [5]:
normalized_df.head(2)

Unnamed: 0,taxon_group,verbatim_name,name,"name to use (if different from ""name"")",name comment field,Comment,notes,Any taxon above genus,genus modifier,genus name,...,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,comments,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank
0,,PROBLEMATIC NAMES,,,,,,,,,...,,,,,,,,,,
1,dinoflagellates,Amorphous organic matter,Amorphous organic matter,,,not a taxa name,Summer 2020: deal with later; should go in as ...,,,,...,,,,,,,,,,


In [6]:
taxa_df = pd.read_csv(all_taxa_file)
taxa_df.shape

(4738, 4)

In [7]:
taxa_df.head(2)

Unnamed: 0,verbatim_name,name,taxon_group,genera
0,"""Globigerina"" angulisuturalis _T","""Globigerina"" angulisuturalis _T",planktic_forams,
1,"""Globigerina"" angulisuturalis _T_","""Globigerina"" angulisuturalis _T_",planktic_forams,


In [8]:
normalized_names = set(normalized_df['verbatim_name'])
taxa_names = set(taxa_df['verbatim_name'])

get taxa in the normalized taxa file that isn't in LIMS taxa file

In [9]:
len(normalized_names - taxa_names)

3

In [10]:
normalized_names - taxa_names

{'PROBLEMATIC NAMES', 'RESOLVED NAMES BY TAXONOMIC GROUP', nan}

get taxa in the  LIMS taxa file  that isn't in normalized taxa file

In [11]:
len(taxa_names - normalized_names)

1

In [12]:
taxa_names - normalized_names

{'Sponge spicules'}

# 2. Process normalized taxa list

Create csvs to import the taxa data into the database.   

## Import normalized taxa list

In [13]:
df = pd.read_csv(input_file, skiprows = 9)
df.head(2)

Unnamed: 0,taxon_group,verbatim_name,name,"name to use (if different from ""name"")",name comment field,Comment,notes,Any taxon above genus,genus modifier,genus name,...,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,comments,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank
0,,PROBLEMATIC NAMES,,,,,,,,,...,,,,,,,,,,
1,dinoflagellates,Amorphous organic matter,Amorphous organic matter,,,not a taxa name,Summer 2020: deal with later; should go in as ...,,,,...,,,,,,,,,,


In [14]:
df.shape

(4749, 21)

drop rows with problematic taxa

In [15]:
df = df.drop(list(range(28)))
df.head(2)

Unnamed: 0,taxon_group,verbatim_name,name,"name to use (if different from ""name"")",name comment field,Comment,notes,Any taxon above genus,genus modifier,genus name,...,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,comments,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank
28,benthic_forams,Pyrite,Pyrite,,,not a taxa,Summer 2020: deal with later; should go in as ...,"Going into Macrostrat, not PBDB, Shanan will t...",,,...,,,,,,,,,,
29,benthic_forams,Euuvigerina miozea (group) >100 m,Euuvigerina miozea (group) >100 m,Euuvigerina miozea,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",,,Euuvigerina,...,,,miozea,,,,,,,


rename columns

In [16]:
dict = {'Comment': 'initial_comments', 'notes': 'processing_notes'}
  
df.rename(columns=dict, inplace=True)

## Filter taxon group

select taxa for a given taxon group  

In [17]:
filtered_taxa = df[df['taxon_group'] == taxon_group].copy()
filtered_taxa.head(2)

Unnamed: 0,taxon_group,verbatim_name,name,"name to use (if different from ""name"")",name comment field,initial_comments,processing_notes,Any taxon above genus,genus modifier,genus name,...,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,comments,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank
2252,nannofossils,calcispheres,calcispheres,Dinoflagellata indet.,calcispheres,"group, not a taxa","in PBDB, so it will be classified correctly",Dinoflagellata indet.,,,...,,,,,,,calcispheres,,,
2253,nannofossils,reworked taxa,reworked taxa,Coccolithophyceae indet.,reworked taxa,"group, not a taxa","in PBDB, so it will be classified correctly",Coccolithophyceae indet.,,,...,,,,,,,reworked taxa,,,


In [18]:
filtered_taxa.shape

(817, 21)

select columns for output csvs

In [19]:
taxa_rank_fields = [
    'Any taxon above genus',
    'genus modifier',
    'genus name',
    'subgenera modifier',
    'subgenera name',
    'species modifier',
    'species name',
    'subspecies modifier',
    'subspecies name',
]

taxa_fields = [
    'non-taxa descriptor',
    'normalized_name', 
    'taxon_group'
]

metadata_fields = [
    'verbatim_name',
    'initial_comments',
    'processing_notes',
    'comments',
]
fields = taxa_rank_fields + taxa_fields + metadata_fields

In [20]:
filtered_taxa['normalized_name'] = np.nan
filtered_taxa = filtered_taxa[fields]
filtered_taxa.head(2)

Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,normalized_name,taxon_group,verbatim_name,initial_comments,processing_notes,comments
2252,Dinoflagellata indet.,,,,,,,,,,,nannofossils,calcispheres,"group, not a taxa","in PBDB, so it will be classified correctly",calcispheres
2253,Coccolithophyceae indet.,,,,,,,,,,,nannofossils,reworked taxa,"group, not a taxa","in PBDB, so it will be classified correctly",reworked taxa


## Add normalized_name

set normalized_name using the taxa fields

In [21]:
# concatenate taxa fields
filtered_taxa['normalized_name'] = filtered_taxa['Any taxon above genus'].str.cat(
    filtered_taxa[taxa_rank_fields[1:]], sep=' ', na_rep='')

# add "(descriptor)" if it exists
descriptor = np.where(
    filtered_taxa['non-taxa descriptor'].notnull(),
    '(' + filtered_taxa['non-taxa descriptor'] + ')',
    ''
)
filtered_taxa['normalized_name'] = filtered_taxa['normalized_name'] + descriptor

In [22]:
# get rid of extra spaces
filtered_taxa['normalized_name'] = filtered_taxa['normalized_name'].str.strip()
filtered_taxa['normalized_name'] = filtered_taxa['normalized_name'].replace(
    to_replace ='  +', value = ' ', regex = True)

## Create crosswalk csv

In [23]:
filtered_taxa.drop_duplicates(keep='first', inplace=True, subset =['verbatim_name', 'normalized_name'])
filtered_taxa.head(5)

Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,normalized_name,taxon_group,verbatim_name,initial_comments,processing_notes,comments
2252,Dinoflagellata indet.,,,,,,,,,,Dinoflagellata indet.,nannofossils,calcispheres,"group, not a taxa","in PBDB, so it will be classified correctly",calcispheres
2253,Coccolithophyceae indet.,,,,,,,,,,Coccolithophyceae indet.,nannofossils,reworked taxa,"group, not a taxa","in PBDB, so it will be classified correctly",reworked taxa
2254,Coccolithophyceae indet.,,,,,,,,,,Coccolithophyceae indet.,nannofossils,Nannofossil fragments,not a taxa,"in PBDB, so it will be classified correctly",Nannofossil fragments
2255,Coccolithophyceae indet.,,,,,,,,,,Coccolithophyceae indet.,nannofossils,Reworked species,not a taxa,"in PBDB, so it will be classified correctly",Reworked species
2256,Dinoflagellata indet.,,,,,,,,,,Dinoflagellata indet.,nannofossils,Calcisphere,not a taxa,"in PBDB, so it will be classified correctly",Calcisphere


In [24]:
filtered_taxa.shape

(817, 16)

create crosswalk csv

In [25]:
filtered_taxa.to_csv(crosswalk_file, index=False)

## Create taxa list csv

create taxa list with unique taxa fields

In [26]:
taxa_df = pd.DataFrame(filtered_taxa, columns=taxa_rank_fields + taxa_fields)

# drop duplicate data
taxa_df = taxa_df.drop_duplicates() 

taxa_df.head()

Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,normalized_name,taxon_group
2252,Dinoflagellata indet.,,,,,,,,,,Dinoflagellata indet.,nannofossils
2253,Coccolithophyceae indet.,,,,,,,,,,Coccolithophyceae indet.,nannofossils
2258,,,Cyclicargolithus,,,cf.,floridanus,,,,Cyclicargolithus cf. floridanus,nannofossils
2259,,,Ahmuellerella,,,,octoradiata,,,,Ahmuellerella octoradiata,nannofossils
2260,,,Algirosphaera,,,,robusta,,,,Algirosphaera robusta,nannofossils


In [27]:
taxa_df.shape

(741, 12)

### create taxa csv

In [28]:
taxa_df.to_csv(taxa_list_file, index=False)