# Normalize taxa list

Cleanup the normalized taxa list from the eODP researchers for a given particular taxon group. Create csvs to import the taxa data into the database.   

In [1]:
import pandas as pd
import numpy as np

In [2]:
taxon_groups = [
    'nannofossils',
    'silicoflagellates',
    'ostracods',
    'ebridians',
    'chrysophyte_cysts',
    'bolboformids',
    'diatoms',
    'planktic_forams',
    'radiolarians'
]

taxa_rank_fields = [
    "Any taxon above genus",
    "genus modifier",
    "genus name",
    "subgenera modifier",
    "subgenera name",
    "species modifier",
    "species name",
    "subspecies modifier",
    "subspecies name",
]

taxa_fields = ["non-taxa descriptor", "normalized_name", "taxon_group"]

metadata_fields = [
    "verbatim_name",
    "initial_comments",
    "processing_notes",
    "comments",
]

In [3]:
date = '2021-05-05'
input_file = f'raw_data/taxa/Micropal_headers_PBDB_Taxonomy_notes_taxa_list_{date}.csv'

taxon_group = taxon_groups[0]
crosswalk_file = f"cleaned_data/taxa/taxa_crosswalk_{taxon_group}_{date}.csv"
taxa_list_file = f"cleaned_data/taxa/taxa_list_{taxon_group}_{date}.csv"

In [4]:
def add_normalized_name_column(df):
    fields = [
        "genus modifier",
        "genus name",
        "subgenera modifier",
        "subgenera name",
        "species modifier",
        "species name",
        "subspecies modifier",
        "subspecies name",
    ]

    # concatenate taxa fields into a string
    df["normalized_name"] = df["Any taxon above genus"].str.cat(
        df[fields], sep=" ", na_rep=""
    )

    # add "(descriptor)" if it exists
    descriptor = np.where(
        df["non-taxa descriptor"].notnull(), "(" + df["non-taxa descriptor"] + ")", ""
    )
    df["normalized_name"] = df["normalized_name"] + descriptor

    # get rid of extra spaces
    df["normalized_name"] = df["normalized_name"].str.strip()
    df["normalized_name"] = df["normalized_name"].replace(
        to_replace="  +", value=" ", regex=True
    )

    return df

## Import normalized taxa list

In [5]:
df = pd.read_csv(input_file, skiprows = 9)
df.head(2)

Unnamed: 0,taxon_group,verbatim_name,name,"name to use (if different from ""name"")",name comment field,Comment,notes,Any taxon above genus,genus modifier,genus name,...,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,comments,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank
0,,PROBLEMATIC NAMES,,,,,,,,,...,,,,,,,,,,
1,dinoflagellates,Amorphous organic matter,Amorphous organic matter,,,not a taxa name,Summer 2020: deal with later; should go in as ...,,,,...,,,,,,,,,,


In [6]:
df.shape

(4756, 21)

drop rows with problematic taxa

In [7]:
df = df.drop(list(range(28)))
df.head(2)

Unnamed: 0,taxon_group,verbatim_name,name,"name to use (if different from ""name"")",name comment field,Comment,notes,Any taxon above genus,genus modifier,genus name,...,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,comments,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank
28,benthic_forams,Pyrite,Pyrite,,,"not a taxa; Going into Macrostrat, not PBDB",Summer 2020: deal with later; should go in as ...,,,,...,,,,,,,,,,
29,benthic_forams,Euuvigerina miozea (group) >100 m,Euuvigerina miozea (group) >100 m,Euuvigerina miozea,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",,,Euuvigerina,...,,,miozea,,,,,,,


setup the columns

In [8]:
dict = {'Comment': 'initial_comments', 'notes': 'processing_notes'}
df.rename(columns=dict, inplace=True)

df['normalized_name'] = np.nan

## Filter taxon group

select taxa for one taxa group

In [9]:
filtered_taxa = df[df['taxon_group'] == taxon_group]
filtered_taxa.head(2)

Unnamed: 0,taxon_group,verbatim_name,name,"name to use (if different from ""name"")",name comment field,initial_comments,processing_notes,Any taxon above genus,genus modifier,genus name,...,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,comments,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,normalized_name
1536,nannofossils,Reticulofenestra circus,Reticulofenestra circus,,,,"in PBDB, so it will be classified correctly; t...",,,Reticulofenestra,...,,circus,,,,,,,,
1537,nannofossils,Amaurolithus delicatus,Amaurolithus delicatus,,,,"in PBDB, so it will be classified correctly; t...",,,Amaurolithus,...,,delicatus,,,,,,,,


In [10]:
filtered_taxa.shape

(903, 22)

## Create crosswalk csv

In [11]:
fields = taxa_rank_fields + taxa_fields + metadata_fields
filtered_taxa = pd.DataFrame(filtered_taxa, columns=fields)
filtered_taxa.head(2)

Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,normalized_name,taxon_group,verbatim_name,initial_comments,processing_notes,comments
1536,,,Reticulofenestra,,,,circus,,,,,nannofossils,Reticulofenestra circus,,"in PBDB, so it will be classified correctly; t...",
1537,,,Amaurolithus,,,,delicatus,,,,,nannofossils,Amaurolithus delicatus,,"in PBDB, so it will be classified correctly; t...",


set normalized_name using the taxa fields

In [12]:
add_normalized_name_column(filtered_taxa)
filtered_taxa.head(2)

Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,normalized_name,taxon_group,verbatim_name,initial_comments,processing_notes,comments
1536,,,Reticulofenestra,,,,circus,,,,Reticulofenestra circus,nannofossils,Reticulofenestra circus,,"in PBDB, so it will be classified correctly; t...",
1537,,,Amaurolithus,,,,delicatus,,,,Amaurolithus delicatus,nannofossils,Amaurolithus delicatus,,"in PBDB, so it will be classified correctly; t...",


In [13]:
filtered_taxa.shape

(903, 16)

In [14]:
filtered_taxa = filtered_taxa.drop(filtered_taxa[filtered_taxa['normalized_name'] == ''].index)
filtered_taxa.shape

(903, 16)

In [15]:
filtered_taxa.drop_duplicates(keep='first', inplace=True, subset =['verbatim_name', 'normalized_name'])
filtered_taxa.head(5)

Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,normalized_name,taxon_group,verbatim_name,initial_comments,processing_notes,comments
1536,,,Reticulofenestra,,,,circus,,,,Reticulofenestra circus,nannofossils,Reticulofenestra circus,,"in PBDB, so it will be classified correctly; t...",
1537,,,Amaurolithus,,,,delicatus,,,,Amaurolithus delicatus,nannofossils,Amaurolithus delicatus,,"in PBDB, so it will be classified correctly; t...",
1538,,,Amaurolithus,,,,primus,,,,Amaurolithus primus,nannofossils,Amaurolithus primus,,"in PBDB, so it will be classified correctly; t...",
1539,,,Amaurolithus,,,,tricorniculatus,,,,Amaurolithus tricorniculatus,nannofossils,Amaurolithus tricorniculatus,,"in PBDB, so it will be classified correctly; t...",
1540,,,Braarudosphaera,,,,bigelowii,,,,Braarudosphaera bigelowii,nannofossils,Braarudosphaera bigelowii,,"in PBDB, so it will be classified correctly; t...",


In [16]:
filtered_taxa.shape

(822, 16)

create crosswalk csv

In [17]:
filtered_taxa.to_csv(crosswalk_file, index=False)

## Create taxa list csv

create taxa list with unique taxa fields

In [18]:
fields = taxa_rank_fields + taxa_fields
taxa_df = pd.DataFrame(filtered_taxa, columns=fields)

# drop duplicate data
taxa_df = taxa_df.drop_duplicates() 

taxa_df.head()

Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,normalized_name,taxon_group
1536,,,Reticulofenestra,,,,circus,,,,Reticulofenestra circus,nannofossils
1537,,,Amaurolithus,,,,delicatus,,,,Amaurolithus delicatus,nannofossils
1538,,,Amaurolithus,,,,primus,,,,Amaurolithus primus,nannofossils
1539,,,Amaurolithus,,,,tricorniculatus,,,,Amaurolithus tricorniculatus,nannofossils
1540,,,Braarudosphaera,,,,bigelowii,,,,Braarudosphaera bigelowii,nannofossils


In [19]:
taxa_df.shape

(746, 12)

### create taxa csv

In [20]:
taxa_df.to_csv(taxa_list_file, index=False)