# Normalize taxa list

Cleanup the normalized taxa list from the eODP researchers for a given particular taxon group. Create csvs to import the taxa data into the database.   

In [3]:
import sys

import pandas as pd
import numpy as np
import requests

sys.path.append('../../')
import scripts.normalize_taxa as nt

In [4]:
date = '2021-07-28'
input_file = f'../../raw_data/PI_processed_files/LIMS_Micropal_headers_PBDB_Taxonomy_notes_taxa_list_{date}.csv'

crosswalk_file = f"../../output/taxa/LIMS/taxa_crosswalk_{date}.csv"
taxa_list_file = f"../../output/taxa/LIMS/taxa_list_{date}.csv"

In [5]:
def log_df(df, row_count=5):
    print(df.shape)
    return df.head(row_count)


## Import normalized taxa list

In [6]:
df = pd.read_csv(input_file, skiprows = 9)
df = df.drop(df.index[[0, 1]])
df = df.dropna(how="all", axis="index")

log_df(df)

(4742, 21)


Unnamed: 0,taxon_group,verbatim_name,name,"name to use (if different from ""name"")",name comment field,Comment,notes,Any taxon above genus,genus modifier,genus name,...,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,comments,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank
2,benthic_forams,Euuvigerina miozea (group) >100 m,Euuvigerina miozea (group) >100 m,Euuvigerina miozea,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",,,Euuvigerina,...,,,miozea,,,,>100 m group,,,
3,benthic_forams,Euuvigerina rodleyi (group) >50 m,Euuvigerina rodleyi (group) >50 m,Euuvigerina rodleyi,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",,,Euuvigerina,...,,,rodleyi,,,,>50 m group,,,
4,benthic_forams,Others,Others,Foraminifera indet.,,not a taxa,Andy,Foraminifera indet.,,,...,,,,,,,other benthic foraminifera,,,
5,benthic_forams,Pleurostomellids comment,Pleurostomellids comment,Pleurostomellia indet.,,not a taxa,Andy,Pleurostomellidae indet.,,,...,,,,,,,,,,
6,benthic_forams,Ostracoda spp.,Ostracoda spp.,Ostracoda indet.,Ostracoda spp.,group abundance,"Summer 2020: go in as Ostracoda indet, and the...",Ostracoda indet.,,,...,,,,,,,,,,


In [7]:
df.shape

(4742, 21)

## Create crosswalk csv

In [8]:
fields = nt.taxa_rank_fields + nt.taxa_fields + nt.metadata_fields
filtered_taxa = pd.DataFrame(df, columns=fields)

set normalized_name using the taxa fields

In [9]:
nt.add_normalized_name_column(filtered_taxa)
log_df(filtered_taxa)

(4742, 14)


Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,normalized_name,taxon_group,verbatim_name,comments
2,,,Euuvigerina,,,,miozea,,,,Euuvigerina miozea,benthic_forams,Euuvigerina miozea (group) >100 m,>100 m group
3,,,Euuvigerina,,,,rodleyi,,,,Euuvigerina rodleyi,benthic_forams,Euuvigerina rodleyi (group) >50 m,>50 m group
4,Foraminifera indet.,,,,,,,,,,Foraminifera indet.,benthic_forams,Others,other benthic foraminifera
5,Pleurostomellidae indet.,,,,,,,,,,Pleurostomellidae indet.,benthic_forams,Pleurostomellids comment,
6,Ostracoda indet.,,,,,,,,,,Ostracoda indet.,benthic_forams,Ostracoda spp.,


In [49]:
filtered_taxa = filtered_taxa.drop(filtered_taxa[filtered_taxa['normalized_name'] == ''].index)
log_df(filtered_taxa)

(4735, 14)


Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,normalized_name,taxon_group,verbatim_name,comments
2,,,Euuvigerina,,,,miozea,,,,Euuvigerina miozea,benthic_forams,Euuvigerina miozea (group) >100 m,>100 m group
3,,,Euuvigerina,,,,rodleyi,,,,Euuvigerina rodleyi,benthic_forams,Euuvigerina rodleyi (group) >50 m,>50 m group
4,Foraminifera indet.,,,,,,,,,,Foraminifera indet.,benthic_forams,Others,other benthic foraminifera
5,Pleurostomellidae indet.,,,,,,,,,,Pleurostomellidae indet.,benthic_forams,Pleurostomellids comment,
6,Ostracoda indet.,,,,,,,,,,Ostracoda indet.,benthic_forams,Ostracoda spp.,


In [50]:
filtered_taxa.drop_duplicates(keep='first', inplace=True, subset =['verbatim_name', 'normalized_name'])
log_df(filtered_taxa)

(4594, 14)


Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,normalized_name,taxon_group,verbatim_name,comments
2,,,Euuvigerina,,,,miozea,,,,Euuvigerina miozea,benthic_forams,Euuvigerina miozea (group) >100 m,>100 m group
3,,,Euuvigerina,,,,rodleyi,,,,Euuvigerina rodleyi,benthic_forams,Euuvigerina rodleyi (group) >50 m,>50 m group
4,Foraminifera indet.,,,,,,,,,,Foraminifera indet.,benthic_forams,Others,other benthic foraminifera
5,Pleurostomellidae indet.,,,,,,,,,,Pleurostomellidae indet.,benthic_forams,Pleurostomellids comment,
6,Ostracoda indet.,,,,,,,,,,Ostracoda indet.,benthic_forams,Ostracoda spp.,


create crosswalk csv

In [15]:
filtered_taxa.to_csv(crosswalk_file, index=False)

## Create taxa list csv

create taxa list with unique taxa fields

In [51]:
fields = nt.taxa_rank_fields + nt.taxa_fields
taxa_df = pd.DataFrame(filtered_taxa, columns=fields)

# drop duplicate data
taxa_df = taxa_df.drop_duplicates() 

log_df(taxa_df)

(4209, 12)


Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,normalized_name,taxon_group
2,,,Euuvigerina,,,,miozea,,,,Euuvigerina miozea,benthic_forams
3,,,Euuvigerina,,,,rodleyi,,,,Euuvigerina rodleyi,benthic_forams
4,Foraminifera indet.,,,,,,,,,,Foraminifera indet.,benthic_forams
5,Pleurostomellidae indet.,,,,,,,,,,Pleurostomellidae indet.,benthic_forams
6,Ostracoda indet.,,,,,,,,,,Ostracoda indet.,benthic_forams


### create taxa csv

In [18]:
taxa_df.to_csv(taxa_list_file, index=False)

## create genera csv

In [272]:
taxa_df = pd.read_csv(taxa_list_file)
log_df(taxa_df)

(4209, 12)


Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,normalized_name,taxon_group
0,,,Euuvigerina,,,,miozea,,,,Euuvigerina miozea,benthic_forams
1,,,Euuvigerina,,,,rodleyi,,,,Euuvigerina rodleyi,benthic_forams
2,Foraminifera indet.,,,,,,,,,,Foraminifera indet.,benthic_forams
3,Pleurostomellidae indet.,,,,,,,,,,Pleurostomellidae indet.,benthic_forams
4,Ostracoda indet.,,,,,,,,,,Ostracoda indet.,benthic_forams


In [277]:
genus_df = pd.DataFrame(taxa_df['genus name'].str.strip().dropna().unique(), columns=['genus name']) 

log_df(genus_df)

(1015, 1)


Unnamed: 0,genus name
0,Euuvigerina
1,Nodosaria
2,Cibicides
3,Brizalina
4,Candeina


In [278]:
PBDB_API = "https://paleobiodb.org/data1.2/"
PBDB_TAXA_NAME = f"{PBDB_API}taxa/single.json?vocab=pbdb&name="
PBDB_TAXA_ID = f"{PBDB_API}taxa/single.json?vocab=pbdb&id="

In [279]:
def fill_taxon(df, index, data, taxon_rank):
    # cast taxon_no to string to avoid pandas converting it to a float    
    df.at[index, f'{taxon_rank}_taxon_id'] = str(data[0]["taxon_no"])
    df.at[index, f'{taxon_rank}_taxon_name'] = data[0]["taxon_name"]
                

def get_parent_taxa(df, parent_id, taxon_rank, round, data): 
    if taxon_rank == 'kingdom':
        return data
    if round > 20:
        return data 
    
    round = round + 1
       
    url_parent = PBDB_TAXA_ID + parent_id
    response = requests.get(url_parent)
    if response.status_code == 200:
        data = response.json()["records"]
        if len(data) == 1:
            taxon_rank = data[0]["taxon_rank"]
            parent_id = data[0]["parent_no"]
            if taxon_rank == 'family':
                fill_taxon(df, index, data, taxon_rank)
            if taxon_rank == 'order':
                fill_taxon(df, index, data, taxon_rank)
            if taxon_rank == 'class':
                fill_taxon(df, index, data, taxon_rank)
            if taxon_rank == 'phylum':
                fill_taxon(df, index, data, taxon_rank)
            if taxon_rank == 'kingdom':
                fill_taxon(df, index, data, taxon_rank)

                
            return get_phylum(df, parent_id, taxon_rank, round, data)



for index, row in genus_df.iterrows():        
    if index % 50 == 0:
        print(index, end=' ')
        
    url =  PBDB_TAXA_NAME +  row['genus name']
    if row['genus name'] == 'Martinotiella':
        url =  PBDB_TAXA_NAME + 'Martinottiella'
        
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()["records"]
        if len(data) == 1:
            genus_df.at[index, f'pbdb_taxon_id'] = str(data[0]["taxon_no"])
            genus_df.at[index, f'pbdb_taxon_name'] = data[0]["taxon_name"]
            genus_df.at[index, f'pbdb_taxon_rank'] = data[0]["taxon_rank"]
            
            round = 0
            get_parent_taxa(genus_df, data[0]["parent_no"], data[0]["taxon_rank"], round, None)

                

0 50 100 150 200 250 300 350 400 450 500 550 600 650 700 750 800 850 900 950 1000 

In [284]:


genus_df = genus_df.reindex(columns=[
    'genus name', 
    'pbdb_taxon_id', 'pbdb_taxon_name', 'pbdb_taxon_rank',
    'family_taxon_id', 'family_taxon_name',
    'order_taxon_id', 'order_taxon_name', 
    'class_taxon_id', 'class_taxon_name',
    'phylum_taxon_id', 'phylum_taxon_name', 
    'kingdom_taxon_id', 'kingdom_taxon_name', 
    ])

In [285]:
genus_df[0:20]

Unnamed: 0,genus name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
0,Euuvigerina,1408,Euuvigerina,genus,,,,,,,288974,Foraminifera,212476,Rhizaria
1,Nodosaria,1952,Nodosaria,genus,82197.0,Nodosariidae,429322.0,Nodosariida,428875.0,Nodosariata,288974,Foraminifera,212476,Rhizaria
2,Cibicides,1107,Cibicides,genus,82208.0,Cibicididae,,,,,288974,Foraminifera,212476,Rhizaria
3,Brizalina,1017,Brizalina,genus,112279.0,Bolivinidae,,,,,288974,Foraminifera,212476,Rhizaria
4,Candeina,1053,Candeina,genus,422277.0,Candeinidae,,,,,288974,Foraminifera,212476,Rhizaria
5,Dentoglobigerina,1264,Dentoglobigerina,genus,82191.0,Globigerinidae,,,,,288974,Foraminifera,212476,Rhizaria
6,Globigerina,1498,Globigerina,genus,82191.0,Globigerinidae,,,,,288974,Foraminifera,212476,Rhizaria
7,Globigerinella,1501,Globigerinella,genus,82191.0,Globigerinidae,,,,,288974,Foraminifera,212476,Rhizaria
8,Globigerinita,1503,Globigerinita,genus,422373.0,Globigerinitidae,,,,,288974,Foraminifera,212476,Rhizaria
9,Globigerinoides,1504,Globigerinoides,genus,82191.0,Globigerinidae,,,,,288974,Foraminifera,212476,Rhizaria


In [286]:
genus_df.shape

(1015, 14)

In [287]:
genus_df.to_csv(f"../../output/taxa/LIMS/genera_phylum_{date}.csv", index=False)

## add pbdb to input_file

In [288]:
df = pd.read_csv(input_file, skiprows = 9, dtype=str)
log_df(df)


(4754, 21)


Unnamed: 0,taxon_group,verbatim_name,name,"name to use (if different from ""name"")",name comment field,Comment,notes,Any taxon above genus,genus modifier,genus name,...,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,comments,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank
0,,RESOLVED NAMES BY TAXONOMIC GROUP,,,,,,,,,...,,,,,,,,,,
1,benthic_forams,Pyrite,Pyrite,,,"not a taxa; Going into Macrostrat, not PBDB",Summer 2020: deal with later; should go in as ...,,,,...,,,,,,,,,,
2,benthic_forams,Euuvigerina miozea (group) >100 m,Euuvigerina miozea (group) >100 m,Euuvigerina miozea,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",,,Euuvigerina,...,,,miozea,,,,>100 m group,,,
3,benthic_forams,Euuvigerina rodleyi (group) >50 m,Euuvigerina rodleyi (group) >50 m,Euuvigerina rodleyi,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",,,Euuvigerina,...,,,rodleyi,,,,>50 m group,,,
4,benthic_forams,Others,Others,Foraminifera indet.,,not a taxa,Andy,Foraminifera indet.,,,...,,,,,,,other benthic foraminifera,,,


In [269]:
genus_df= pd.read_csv(f"../../output/taxa/LIMS/genera_phylum_{date}.csv", dtype=str)

log_df(genus_df)

(1015, 14)


Unnamed: 0,genus name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
0,Euuvigerina,1408,Euuvigerina,genus,,,,,,,288974,Foraminifera,212476,Rhizaria
1,Nodosaria,1952,Nodosaria,genus,82197.0,Nodosariidae,429322.0,Nodosariida,428875.0,Nodosariata,288974,Foraminifera,212476,Rhizaria
2,Cibicides,1107,Cibicides,genus,82208.0,Cibicididae,,,,,288974,Foraminifera,212476,Rhizaria
3,Brizalina,1017,Brizalina,genus,112279.0,Bolivinidae,,,,,288974,Foraminifera,212476,Rhizaria
4,Candeina,1053,Candeina,genus,422277.0,Candeinidae,,,,,288974,Foraminifera,212476,Rhizaria


In [289]:
df['genus name'] = df['genus name'].str.strip()
genus_df['genus name'] = genus_df['genus name'].str.strip()

merged_df = df.merge(genus_df, on="genus name", how="left")
log_df(merged_df)

(4754, 34)


Unnamed: 0,taxon_group,verbatim_name,name,"name to use (if different from ""name"")",name comment field,Comment,notes,Any taxon above genus,genus modifier,genus name,...,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
0,,RESOLVED NAMES BY TAXONOMIC GROUP,,,,,,,,,...,,,,,,,,,,
1,benthic_forams,Pyrite,Pyrite,,,"not a taxa; Going into Macrostrat, not PBDB",Summer 2020: deal with later; should go in as ...,,,,...,,,,,,,,,,
2,benthic_forams,Euuvigerina miozea (group) >100 m,Euuvigerina miozea (group) >100 m,Euuvigerina miozea,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",,,Euuvigerina,...,,,,,,,288974.0,Foraminifera,212476.0,Rhizaria
3,benthic_forams,Euuvigerina rodleyi (group) >50 m,Euuvigerina rodleyi (group) >50 m,Euuvigerina rodleyi,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",,,Euuvigerina,...,,,,,,,288974.0,Foraminifera,212476.0,Rhizaria
4,benthic_forams,Others,Others,Foraminifera indet.,,not a taxa,Andy,Foraminifera indet.,,,...,,,,,,,,,,


In [290]:
merged_df.to_csv(f"../../output/taxa/LIMS/PI_normalized_taxa_list_with_pbdb_{date}.csv", index=False)