#  normalize NOAA DSDP taxa list
## 1-96 taxa

Cleanup the normalized taxa list from the eODP researchers.

In [128]:
import sys
import csv
import glob
import os
import requests
import re
import time 

sys.path.append('../../')
import pandas as pd
import numpy as np

# import db 
import scripts.normalize_taxa as nt
from config import OUTPUT_DIR, CLEAN_DATA_DIR, RAW_DATA_DIR
from scripts.pbdb import get_parent_taxa, PBDB_TAXA_ID, PBDB_TAXA_NAME
from scripts.shared_utils import (
    log_df
)

In [129]:
# date='2021-11-29'
date = '2022-09-12'

base_dir = CLEAN_DATA_DIR

initial_input_file = RAW_DATA_DIR/'PI_processed_files'/f'NOAA_taxa_lists_taxa_list_{date}.csv'
input_file = OUTPUT_DIR/'taxa'/'NOAA'/f"PI_normalized_taxa_list_with_pbdb_{date}.csv"
crosswalk_file = OUTPUT_DIR/'taxa'/'NOAA'/f"taxa_crosswalk_{date}.csv"
taxa_file = OUTPUT_DIR/'taxa'/'NOAA'/f"taxa_list_{date}.csv"

genus_path = OUTPUT_DIR/'taxa'/'draft'/'NOAA'/f'genus_pbdb_{date}.csv'
higher_taxa_pbdb_file = OUTPUT_DIR/'taxa'/'draft'/'NOAA'/f'higher_taxa_pbdb_{date}.csv'  

taxa_pbdb_path = OUTPUT_DIR/'taxa'/'draft'/'NOAA'/f'taxa_list_pbdb_{date}.csv'


## remove duplicate columns

remove duplicate columns

In [130]:
PI_df = pd.read_csv(initial_input_file, dtype=str)

log_df(PI_df)

(7763, 38)


Unnamed: 0,taxon_group,verbatim_name,name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name
0,benthic_forams,Abyssamina incisa,,,,,,Abyssamina,,,...,279579.0,Rotaliida,428504.0,Globothalamea,288974,Foraminifera,212476,Rhizaria,,
1,benthic_forams,Adercotryma glomeratum,,,,,,Adercotryma,,,...,,,,,288974,Foraminifera,212476,Rhizaria,,
2,benthic_forams,Adercotryma sp.,,,,,,Adercotryma,,,...,,,,,288974,Foraminifera,212476,Rhizaria,,
3,benthic_forams,Alabamina decorata,,,,,,Alabamina,,,...,,,,,288974,Foraminifera,212476,Rhizaria,,
4,benthic_forams,Alabamina haitiensis,,,,,,Alabamina,,,...,,,,,288974,Foraminifera,212476,Rhizaria,,


In [131]:
PI_df.columns

Index(['taxon_group', 'verbatim_name', 'name', 'Comment',
       'Notes (change to Internal only notes?)', 'Any taxon above genus',
       'genus modifier', 'genus name', 'subgenera modifier', 'subgenera name',
       'species modifier', 'species name', 'subspecies modifier',
       'subspecies name', 'non-taxa descriptor', 'comments', 'pbdb_taxon_id',
       'pbdb_taxon_name', 'pbdb_taxon_rank', 'Corrections to pbdb_taxon_rank',
       'pbdb_taxon_id.1', 'pbdb_taxon_name.1', 'pbdb_taxon_rank.1',
       'Corrections to pbdb_taxon', 'family_taxon_id', 'family_taxon_name',
       'superfamily_taxon_id', 'superfamily_taxon_name', 'order_taxon_id',
       'order_taxon_name', 'class_taxon_id', 'class_taxon_name',
       'phylum_taxon_id', 'phylum_taxon_name', 'kingdom_taxon_id',
       'kingdom_taxon_name', 'unranked clade_taxon_id',
       'unranked clade_taxon_name'],
      dtype='object')

In [132]:

del PI_df['pbdb_taxon_id']
del PI_df['pbdb_taxon_name']
del PI_df['pbdb_taxon_rank']

In [134]:
PI_df = PI_df.rename(columns={ 
    'pbdb_taxon_id.1': 'pbdb_taxon_id',
    'pbdb_taxon_name.1': 'pbdb_taxon_name',
    'pbdb_taxon_rank.1': 'pbdb_taxon_rank'
})
log_df(PI_df)
# 7763, 35

(7763, 35)


Unnamed: 0,taxon_group,verbatim_name,name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name
0,benthic_forams,Abyssamina incisa,,,,,,Abyssamina,,,...,279579.0,Rotaliida,428504.0,Globothalamea,288974,Foraminifera,212476,Rhizaria,,
1,benthic_forams,Adercotryma glomeratum,,,,,,Adercotryma,,,...,,,,,288974,Foraminifera,212476,Rhizaria,,
2,benthic_forams,Adercotryma sp.,,,,,,Adercotryma,,,...,,,,,288974,Foraminifera,212476,Rhizaria,,
3,benthic_forams,Alabamina decorata,,,,,,Alabamina,,,...,,,,,288974,Foraminifera,212476,Rhizaria,,
4,benthic_forams,Alabamina haitiensis,,,,,,Alabamina,,,...,,,,,288974,Foraminifera,212476,Rhizaria,,


In [135]:
PI_df.to_csv(input_file, index=False)

## fix incorect pbdb_taxon_id

incorported  pbdb_taxon_id that the PIs corrected into the taxalist.


In [3]:
def fix_pbdb_id(df, correction_text, correct_id):
    print(correct_id)
    
    col = 'Corrections to pbdb_taxon_rank'
    url_parent = PBDB_TAXA_ID + str(correct_id)
    response = requests.get(url_parent)
    if response.status_code == 200:
        data = response.json()["records"]
        if len(data) == 1:
            df.loc[df[col]== correction_text, 'pbdb_taxon_name' ] = data[0]["taxon_name"]
            df.loc[df[col]== correction_text, 'pbdb_taxon_rank' ] = data[0]["taxon_rank"]
            df.loc[df[col]== correction_text, 'pbdb_taxon_id' ] = correct_id

            for index, row in df[df[col] == correction_text].iterrows():
                round = 0
                get_parent_taxa(df, data[0]["parent_no"], data[0]["taxon_rank"], round, index,  None)

        else:
            raise ValueError('multipe ID found')
    else:
        raise ValueError('ID not found')

    df.loc[df[col]== correction_text, 'corrected' ] =  True



In [70]:
PI_df = pd.read_csv(input_file, dtype=str)
PI_df['corrected'] = False

log_df(PI_df)
# 7763

(7763, 39)


Unnamed: 0,taxon_group,verbatim_name,name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name,corrected
0,benthic_forams,Abyssamina incisa,,,,,,Abyssamina,,,...,Rotaliida,428504.0,Globothalamea,288974,Foraminifera,212476,Rhizaria,,,False
1,benthic_forams,Adercotryma glomeratum,,,,,,Adercotryma,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False
2,benthic_forams,Adercotryma sp.,,,,,,Adercotryma,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False
3,benthic_forams,Alabamina decorata,,,,,,Alabamina,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False
4,benthic_forams,Alabamina haitiensis,,,,,,Alabamina,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False


In [5]:
fix_pbdb_id(PI_df, 'genus; id 1064', 1064)
fix_pbdb_id(PI_df, 'genus, id 1124', 1124)
fix_pbdb_id(PI_df, 'genus taxon_no=2092', 2092)
fix_pbdb_id(PI_df, 'genus taxon_no=2542', 2542)
fix_pbdb_id(PI_df, 'genus; ID 71247', 71247)
fix_pbdb_id(PI_df, 'genus; ID 82145', 82145)
fix_pbdb_id(PI_df, 'genus; ID 432650', 432650)
fix_pbdb_id(PI_df, 'genus: ID 68421', 68421)
fix_pbdb_id(PI_df, 'genus; ID 432651', 432651)
fix_pbdb_id(PI_df, 'homonym with an isect; ID# is 414258', 414258)
fix_pbdb_id(PI_df, 'genus; ID 421517', 421517)
fix_pbdb_id(PI_df, 'ID# 24521', 24521)
fix_pbdb_id(PI_df, 'genus; ID 165526', 165526)


fix_pbdb_id(PI_df, 'homonym; this ID # is incorrect ', 374615)
fix_pbdb_id(PI_df, 'Note: homonym with a plant once this is entered, update taxon #', 319949)
fix_pbdb_id(PI_df, 'Note: homonym with an ichnofossil; once this authority is entered, need to update taxon ID number', 83895)
fix_pbdb_id(PI_df, 'homonym with a brachiopod!', 26514)
fix_pbdb_id(PI_df, 'homonym with a plant', 410573)
fix_pbdb_id(PI_df, 'homonym with heart urchin', 259666)

1064
1124
2092
2542
71247
82145
432650
68421
432651
414258
421517
24521
165526
374615
319949
83895
26514
410573
259666


In [6]:
PI_df.shape
# 7763

(7763, 39)

In [7]:
PI_df.columns

Index(['taxon_group', 'verbatim_name', 'name', 'Comment',
       'Notes (change to Internal only notes?)', 'Any taxon above genus',
       'genus modifier', 'genus name', 'subgenera modifier', 'subgenera name',
       'species modifier', 'species name', 'subspecies modifier',
       'subspecies name', 'non-taxa descriptor', 'comments', 'pbdb_taxon_id',
       'pbdb_taxon_name', 'pbdb_taxon_rank', 'Corrections to pbdb_taxon_rank',
       'pbdb_taxon_id.1', 'pbdb_taxon_name.1', 'pbdb_taxon_rank.1',
       'Corrections to pbdb_taxon', 'family_taxon_id', 'family_taxon_name',
       'superfamily_taxon_id', 'superfamily_taxon_name', 'order_taxon_id',
       'order_taxon_name', 'class_taxon_id', 'class_taxon_name',
       'phylum_taxon_id', 'phylum_taxon_name', 'kingdom_taxon_id',
       'kingdom_taxon_name', 'unranked clade_taxon_id',
       'unranked clade_taxon_name', 'corrected'],
      dtype='object')

In [8]:
PI_df = PI_df.reindex(columns=[
    'taxon_group', 'verbatim_name', 'name', 'Comment',
    'Notes (change to Internal only notes?)', 'Any taxon above genus',
    'genus modifier', 'genus name', 'subgenera modifier', 'subgenera name',
    'species modifier', 'species name', 'subspecies modifier',
    'subspecies name', 'non-taxa descriptor', 'comments', 
    'pbdb_taxon_id', 'pbdb_taxon_name', 'pbdb_taxon_rank', 
    'Corrections to pbdb_taxon_rank',
    'class_taxon_id','class_taxon_name',
    'family_taxon_id', 'family_taxon_name', 
    'order_taxon_id', 'order_taxon_name',
    'phylum_taxon_id', 'phylum_taxon_name', 
    'kingdom_taxon_id', 'kingdom_taxon_name',
    'unranked clade_taxon_id', 'unranked clade_taxon_name'
])


In [9]:
PI_df.to_csv(input_file, index=False)


## create higher taxa csv with pbdb info

In [10]:
PI_df = pd.read_csv(input_file, dtype=str)
log_df(PI_df)
# 7763

(7763, 32)


Unnamed: 0,taxon_group,verbatim_name,name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name
0,benthic_forams,Abyssamina incisa,,,,,,Abyssamina,,,...,103796.0,Chilostomellidae,279579.0,Rotaliida,288974,Foraminifera,212476,Rhizaria,,
1,benthic_forams,Adercotryma glomeratum,,,,,,Adercotryma,,,...,,,,,288974,Foraminifera,212476,Rhizaria,,
2,benthic_forams,Adercotryma sp.,,,,,,Adercotryma,,,...,,,,,288974,Foraminifera,212476,Rhizaria,,
3,benthic_forams,Alabamina decorata,,,,,,Alabamina,,,...,82213.0,Alabaminidae,,,288974,Foraminifera,212476,Rhizaria,,
4,benthic_forams,Alabamina haitiensis,,,,,,Alabamina,,,...,82213.0,Alabaminidae,,,288974,Foraminifera,212476,Rhizaria,,


In [11]:
higher_df = pd.DataFrame(PI_df['Any taxon above genus'].dropna().unique(), columns=['Any taxon above genus'])
log_df(higher_df)
# 77

(77, 1)


Unnamed: 0,Any taxon above genus
0,"""Astronion charlottensis"""
1,Miliolidae indet.
2,Foraminifera indet.
3,Chrysophyta indet.
4,"""Dimerogramma elegans"""


In [12]:
for index, row in higher_df.iterrows():  
#     if index < 40:
#         continue
        
    if index % 20 == 0:
        print(index)
        
    time.sleep(0.5)
        
    name = row['Any taxon above genus'].replace(' indet.', '')
    name = name.replace(' indent.', '')

  
    url =  PBDB_TAXA_NAME +  name
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()["records"]
        if len(data) == 1:
            rank = data[0]["taxon_rank"]
            higher_df.at[index, f'pbdb_taxon_id'] = str(data[0]["taxon_no"])
            higher_df.at[index, f'pbdb_taxon_name'] = data[0]["taxon_name"]
            higher_df.at[index, f'pbdb_taxon_rank'] = data[0]["taxon_rank"]
            higher_df.at[index, f'{rank}_taxon_id'] = str(data[0]["taxon_no"])
            higher_df.at[index, f'{rank}_taxon_name'] = data[0]["taxon_name"]
               
            round = 0
            get_parent_taxa(higher_df, data[0]["parent_no"], data[0]["taxon_rank"], round, index, None)
        else:
            print(name, len(data), ' found')
    else:
        print(name, ' not found')

0
"Astronion charlottensis"  not found
"Dimerogramma elegans"  not found
"Diogramma sp."  not found
Globigerinida  not found
20
Crotonoideae  not found
40
60
"Tripodiscinos clavipes"  not found


In [13]:
higher_df.shape
# 77

(77, 18)

In [14]:
higher_df.to_csv(higher_taxa_pbdb_file, index=False)

## add higher taxa pbdb to input file

In [7]:
higher_df = pd.read_csv(higher_taxa_pbdb_file, dtype=str)
log_df(higher_df)
# 77

(77, 18)


Unnamed: 0,Any taxon above genus,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name,superfamily_taxon_id,superfamily_taxon_name
0,"""Astronion charlottensis""",,,,,,,,,,,,,,,,,
1,Miliolidae indet.,81704.0,Miliolidae,family,81704.0,Miliolidae,256604.0,Miliolida,428719.0,Tubothalamea,288974.0,Foraminifera,212476.0,Rhizaria,,,,
2,Foraminifera indet.,288974.0,Foraminifera,phylum,,,,,,,288974.0,Foraminifera,212476.0,Rhizaria,,,,
3,Chrysophyta indet.,69586.0,Chrysophyta,unranked clade,,,,,,,,,,,28595.0,Life,,
4,"""Dimerogramma elegans""",,,,,,,,,,,,,,,,,


In [8]:
PI_df = pd.read_csv(input_file, dtype=str)
PI_df['row_index'] = PI_df.index
log_df(PI_df)
# 7763

(7763, 35)


Unnamed: 0,taxon_group,verbatim_name,name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name,row_index
0,benthic_forams,Abyssamina incisa,,,,,,Abyssamina,,,...,Rotaliida,428504.0,Globothalamea,288974,Foraminifera,212476,Rhizaria,,,0
1,benthic_forams,Adercotryma glomeratum,,,,,,Adercotryma,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,1
2,benthic_forams,Adercotryma sp.,,,,,,Adercotryma,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,2
3,benthic_forams,Alabamina decorata,,,,,,Alabamina,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,3
4,benthic_forams,Alabamina haitiensis,,,,,,Alabamina,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,4


In [9]:
PI_higher_df = PI_df[PI_df['Any taxon above genus'].notna()]
log_df(PI_higher_df)
# 82

(82, 35)


Unnamed: 0,taxon_group,verbatim_name,name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name,row_index
114,benthic_forams,Astronion charlottensis,"""Astronion charlottensis""",Consulted with ET; not a known genus-species pair,"JAS: need help with this one, could be a missp...","""Astronion charlottensis""",,,,,...,,,,,,,,,,114
994,benthic_forams,Miliolidae sp.,Miliolidae indet.,,,Miliolidae indet.,,,,,...,Miliolida,428719.0,Tubothalamea,288974.0,Foraminifera,212476.0,Rhizaria,,,994
1555,benthic_forams,Unidentified benthic forams,Foraminifera indet.,,,Foraminifera indet.,,,,,...,,,,288974.0,Foraminifera,212476.0,Rhizaria,,,1555
1829,diatoms,Chrysophyta cysts a,Chrysophyta indet.,,,Chrysophyta indet.,,,,,...,,,,,,,,28595.0,Life,1829
1830,diatoms,Chrysophyta cysts b,Chrysophyta indet.,,,Chrysophyta indet.,,,,,...,,,,,,,,28595.0,Life,1830


In [10]:
merge_df = PI_higher_df.merge(higher_df, 
                       on=['Any taxon above genus'],   
                       how='inner',
                       suffixes=('_prev', None))
log_df(merge_df)


(82, 52)


Unnamed: 0,taxon_group,verbatim_name,name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name,superfamily_taxon_id,superfamily_taxon_name
0,benthic_forams,Astronion charlottensis,"""Astronion charlottensis""",Consulted with ET; not a known genus-species pair,"JAS: need help with this one, could be a missp...","""Astronion charlottensis""",,,,,...,,,,,,,,,,
1,benthic_forams,Miliolidae sp.,Miliolidae indet.,,,Miliolidae indet.,,,,,...,428719.0,Tubothalamea,288974.0,Foraminifera,212476.0,Rhizaria,,,,
2,benthic_forams,Unidentified benthic forams,Foraminifera indet.,,,Foraminifera indet.,,,,,...,,,288974.0,Foraminifera,212476.0,Rhizaria,,,,
3,diatoms,Chrysophyta cysts a,Chrysophyta indet.,,,Chrysophyta indet.,,,,,...,,,,,,,28595.0,Life,,
4,diatoms,Chrysophyta cysts b,Chrysophyta indet.,,,Chrysophyta indet.,,,,,...,,,,,,,28595.0,Life,,


In [11]:
merge_df.columns

Index(['taxon_group', 'verbatim_name', 'name', 'Comment',
       'Notes (change to Internal only notes?)', 'Any taxon above genus',
       'genus modifier', 'genus name', 'subgenera modifier', 'subgenera name',
       'species modifier', 'species name', 'subspecies modifier',
       'subspecies name', 'non-taxa descriptor', 'comments',
       'pbdb_taxon_id_prev', 'pbdb_taxon_name_prev', 'pbdb_taxon_rank_prev',
       'Corrections to pbdb_taxon_rank', 'family_taxon_id_prev',
       'family_taxon_name_prev', 'superfamily_taxon_id_prev',
       'superfamily_taxon_name_prev', 'order_taxon_id_prev',
       'order_taxon_name_prev', 'class_taxon_id_prev', 'class_taxon_name_prev',
       'phylum_taxon_id_prev', 'phylum_taxon_name_prev',
       'kingdom_taxon_id_prev', 'kingdom_taxon_name_prev',
       'unranked clade_taxon_id_prev', 'unranked clade_taxon_name_prev',
       'row_index', 'pbdb_taxon_id', 'pbdb_taxon_name', 'pbdb_taxon_rank',
       'family_taxon_id', 'family_taxon_name', 'order

In [12]:
cols = [
    'pbdb_taxon_id', 'pbdb_taxon_name', 'pbdb_taxon_rank', 
    'family_taxon_id', 'family_taxon_name', 
    'order_taxon_id', 'order_taxon_name',
    'class_taxon_id', 'class_taxon_name', 
    'phylum_taxon_id', 'phylum_taxon_name', 
    'kingdom_taxon_id', 'kingdom_taxon_name',
    'unranked clade_taxon_id', 'unranked clade_taxon_name',
    'superfamily_taxon_id', 'superfamily_taxon_name'
]

for col in cols:
    if col not in PI_df.columns:
        PI_df[col] = ''
        
for index, row in merge_df.iterrows():
    for col in cols:
        PI_df.at[row['row_index'], col] = row[col]

PI_df.shape 
# 7763

(7763, 35)

In [13]:
PI_df = PI_df.reindex(columns=[
    'taxon_group', 'verbatim_name', 'name', 'Comment',
    'Notes (change to Internal only notes?)', 'Any taxon above genus',
    'genus modifier', 'genus name', 
    'subgenera modifier', 'subgenera name',
    'species modifier', 'species name', 
    'subspecies modifier', 'subspecies name', 
    'non-taxa descriptor', 'comments', 
    'pbdb_taxon_id', 'pbdb_taxon_name', 'pbdb_taxon_rank', 
    'Corrections to pbdb_taxon_rank',
    'family_taxon_id', 'family_taxon_name',
    'superfamily_taxon_id', 'superfamily_taxon_name',
    'order_taxon_id', 'order_taxon_name',
    'class_taxon_id', 'class_taxon_name', 
    'phylum_taxon_id','phylum_taxon_name', 
    'kingdom_taxon_id', 'kingdom_taxon_name',
    'unranked clade_taxon_id', 'unranked clade_taxon_name',
    ])



In [14]:
PI_df.to_csv(input_file, index=False)

## add pbdb data to input file list for fixed genus

PIs fixed some misspellings in the Google Sheet. add pbdb data for fixed genus that are in genus file.

In [15]:
genus_df = pd.read_csv(genus_path, dtype=str)
log_df(genus_df)

(1707, 16)


Unnamed: 0,genus name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,class_taxon_id,class_taxon_name,order_taxon_id,order_taxon_name,family_taxon_id,family_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name
0,Abas,441243,Abas,genus,69587.0,Bacillariophyceae,426784.0,Hemiaulales,387088.0,Hemiaulaceae,432613,Ochrophyta,,,,
1,Abathomphalus,758,Abathomphalus,genus,,,,,,,288974,Foraminifera,212476.0,Rhizaria,,
2,Abies,55065,Abies,genus,82141.0,Pinopsida,82140.0,Pinales,54794.0,Pinaceae,82139,Pinophyta,54311.0,Plantae,,
3,Abutilon,454155,Abutilon,genus,452804.0,Angiospermae,54911.0,Malvales,53879.0,Malvaceae,55350,Spermatophyta,54311.0,Plantae,,
4,Abyssamina,762,Abyssamina,genus,428504.0,Globothalamea,279579.0,Rotaliida,103796.0,Chilostomellidae,288974,Foraminifera,212476.0,Rhizaria,,


In [16]:
genus_df_with_pbdb = genus_df[genus_df['pbdb_taxon_id'].notna()]
genus_df_with_pbdb.shape

(1536, 16)

In [17]:
PI_df = pd.read_csv(input_file, dtype=str)
PI_df['row_index'] = PI_df.index

log_df(PI_df)

(7763, 35)


Unnamed: 0,taxon_group,verbatim_name,name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name,row_index
0,benthic_forams,Abyssamina incisa,,,,,,Abyssamina,,,...,Rotaliida,428504.0,Globothalamea,288974,Foraminifera,212476,Rhizaria,,,0
1,benthic_forams,Adercotryma glomeratum,,,,,,Adercotryma,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,1
2,benthic_forams,Adercotryma sp.,,,,,,Adercotryma,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,2
3,benthic_forams,Alabamina decorata,,,,,,Alabamina,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,3
4,benthic_forams,Alabamina haitiensis,,,,,,Alabamina,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,4


In [18]:
PI_df_no_pbdb = PI_df[PI_df['pbdb_taxon_id'].isna()]
PI_df_no_pbdb.shape

(781, 35)

In [19]:
PI_df.shape

(7763, 35)

In [20]:
temp_genus = set(PI_df_no_pbdb['genus name']).intersection(set(genus_df_with_pbdb['genus name']))
len(temp_genus)

326

In [21]:
merge = PI_df_no_pbdb.merge(genus_df_with_pbdb, on='genus name', suffixes=('_old', None))
merge = merge.drop_duplicates(subset='genus name')
log_df(merge)

(326, 50)


Unnamed: 0,taxon_group,verbatim_name,name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,order_taxon_id,order_taxon_name,family_taxon_id,family_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name
0,benthic_forams,Aragonina aragonensis,Aragonia aragonensis,appears to be a misspelling; WoRMS (JAS),,,,Aragonia,,,...,279579.0,Rotaliida,432106,Loxostomatidae,288974,Foraminifera,212476,Rhizaria,,
2,benthic_forams,Boliminella exilis,Bulimina exilis,"misspelled genus, ET",JAS: misspelling? http://www.marinespecies.org...,,,Bulimina,,,...,,,103766,Buliminidae,288974,Foraminifera,212476,Rhizaria,,
3,benthic_forams,Bolvina sp. (q),? Bolivina sp.,"in PBDB, so it will be classified correctly","in PBDB, so it will be classified correctly Si...",,?,Bolivina,,,...,,,112279,Bolivinidae,288974,Foraminifera,212476,Rhizaria,,
4,benthic_forams,Bradynella subglobosa,,,"in PBDB, so it will be classified correctly",,,Bradynella,,,...,,,103771,Cassidulinidae,288974,Foraminifera,212476,Rhizaria,,
5,benthic_forams,Candiena sp.,Candeina sp.,"misspelling, AF",think this is it? https://www.marinespecies.or...,,,Candeina,,,...,,,422277,Candeinidae,288974,Foraminifera,212476,Rhizaria,,


In [22]:
merge['genus name'].unique()

array(['Aragonia', 'Bulimina', 'Bolivina', 'Bradynella', 'Candeina',
       'Globocassidulina', 'Cancris', 'Discopulvinulina',
       'Ellipsoglandulina', 'Ellipsolagena', 'Favocassidulina',
       'Gavelinopsis', 'Globotextularia', 'Helenina', 'Karreriella',
       'Martinottiella', 'Noviuva', 'Palaeonummulites', 'Paradentalina',
       'Praeglobobulimina', 'Pseudovalvulineria', 'Pulvinulinella',
       'Smyrnella', 'Stensioina', 'Trochamminoides', 'Acanthosphaeridium',
       'Amphiprora', 'Archaeomonas', 'Asterionella', 'Bacillaria',
       'Baxteria', 'Baxteriopsis', 'Benetorus', 'Bergonia',
       'Campylodiscus', 'Campyloneis', 'Campylosira', 'Cerataulina',
       'Cerataulus', 'Ceratoneis', 'Coscinodiscus', 'Cymatodiscus',
       'Cymatopleura', 'Denticula', 'Denticulopsis', 'Detonula',
       'Diatomella', 'Discodiscus', 'Ditylum', 'Endictya', 'Entogonia',
       'Epithelion', 'Eunotogramma', 'Fenestrella', 'Glyphodesmis',
       'Glyphodiscus', 'Grammatophora', 'Gyrosigma', 'H

In [23]:
merge.columns

Index(['taxon_group', 'verbatim_name', 'name', 'Comment',
       'Notes (change to Internal only notes?)', 'Any taxon above genus',
       'genus modifier', 'genus name', 'subgenera modifier', 'subgenera name',
       'species modifier', 'species name', 'subspecies modifier',
       'subspecies name', 'non-taxa descriptor', 'comments',
       'pbdb_taxon_id_old', 'pbdb_taxon_name_old', 'pbdb_taxon_rank_old',
       'Corrections to pbdb_taxon_rank', 'family_taxon_id_old',
       'family_taxon_name_old', 'superfamily_taxon_id',
       'superfamily_taxon_name', 'order_taxon_id_old', 'order_taxon_name_old',
       'class_taxon_id_old', 'class_taxon_name_old', 'phylum_taxon_id_old',
       'phylum_taxon_name_old', 'kingdom_taxon_id_old',
       'kingdom_taxon_name_old', 'unranked clade_taxon_id_old',
       'unranked clade_taxon_name_old', 'row_index', 'pbdb_taxon_id',
       'pbdb_taxon_name', 'pbdb_taxon_rank', 'class_taxon_id',
       'class_taxon_name', 'order_taxon_id', 'order_taxon_na

In [24]:
for col in merge.columns:
    if '_old' in col:
        print(col,'   ', merge[col].unique())

pbdb_taxon_id_old     [nan]
pbdb_taxon_name_old     [nan]
pbdb_taxon_rank_old     [nan]
family_taxon_id_old     ['432106' nan '112279' '103771' '422277' '241044' '241473' '355065'
 '155901' '241432' '112207' '430081' '103762' '82201' '103799' '103764'
 '103766' '112328' '433657' '103761' '433093' '387103' '432671' '387088'
 '441252' '71208' '71203' '432644' '433229' '134837' '71206' '433269'
 '433094' '441495' '427313' '441492' '433471' '441539' '71212' '441547'
 '441549' '441554' '71210' '442333' '442335' '433387' '433618' '442337'
 '442356' '321644' '277915' '321603' '321604' '434217' '443819' '321581'
 '445346' '323951' '424307' '82109' '166233' '82191' '55444' '157257'
 '55443' '54606' '55398' '54794' '152336']
family_taxon_name_old     ['Loxostomatidae' nan 'Bolivinidae' 'Cassidulinidae' 'Candeinidae'
 'Bagginidae' 'Discorbinellidae' 'Ellipsoidinidae' 'Ellipsolagenidae'
 'Rosalinidae' 'Globotextulariidae' 'Ammoniidae' 'Valvulinidae'
 'Uvigerinidae' 'Nummulitidae' 'Polymorphinidae'

In [25]:
cols = [    
    'pbdb_taxon_name','pbdb_taxon_rank', 
    'family_taxon_id', 'family_taxon_name',
    'superfamily_taxon_id', 'superfamily_taxon_name', 
    'order_taxon_id', 'order_taxon_name', 
    'class_taxon_id', 'class_taxon_name',
    'phylum_taxon_id', 'phylum_taxon_name',
    'kingdom_taxon_id', 'kingdom_taxon_name',
    'unranked clade_taxon_id','unranked clade_taxon_name', 
    'pbdb_taxon_id', 
]
for index, row in merge.iterrows():
    for col in cols:
        PI_df.at[row['row_index'], col] = row[col]



In [26]:
del PI_df['row_index']

In [27]:
PI_df.to_csv(input_file, index=False)


# Update genus csv with genus in input file that don't have pbdb data

look for all genus that don't have pbdb_taxon_id in input file

In [37]:
PI_df = pd.read_csv(input_file, dtype=str)
log_df(PI_df)

(7763, 34)


Unnamed: 0,taxon_group,verbatim_name,name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name
0,benthic_forams,Abyssamina incisa,,,,,,Abyssamina,,,...,279579.0,Rotaliida,428504.0,Globothalamea,288974,Foraminifera,212476,Rhizaria,,
1,benthic_forams,Adercotryma glomeratum,,,,,,Adercotryma,,,...,,,,,288974,Foraminifera,212476,Rhizaria,,
2,benthic_forams,Adercotryma sp.,,,,,,Adercotryma,,,...,,,,,288974,Foraminifera,212476,Rhizaria,,
3,benthic_forams,Alabamina decorata,,,,,,Alabamina,,,...,,,,,288974,Foraminifera,212476,Rhizaria,,
4,benthic_forams,Alabamina haitiensis,,,,,,Alabamina,,,...,,,,,288974,Foraminifera,212476,Rhizaria,,


In [38]:
genus_df = pd.read_csv(genus_path, dtype=str)
log_df(genus_df)

(1707, 16)


Unnamed: 0,genus name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,class_taxon_id,class_taxon_name,order_taxon_id,order_taxon_name,family_taxon_id,family_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name
0,Abas,441243,Abas,genus,69587.0,Bacillariophyceae,426784.0,Hemiaulales,387088.0,Hemiaulaceae,432613,Ochrophyta,,,,
1,Abathomphalus,758,Abathomphalus,genus,,,,,,,288974,Foraminifera,212476.0,Rhizaria,,
2,Abies,55065,Abies,genus,82141.0,Pinopsida,82140.0,Pinales,54794.0,Pinaceae,82139,Pinophyta,54311.0,Plantae,,
3,Abutilon,454155,Abutilon,genus,452804.0,Angiospermae,54911.0,Malvales,53879.0,Malvaceae,55350,Spermatophyta,54311.0,Plantae,,
4,Abyssamina,762,Abyssamina,genus,428504.0,Globothalamea,279579.0,Rotaliida,103796.0,Chilostomellidae,288974,Foraminifera,212476.0,Rhizaria,,


In [39]:
new_genus = PI_df[PI_df['genus name'].notna() &  PI_df['pbdb_taxon_id'].isna()][['genus name']].drop_duplicates()
log_df(new_genus)

(209, 1)


Unnamed: 0,genus name
12,Alabaminella
15,Alveolophragmium
96,Aragonia
112,Astrorhiza
113,Astrononion


In [40]:
for index, row in new_genus.iterrows():  
    if index % 20 == 0:
        print(index)
        
    time.sleep(0.5)
        
    name = row['genus name']

  
    url =  PBDB_TAXA_NAME +  name
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()["records"]
        if len(data) == 1:
            # print(index, name, ' found')
            new_genus.at[index, f'pbdb_taxon_id'] = str(data[0]["taxon_no"])
            new_genus.at[index, f'pbdb_taxon_name'] = data[0]["taxon_name"]
            new_genus.at[index, f'pbdb_taxon_rank'] = data[0]["taxon_rank"]
            
            round = 0
            get_parent_taxa(new_genus, data[0]["parent_no"], data[0]["taxon_rank"], round, index, None)
        else:
            print(name, len(data), ' items found')
    else:
        pass
#         print(name, ' not found')

1000
3180
3560
3900
3980
5680
6080
7160
7740


In [41]:
new_genus.sort_values('genus name', inplace=True)
log_df(new_genus)

(209, 16)


Unnamed: 0,genus name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,family_taxon_id,family_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name
7160,Trissocyclus,85877.0,Trissocyclus,genus,85775.0,Acanthodesmiidae,4.0,Radiolaria,212476.0,Rhizaria,402.0,Nassellaria,,,,
512,"""Discocibicides""",,,,,,,,,,,,,,,
5709,"""Gomphrenia""",,,,,,,,,,,,,,,
5749,"""Liguliflores""",,,,,,,,,,,,,,,
3327,Acanthoica,453143.0,Acanthoica,genus,420233.0,Rhabdosphaeraceae,87644.0,Haptophyta,,,87650.0,Syracosphaerales,418920.0,Coccolithophyceae,28595.0,Life


In [42]:
set(new_genus.columns) - set(genus_df.columns) 

set()

In [43]:
genus_df['unranked clade_taxon_id'] = np.nan
genus_df['unranked clade_taxon_name'] = np.nan

In [44]:
new_genus.columns

Index(['genus name', 'pbdb_taxon_id', 'pbdb_taxon_name', 'pbdb_taxon_rank',
       'family_taxon_id', 'family_taxon_name', 'phylum_taxon_id',
       'phylum_taxon_name', 'kingdom_taxon_id', 'kingdom_taxon_name',
       'order_taxon_id', 'order_taxon_name', 'class_taxon_id',
       'class_taxon_name', 'unranked clade_taxon_id',
       'unranked clade_taxon_name'],
      dtype='object')

In [45]:
cols = [
    'genus name', 
    'pbdb_taxon_id', 'pbdb_taxon_name', 'pbdb_taxon_rank',
    'class_taxon_id', 'class_taxon_name', 
    'order_taxon_id', 'order_taxon_name', 
    'family_taxon_id', 'family_taxon_name', 
    'phylum_taxon_id','phylum_taxon_name', 
    'kingdom_taxon_id', 'kingdom_taxon_name',
    'unranked clade_taxon_id', 'unranked clade_taxon_name'
]

genus_df = genus_df.reindex(columns=cols)
new_genus = new_genus.reindex(columns=cols)

In [46]:
new_df = pd.concat([genus_df, new_genus])
log_df(new_df)

(1916, 16)


Unnamed: 0,genus name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,class_taxon_id,class_taxon_name,order_taxon_id,order_taxon_name,family_taxon_id,family_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name
0,Abas,441243,Abas,genus,69587.0,Bacillariophyceae,426784.0,Hemiaulales,387088.0,Hemiaulaceae,432613,Ochrophyta,,,,
1,Abathomphalus,758,Abathomphalus,genus,,,,,,,288974,Foraminifera,212476.0,Rhizaria,,
2,Abies,55065,Abies,genus,82141.0,Pinopsida,82140.0,Pinales,54794.0,Pinaceae,82139,Pinophyta,54311.0,Plantae,,
3,Abutilon,454155,Abutilon,genus,452804.0,Angiospermae,54911.0,Malvales,53879.0,Malvaceae,55350,Spermatophyta,54311.0,Plantae,,
4,Abyssamina,762,Abyssamina,genus,428504.0,Globothalamea,279579.0,Rotaliida,103796.0,Chilostomellidae,288974,Foraminifera,212476.0,Rhizaria,,


In [47]:
cols = ['genus name', 
    'pbdb_taxon_id', 'pbdb_taxon_name', 'pbdb_taxon_rank',
    'class_taxon_id', 'class_taxon_name', 
    'order_taxon_id', 'order_taxon_name', 
    'family_taxon_id', 'family_taxon_name', 
    'phylum_taxon_id','phylum_taxon_name', 
    'kingdom_taxon_id', 'kingdom_taxon_name']

new_df = new_df.drop_duplicates(subset=cols)
log_df(new_df)

(1784, 16)


Unnamed: 0,genus name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,class_taxon_id,class_taxon_name,order_taxon_id,order_taxon_name,family_taxon_id,family_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name
0,Abas,441243,Abas,genus,69587.0,Bacillariophyceae,426784.0,Hemiaulales,387088.0,Hemiaulaceae,432613,Ochrophyta,,,,
1,Abathomphalus,758,Abathomphalus,genus,,,,,,,288974,Foraminifera,212476.0,Rhizaria,,
2,Abies,55065,Abies,genus,82141.0,Pinopsida,82140.0,Pinales,54794.0,Pinaceae,82139,Pinophyta,54311.0,Plantae,,
3,Abutilon,454155,Abutilon,genus,452804.0,Angiospermae,54911.0,Malvales,53879.0,Malvaceae,55350,Spermatophyta,54311.0,Plantae,,
4,Abyssamina,762,Abyssamina,genus,428504.0,Globothalamea,279579.0,Rotaliida,103796.0,Chilostomellidae,288974,Foraminifera,212476.0,Rhizaria,,


In [48]:
new_df[new_df.duplicated('genus name')]

Unnamed: 0,genus name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,class_taxon_id,class_taxon_name,order_taxon_id,order_taxon_name,family_taxon_id,family_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name
3327,Acanthoica,453143,Acanthoica,genus,418920.0,Coccolithophyceae,87650.0,Syracosphaerales,420233.0,Rhabdosphaeraceae,87644.0,Haptophyta,,,28595.0,Life
2951,Ascostomocystis,443687,Ascostomocystis,genus,,,,,,,,,,,28595.0,Life
5623,Cephalotaxus,444516,Cephalotaxus,genus,82141.0,Pinopsida,82140.0,Pinales,54797.0,Taxaceae,55350.0,Spermatophyta,54311.0,Plantae,,
5732,Keteleeria,443061,Keteleeria,genus,82141.0,Pinopsida,82140.0,Pinales,54794.0,Pinaceae,55350.0,Spermatophyta,54311.0,Plantae,,
2609,Riedelia,458683,Riedelia,genus,,,,,,,,,54311.0,Plantae,,


In [49]:
new_df.to_csv(genus_path, index=False)

# update genus with no higher pbdb ranks

In [50]:
df = pd.read_csv(genus_path, dtype=str)

temp_df = df[
    df['unranked clade_taxon_id'].isna() &
    df['kingdom_taxon_id'].isna() &
    df['phylum_taxon_id'].isna() &
    df['family_taxon_id'].isna() &
    df['order_taxon_id'].isna() &
    df['class_taxon_id'].isna() &
    df['pbdb_taxon_id'].notna() 
]
log_df(temp_df)


(20, 16)


Unnamed: 0,genus name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,class_taxon_id,class_taxon_name,order_taxon_id,order_taxon_name,family_taxon_id,family_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name
329,Chrysophyta,69586,Chrysophyta,unranked clade,,,,,,,,,,,,
370,Comasphaeridium,141162,Comasphaeridium,genus,,,,,,,,,,,,
405,Crassosphaera,264671,Crassosphaera,genus,,,,,,,,,,,,
443,Cyclopsiella,277123,Cyclopsiella,genus,,,,,,,,,,,,
446,Cymatiosphaera,170215,Cymatiosphaera,genus,,,,,,,,,,,,


In [51]:
for index, row in temp_df.iterrows():  
        
    time.sleep(0.5)
        
    url =  PBDB_TAXA_ID + str(row['pbdb_taxon_id'])
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()["records"]
        if len(data) == 1:
            rank = data[0]["taxon_rank"]
            temp_df.at[index, f'pbdb_taxon_id'] = str(data[0]["taxon_no"])
            temp_df.at[index, f'pbdb_taxon_name'] = data[0]["taxon_name"]
            temp_df.at[index, f'pbdb_taxon_rank'] = data[0]["taxon_rank"]
               
            round = 0
            get_parent_taxa(temp_df, data[0]["parent_no"], data[0]["taxon_rank"], round, index, None)
        else:
            print(name, len(data), ' found')
    else:
        print(name, ' not found')

In [52]:
for index, row in temp_df.iterrows():
    df.at[index, 'unranked clade_taxon_id'] = row['unranked clade_taxon_id']
    df.at[index, 'unranked clade_taxon_name'] = row['unranked clade_taxon_name']


In [53]:
df.to_csv(genus_path, index=False)

## add pbdb data to genus in input file that does not have pbdb taxon id

In [15]:
PI_df = pd.read_csv(input_file, dtype=str)
PI_df['row_index'] = PI_df.index
log_df(PI_df)

(7763, 33)


Unnamed: 0,taxon_group,verbatim_name,name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,family_taxon_name,order_taxon_id,order_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name,row_index
0,benthic_forams,Abyssamina incisa,,,,,,Abyssamina,,,...,Chilostomellidae,279579.0,Rotaliida,288974,Foraminifera,212476,Rhizaria,,,0
1,benthic_forams,Adercotryma glomeratum,,,,,,Adercotryma,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,1
2,benthic_forams,Adercotryma sp.,,,,,,Adercotryma,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,2
3,benthic_forams,Alabamina decorata,,,,,,Alabamina,,,...,Alabaminidae,,,288974,Foraminifera,212476,Rhizaria,,,3
4,benthic_forams,Alabamina haitiensis,,,,,,Alabamina,,,...,Alabaminidae,,,288974,Foraminifera,212476,Rhizaria,,,4


In [16]:
genus_df = pd.read_csv(genus_path, dtype=str)
genus_df = genus_df[genus_df['pbdb_taxon_id'].notna()]
log_df(genus_df)

FileNotFoundError: [Errno 2] No such file or directory: '../../output/taxa/draft/NOAA/genus_pbdb_2022-09-12.csv'

In [17]:
PI_update_df = PI_df[PI_df['pbdb_taxon_id'].isna() & PI_df['genus name'].notna()]
log_df(PI_update_df)

(776, 33)


Unnamed: 0,taxon_group,verbatim_name,name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,family_taxon_name,order_taxon_id,order_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name,row_index
12,benthic_forams,Alabinella wedellensis,Alabaminella weddellensis,JAS: appears to be a misspelling; WoRMS,,,,Alabaminella,,,...,Eponididae,,,288974,Foraminifera,212476,Rhizaria,,,12
15,benthic_forams,Alveolophraginium crassimargo,Alveolophragmium crassimargo,JAS: appears to be a misspelling; WoRMS,,,,Alveolophragmium,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,15
95,benthic_forams,Aragonina aragonensis,Aragonia aragonensis,appears to be a misspelling; WoRMS (JAS),,,,Aragonia,,,...,Loxostomatidae,279579.0,Rotaliida,288974,Foraminifera,212476,Rhizaria,,,95
96,benthic_forams,Aragonina velascoensis,Aragonia velascoensis,appears to be a misspelling; WoRMS (JAS),,,,Aragonia,,,...,Loxostomatidae,279579.0,Rotaliida,288974,Foraminifera,212476,Rhizaria,,,96
112,benthic_forams,Astrohiza granulosa,Astrorhiza granulosa,appears to be a misspelling; WoRMS (JAS),,,,Astrorhiza,,,...,Astrorhizidae,,,288974,Foraminifera,212476,Rhizaria,,,112


In [57]:
merge_df = PI_update_df.merge(genus_df, on='genus name', how='left', suffixes=('_prev', None))
log_df(merge_df)

(456, 50)


Unnamed: 0,taxon_group,verbatim_name,name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,order_taxon_id,order_taxon_name,family_taxon_id,family_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name
0,benthic_forams,Alabinella wedellensis,Alabaminella weddellensis,JAS: appears to be a misspelling; WoRMS,,,,Alabaminella,,,...,,,241423.0,Eponididae,288974,Foraminifera,212476,Rhizaria,,
1,benthic_forams,Alveolophraginium crassimargo,Alveolophragmium crassimargo,JAS: appears to be a misspelling; WoRMS,,,,Alveolophragmium,,,...,,,,,288974,Foraminifera,212476,Rhizaria,,
2,benthic_forams,Aragonina velascoensis,Aragonia velascoensis,appears to be a misspelling; WoRMS (JAS),,,,Aragonia,,,...,279579.0,Rotaliida,432106.0,Loxostomatidae,288974,Foraminifera,212476,Rhizaria,,
3,benthic_forams,Astrohiza granulosa,Astrorhiza granulosa,appears to be a misspelling; WoRMS (JAS),,,,Astrorhiza,,,...,,,147614.0,Astrorhizidae,288974,Foraminifera,212476,Rhizaria,,
4,benthic_forams,Astronion astrale,Astrononion australe,appears to be a misspelling; WoRMS (JAS),,,,Astrononion,,,...,,,82211.0,Nonionidae,288974,Foraminifera,212476,Rhizaria,,


In [58]:
genus_df.columns

Index(['genus name', 'pbdb_taxon_id', 'pbdb_taxon_name', 'pbdb_taxon_rank',
       'class_taxon_id', 'class_taxon_name', 'order_taxon_id',
       'order_taxon_name', 'family_taxon_id', 'family_taxon_name',
       'phylum_taxon_id', 'phylum_taxon_name', 'kingdom_taxon_id',
       'kingdom_taxon_name', 'unranked clade_taxon_id',
       'unranked clade_taxon_name'],
      dtype='object')

In [59]:
cols = [
    'pbdb_taxon_id', 'pbdb_taxon_name', 'pbdb_taxon_rank',
    'class_taxon_id', 'class_taxon_name', 
    'order_taxon_id', 'order_taxon_name', 
    'family_taxon_id', 'family_taxon_name',
    'phylum_taxon_id', 'phylum_taxon_name', 
    'kingdom_taxon_id', 'kingdom_taxon_name', 
    'unranked clade_taxon_id', 'unranked clade_taxon_name'
]

for index, row in merge_df.iterrows():
    for col in cols:
        PI_df.at[row['row_index'], col] = row[col]
    

In [60]:
len(PI_df[PI_df['pbdb_taxon_id'].isna() & PI_df['genus name'].notna()])

44

In [61]:
PI_df.columns

Index(['taxon_group', 'verbatim_name', 'name', 'Comment',
       'Notes (change to Internal only notes?)', 'Any taxon above genus',
       'genus modifier', 'genus name', 'subgenera modifier', 'subgenera name',
       'species modifier', 'species name', 'subspecies modifier',
       'subspecies name', 'non-taxa descriptor', 'comments', 'pbdb_taxon_id',
       'pbdb_taxon_name', 'pbdb_taxon_rank', 'Corrections to pbdb_taxon_rank',
       'family_taxon_id', 'family_taxon_name', 'superfamily_taxon_id',
       'superfamily_taxon_name', 'order_taxon_id', 'order_taxon_name',
       'class_taxon_id', 'class_taxon_name', 'phylum_taxon_id',
       'phylum_taxon_name', 'kingdom_taxon_id', 'kingdom_taxon_name',
       'unranked clade_taxon_id', 'unranked clade_taxon_name', 'row_index'],
      dtype='object')

In [62]:
del PI_df['row_index']

In [63]:
PI_df.to_csv(input_file, index=False)

# create crosswalk file

In [136]:
PI_df = pd.read_csv(input_file, dtype=str)
log_df(PI_df)
# 7763

(7763, 35)


Unnamed: 0,taxon_group,verbatim_name,name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name
0,benthic_forams,Abyssamina incisa,,,,,,Abyssamina,,,...,279579.0,Rotaliida,428504.0,Globothalamea,288974,Foraminifera,212476,Rhizaria,,
1,benthic_forams,Adercotryma glomeratum,,,,,,Adercotryma,,,...,,,,,288974,Foraminifera,212476,Rhizaria,,
2,benthic_forams,Adercotryma sp.,,,,,,Adercotryma,,,...,,,,,288974,Foraminifera,212476,Rhizaria,,
3,benthic_forams,Alabamina decorata,,,,,,Alabamina,,,...,,,,,288974,Foraminifera,212476,Rhizaria,,
4,benthic_forams,Alabamina haitiensis,,,,,,Alabamina,,,...,,,,,288974,Foraminifera,212476,Rhizaria,,


In [137]:
df = nt.add_normalized_name_column(PI_df)
log_df(df)

(7763, 36)


Unnamed: 0,taxon_group,verbatim_name,name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name,normalized_name
0,benthic_forams,Abyssamina incisa,,,,,,Abyssamina,,,...,Rotaliida,428504.0,Globothalamea,288974,Foraminifera,212476,Rhizaria,,,Abyssamina incisa
1,benthic_forams,Adercotryma glomeratum,,,,,,Adercotryma,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,Adercotryma glomeratum
2,benthic_forams,Adercotryma sp.,,,,,,Adercotryma,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,Adercotryma sp.
3,benthic_forams,Alabamina decorata,,,,,,Alabamina,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,Alabamina decorata
4,benthic_forams,Alabamina haitiensis,,,,,,Alabamina,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,Alabamina haitiensis


In [138]:
crosswalk_df = nt.create_taxa_crosswalk_df(df)
log_df(crosswalk_df)
# 7763
# 7760
# 7759

fields: ['Any taxon above genus', 'genus modifier', 'genus name', 'subgenera modifier', 'subgenera name', 'species modifier', 'species name', 'subspecies modifier', 'subspecies name', 'non-taxa descriptor', 'normalized_name', 'taxon_group', 'verbatim_name', 'name comment field', 'Comment', 'Notes (change to Internal only notes?)', 'comments']
initial df:  (7763, 17)
remove nontaxa df:  (7760, 17)
drop duplicates df:  (7759, 17)
(7759, 17)


Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,normalized_name,taxon_group,verbatim_name,name comment field,Comment,Notes (change to Internal only notes?),comments
0,,,Abyssamina,,,,incisa,,,,Abyssamina incisa,benthic_forams,Abyssamina incisa,,,,
1,,,Adercotryma,,,,glomeratum,,,,Adercotryma glomeratum,benthic_forams,Adercotryma glomeratum,,,,
2,,,Adercotryma,,,,sp.,,,,Adercotryma sp.,benthic_forams,Adercotryma sp.,,,,
3,,,Alabamina,,,,decorata,,,,Alabamina decorata,benthic_forams,Alabamina decorata,,,,
4,,,Alabamina,,,,haitiensis,,,,Alabamina haitiensis,benthic_forams,Alabamina haitiensis,,,,


In [139]:
PI_df.to_csv(crosswalk_file, index=False)

## create taxa file

In [140]:
PI_df = pd.read_csv(input_file, dtype=str)
log_df(PI_df)
# 7763

(7763, 35)


Unnamed: 0,taxon_group,verbatim_name,name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name
0,benthic_forams,Abyssamina incisa,,,,,,Abyssamina,,,...,279579.0,Rotaliida,428504.0,Globothalamea,288974,Foraminifera,212476,Rhizaria,,
1,benthic_forams,Adercotryma glomeratum,,,,,,Adercotryma,,,...,,,,,288974,Foraminifera,212476,Rhizaria,,
2,benthic_forams,Adercotryma sp.,,,,,,Adercotryma,,,...,,,,,288974,Foraminifera,212476,Rhizaria,,
3,benthic_forams,Alabamina decorata,,,,,,Alabamina,,,...,,,,,288974,Foraminifera,212476,Rhizaria,,
4,benthic_forams,Alabamina haitiensis,,,,,,Alabamina,,,...,,,,,288974,Foraminifera,212476,Rhizaria,,


In [141]:
df = nt.add_normalized_name_column(PI_df)
log_df(df)

(7763, 36)


Unnamed: 0,taxon_group,verbatim_name,name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name,normalized_name
0,benthic_forams,Abyssamina incisa,,,,,,Abyssamina,,,...,Rotaliida,428504.0,Globothalamea,288974,Foraminifera,212476,Rhizaria,,,Abyssamina incisa
1,benthic_forams,Adercotryma glomeratum,,,,,,Adercotryma,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,Adercotryma glomeratum
2,benthic_forams,Adercotryma sp.,,,,,,Adercotryma,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,Adercotryma sp.
3,benthic_forams,Alabamina decorata,,,,,,Alabamina,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,Alabamina decorata
4,benthic_forams,Alabamina haitiensis,,,,,,Alabamina,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,Alabamina haitiensis


In [142]:
taxa_df = nt.create_taxa_list_df(df)
log_df(taxa_df)
# 7763
# 7760
# 7658

fields: ['Any taxon above genus', 'genus modifier', 'genus name', 'subgenera modifier', 'subgenera name', 'species modifier', 'species name', 'subspecies modifier', 'subspecies name', 'non-taxa descriptor', 'normalized_name', 'taxon_group', 'pbdb_taxon_id', 'pbdb_taxon_name', 'pbdb_taxon_rank', 'family_taxon_id', 'family_taxon_name', 'order_taxon_id', 'order_taxon_name', 'class_taxon_id', 'class_taxon_name', 'phylum_taxon_id', 'phylum_taxon_name', 'kingdom_taxon_id', 'kingdom_taxon_name']
initial df:  (7763, 25)
remove nontaxa df:  (7760, 25)
drop duplicates df:  (7658, 25)
(7658, 25)


Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,...,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
0,,,Abyssamina,,,,incisa,,,,...,103796.0,Chilostomellidae,279579.0,Rotaliida,428504.0,Globothalamea,288974,Foraminifera,212476,Rhizaria
1,,,Adercotryma,,,,glomeratum,,,,...,,,,,,,288974,Foraminifera,212476,Rhizaria
2,,,Adercotryma,,,,sp.,,,,...,,,,,,,288974,Foraminifera,212476,Rhizaria
3,,,Alabamina,,,,decorata,,,,...,82213.0,Alabaminidae,,,,,288974,Foraminifera,212476,Rhizaria
4,,,Alabamina,,,,haitiensis,,,,...,82213.0,Alabaminidae,,,,,288974,Foraminifera,212476,Rhizaria


In [143]:
taxa_df.to_csv(taxa_file, index=False)