#  normalize NOAA DSDP taxa list
## 1-96 taxa

Cleanup the normalized taxa list from the eODP researchers.

In [1]:
import sys
import csv
import glob
import os
import requests
import re
import time 

sys.path.append('../../')
import pandas as pd
import numpy as np

# import db 
import scripts.normalize_taxa as nt
from config import OUTPUT_DIR, CLEAN_DATA_DIR, RAW_DATA_DIR
from scripts.pbdb import get_parent_taxa, PBDB_TAXA_ID, PBDB_TAXA_NAME

In [2]:
date='2021-11-29'

base_dir = CLEAN_DATA_DIR

initial_input_file = RAW_DATA_DIR/'PI_processed_files'/f'NOAA_taxa_lists_taxa_list_{date}.csv'
input_file = OUTPUT_DIR/'taxa'/'NOAA'/f"PI_normalized_taxa_list_with_pbdb_{date}.csv"

genus_path = OUTPUT_DIR/'taxa'/'draft'/'NOAA'/f'genus_pbdb_{date}.csv'
higher_taxa_pbdb_file = OUTPUT_DIR/'taxa'/'draft'/'NOAA'/f'higher_taxa_pbdb_{date}.csv'  

taxa_pbdb_path = OUTPUT_DIR/'taxa'/'draft'/'NOAA'/f'taxa_list_pbdb_{date}.csv'


In [3]:
def log_df(df, row_count=5):
    print(df.shape)
    return df.head(row_count)


## fix incorect pbdb_taxon_id

incorported  pbdb_taxon_id that the PIs corrected into the taxalist.


In [4]:
def fix_pbdb_id(df, correction_text, correct_id):
    print(correct_id)
    
    col = 'Corrections to pbdb_taxon_rank'
    url_parent = PBDB_TAXA_ID + str(correct_id)
    response = requests.get(url_parent)
    if response.status_code == 200:
        data = response.json()["records"]
        if len(data) == 1:
            df.loc[df[col]== correction_text, 'pbdb_taxon_name' ] = data[0]["taxon_name"]
            df.loc[df[col]== correction_text, 'pbdb_taxon_rank' ] = data[0]["taxon_rank"]
            df.loc[df[col]== correction_text, 'pbdb_taxon_id' ] = correct_id

            for index, row in df[df[col] == correction_text].iterrows():
                round = 0
                get_parent_taxa(df, data[0]["parent_no"], data[0]["taxon_rank"], round, index,  None)

        else:
            raise ValueError('multipe ID found')
    else:
        raise ValueError('ID not found')

    df.loc[df[col]== correction_text, 'corrected' ] =  True



In [5]:
PI_df = pd.read_csv(initial_input_file, dtype=str)
PI_df['corrected'] = False

log_df(PI_df)

(7763, 21)


Unnamed: 0,taxon_group,verbatim_name,name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,species name,subspecies modifier,subspecies name,non-taxa descriptor,comments,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,Corrections to pbdb_taxon_rank,corrected
0,benthic_forams,Abyssamina incisa,,,,,,Abyssamina,,,...,incisa,,,,,762,Abyssamina,genus,,False
1,benthic_forams,Adercotryma glomeratum,,,,,,Adercotryma,,,...,glomeratum,,,,,774,Adercotryma,genus,,False
2,benthic_forams,Adercotryma sp.,,,,,,Adercotryma,,,...,sp.,,,,,774,Adercotryma,genus,,False
3,benthic_forams,Alabamina decorata,,,,,,Alabamina,,,...,decorata,,,,,788,Alabamina,genus,,False
4,benthic_forams,Alabamina haitiensis,,,,,,Alabamina,,,...,haitiensis,,,,,788,Alabamina,genus,,False


In [6]:
tmp = PI_df[PI_df['Corrections to pbdb_taxon_rank'].notna() & (PI_df['corrected'] == False)]
tmp['Corrections to pbdb_taxon_rank'].unique()

array(['genus; id 1064', 'genus, id 1124', 'genus taxon_no=2092',
       'genus taxon_no=2542', 'homonym; this ID # is incorrect ',
       'genus; ID 71247', 'genus; ID 82145',
       'Note: homonym with a plant once this is entered, update taxon #',
       'Note: homonym with an ichnofossil; once this authority is entered, need to update taxon ID number',
       'genus; ID 432650', 'genus: ID 68421', 'genus; ID 432651',
       'homonym with a brachiopod!',
       'homonym with an isect; ID# is 414258', 'genus; ID 421517',
       'homonym with a plant', 'homonym with heart urchin', 'ID# 24521',
       'genus; ID 165526'], dtype=object)

In [7]:
fix_pbdb_id(PI_df, 'genus; id 1064', 1064)
fix_pbdb_id(PI_df, 'genus, id 1124', 1124)
fix_pbdb_id(PI_df, 'genus taxon_no=2092', 2092)
fix_pbdb_id(PI_df, 'genus taxon_no=2542', 2542)
fix_pbdb_id(PI_df, 'genus; ID 71247', 71247)
fix_pbdb_id(PI_df, 'genus; ID 82145', 82145)
fix_pbdb_id(PI_df, 'genus; ID 432650', 432650)
fix_pbdb_id(PI_df, 'genus: ID 68421', 68421)
fix_pbdb_id(PI_df, 'genus; ID 432651', 432651)
fix_pbdb_id(PI_df, 'homonym with an isect; ID# is 414258', 414258)
fix_pbdb_id(PI_df, 'genus; ID 421517', 421517)
fix_pbdb_id(PI_df, 'ID# 24521', 24521)
fix_pbdb_id(PI_df, 'genus; ID 165526', 165526)


fix_pbdb_id(PI_df, 'homonym; this ID # is incorrect ', 374615)
fix_pbdb_id(PI_df, 'Note: homonym with a plant once this is entered, update taxon #', 319949)
fix_pbdb_id(PI_df, 'Note: homonym with an ichnofossil; once this authority is entered, need to update taxon ID number', 83895)
fix_pbdb_id(PI_df, 'homonym with a brachiopod!', 26514)
fix_pbdb_id(PI_df, 'homonym with a plant', 410573)
fix_pbdb_id(PI_df, 'homonym with heart urchin', 259666)




1064
1124
2092
2542
71247
82145
432650
68421
432651
414258
421517
24521
165526
374615
319949
83895
26514
410573
259666


In [8]:
PI_df.shape

(7763, 33)

In [9]:
PI_df[PI_df['Corrections to pbdb_taxon_rank'].notna() & (PI_df['corrected'] == False)]


Unnamed: 0,taxon_group,verbatim_name,name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name


In [10]:
PI_df.columns

Index(['taxon_group', 'verbatim_name', 'name', 'Comment',
       'Notes (change to Internal only notes?)', 'Any taxon above genus',
       'genus modifier', 'genus name', 'subgenera modifier', 'subgenera name',
       'species modifier', 'species name', 'subspecies modifier',
       'subspecies name', 'non-taxa descriptor', 'comments', 'pbdb_taxon_id',
       'pbdb_taxon_name', 'pbdb_taxon_rank', 'Corrections to pbdb_taxon_rank',
       'corrected', 'family_taxon_id', 'family_taxon_name', 'phylum_taxon_id',
       'phylum_taxon_name', 'kingdom_taxon_id', 'kingdom_taxon_name',
       'order_taxon_id', 'order_taxon_name', 'class_taxon_id',
       'class_taxon_name', 'unranked clade_taxon_id',
       'unranked clade_taxon_name'],
      dtype='object')

In [11]:
PI_df = PI_df.reindex(columns=[
    'taxon_group', 'verbatim_name', 'name', 'Comment',
    'Notes (change to Internal only notes?)', 'Any taxon above genus',
    'genus modifier', 'genus name', 'subgenera modifier', 'subgenera name',
    'species modifier', 'species name', 'subspecies modifier',
    'subspecies name', 'non-taxa descriptor', 'comments', 
    'pbdb_taxon_id', 'pbdb_taxon_name', 'pbdb_taxon_rank', 
    'Corrections to pbdb_taxon_rank',
    'class_taxon_id','class_taxon_name',
    'family_taxon_id', 'family_taxon_name', 
    'order_taxon_id', 'order_taxon_name',
    'phylum_taxon_id', 'phylum_taxon_name', 
    'kingdom_taxon_id', 'kingdom_taxon_name',
    'unranked clade_taxon_id', 'unranked clade_taxon_name'
     
    
])


In [12]:
PI_df.to_csv(input_file, index=False)
# PI_df.to_csv('foo.csv', index=False)

## create higher taxa csv with pbdb info

In [13]:
PI_df = pd.read_csv(input_file, dtype=str)
log_df(PI_df)

(7763, 32)


Unnamed: 0,taxon_group,verbatim_name,name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name
0,benthic_forams,Abyssamina incisa,,,,,,Abyssamina,,,...,,,,,,,,,,
1,benthic_forams,Adercotryma glomeratum,,,,,,Adercotryma,,,...,,,,,,,,,,
2,benthic_forams,Adercotryma sp.,,,,,,Adercotryma,,,...,,,,,,,,,,
3,benthic_forams,Alabamina decorata,,,,,,Alabamina,,,...,,,,,,,,,,
4,benthic_forams,Alabamina haitiensis,,,,,,Alabamina,,,...,,,,,,,,,,


In [14]:
len(PI_df['Any taxon above genus'].dropna().unique())

72

In [15]:
len(PI_df['Any taxon above genus'].str.strip().dropna().unique())

72

In [16]:
higher_df = pd.DataFrame(PI_df['Any taxon above genus'].dropna().unique(), columns=['Any taxon above genus'])
log_df(higher_df)

(72, 1)


Unnamed: 0,Any taxon above genus
0,Miliolidae indet.
1,Foraminifera indet.
2,Chrysophyta indet.
3,"""Dimerogramma elegans"""
4,"""Diogramma sp."""


In [36]:
for index, row in higher_df.iterrows():  
#     if index < 40:
#         continue
        
    if index % 20 == 0:
        print(index)
        
    time.sleep(0.5)
        
    name = row['Any taxon above genus'].replace(' indet.', '')
    name = name.replace(' indent.', '')

  
    url =  PBDB_TAXA_NAME +  name
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()["records"]
        if len(data) == 1:
            rank = data[0]["taxon_rank"]
            higher_df.at[index, f'pbdb_taxon_id'] = str(data[0]["taxon_no"])
            higher_df.at[index, f'pbdb_taxon_name'] = data[0]["taxon_name"]
            higher_df.at[index, f'pbdb_taxon_rank'] = data[0]["taxon_rank"]
            higher_df.at[index, f'{rank}_taxon_id'] = str(data[0]["taxon_no"])
            higher_df.at[index, f'{rank}_taxon_name'] = data[0]["taxon_name"]
               
            round = 0
            get_parent_taxa(higher_df, data[0]["parent_no"], data[0]["taxon_rank"], round, index, None)
        else:
            print(name, len(data), ' found')
    else:
        print(name, ' not found')

0
"Dimerogramma elegans"  not found
"Diogramma sp."  not found
"Rhabdonoma angulatum"  not found
Globigerinida  not found
Ampelidaceae  not found
20
Cambretaceae  not found
Crotonoideae  not found
Graminae  not found
Guttifere  not found
40
Liliaceace  not found
Nympheaceae  not found
60
Thymeleaceae  not found
Urticeae  not found


In [37]:
higher_df.columns

Index(['Any taxon above genus', 'pbdb_taxon_id', 'pbdb_taxon_name',
       'pbdb_taxon_rank', 'family_taxon_id', 'family_taxon_name',
       'family_taxon_rank', 'order_taxon_id', 'order_taxon_name',
       'class_taxon_id', 'class_taxon_name', 'phylum_taxon_id',
       'phylum_taxon_name', 'kingdom_taxon_id', 'kingdom_taxon_name',
       'phylum_taxon_rank', 'unranked clade_taxon_id',
       'unranked clade_taxon_name', 'unranked clade_taxon_rank',
       'class_taxon_rank', 'superfamily_taxon_id', 'superfamily_taxon_name',
       'superfamily_taxon_rank', 'order_taxon_rank'],
      dtype='object')

In [38]:
higher_df.to_csv(higher_taxa_pbdb_file, index=False)

## add higher taxa pbdb to input file

In [20]:
higher_df = pd.read_csv(higher_taxa_pbdb_file, dtype=str)
log_df(higher_df)

(72, 18)


Unnamed: 0,Any taxon above genus,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name,superfamily_taxon_id,superfamily_taxon_name
0,Miliolidae indet.,81704.0,Miliolidae,family,81704.0,Miliolidae,256604.0,Miliolida,428719.0,Tubothalamea,288974.0,Foraminifera,212476.0,Rhizaria,,,,
1,Foraminifera indet.,288974.0,Foraminifera,phylum,,,,,,,288974.0,Foraminifera,212476.0,Rhizaria,,,,
2,Chrysophyta indet.,69586.0,Chrysophyta,unranked clade,,,,,,,,,,,28595.0,Life,,
3,"""Dimerogramma elegans""",,,,,,,,,,,,,,,,,
4,"""Diogramma sp.""",,,,,,,,,,,,,,,,,


In [21]:
PI_df = pd.read_csv(input_file, dtype=str)
PI_df['row_index'] = PI_df.index
log_df(PI_df)

(7763, 33)


Unnamed: 0,taxon_group,verbatim_name,name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,family_taxon_name,order_taxon_id,order_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name,row_index
0,benthic_forams,Abyssamina incisa,,,,,,Abyssamina,,,...,,,,,,,,,,0
1,benthic_forams,Adercotryma glomeratum,,,,,,Adercotryma,,,...,,,,,,,,,,1
2,benthic_forams,Adercotryma sp.,,,,,,Adercotryma,,,...,,,,,,,,,,2
3,benthic_forams,Alabamina decorata,,,,,,Alabamina,,,...,,,,,,,,,,3
4,benthic_forams,Alabamina haitiensis,,,,,,Alabamina,,,...,,,,,,,,,,4


In [22]:
PI_higher_df = PI_df[PI_df['Any taxon above genus'].notna()]
log_df(PI_higher_df)

(77, 33)


Unnamed: 0,taxon_group,verbatim_name,name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,family_taxon_name,order_taxon_id,order_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name,row_index
994,benthic_forams,Miliolidae sp.,Miliolidae indet.,,,Miliolidae indet.,,,,,...,,,,,,,,,,994
1555,benthic_forams,Unidentified benthic forams,Foraminifera indet.,,,Foraminifera indet.,,,,,...,,,,,,,,,,1555
1829,diatoms,Chrysophyta cysts a,Chrysophyta indet.,,,Chrysophyta indet.,,,,,...,,,,,,,,,,1829
1830,diatoms,Chrysophyta cysts b,Chrysophyta indet.,,,Chrysophyta indet.,,,,,...,,,,,,,,,,1830
1831,diatoms,Chrysophyta sp.,Chrysophyta indet.,,,Chrysophyta indet.,,,,,...,,,,,,,,,,1831


In [23]:
merge_df = PI_higher_df.merge(higher_df, 
                       on=['Any taxon above genus'],   
                       how='inner',
                       suffixes=('_prev', None))
log_df(merge_df)


(77, 50)


Unnamed: 0,taxon_group,verbatim_name,name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name,superfamily_taxon_id,superfamily_taxon_name
0,benthic_forams,Miliolidae sp.,Miliolidae indet.,,,Miliolidae indet.,,,,,...,428719.0,Tubothalamea,288974.0,Foraminifera,212476.0,Rhizaria,,,,
1,benthic_forams,Unidentified benthic forams,Foraminifera indet.,,,Foraminifera indet.,,,,,...,,,288974.0,Foraminifera,212476.0,Rhizaria,,,,
2,diatoms,Chrysophyta cysts a,Chrysophyta indet.,,,Chrysophyta indet.,,,,,...,,,,,,,28595.0,Life,,
3,diatoms,Chrysophyta cysts b,Chrysophyta indet.,,,Chrysophyta indet.,,,,,...,,,,,,,28595.0,Life,,
4,diatoms,Chrysophyta sp.,Chrysophyta indet.,,,Chrysophyta indet.,,,,,...,,,,,,,28595.0,Life,,


In [24]:
merge_df.columns

Index(['taxon_group', 'verbatim_name', 'name', 'Comment',
       'Notes (change to Internal only notes?)', 'Any taxon above genus',
       'genus modifier', 'genus name', 'subgenera modifier', 'subgenera name',
       'species modifier', 'species name', 'subspecies modifier',
       'subspecies name', 'non-taxa descriptor', 'comments',
       'pbdb_taxon_id_prev', 'pbdb_taxon_name_prev', 'pbdb_taxon_rank_prev',
       'Corrections to pbdb_taxon_rank', 'class_taxon_id_prev',
       'class_taxon_name_prev', 'family_taxon_id_prev',
       'family_taxon_name_prev', 'order_taxon_id_prev',
       'order_taxon_name_prev', 'phylum_taxon_id_prev',
       'phylum_taxon_name_prev', 'kingdom_taxon_id_prev',
       'kingdom_taxon_name_prev', 'unranked clade_taxon_id_prev',
       'unranked clade_taxon_name_prev', 'row_index', 'pbdb_taxon_id',
       'pbdb_taxon_name', 'pbdb_taxon_rank', 'family_taxon_id',
       'family_taxon_name', 'order_taxon_id', 'order_taxon_name',
       'class_taxon_id', 'cl

In [25]:
cols = [
    'pbdb_taxon_id', 'pbdb_taxon_name', 'pbdb_taxon_rank', 
    'family_taxon_id', 'family_taxon_name', 
    'order_taxon_id', 'order_taxon_name',
    'class_taxon_id', 'class_taxon_name', 
    'phylum_taxon_id', 'phylum_taxon_name', 
    'kingdom_taxon_id', 'kingdom_taxon_name',
    'unranked clade_taxon_id', 'unranked clade_taxon_name',
    'superfamily_taxon_id', 'superfamily_taxon_name'
]

for col in cols:
    if col not in PI_df.columns:
        PI_df[col] = ''
        
for index, row in merge_df.iterrows():
    for col in cols:
        PI_df.at[row['row_index'], col] = row[col]

PI_df.shape    

(7763, 35)

In [26]:
PI_df = PI_df.reindex(columns=[
    'taxon_group', 'verbatim_name', 'name', 'Comment',
    'Notes (change to Internal only notes?)', 'Any taxon above genus',
    'genus modifier', 'genus name', 
    'subgenera modifier', 'subgenera name',
    'species modifier', 'species name', 
    'subspecies modifier', 'subspecies name', 
    'non-taxa descriptor', 'comments', 
    'pbdb_taxon_id', 'pbdb_taxon_name', 'pbdb_taxon_rank', 
    'Corrections to pbdb_taxon_rank',
    'family_taxon_id', 'family_taxon_name',
    'superfamily_taxon_id', 'superfamily_taxon_name',
    'order_taxon_id', 'order_taxon_name',
    'class_taxon_id', 'class_taxon_name', 
    'phylum_taxon_id','phylum_taxon_name', 
    'kingdom_taxon_id', 'kingdom_taxon_name',
    'unranked clade_taxon_id', 'unranked clade_taxon_name',
    ])



In [27]:
PI_df.to_csv(input_file, index=False)
# PI_df.to_csv('foo.csv', index=False)

## add pbdb data to input file list for fixed genus

PIs fixed some misspellings in the Google Sheet. add pbdb data for fixed genus that are in genus file.

In [28]:
genus_df = pd.read_csv(genus_path, dtype=str)
log_df(genus_df)

(1707, 16)


Unnamed: 0,genus name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,class_taxon_id,class_taxon_name,order_taxon_id,order_taxon_name,family_taxon_id,family_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name
0,Abas,441243.0,Abas,genus,69587.0,Bacillariophyceae,426784.0,Hemiaulales,387088.0,Hemiaulaceae,432613.0,Ochrophyta,,,,
1,Abathomphalus,758.0,Abathomphalus,genus,,,,,,,288974.0,Foraminifera,212476.0,Rhizaria,,
2,Abies,55065.0,Abies,genus,82141.0,Pinopsida,82140.0,Pinales,54794.0,Pinaceae,82139.0,Pinophyta,54311.0,Plantae,,
3,Abutilon,,,,,,,,,,,,,,,
4,Abyssamina,762.0,Abyssamina,genus,428504.0,Globothalamea,279579.0,Rotaliida,103796.0,Chilostomellidae,288974.0,Foraminifera,212476.0,Rhizaria,,


In [29]:
genus_df_with_pbdb = genus_df[genus_df['pbdb_taxon_id'].notna()]
genus_df_with_pbdb.shape

(1364, 16)

In [30]:
PI_df = pd.read_csv(input_file, dtype=str)
PI_df['row_index'] = PI_df.index

log_df(PI_df)

(7763, 35)


Unnamed: 0,taxon_group,verbatim_name,name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name,row_index
0,benthic_forams,Abyssamina incisa,,,,,,Abyssamina,,,...,,,,,,,,,,0
1,benthic_forams,Adercotryma glomeratum,,,,,,Adercotryma,,,...,,,,,,,,,,1
2,benthic_forams,Adercotryma sp.,,,,,,Adercotryma,,,...,,,,,,,,,,2
3,benthic_forams,Alabamina decorata,,,,,,Alabamina,,,...,,,,,,,,,,3
4,benthic_forams,Alabamina haitiensis,,,,,,Alabamina,,,...,,,,,,,,,,4


In [31]:
PI_df_no_pbdb = PI_df[PI_df['pbdb_taxon_id'].isna()]
PI_df_no_pbdb.shape

(792, 35)

In [32]:
PI_df.shape

(7763, 35)

In [33]:
temp_genus = set(PI_df_no_pbdb['genus name']).intersection(set(genus_df_with_pbdb['genus name']))
len(temp_genus)

145

In [34]:
merge = PI_df_no_pbdb.merge(genus_df_with_pbdb, on='genus name', suffixes=('_old', None))
merge = merge.drop_duplicates(subset='genus name')
log_df(merge)

(145, 50)


Unnamed: 0,taxon_group,verbatim_name,name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,order_taxon_id,order_taxon_name,family_taxon_id,family_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name
0,benthic_forams,Aragonina aragonensis,Aragonia aragonensis,JAS: appears to be a misspelling; WoRMS,,,,Aragonia,,,...,279579.0,Rotaliida,432106,Loxostomatidae,288974,Foraminifera,212476,Rhizaria,,
2,benthic_forams,Bolvina sp. (q),? Bolivina sp.,"in PBDB, so it will be classified correctly","Since its just sp., hard to know what this is ...",,?,Bolivina,,,...,,,112279,Bolivinidae,288974,Foraminifera,212476,Rhizaria,,
3,benthic_forams,Bradynella subglobosa,,"in PBDB, so it will be classified correctly",http://www.marinespecies.org/foraminifera/aphi...,,,Bradynella,,,...,,,103771,Cassidulinidae,288974,Foraminifera,212476,Rhizaria,,
4,benthic_forams,Candiena sp.,Candeina sp.,"in PBDB, so it will be classified correctly",think this is it? https://www.marinespecies.or...,,,Candeina,,,...,,,422277,Candeinidae,288974,Foraminifera,212476,Rhizaria,,
5,benthic_forams,Clobocassidulina arata,Globocassidulina arata,JAS: appears to be a misspelling; WoRMS https:...,,,,Globocassidulina,,,...,,,103771,Cassidulinidae,288974,Foraminifera,212476,Rhizaria,,


In [35]:
merge['genus name'].unique()

array(['Aragonia', 'Bolivina', 'Bradynella', 'Candeina',
       'Globocassidulina', 'Cancris', 'Discopulvinulina',
       'Ellipsoglandulina', 'Ellipsolagena', 'Favocassidulina',
       'Gavelinopsis', 'Globotextularia', 'Helenina', 'Karreriella',
       'Martinottiella', 'Noviuva', 'Palaeonummulites', 'Paradentalina',
       'Praeglobobulimina', 'Pseudovalvulineria', 'Pulvinulinella',
       'Smyrnella', 'Stensioina', 'Trochamminoides', 'Acanthosphaeridium',
       'Amphiprora', 'Archaeomonas', 'Asterionella', 'Bacillaria',
       'Baxteria', 'Baxteriopsis', 'Benetorus', 'Bergonia',
       'Campylodiscus', 'Campyloneis', 'Campylosira', 'Cerataulina',
       'Cerataulus', 'Ceratoneis', 'Coscinodiscus', 'Cymatodiscus',
       'Cymatopleura', 'Denticula', 'Detonula', 'Diatomella',
       'Discodiscus', 'Ditylum', 'Endictya', 'Entogonia', 'Epithelion',
       'Eunotogramma', 'Glyphodesmis', 'Glyphodiscus', 'Grammatophora',
       'Gyrosigma', 'Hantzschia', 'Hendeya', 'Ikebea', 'Isthmia',


In [36]:
merge.columns

Index(['taxon_group', 'verbatim_name', 'name', 'Comment',
       'Notes (change to Internal only notes?)', 'Any taxon above genus',
       'genus modifier', 'genus name', 'subgenera modifier', 'subgenera name',
       'species modifier', 'species name', 'subspecies modifier',
       'subspecies name', 'non-taxa descriptor', 'comments',
       'pbdb_taxon_id_old', 'pbdb_taxon_name_old', 'pbdb_taxon_rank_old',
       'Corrections to pbdb_taxon_rank', 'family_taxon_id_old',
       'family_taxon_name_old', 'superfamily_taxon_id',
       'superfamily_taxon_name', 'order_taxon_id_old', 'order_taxon_name_old',
       'class_taxon_id_old', 'class_taxon_name_old', 'phylum_taxon_id_old',
       'phylum_taxon_name_old', 'kingdom_taxon_id_old',
       'kingdom_taxon_name_old', 'unranked clade_taxon_id_old',
       'unranked clade_taxon_name_old', 'row_index', 'pbdb_taxon_id',
       'pbdb_taxon_name', 'pbdb_taxon_rank', 'class_taxon_id',
       'class_taxon_name', 'order_taxon_id', 'order_taxon_na

In [37]:
for col in merge.columns:
    if '_old' in col:
        print(col,'   ', merge[col].unique())

pbdb_taxon_id_old     [nan]
pbdb_taxon_name_old     [nan]
pbdb_taxon_rank_old     [nan]
family_taxon_id_old     [nan]
family_taxon_name_old     [nan]
order_taxon_id_old     [nan]
order_taxon_name_old     [nan]
class_taxon_id_old     [nan]
class_taxon_name_old     [nan]
phylum_taxon_id_old     [nan]
phylum_taxon_name_old     [nan]
kingdom_taxon_id_old     [nan]
kingdom_taxon_name_old     [nan]
unranked clade_taxon_id_old     [nan]
unranked clade_taxon_name_old     [nan]


In [38]:
cols = [    
    'pbdb_taxon_name','pbdb_taxon_rank', 
    'family_taxon_id', 'family_taxon_name',
    'superfamily_taxon_id', 'superfamily_taxon_name', 
    'order_taxon_id', 'order_taxon_name', 
    'class_taxon_id', 'class_taxon_name',
    'phylum_taxon_id', 'phylum_taxon_name',
    'kingdom_taxon_id', 'kingdom_taxon_name',
    'unranked clade_taxon_id','unranked clade_taxon_name', 
    'pbdb_taxon_id', 
]
for index, row in merge.iterrows():
    for col in cols:
        PI_df.at[row['row_index'], col] = row[col]



In [39]:
del PI_df['row_index']

In [41]:
PI_df.to_csv(input_file, index=False)


## add all pbdb data to input file list for existing genus

add class to kingdom pbdb data to genus that have pdbdb taxon id

In [48]:
genus_df = pd.read_csv(genus_path, dtype=str)
log_df(genus_df)

(1707, 16)


Unnamed: 0,genus name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,class_taxon_id,class_taxon_name,order_taxon_id,order_taxon_name,family_taxon_id,family_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name
0,Abas,441243.0,Abas,genus,69587.0,Bacillariophyceae,426784.0,Hemiaulales,387088.0,Hemiaulaceae,432613.0,Ochrophyta,,,,
1,Abathomphalus,758.0,Abathomphalus,genus,,,,,,,288974.0,Foraminifera,212476.0,Rhizaria,,
2,Abies,55065.0,Abies,genus,82141.0,Pinopsida,82140.0,Pinales,54794.0,Pinaceae,82139.0,Pinophyta,54311.0,Plantae,,
3,Abutilon,,,,,,,,,,,,,,,
4,Abyssamina,762.0,Abyssamina,genus,428504.0,Globothalamea,279579.0,Rotaliida,103796.0,Chilostomellidae,288974.0,Foraminifera,212476.0,Rhizaria,,


In [49]:
PI_df = pd.read_csv(input_file, dtype=str)
PI_df['row_index'] = PI_df.index
log_df(PI_df)

(7763, 35)


Unnamed: 0,taxon_group,verbatim_name,name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name,row_index
0,benthic_forams,Abyssamina incisa,,,,,,Abyssamina,,,...,,,,,,,,,,0
1,benthic_forams,Adercotryma glomeratum,,,,,,Adercotryma,,,...,,,,,,,,,,1
2,benthic_forams,Adercotryma sp.,,,,,,Adercotryma,,,...,,,,,,,,,,2
3,benthic_forams,Alabamina decorata,,,,,,Alabamina,,,...,,,,,,,,,,3
4,benthic_forams,Alabamina haitiensis,,,,,,Alabamina,,,...,,,,,,,,,,4


In [50]:

PI_missing_higher_df = PI_df[(PI_df['pbdb_taxon_id'].notna()) &
                             (PI_df['kingdom_taxon_id'].isna()) & 
                             (PI_df['phylum_taxon_id'].isna()) & 
                             (PI_df['family_taxon_id'].isna()) &
                             (PI_df['order_taxon_id'].isna()) &
                             (PI_df['class_taxon_id'].isna()) 
                            ]
log_df(PI_missing_higher_df)

(6820, 35)


Unnamed: 0,taxon_group,verbatim_name,name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name,row_index
0,benthic_forams,Abyssamina incisa,,,,,,Abyssamina,,,...,,,,,,,,,,0
1,benthic_forams,Adercotryma glomeratum,,,,,,Adercotryma,,,...,,,,,,,,,,1
2,benthic_forams,Adercotryma sp.,,,,,,Adercotryma,,,...,,,,,,,,,,2
3,benthic_forams,Alabamina decorata,,,,,,Alabamina,,,...,,,,,,,,,,3
4,benthic_forams,Alabamina haitiensis,,,,,,Alabamina,,,...,,,,,,,,,,4


In [51]:
set(PI_missing_higher_df['pbdb_taxon_id']) - set(genus_df['pbdb_taxon_id'])

set()

In [52]:
merge_df = PI_missing_higher_df.merge(genus_df, on='pbdb_taxon_id', suffixes=('_prev', None))
log_df(merge_df)

(6820, 50)


Unnamed: 0,taxon_group,verbatim_name,name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name_prev,subgenera modifier,subgenera name,...,order_taxon_id,order_taxon_name,family_taxon_id,family_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name
0,benthic_forams,Abyssamina incisa,,,,,,Abyssamina,,,...,279579.0,Rotaliida,103796.0,Chilostomellidae,288974,Foraminifera,212476,Rhizaria,,
1,benthic_forams,Adercotryma glomeratum,,,,,,Adercotryma,,,...,,,,,288974,Foraminifera,212476,Rhizaria,,
2,benthic_forams,Adercotryma sp.,,,,,,Adercotryma,,,...,,,,,288974,Foraminifera,212476,Rhizaria,,
3,benthic_forams,Alabamina decorata,,,,,,Alabamina,,,...,,,82213.0,Alabaminidae,288974,Foraminifera,212476,Rhizaria,,
4,benthic_forams,Alabamina haitiensis,,,,,,Alabamina,,,...,,,82213.0,Alabaminidae,288974,Foraminifera,212476,Rhizaria,,


In [53]:
merge_df.columns

Index(['taxon_group', 'verbatim_name', 'name', 'Comment',
       'Notes (change to Internal only notes?)', 'Any taxon above genus',
       'genus modifier', 'genus name_prev', 'subgenera modifier',
       'subgenera name', 'species modifier', 'species name',
       'subspecies modifier', 'subspecies name', 'non-taxa descriptor',
       'comments', 'pbdb_taxon_id', 'pbdb_taxon_name_prev',
       'pbdb_taxon_rank_prev', 'Corrections to pbdb_taxon_rank',
       'family_taxon_id_prev', 'family_taxon_name_prev',
       'superfamily_taxon_id', 'superfamily_taxon_name', 'order_taxon_id_prev',
       'order_taxon_name_prev', 'class_taxon_id_prev', 'class_taxon_name_prev',
       'phylum_taxon_id_prev', 'phylum_taxon_name_prev',
       'kingdom_taxon_id_prev', 'kingdom_taxon_name_prev',
       'unranked clade_taxon_id_prev', 'unranked clade_taxon_name_prev',
       'row_index', 'genus name', 'pbdb_taxon_name', 'pbdb_taxon_rank',
       'class_taxon_id', 'class_taxon_name', 'order_taxon_id',
   

In [54]:
cols = [
    'kingdom_taxon_id', 'kingdom_taxon_name',
    'phylum_taxon_id', 'phylum_taxon_name',
    'family_taxon_id', 'family_taxon_name',
    'order_taxon_id', 'order_taxon_name', 
    'class_taxon_id', 'class_taxon_name',
    'superfamily_taxon_id', 'superfamily_taxon_name', 
    'unranked clade_taxon_id','unranked clade_taxon_name', 
]
for index, row in merge_df.iterrows():
    for col in cols:
        PI_df.at[row['row_index'], col] = row[col]



In [55]:
del PI_df['row_index']

In [56]:
PI_df.to_csv(input_file, index=False)
# PI_df.to_csv('foo.csv', index=False)

# Update genus csv with genus in input file that don't have pbdb data

look for all genus that don't have pbdb_taxon_id in input file

In [101]:
PI_df = pd.read_csv(input_file, dtype=str)
log_df(PI_df)

(7763, 34)


Unnamed: 0,taxon_group,verbatim_name,name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name
0,benthic_forams,Abyssamina incisa,,,,,,Abyssamina,,,...,279579.0,Rotaliida,428504.0,Globothalamea,288974,Foraminifera,212476,Rhizaria,,
1,benthic_forams,Adercotryma glomeratum,,,,,,Adercotryma,,,...,,,,,288974,Foraminifera,212476,Rhizaria,,
2,benthic_forams,Adercotryma sp.,,,,,,Adercotryma,,,...,,,,,288974,Foraminifera,212476,Rhizaria,,
3,benthic_forams,Alabamina decorata,,,,,,Alabamina,,,...,,,,,288974,Foraminifera,212476,Rhizaria,,
4,benthic_forams,Alabamina haitiensis,,,,,,Alabamina,,,...,,,,,288974,Foraminifera,212476,Rhizaria,,


In [113]:
genus_df = pd.read_csv(genus_path, dtype=str)
log_df(genus_df)

(1707, 14)


Unnamed: 0,genus name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,class_taxon_id,class_taxon_name,order_taxon_id,order_taxon_name,family_taxon_id,family_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
0,Abas,441243.0,Abas,genus,69587.0,Bacillariophyceae,426784.0,Hemiaulales,387088.0,Hemiaulaceae,432613.0,Ochrophyta,,
1,Abathomphalus,758.0,Abathomphalus,genus,,,,,,,288974.0,Foraminifera,212476.0,Rhizaria
2,Abies,55065.0,Abies,genus,82141.0,Pinopsida,82140.0,Pinales,54794.0,Pinaceae,82139.0,Pinophyta,54311.0,Plantae
3,Abutilon,,,,,,,,,,,,,
4,Abyssamina,762.0,Abyssamina,genus,428504.0,Globothalamea,279579.0,Rotaliida,103796.0,Chilostomellidae,288974.0,Foraminifera,212476.0,Rhizaria


In [103]:
new_genus = PI_df[PI_df['genus name'].notna() &  PI_df['pbdb_taxon_id'].isna()][['genus name']].drop_duplicates()
log_df(new_genus)

(328, 1)


Unnamed: 0,genus name
12,Alabaminella
15,Alveolophragmium
96,Aragonia
112,Astrorhiza
113,Astrononion


In [105]:
for index, row in new_genus.iterrows():  
    if index % 20 == 0:
        print(index)
        
    time.sleep(0.5)
        
    name = row['genus name']

  
    url =  PBDB_TAXA_NAME +  name
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()["records"]
        if len(data) == 1:
            # print(index, name, ' found')
            new_genus.at[index, f'pbdb_taxon_id'] = str(data[0]["taxon_no"])
            new_genus.at[index, f'pbdb_taxon_name'] = data[0]["taxon_name"]
            new_genus.at[index, f'pbdb_taxon_rank'] = data[0]["taxon_rank"]
            
            round = 0
            get_parent_taxa(new_genus, data[0]["parent_no"], data[0]["taxon_rank"], round, index, None)
        else:
            print(name, len(data), ' items found')
    else:
        pass
#         print(name, ' not found')

1000
2080
3180
3480
3520
4160
5680
5820
6080
6960
7160


In [106]:
new_genus.sort_values('genus name', inplace=True)
log_df(new_genus)

(328, 16)


Unnamed: 0,genus name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,family_taxon_id,family_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name
114,"""Astronion""",,,,,,,,,,,,,,,
242,"""Bulava""",,,,,,,,,,,,,,,
512,"""Discocibicides""",,,,,,,,,,,,,,,
5561,Abutilon,,,,,,,,,,,,,,,
3326,Acanthoica,,,,,,,,,,,,,,,


In [114]:
set(new_genus.columns) - set(genus_df.columns) 

{'unranked clade_taxon_id', 'unranked clade_taxon_name'}

In [115]:
genus_df['unranked clade_taxon_id'] = np.nan
genus_df['unranked clade_taxon_name'] = np.nan

In [116]:
new_genus.columns

Index(['genus name', 'pbdb_taxon_id', 'pbdb_taxon_name', 'pbdb_taxon_rank',
       'class_taxon_id', 'class_taxon_name', 'order_taxon_id',
       'order_taxon_name', 'family_taxon_id', 'family_taxon_name',
       'phylum_taxon_id', 'phylum_taxon_name', 'kingdom_taxon_id',
       'kingdom_taxon_name', 'unranked clade_taxon_id',
       'unranked clade_taxon_name'],
      dtype='object')

In [117]:
cols = [
    'genus name', 
    'pbdb_taxon_id', 'pbdb_taxon_name', 'pbdb_taxon_rank',
    'class_taxon_id', 'class_taxon_name', 
    'order_taxon_id', 'order_taxon_name', 
    'family_taxon_id', 'family_taxon_name', 
    'phylum_taxon_id','phylum_taxon_name', 
    'kingdom_taxon_id', 'kingdom_taxon_name',
    'unranked clade_taxon_id', 'unranked clade_taxon_name'
]

genus_df = genus_df.reindex(columns=cols)
new_genus = new_genus.reindex(columns=cols)

In [118]:
new_df = pd.concat([genus_df, new_genus])
log_df(new_df)

(2035, 16)


Unnamed: 0,genus name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,class_taxon_id,class_taxon_name,order_taxon_id,order_taxon_name,family_taxon_id,family_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name
0,Abas,441243.0,Abas,genus,69587.0,Bacillariophyceae,426784.0,Hemiaulales,387088.0,Hemiaulaceae,432613.0,Ochrophyta,,,,
1,Abathomphalus,758.0,Abathomphalus,genus,,,,,,,288974.0,Foraminifera,212476.0,Rhizaria,,
2,Abies,55065.0,Abies,genus,82141.0,Pinopsida,82140.0,Pinales,54794.0,Pinaceae,82139.0,Pinophyta,54311.0,Plantae,,
3,Abutilon,,,,,,,,,,,,,,,
4,Abyssamina,762.0,Abyssamina,genus,428504.0,Globothalamea,279579.0,Rotaliida,103796.0,Chilostomellidae,288974.0,Foraminifera,212476.0,Rhizaria,,


In [131]:
cols = ['genus name', 
    'pbdb_taxon_id', 'pbdb_taxon_name', 'pbdb_taxon_rank',
    'class_taxon_id', 'class_taxon_name', 
    'order_taxon_id', 'order_taxon_name', 
    'family_taxon_id', 'family_taxon_name', 
    'phylum_taxon_id','phylum_taxon_name', 
    'kingdom_taxon_id', 'kingdom_taxon_name']

new_df = new_df.drop_duplicates(subset=cols)
log_df(new_df)

(1751, 16)


Unnamed: 0,genus name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,class_taxon_id,class_taxon_name,order_taxon_id,order_taxon_name,family_taxon_id,family_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name
0,Abas,441243.0,Abas,genus,69587.0,Bacillariophyceae,426784.0,Hemiaulales,387088.0,Hemiaulaceae,432613.0,Ochrophyta,,,,
1,Abathomphalus,758.0,Abathomphalus,genus,,,,,,,288974.0,Foraminifera,212476.0,Rhizaria,,
2,Abies,55065.0,Abies,genus,82141.0,Pinopsida,82140.0,Pinales,54794.0,Pinaceae,82139.0,Pinophyta,54311.0,Plantae,,
3,Abutilon,,,,,,,,,,,,,,,
4,Abyssamina,762.0,Abyssamina,genus,428504.0,Globothalamea,279579.0,Rotaliida,103796.0,Chilostomellidae,288974.0,Foraminifera,212476.0,Rhizaria,,


In [133]:
new_df[new_df.duplicated('genus name')]

Unnamed: 0,genus name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,class_taxon_id,class_taxon_name,order_taxon_id,order_taxon_name,family_taxon_id,family_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name


In [134]:
new_df.to_csv(genus_path, index=False)

# update genus with no higher pbdb ranks

In [147]:
df = pd.read_csv(genus_path, dtype=str)

temp_df = df[
    df['unranked clade_taxon_id'].isna() &
    df['kingdom_taxon_id'].isna() &
    df['phylum_taxon_id'].isna() &
    df['family_taxon_id'].isna() &
    df['order_taxon_id'].isna() &
    df['class_taxon_id'].isna() &
    df['pbdb_taxon_id'].notna() 
]
log_df(temp_df)


(17, 16)


Unnamed: 0,genus name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,class_taxon_id,class_taxon_name,order_taxon_id,order_taxon_name,family_taxon_id,family_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name
7,Acaciapollenites,249963,Acaciapollenites,genus,,,,,,,,,,,,
94,Anthocorys,159808,Anthocorys,genus,,,,,,,,,,,,
151,Australopollis,308943,Australopollis,genus,,,,,,,,,,,,
319,Chenopodipollis,252756,Chenopodipollis,genus,,,,,,,,,,,,
553,Echitricolporites,252828,Echitricolporites,genus,,,,,,,,,,,,


In [141]:
for index, row in temp_df.iterrows():  
        
    time.sleep(0.5)
        
    url =  PBDB_TAXA_ID + str(row['pbdb_taxon_id'])
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()["records"]
        if len(data) == 1:
            rank = data[0]["taxon_rank"]
            temp_df.at[index, f'pbdb_taxon_id'] = str(data[0]["taxon_no"])
            temp_df.at[index, f'pbdb_taxon_name'] = data[0]["taxon_name"]
            temp_df.at[index, f'pbdb_taxon_rank'] = data[0]["taxon_rank"]
               
            round = 0
            get_parent_taxa(temp_df, data[0]["parent_no"], data[0]["taxon_rank"], round, index, None)
        else:
            print(name, len(data), ' found')
    else:
        print(name, ' not found')

In [144]:
for index, row in temp_df.iterrows():
    df.at[index, 'unranked clade_taxon_id'] = row['unranked clade_taxon_id']
    df.at[index, 'unranked clade_taxon_name'] = row['unranked clade_taxon_name']


In [145]:
df.to_csv(genus_path, index=False)

## add pbdb data to genus in input file that does not have pbdb taxon id

In [158]:
PI_df = pd.read_csv(input_file, dtype=str)
PI_df['row_index'] = PI_df.index
log_df(PI_df)

(7763, 35)


Unnamed: 0,taxon_group,verbatim_name,name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name,row_index
0,benthic_forams,Abyssamina incisa,,,,,,Abyssamina,,,...,Rotaliida,428504.0,Globothalamea,288974,Foraminifera,212476,Rhizaria,,,0
1,benthic_forams,Adercotryma glomeratum,,,,,,Adercotryma,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,1
2,benthic_forams,Adercotryma sp.,,,,,,Adercotryma,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,2
3,benthic_forams,Alabamina decorata,,,,,,Alabamina,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,3
4,benthic_forams,Alabamina haitiensis,,,,,,Alabamina,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,4


In [159]:
genus_df = pd.read_csv(genus_path, dtype=str)
genus_df = genus_df[genus_df['pbdb_taxon_id'].notna()]
log_df(genus_df)

(1402, 16)


Unnamed: 0,genus name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,class_taxon_id,class_taxon_name,order_taxon_id,order_taxon_name,family_taxon_id,family_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name
0,Abas,441243,Abas,genus,69587.0,Bacillariophyceae,426784.0,Hemiaulales,387088.0,Hemiaulaceae,432613,Ochrophyta,,,,
1,Abathomphalus,758,Abathomphalus,genus,,,,,,,288974,Foraminifera,212476.0,Rhizaria,,
2,Abies,55065,Abies,genus,82141.0,Pinopsida,82140.0,Pinales,54794.0,Pinaceae,82139,Pinophyta,54311.0,Plantae,,
4,Abyssamina,762,Abyssamina,genus,428504.0,Globothalamea,279579.0,Rotaliida,103796.0,Chilostomellidae,288974,Foraminifera,212476.0,Rhizaria,,
5,Abyssocythere,23843,Abyssocythere,genus,22826.0,Ostracoda,23837.0,Podocopida,82093.0,Trachyleberididae,18891,Arthropoda,325038.0,Animalia,,


In [160]:
PI_update_df = PI_df[PI_df['pbdb_taxon_id'].isna() & PI_df['genus name'].notna()]
log_df(PI_update_df)

(634, 35)


Unnamed: 0,taxon_group,verbatim_name,name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name,row_index
12,benthic_forams,Alabinella wedellensis,Alabaminella weddellensis,JAS: appears to be a misspelling; WoRMS,,,,Alabaminella,,,...,,,,,,,,,,12
15,benthic_forams,Alveolophraginium crassimargo,Alveolophragmium crassimargo,JAS: appears to be a misspelling; WoRMS,,,,Alveolophragmium,,,...,,,,,,,,,,15
96,benthic_forams,Aragonina velascoensis,Aragonia velascoensis,JAS: appears to be a misspelling; WoRMS,,,,Aragonia,,,...,,,,,,,,,,96
112,benthic_forams,Astrohiza granulosa,Astrorhiza granulosa,JAS: appears to be a misspelling; WoRMS,,,,Astrorhiza,,,...,,,,,,,,,,112
113,benthic_forams,Astronion astrale,Astrononion australe,JAS: appears to be a misspelling; WoRMS,,,,Astrononion,,,...,,,,,,,,,,113


In [161]:
merge_df = PI_update_df.merge(genus_df, on='genus name', how='left', suffixes=('_prev', None))
log_df(merge_df)

(634, 50)


Unnamed: 0,taxon_group,verbatim_name,name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,order_taxon_id,order_taxon_name,family_taxon_id,family_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name
0,benthic_forams,Alabinella wedellensis,Alabaminella weddellensis,JAS: appears to be a misspelling; WoRMS,,,,Alabaminella,,,...,,,241423.0,Eponididae,288974,Foraminifera,212476,Rhizaria,,
1,benthic_forams,Alveolophraginium crassimargo,Alveolophragmium crassimargo,JAS: appears to be a misspelling; WoRMS,,,,Alveolophragmium,,,...,,,,,288974,Foraminifera,212476,Rhizaria,,
2,benthic_forams,Aragonina velascoensis,Aragonia velascoensis,JAS: appears to be a misspelling; WoRMS,,,,Aragonia,,,...,279579.0,Rotaliida,432106.0,Loxostomatidae,288974,Foraminifera,212476,Rhizaria,,
3,benthic_forams,Astrohiza granulosa,Astrorhiza granulosa,JAS: appears to be a misspelling; WoRMS,,,,Astrorhiza,,,...,,,147614.0,Astrorhizidae,288974,Foraminifera,212476,Rhizaria,,
4,benthic_forams,Astronion astrale,Astrononion australe,JAS: appears to be a misspelling; WoRMS,,,,Astrononion,,,...,,,82211.0,Nonionidae,288974,Foraminifera,212476,Rhizaria,,


In [162]:
genus_df.columns

Index(['genus name', 'pbdb_taxon_id', 'pbdb_taxon_name', 'pbdb_taxon_rank',
       'class_taxon_id', 'class_taxon_name', 'order_taxon_id',
       'order_taxon_name', 'family_taxon_id', 'family_taxon_name',
       'phylum_taxon_id', 'phylum_taxon_name', 'kingdom_taxon_id',
       'kingdom_taxon_name', 'unranked clade_taxon_id',
       'unranked clade_taxon_name'],
      dtype='object')

In [163]:
cols = [
    'pbdb_taxon_id', 'pbdb_taxon_name', 'pbdb_taxon_rank',
    'class_taxon_id', 'class_taxon_name', 
    'order_taxon_id', 'order_taxon_name', 
    'family_taxon_id', 'family_taxon_name',
    'phylum_taxon_id', 'phylum_taxon_name', 
    'kingdom_taxon_id', 'kingdom_taxon_name', 
    'unranked clade_taxon_id', 'unranked clade_taxon_name'
]

for index, row in merge_df.iterrows():
    for col in cols:
        PI_df.at[row['row_index'], col] = row[col]
    

In [164]:
len(PI_df[PI_df['pbdb_taxon_id'].isna() & PI_df['genus name'].notna()])

453

In [165]:
PI_df.columns

Index(['taxon_group', 'verbatim_name', 'name', 'Comment',
       'Notes (change to Internal only notes?)', 'Any taxon above genus',
       'genus modifier', 'genus name', 'subgenera modifier', 'subgenera name',
       'species modifier', 'species name', 'subspecies modifier',
       'subspecies name', 'non-taxa descriptor', 'comments', 'pbdb_taxon_id',
       'pbdb_taxon_name', 'pbdb_taxon_rank', 'Corrections to pbdb_taxon_rank',
       'family_taxon_id', 'family_taxon_name', 'superfamily_taxon_id',
       'superfamily_taxon_name', 'order_taxon_id', 'order_taxon_name',
       'class_taxon_id', 'class_taxon_name', 'phylum_taxon_id',
       'phylum_taxon_name', 'kingdom_taxon_id', 'kingdom_taxon_name',
       'unranked clade_taxon_id', 'unranked clade_taxon_name', 'row_index'],
      dtype='object')

In [166]:
del PI_df['row_index']

In [167]:
PI_df.to_csv(input_file, index=True)