#  NOAA Janus taxa list
## 101-210 taxa

Create list of taxa for NOAA Janus files. Compare NOAA taxa with the taxa that the PIs have already approved in order create a list of unapproved taxa. Add PBDB data to unapproved taxa.

In [199]:
import sys
import csv
import glob
import os
import requests
import re

sys.path.append('../scripts/')
sys.path.append('../')
import pandas as pd
import numpy as np

import db 
import normalize_taxa as nt


In [200]:
base_directory = 'cleaned_data'
date='2021-07-28'
metadata_path = os.path.join(base_directory, 'metadata', 'NOAA', 'noaa_janus_iodp_files.csv')
approved_taxa_path = os.path.join(base_directory, 'taxa', 'LIMS', f'taxa_list_{date}.csv')
noaa_1_96_taxa_path = os.path.join(base_directory, 'taxa', 'draft', 'NOAA',f'taxa_crosswalk_{date}.csv')
noaa_1_96_genus_path = os.path.join(base_directory, 'taxa', 'draft', 'NOAA',f'genus_{date}.csv')

date='2021-08-03'
crosswalk_path = os.path.join(base_directory, 'taxa', 'draft', 'NOAA',f'taxa_101_210_crosswalk_{date}.csv')
merged_path = os.path.join(base_directory, 'taxa', 'draft', 'NOAA',f'taxa_101_210_merged_{date}.csv')
merged2_path = os.path.join(base_directory, 'taxa', 'draft', 'NOAA',f'taxa_101_210_merged2_{date}.csv')

taxa_path = os.path.join(base_directory, 'taxa', 'draft', 'NOAA',f'taxa_101_210_list_{date}.csv')
genus_path = os.path.join(base_directory, 'taxa', 'draft', 'NOAA', f'genus_101_210_{date}.csv')
taxa_pbdb_path = os.path.join(base_directory, 'taxa', 'draft', 'NOAA',f'taxa_101_210_list_pbdb_{date}.csv')


In [201]:
def log_df(df, row_count=5):
    print(df.shape)
    return df.head(row_count)


## Create taxa list

In [202]:
metadata = pd.read_csv(metadata_path)
metadata = metadata[metadata['type'] == 'taxa']
log_df(metadata)

(2045, 5)


Unnamed: 0,path,type,expedition,site,taxon_group
436,cleaned_data/NOAA_csv/JanusIODP_paleo_agemodel...,taxa,135,835,nannofossils
437,cleaned_data/NOAA_csv/JanusIODP_paleo_agemodel...,taxa,135,834,nannofossils
438,cleaned_data/NOAA_csv/JanusIODP_paleo_agemodel...,taxa,135,834,nannofossils
439,cleaned_data/NOAA_csv/JanusIODP_paleo_agemodel...,taxa,135,841,nannofossils
440,cleaned_data/NOAA_csv/JanusIODP_paleo_agemodel...,taxa,135,841,benthic_foraminfera


In [203]:
common_fields = {
    'Data',
    'Age From (oldest)',
    'Age To (youngest)',
    'Zone From (bottom)',
    'Zone To  (top)',
    'Leg',
    'Site',
    'H',
    'Cor',
    'T',
    'Sc',
    'Top(cm)',
    'Depth (mbsf)',
    'Scientist',
    'Fossil Group',
    'Comment', 
    'Group Abundance',
    'Group Preservation',
}


In [204]:
path = 'cleaned_data/NOAA_csv/JanusIODP_paleo_agemodel/paleontology/range_tables/104/643/HOLE_A/Miscellaneous.csv'
df = pd.read_csv(path)
df.dropna(axis=0, inplace=True, how='all')
file_taxa = set([col.strip() for col in df.columns]) - common_fields
file_taxa


# taxa = [col.strip() + '|' +  row['taxon_group'] for col in file_taxa if col is not None]
# taxa[0:5]

{'Inoceramus (prisms)'}

read all the taxa files to get unique taxa names

In [219]:
# 13285
taxa = set()

skip_files = [
    # needs fixing - Z, X; manually remove quotes
    'cleaned_data/NOAA_csv/JanusIODP_paleo_agemodel/paleontology/range_tables/175/1077/HOLE_A/Diatoms.csv', 
    # needs fixing - Reticulofenestra Z 
    'cleaned_data/NOAA_csv/JanusIODP_paleo_agemodel/paleontology/range_tables/181/1120/HOLE_D/Nannofossils.csv',
    # needs fixing - Form A, Form B    
    'cleaned_data/NOAA_csv/JanusIODP_paleo_agemodel/paleontology/range_tables/181/1120/HOLE_B/Radiolarians.csv',
    # needs fixing - Form A      
    'cleaned_data/NOAA_csv/JanusIODP_paleo_agemodel/paleontology/range_tables/181/1119/HOLE_B/Radiolarians.csv'
]

for index, row in metadata.iterrows():
    path = row['path']
    if path in skip_files:
        continue
        
    df = pd.read_csv(path)
    df.dropna(axis='index', inplace=True, how='all')
    
    file_taxa = set([col.strip() for col in df.columns]) - common_fields
    temp_taxa = [taxon.strip() + '|' +  row['taxon_group'] for taxon in file_taxa if isinstance(taxon, str)]    
    taxa.update(temp_taxa)
        
len(taxa)

13285

In [220]:
list(taxa)[20:25]

['Odontochitina spp.|dinoflagellates/acritarchs/prasinophytes',
 'Enneadocysta harrisii|nannofossils',
 'Tenuitella iota|planktic_foraminfera',
 'Stereisporites taxa|pollen',
 'Globorotalia inflata (4 chambered)|nannofossils']

In [224]:
# 13285
taxa_list = []

for taxon in taxa:
    if not pd.isna(taxon):
        taxon_name, taxon_group = taxon.split('|')
        simplified_name = re.sub('\(.*?\)$', '', taxon_name).strip()
        taxon_name_parts = simplified_name.split(' ')

        data = {'verbatim_name': taxon_name, 
                'taxon_group': taxon_group, 
                'genus name': taxon_name_parts[0],
                'simplified_name': simplified_name}
        if len(taxon_name_parts) > 1:
            data['species name'] = taxon_name_parts[1]
        if len(taxon_name_parts) == 3:
            data['subspecies name'] = taxon_name_parts[2]

        taxa_list.append(data)
        
len(taxa_list)

13285

create taxa list csv

In [225]:
taxa_df = pd.DataFrame(taxa_list)
log_df(taxa_df)

(13285, 6)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name
0,Talimudinium scissurum,dinoflagellates/acritarchs/prasinophytes,Talimudinium,Talimudinium scissurum,scissurum,
1,Pseudoclavulina rugolosa,benthic_foraminfera,Pseudoclavulina,Pseudoclavulina rugolosa,rugolosa,
2,Pentadinium goniferum,nannofossils,Pentadinium,Pentadinium goniferum,goniferum,
3,Globanomalina planocompressa,planktic_foraminfera,Globanomalina,Globanomalina planocompressa,planocompressa,
4,Obliquipithonella multistrata,dinoflagellates/acritarchs/prasinophytes,Obliquipithonella,Obliquipithonella multistrata,multistrata,


In [226]:
taxa_df.to_csv(crosswalk_path, index=False)

## compare and replace taxon groups 

In [227]:
noaa_taxa_df = pd.read_csv(crosswalk_path)
approved_taxa_df = pd.read_csv(approved_taxa_path)

In [228]:
approved_groups = list(approved_taxa_df['taxon_group'].unique())
approved_groups.sort()
approved_groups

['benthic_forams',
 'bolboformids',
 'chrysophyte_cysts',
 'diatoms',
 'dinoflagellates',
 'ebridians',
 'nannofossils',
 'ostracods',
 'palynology',
 'planktic_forams',
 'radiolarians',
 'silicoflagellates']

In [229]:
noaa_groups = list(noaa_taxa_df['taxon_group'].unique())
noaa_groups.sort()
noaa_groups

['benthic_foraminfera',
 'bolboformids',
 'diatoms',
 'dinoflagellates/acritarchs/prasinophytes',
 'macrofossils',
 'miscellaneous',
 'nannofossils',
 'ostracods',
 'planktic_foraminfera',
 'pollen',
 'pteropods',
 'radiolarians',
 'silicoflagellates/ebridians/actiniscidians',
 'sponge_spicules',
 'trace_fossils']

In [230]:
set(noaa_groups) -  set(approved_groups)

{'benthic_foraminfera',
 'dinoflagellates/acritarchs/prasinophytes',
 'macrofossils',
 'miscellaneous',
 'planktic_foraminfera',
 'pollen',
 'pteropods',
 'silicoflagellates/ebridians/actiniscidians',
 'sponge_spicules',
 'trace_fossils'}

In [231]:
noaa_taxa_df = noaa_taxa_df.replace(['benthic_foraminfera', 'planktic_foraminfera'],
                                    ['benthic_forams', 'planktic_forams'])
noaa_taxa_df.head()

Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name
0,Talimudinium scissurum,dinoflagellates/acritarchs/prasinophytes,Talimudinium,Talimudinium scissurum,scissurum,
1,Pseudoclavulina rugolosa,benthic_forams,Pseudoclavulina,Pseudoclavulina rugolosa,rugolosa,
2,Pentadinium goniferum,nannofossils,Pentadinium,Pentadinium goniferum,goniferum,
3,Globanomalina planocompressa,planktic_forams,Globanomalina,Globanomalina planocompressa,planocompressa,
4,Obliquipithonella multistrata,dinoflagellates/acritarchs/prasinophytes,Obliquipithonella,Obliquipithonella multistrata,multistrata,


In [232]:
noaa_taxa_df.to_csv(crosswalk_path, index=False)

## combine noaa taxa

In [233]:
# 9932

noaa_1_taxa_df = pd.read_csv(noaa_1_96_taxa_path)
log_df(noaa_1_taxa_df)

(9932, 6)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name
0,Reticulofenestra minutulus (q),nannofossils,Reticulofenestra,Reticulofenestra minutulus,minutulus,
1,Theocotyle alpha,radiolarians,Theocotyle,Theocotyle alpha,alpha,
2,Coscinodiscus kuetzingii,diatoms,Coscinodiscus,Coscinodiscus kuetzingii,kuetzingii,
3,Fragilaria hirosakiensis,diatoms,Fragilaria,Fragilaria hirosakiensis,hirosakiensis,
4,Globorotalia truncatulinoides (sin),planktic_forams,Globorotalia,Globorotalia truncatulinoides,truncatulinoides,


In [235]:
# 13285

noaa_taxa_df = pd.read_csv(crosswalk_path)
log_df(noaa_taxa_df)

(13285, 6)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name
0,Talimudinium scissurum,dinoflagellates/acritarchs/prasinophytes,Talimudinium,Talimudinium scissurum,scissurum,
1,Pseudoclavulina rugolosa,benthic_forams,Pseudoclavulina,Pseudoclavulina rugolosa,rugolosa,
2,Pentadinium goniferum,nannofossils,Pentadinium,Pentadinium goniferum,goniferum,
3,Globanomalina planocompressa,planktic_forams,Globanomalina,Globanomalina planocompressa,planocompressa,
4,Obliquipithonella multistrata,dinoflagellates/acritarchs/prasinophytes,Obliquipithonella,Obliquipithonella multistrata,multistrata,


In [236]:
# 23217

combined_df = pd.concat([noaa_1_taxa_df, noaa_taxa_df])
log_df(combined_df)

(23217, 6)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name
0,Reticulofenestra minutulus (q),nannofossils,Reticulofenestra,Reticulofenestra minutulus,minutulus,
1,Theocotyle alpha,radiolarians,Theocotyle,Theocotyle alpha,alpha,
2,Coscinodiscus kuetzingii,diatoms,Coscinodiscus,Coscinodiscus kuetzingii,kuetzingii,
3,Fragilaria hirosakiensis,diatoms,Fragilaria,Fragilaria hirosakiensis,hirosakiensis,
4,Globorotalia truncatulinoides (sin),planktic_forams,Globorotalia,Globorotalia truncatulinoides,truncatulinoides,


In [237]:
# 20281

combined_df = combined_df.drop_duplicates()
log_df(combined_df)

(20281, 6)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name
0,Reticulofenestra minutulus (q),nannofossils,Reticulofenestra,Reticulofenestra minutulus,minutulus,
1,Theocotyle alpha,radiolarians,Theocotyle,Theocotyle alpha,alpha,
2,Coscinodiscus kuetzingii,diatoms,Coscinodiscus,Coscinodiscus kuetzingii,kuetzingii,
3,Fragilaria hirosakiensis,diatoms,Fragilaria,Fragilaria hirosakiensis,hirosakiensis,
4,Globorotalia truncatulinoides (sin),planktic_forams,Globorotalia,Globorotalia truncatulinoides,truncatulinoides,


## Compare NOAA 1-96 taxa with NOAA 101-210

In [238]:
noaa_1_taxa_df = pd.read_csv(noaa_1_96_taxa_path, usecols=['simplified_name', 'taxon_group'])
log_df(noaa_1_taxa_df)

(9932, 2)


Unnamed: 0,taxon_group,simplified_name
0,nannofossils,Reticulofenestra minutulus
1,radiolarians,Theocotyle alpha
2,diatoms,Coscinodiscus kuetzingii
3,diatoms,Fragilaria hirosakiensis
4,planktic_forams,Globorotalia truncatulinoides


In [239]:
noaa_taxa_df = pd.read_csv(crosswalk_path)
log_df(noaa_taxa_df)

(13285, 6)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name
0,Talimudinium scissurum,dinoflagellates/acritarchs/prasinophytes,Talimudinium,Talimudinium scissurum,scissurum,
1,Pseudoclavulina rugolosa,benthic_forams,Pseudoclavulina,Pseudoclavulina rugolosa,rugolosa,
2,Pentadinium goniferum,nannofossils,Pentadinium,Pentadinium goniferum,goniferum,
3,Globanomalina planocompressa,planktic_forams,Globanomalina,Globanomalina planocompressa,planocompressa,
4,Obliquipithonella multistrata,dinoflagellates/acritarchs/prasinophytes,Obliquipithonella,Obliquipithonella multistrata,multistrata,


### merge NOAA taxa

In [241]:
# 14140 
merged_df = pd.merge(noaa_taxa_df, noaa_1_taxa_df,  
                     on=['simplified_name', 'taxon_group' ], 
                     how='left',
                     indicator='_merge_approved')

log_df(merged_df)


(14140, 7)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name,_merge_approved
0,Talimudinium scissurum,dinoflagellates/acritarchs/prasinophytes,Talimudinium,Talimudinium scissurum,scissurum,,left_only
1,Pseudoclavulina rugolosa,benthic_forams,Pseudoclavulina,Pseudoclavulina rugolosa,rugolosa,,left_only
2,Pentadinium goniferum,nannofossils,Pentadinium,Pentadinium goniferum,goniferum,,left_only
3,Globanomalina planocompressa,planktic_forams,Globanomalina,Globanomalina planocompressa,planocompressa,,left_only
4,Obliquipithonella multistrata,dinoflagellates/acritarchs/prasinophytes,Obliquipithonella,Obliquipithonella multistrata,multistrata,,left_only


In [243]:
# 13285
merged_df = merged_df.drop_duplicates()
log_df(merged_df)

(13285, 7)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name,_merge_approved
0,Talimudinium scissurum,dinoflagellates/acritarchs/prasinophytes,Talimudinium,Talimudinium scissurum,scissurum,,left_only
1,Pseudoclavulina rugolosa,benthic_forams,Pseudoclavulina,Pseudoclavulina rugolosa,rugolosa,,left_only
2,Pentadinium goniferum,nannofossils,Pentadinium,Pentadinium goniferum,goniferum,,left_only
3,Globanomalina planocompressa,planktic_forams,Globanomalina,Globanomalina planocompressa,planocompressa,,left_only
4,Obliquipithonella multistrata,dinoflagellates/acritarchs/prasinophytes,Obliquipithonella,Obliquipithonella multistrata,multistrata,,left_only


In [244]:
merged_df.to_csv(merged_path, index=False)


In [246]:
# 14511 
merged2_df = pd.merge(noaa_taxa_df, noaa_1_taxa_df,  
                     on=['simplified_name'], 
                     how='left',
                     indicator='_merge_approved')

log_df(merged2_df)


(14511, 8)


Unnamed: 0,verbatim_name,taxon_group_x,genus name,simplified_name,species name,subspecies name,taxon_group_y,_merge_approved
0,Talimudinium scissurum,dinoflagellates/acritarchs/prasinophytes,Talimudinium,Talimudinium scissurum,scissurum,,,left_only
1,Pseudoclavulina rugolosa,benthic_forams,Pseudoclavulina,Pseudoclavulina rugolosa,rugolosa,,,left_only
2,Pentadinium goniferum,nannofossils,Pentadinium,Pentadinium goniferum,goniferum,,,left_only
3,Globanomalina planocompressa,planktic_forams,Globanomalina,Globanomalina planocompressa,planocompressa,,,left_only
4,Obliquipithonella multistrata,dinoflagellates/acritarchs/prasinophytes,Obliquipithonella,Obliquipithonella multistrata,multistrata,,,left_only


In [248]:
# 13296
merged2_df = merged2_df.drop_duplicates()
log_df(merged2_df)


(13296, 8)


Unnamed: 0,verbatim_name,taxon_group_x,genus name,simplified_name,species name,subspecies name,taxon_group_y,_merge_approved
0,Talimudinium scissurum,dinoflagellates/acritarchs/prasinophytes,Talimudinium,Talimudinium scissurum,scissurum,,,left_only
1,Pseudoclavulina rugolosa,benthic_forams,Pseudoclavulina,Pseudoclavulina rugolosa,rugolosa,,,left_only
2,Pentadinium goniferum,nannofossils,Pentadinium,Pentadinium goniferum,goniferum,,,left_only
3,Globanomalina planocompressa,planktic_forams,Globanomalina,Globanomalina planocompressa,planocompressa,,,left_only
4,Obliquipithonella multistrata,dinoflagellates/acritarchs/prasinophytes,Obliquipithonella,Obliquipithonella multistrata,multistrata,,,left_only


save changes to taxa_all csv

In [249]:
merged2_df.to_csv(merged2_path, index=False)

the reason for the count difference is because a taxa in multiple taxon groups


## Created taxa list with unapproved NOAA taxa

Select unapproved NOAA taxa. When _merged is both, it means the taxa has been approved. When _merged is left_only, it means the taxa has not been approved.

In [250]:
merged_df = pd.read_csv(merged_path)
log_df(merged_df)

(13285, 7)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name,_merge_approved
0,Talimudinium scissurum,dinoflagellates/acritarchs/prasinophytes,Talimudinium,Talimudinium scissurum,scissurum,,left_only
1,Pseudoclavulina rugolosa,benthic_forams,Pseudoclavulina,Pseudoclavulina rugolosa,rugolosa,,left_only
2,Pentadinium goniferum,nannofossils,Pentadinium,Pentadinium goniferum,goniferum,,left_only
3,Globanomalina planocompressa,planktic_forams,Globanomalina,Globanomalina planocompressa,planocompressa,,left_only
4,Obliquipithonella multistrata,dinoflagellates/acritarchs/prasinophytes,Obliquipithonella,Obliquipithonella multistrata,multistrata,,left_only


In [252]:
# (10099, 7)
unapproved_taxa_df = merged_df[merged_df['_merge_approved'] == 'left_only'].copy()

log_df(unapproved_taxa_df)

(10099, 7)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name,_merge_approved
0,Talimudinium scissurum,dinoflagellates/acritarchs/prasinophytes,Talimudinium,Talimudinium scissurum,scissurum,,left_only
1,Pseudoclavulina rugolosa,benthic_forams,Pseudoclavulina,Pseudoclavulina rugolosa,rugolosa,,left_only
2,Pentadinium goniferum,nannofossils,Pentadinium,Pentadinium goniferum,goniferum,,left_only
3,Globanomalina planocompressa,planktic_forams,Globanomalina,Globanomalina planocompressa,planocompressa,,left_only
4,Obliquipithonella multistrata,dinoflagellates/acritarchs/prasinophytes,Obliquipithonella,Obliquipithonella multistrata,multistrata,,left_only


create csv of unapproved NOAA taxa

In [253]:
unapproved_taxa_df.to_csv(taxa_path, index=False)

## Add PBDB data for taxa that aren't approved

Look up the genus for unapproved taxa in PBDB

In [254]:
unapproved_taxa_df = pd.read_csv(taxa_path)
log_df(unapproved_taxa_df)

(10099, 7)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name,_merge_approved
0,Talimudinium scissurum,dinoflagellates/acritarchs/prasinophytes,Talimudinium,Talimudinium scissurum,scissurum,,left_only
1,Pseudoclavulina rugolosa,benthic_forams,Pseudoclavulina,Pseudoclavulina rugolosa,rugolosa,,left_only
2,Pentadinium goniferum,nannofossils,Pentadinium,Pentadinium goniferum,goniferum,,left_only
3,Globanomalina planocompressa,planktic_forams,Globanomalina,Globanomalina planocompressa,planocompressa,,left_only
4,Obliquipithonella multistrata,dinoflagellates/acritarchs/prasinophytes,Obliquipithonella,Obliquipithonella multistrata,multistrata,,left_only


create a dataframe of unique genera

In [260]:
unapproved_genus_df = pd.DataFrame(unapproved_taxa_df['genus name'].unique(), columns=['genus name'])

log_df(unapproved_genus_df)

(2554, 1)


Unnamed: 0,genus name
0,Talimudinium
1,Pseudoclavulina
2,Pentadinium
3,Globanomalina
4,Obliquipithonella


In [261]:
noaa_1_96_genus_df = pd.read_csv(noaa_1_96_genus_path)
log_df(noaa_1_96_genus_df)

(1707, 4)


Unnamed: 0,genus name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank
0,Astromma,33.0,Astromma,genus
1,Lagena,1739.0,Lagena,genus
2,Cretarhabdus,87816.0,Cretarhabdus,genus
3,Fasciculithus,424283.0,Fasciculithus,genus
4,Coscinodiscus,71292.0,Coscinodiscus,genus


In [262]:
janus_genus = set(unapproved_genus_df['genus name']) - set(noaa_1_96_genus_df['genus name'])
len(janus_genus)

1651

add pbdb taxa data

In [34]:
PBDB_API = "https://paleobiodb.org/data1.2/"
PBDB_TAXA = f"{PBDB_API}taxa/single.json?vocab=pbdb&name="

In [35]:
for index, row in genus_df.iterrows():
    if index % 50 == 0:
        print(index, end=' ')
        
    url =  PBDB_TAXA +  row['genus name']
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()["records"]
        if len(data) == 1:
            # cast taxon_no to string to avoid pandas converting it to a float           
            genus_df.at[index, 'pbdb_taxon_id'] = str(data[0]["taxon_no"])
            genus_df.at[index, 'pbdb_taxon_name'] = data[0]["taxon_name"]
            genus_df.at[index, 'pbdb_taxon_rank'] = data[0]["taxon_rank"]

0 50 100 150 200 250 300 350 400 450 500 550 600 650 700 750 800 850 900 950 1000 1050 1100 1150 1200 1250 1300 1350 1400 1450 1500 1550 1600 1650 1700 

In [36]:
genus_df.head()

Unnamed: 0,genus name
0,Dictyocha
1,Planorotalites
2,Globorotalia
3,Globoquadrina
4,Spirocyrtis


create genus csv

In [48]:
genus_df.to_csv(genus_path, index=False)

## add pbdb info to unapproved taxa 

In [37]:
genus_df = pd.read_csv(genus_path, dtype={'pbdb_taxon_id': str})
log_df(genus_df)

(1707, 4)


Unnamed: 0,genus name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank
0,Astromma,33,Astromma,genus
1,Lagena,1739,Lagena,genus
2,Cretarhabdus,87816,Cretarhabdus,genus
3,Fasciculithus,424283,Fasciculithus,genus
4,Coscinodiscus,71292,Coscinodiscus,genus


In [38]:
unapproved_df = pd.read_csv(taxa_path)

log_df(unapproved_df)

(7763, 7)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name,_merge_approved
0,Dictyocha brevispina brevispina (q),silicoflagellates,Dictyocha,Dictyocha brevispina brevispina,brevispina,brevispina,left_only
1,Planorotalites ehrenbergi,planktic_forams,Planorotalites,Planorotalites ehrenbergi,ehrenbergi,,left_only
2,Globorotalia miozea sphericomiozea,planktic_forams,Globorotalia,Globorotalia miozea sphericomiozea,miozea,sphericomiozea,left_only
3,Globoquadrina globosa,planktic_forams,Globoquadrina,Globoquadrina globosa,globosa,,left_only
4,Spirocyrtis scalaris,radiolarians,Spirocyrtis,Spirocyrtis scalaris,scalaris,,left_only


merge NOAA unapproved taxa with pbdb data

In [39]:
merged_df = pd.merge(unapproved_taxa_df, genus_df, 
                     on = 'genus name', 
                     how='left',
                     indicator='_merge_pbdb')

log_df(merged_df)

(7763, 11)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name,_merge_approved,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,_merge_pbdb
0,Dictyocha brevispina brevispina (q),silicoflagellates,Dictyocha,Dictyocha brevispina brevispina,brevispina,brevispina,left_only,71284,Dictyocha,genus,both
1,Planorotalites ehrenbergi,planktic_forams,Planorotalites,Planorotalites ehrenbergi,ehrenbergi,,left_only,2146,Planorotalites,genus,both
2,Globorotalia miozea sphericomiozea,planktic_forams,Globorotalia,Globorotalia miozea sphericomiozea,miozea,sphericomiozea,left_only,1521,Globorotalia,genus,both
3,Globoquadrina globosa,planktic_forams,Globoquadrina,Globoquadrina globosa,globosa,,left_only,1518,Globoquadrina,genus,both
4,Spirocyrtis scalaris,radiolarians,Spirocyrtis,Spirocyrtis scalaris,scalaris,,left_only,654,Spirocyrtis,genus,both


reorder columns and sort rows

In [40]:
merged_df = merged_df.reindex(columns=['taxon_group', 'verbatim_name', 
                                       'genus name', 'species name', 
                                       'subspecies name',
                                       'pbdb_taxon_id', 'pbdb_taxon_name',
                                       'pbdb_taxon_rank', '_simplified_name',
                                       '_merge_approved', '_merge_pbdb'
                                      ])

merged_df.sort_values(by=['taxon_group', 'verbatim_name'], inplace=True)

log_df(merged_df)

(7763, 11)


Unnamed: 0,taxon_group,verbatim_name,genus name,species name,subspecies name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,_simplified_name,_merge_approved,_merge_pbdb
822,benthic_forams,Abyssamina incisa,Abyssamina,incisa,,762,Abyssamina,genus,,left_only,both
6100,benthic_forams,Adercotryma glomeratum,Adercotryma,glomeratum,,774,Adercotryma,genus,,left_only,both
2822,benthic_forams,Adercotryma sp.,Adercotryma,sp.,,774,Adercotryma,genus,,left_only,both
6167,benthic_forams,Alabamina decorata,Alabamina,decorata,,788,Alabamina,genus,,left_only,both
3517,benthic_forams,Alabamina haitiensis,Alabamina,haitiensis,,788,Alabamina,genus,,left_only,both


save csv

In [41]:
merged_df.to_csv(taxa_pbdb_path, index=False)