#  NOAA DSDP taxa list
## 1-96 taxa

Create list of taxa for NOAA DSDP files. Compare NOAA taxa with the taxa that the PIs have already approved in order create a list of unapproved taxa. Add PBDB data to unapproved taxa.

In [1]:
import sys
import csv
import glob
import os
import requests
import re

sys.path.append('../scripts/')
sys.path.append('../')
import pandas as pd
import numpy as np

import db 
import normalize_taxa as nt


In [88]:
base_directory = 'cleaned_data'
date='2021-07-28'
metadata_path = os.path.join(base_directory, 'metadata', 'NOAA', 'noaa_dsdp_files.csv')
approved_taxa_path = os.path.join(base_directory, 'taxa', 'LIMS', f'taxa_list_{date}.csv')

crosswalk_path = os.path.join(base_directory, 'taxa', 'draft', 'NOAA',f'taxa_crosswalk_{date}.csv')
merged_path = os.path.join(base_directory, 'taxa', 'draft', 'NOAA',f'taxa_merged_{date}.csv')
merged2_path = os.path.join(base_directory, 'taxa', 'draft', 'NOAA',f'taxa_merged2_{date}.csv')

taxa_path = os.path.join(base_directory, 'taxa', 'draft', 'NOAA',f'taxa_list_{date}.csv')
genus_path = os.path.join(base_directory, 'taxa', 'draft', 'NOAA', f'genus_{date}.csv')
taxa_pbdb_path = os.path.join(base_directory, 'taxa', 'draft', 'NOAA',f'taxa_list_pbdb_{date}.csv')


In [3]:
def log_df(df, row_count=5):
    print(df.shape)
    return df.head(row_count)


## Create taxa list

In [4]:
metadata = pd.read_csv(metadata_path)
log_df(metadata)

(4477, 5)


Unnamed: 0,path,type,taxon_group,expedition,site
0,cleaned_data/NOAA_csv/DSDP_core_data/61/462/ra...,taxa,radiolarians,61,462
1,cleaned_data/NOAA_csv/DSDP_core_data/61/462/ag...,age,,61,462
2,cleaned_data/NOAA_csv/DSDP_core_data/61/462/b_...,taxa,benthic_foraminfera,61,462
3,cleaned_data/NOAA_csv/DSDP_core_data/61/462/p_...,taxa,planktic_foraminfera,61,462
4,cleaned_data/NOAA_csv/DSDP_core_data/61/462/hr...,hard_rock,,61,462


read all the taxa files to get unique taxa names

In [5]:
# 9933
taxa = set()

for index, row in metadata.iterrows():
    if row['type'] == 'taxa':
        df = pd.read_csv(row['path'])
        df.dropna(axis=0, inplace=True, how='all')
        df['temp'] = df['fossil'].str.strip() +  '|' + row['taxon_group']
        
        taxa.update(df['temp'])
        
len(taxa)

9933

In [6]:
list(taxa)[0:20]

[nan,
 'Triloculina tricarinata|benthic_foraminfera',
 'Globigerapsis semiinvoluta|planktic_foraminfera',
 'Engelhardtia sp.(q)|pollen',
 'Bulimina pagoda|benthic_foraminfera',
 'Thoracosphaera granifera|nannofossils',
 'Discolithina enormis|nannofossils',
 'Amphipyndax sp.|radiolarians',
 'Distephanus speculum speculum pseudofibula (q)|silicoflagellates',
 'Coccolithus formosus|nannofossils',
 'Anomalinoides globulosus|benthic_foraminfera',
 'Cleistosphaeridium sp.|dinoflagellates',
 'Distephanus speculum patulus|silicoflagellates',
 'Samlandia chlamydophora|dinoflagellates',
 'Coccolithus primalis|nannofossils',
 'Cibicidoides robertsonianus (q)|benthic_foraminfera',
 'Tricolpites reticulatus (q)|pollen',
 'Asteromphalus parvulus|diatoms',
 'Botryocyrtis scutum|radiolarians',
 'Polysolenia murrayana|radiolarians']

In [7]:
# 9932
taxa_list = []

for taxon in taxa:
    if not pd.isna(taxon):
        taxon_name, taxon_group = taxon.split('|')
        simplified_name = re.sub('\(.*?\)$', '', taxon_name).strip()
        taxon_name_parts = simplified_name.split(' ')

        data = {'verbatim_name': taxon_name, 
                'taxon_group': taxon_group, 
                'genus name': taxon_name_parts[0],
                'simplified_name': simplified_name}
        if len(taxon_name_parts) > 1:
            data['species name'] = taxon_name_parts[1]
        if len(taxon_name_parts) == 3:
            data['subspecies name'] = taxon_name_parts[2]

        taxa_list.append(data)
        
len(taxa_list)

9932

create taxa list csv

In [8]:
taxa_df = pd.DataFrame(taxa_list)
log_df(taxa_df)

(9932, 6)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name
0,Triloculina tricarinata,benthic_foraminfera,Triloculina,Triloculina tricarinata,tricarinata,
1,Globigerapsis semiinvoluta,planktic_foraminfera,Globigerapsis,Globigerapsis semiinvoluta,semiinvoluta,
2,Engelhardtia sp.(q),pollen,Engelhardtia,Engelhardtia sp.,sp.,
3,Bulimina pagoda,benthic_foraminfera,Bulimina,Bulimina pagoda,pagoda,
4,Thoracosphaera granifera,nannofossils,Thoracosphaera,Thoracosphaera granifera,granifera,


In [9]:
taxa_df.to_csv(crosswalk_path, index=False)

## compare and replace taxon groups 

In [None]:
noaa_taxa_df = pd.read_csv(crosswalk_path)
approved_taxa_df = pd.read_csv(approved_taxa_path)

In [27]:
approved_groups = list(approved_taxa_df['taxon_group'].unique())
approved_groups.sort()
approved_groups

['benthic_forams',
 'bolboformids',
 'chrysophyte_cysts',
 'diatoms',
 'dinoflagellates',
 'ebridians',
 'nannofossils',
 'ostracods',
 'palynology',
 'planktic_forams',
 'radiolarians',
 'silicoflagellates']

In [28]:
noaa_groups = list(noaa_taxa_df['taxon_group'].unique())
noaa_groups.sort()
noaa_groups

['benthic_foraminfera',
 'diatoms',
 'dinoflagellates',
 'ebridians',
 'nannofossils',
 'ostracods',
 'phytoliths',
 'planktic_foraminfera',
 'pollen',
 'radiolarians',
 'silicoflagellates']

In [29]:
set(noaa_groups) -  set(approved_groups)

{'benthic_foraminfera', 'phytoliths', 'planktic_foraminfera', 'pollen'}

In [31]:
noaa_taxa_df = noaa_taxa_df.replace(['benthic_foraminfera', 'planktic_foraminfera'],
                                    ['benthic_forams', 'planktic_forams'])
noaa_taxa_df.head()

Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name
0,Triloculina tricarinata,benthic_forams,Triloculina,Triloculina tricarinata,tricarinata,
1,Globigerapsis semiinvoluta,planktic_forams,Globigerapsis,Globigerapsis semiinvoluta,semiinvoluta,
2,Engelhardtia sp.(q),pollen,Engelhardtia,Engelhardtia sp.,sp.,
3,Bulimina pagoda,benthic_forams,Bulimina,Bulimina pagoda,pagoda,
4,Thoracosphaera granifera,nannofossils,Thoracosphaera,Thoracosphaera granifera,granifera,


In [32]:
noaa_taxa_df.to_csv(crosswalk_path, index=False)

## Compare NOAA taxa with approved taxa

get NOAA taxa

In [33]:
# 9932
noaa_taxa_df = pd.read_csv(crosswalk_path)

log_df(noaa_taxa_df)

(9932, 6)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name
0,Triloculina tricarinata,benthic_forams,Triloculina,Triloculina tricarinata,tricarinata,
1,Globigerapsis semiinvoluta,planktic_forams,Globigerapsis,Globigerapsis semiinvoluta,semiinvoluta,
2,Engelhardtia sp.(q),pollen,Engelhardtia,Engelhardtia sp.,sp.,
3,Bulimina pagoda,benthic_forams,Bulimina,Bulimina pagoda,pagoda,
4,Thoracosphaera granifera,nannofossils,Thoracosphaera,Thoracosphaera granifera,granifera,


get approved taxa

In [34]:
# 4209
approved_taxa_df = pd.read_csv(approved_taxa_path)
log_df(approved_taxa_df)

(4209, 12)


Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,normalized_name,taxon_group
0,,,Euuvigerina,,,,miozea,,,,Euuvigerina miozea,benthic_forams
1,,,Euuvigerina,,,,rodleyi,,,,Euuvigerina rodleyi,benthic_forams
2,Foraminifera indet.,,,,,,,,,,Foraminifera indet.,benthic_forams
3,Pleurostomellidae indet.,,,,,,,,,,Pleurostomellidae indet.,benthic_forams
4,Ostracoda indet.,,,,,,,,,,Ostracoda indet.,benthic_forams


add simplified_name without descriptor

In [35]:
del approved_taxa_df['normalized_name'] 


In [36]:
approved_taxa_df = nt.add_normalized_name_column(approved_taxa_df, 
                                                 include_descriptor=False, 
                                                 col_name="simplified_name")

In [37]:
approved_taxa_df[approved_taxa_df['non-taxa descriptor'].notna()].head(2)

Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,taxon_group,simplified_name
28,,,Globigerinoides,,,,sacculifer,,,without sac,planktic_forams,Globigerinoides sacculifer
201,,,Bolivina,,,cf.,crenulata,,,crenulate,benthic_forams,Bolivina cf. crenulata


In [38]:
approved_taxa_df = pd.DataFrame(approved_taxa_df[['taxon_group', 'simplified_name']])
log_df(approved_taxa_df)

(4209, 2)


Unnamed: 0,taxon_group,simplified_name
0,benthic_forams,Euuvigerina miozea
1,benthic_forams,Euuvigerina rodleyi
2,benthic_forams,Foraminifera indet.
3,benthic_forams,Pleurostomellidae indet.
4,benthic_forams,Ostracoda indet.


### merge NOAA taxa with approved taxa

In [57]:
# 10109 
merged_df = pd.merge(noaa_taxa_df, approved_taxa_df,  
                     on=['simplified_name', 'taxon_group' ], 
                     how='left',
                     indicator='_merge_approved')

log_df(merged_df)


(10109, 7)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name,_merge_approved
0,Triloculina tricarinata,benthic_forams,Triloculina,Triloculina tricarinata,tricarinata,,both
1,Globigerapsis semiinvoluta,planktic_forams,Globigerapsis,Globigerapsis semiinvoluta,semiinvoluta,,left_only
2,Engelhardtia sp.(q),pollen,Engelhardtia,Engelhardtia sp.,sp.,,left_only
3,Bulimina pagoda,benthic_forams,Bulimina,Bulimina pagoda,pagoda,,left_only
4,Thoracosphaera granifera,nannofossils,Thoracosphaera,Thoracosphaera granifera,granifera,,left_only


In [58]:
merged_df.to_csv(merged_path, index=False)


In [61]:
# 10114 
merged2_df = pd.merge(noaa_taxa_df, approved_taxa_df,  
                     on=['simplified_name'], 
                     how='left',
                     indicator='_merge_approved')

log_df(merged2_df)


(10114, 8)


Unnamed: 0,verbatim_name,taxon_group_x,genus name,simplified_name,species name,subspecies name,taxon_group_y,_merge_approved
0,Triloculina tricarinata,benthic_forams,Triloculina,Triloculina tricarinata,tricarinata,,benthic_forams,both
1,Globigerapsis semiinvoluta,planktic_forams,Globigerapsis,Globigerapsis semiinvoluta,semiinvoluta,,,left_only
2,Engelhardtia sp.(q),pollen,Engelhardtia,Engelhardtia sp.,sp.,,,left_only
3,Bulimina pagoda,benthic_forams,Bulimina,Bulimina pagoda,pagoda,,,left_only
4,Thoracosphaera granifera,nannofossils,Thoracosphaera,Thoracosphaera granifera,granifera,,,left_only


save changes to taxa_all csv

In [62]:
merged2_df.to_csv(merged2_path, index=False)

the reason for the count difference is because the LIMS taxa list sometimes puts a taxa in two groups

NOAA: Selenopemphix nephroides - dinoflagellates 
LIMS: Selenopemphix nephroides - dinoflagellates, palynology

## Created taxa list with unapproved NOAA taxa

Select unapproved NOAA taxa. When _merged is both, it means the taxa has been approved. When _merged is left_only, it means the taxa has not been approved.

In [66]:
merged_df = pd.read_csv(merged_path)
log_df(merged_df)

(10109, 7)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name,_merge_approved
0,Triloculina tricarinata,benthic_forams,Triloculina,Triloculina tricarinata,tricarinata,,both
1,Globigerapsis semiinvoluta,planktic_forams,Globigerapsis,Globigerapsis semiinvoluta,semiinvoluta,,left_only
2,Engelhardtia sp.(q),pollen,Engelhardtia,Engelhardtia sp.,sp.,,left_only
3,Bulimina pagoda,benthic_forams,Bulimina,Bulimina pagoda,pagoda,,left_only
4,Thoracosphaera granifera,nannofossils,Thoracosphaera,Thoracosphaera granifera,granifera,,left_only


In [67]:
(7763, 7)
unapproved_taxa_df = merged_df[merged_df['_merge_approved'] == 'left_only'].copy()

log_df(unapproved_taxa_df)

(7763, 7)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name,_merge_approved
1,Globigerapsis semiinvoluta,planktic_forams,Globigerapsis,Globigerapsis semiinvoluta,semiinvoluta,,left_only
2,Engelhardtia sp.(q),pollen,Engelhardtia,Engelhardtia sp.,sp.,,left_only
3,Bulimina pagoda,benthic_forams,Bulimina,Bulimina pagoda,pagoda,,left_only
4,Thoracosphaera granifera,nannofossils,Thoracosphaera,Thoracosphaera granifera,granifera,,left_only
5,Discolithina enormis,nannofossils,Discolithina,Discolithina enormis,enormis,,left_only


create csv of unapproved NOAA taxa

In [68]:
unapproved_taxa_df.to_csv(taxa_path, index=False)

## Add PBDB data for taxa that aren't approved

Look up the genus for unapproved taxa in PBDB

In [70]:
unapproved_taxa_df = pd.read_csv(taxa_path)
log_df(unapproved_taxa_df)

(7763, 7)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name,_merge_approved
0,Globigerapsis semiinvoluta,planktic_forams,Globigerapsis,Globigerapsis semiinvoluta,semiinvoluta,,left_only
1,Engelhardtia sp.(q),pollen,Engelhardtia,Engelhardtia sp.,sp.,,left_only
2,Bulimina pagoda,benthic_forams,Bulimina,Bulimina pagoda,pagoda,,left_only
3,Thoracosphaera granifera,nannofossils,Thoracosphaera,Thoracosphaera granifera,granifera,,left_only
4,Discolithina enormis,nannofossils,Discolithina,Discolithina enormis,enormis,,left_only


create a dataframe of unique genera

In [71]:
genus_df = pd.DataFrame(unapproved_taxa_df['genus name'].unique(), columns=['genus name'])

log_df(genus_df)

(1707, 1)


Unnamed: 0,genus name
0,Globigerapsis
1,Engelhardtia
2,Bulimina
3,Thoracosphaera
4,Discolithina


add pbdb taxa data

In [72]:
PBDB_API = "https://paleobiodb.org/data1.2/"
PBDB_TAXA = f"{PBDB_API}taxa/single.json?vocab=pbdb&name="

In [78]:
for index, row in genus_df.iterrows():
    if index % 50 == 0:
        print(index, end=' ')
        
    url =  PBDB_TAXA +  row['genus name']
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()["records"]
        if len(data) == 1:
            # cast taxon_no to string to avoid pandas converting it to a float           
            genus_df.at[index, 'pbdb_taxon_id'] = str(data[0]["taxon_no"])
            genus_df.at[index, 'pbdb_taxon_name'] = data[0]["taxon_name"]
            genus_df.at[index, 'pbdb_taxon_rank'] = data[0]["taxon_rank"]

0 50 100 150 200 250 300 350 400 450 500 550 600 650 700 750 800 850 900 950 1000 1050 1100 1150 1200 1250 1300 1350 1400 1450 1500 1550 1600 1650 1700 

In [79]:
genus_df.head()

Unnamed: 0,genus name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank
0,Globigerapsis,1497.0,Globigerapsis,genus
1,Engelhardtia,,,
2,Bulimina,1032.0,Bulimina,genus
3,Thoracosphaera,432568.0,Thoracosphaera,genus
4,Discolithina,87709.0,Discolithina,genus


create genus csv

In [80]:
genus_df.to_csv(genus_path, index=False)

## add pbdb info to unapproved taxa 

In [82]:
genus_df = pd.read_csv(genus_path, dtype={'pbdb_taxon_id': str})
log_df(genus_df)

(1707, 4)


Unnamed: 0,genus name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank
0,Globigerapsis,1497.0,Globigerapsis,genus
1,Engelhardtia,,,
2,Bulimina,1032.0,Bulimina,genus
3,Thoracosphaera,432568.0,Thoracosphaera,genus
4,Discolithina,87709.0,Discolithina,genus


In [83]:
unapproved_df = pd.read_csv(taxa_path)

log_df(unapproved_df)

(7763, 7)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name,_merge_approved
0,Globigerapsis semiinvoluta,planktic_forams,Globigerapsis,Globigerapsis semiinvoluta,semiinvoluta,,left_only
1,Engelhardtia sp.(q),pollen,Engelhardtia,Engelhardtia sp.,sp.,,left_only
2,Bulimina pagoda,benthic_forams,Bulimina,Bulimina pagoda,pagoda,,left_only
3,Thoracosphaera granifera,nannofossils,Thoracosphaera,Thoracosphaera granifera,granifera,,left_only
4,Discolithina enormis,nannofossils,Discolithina,Discolithina enormis,enormis,,left_only


merge NOAA unapproved taxa with pbdb data

In [86]:
merged_df = pd.merge(unapproved_taxa_df, genus_df, 
                     on = 'genus name', 
                     how='left',
                     indicator='_merge_pbdb')

log_df(merged_df)

(7763, 11)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name,_merge_approved,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,_merge_pbdb
0,Globigerapsis semiinvoluta,planktic_forams,Globigerapsis,Globigerapsis semiinvoluta,semiinvoluta,,left_only,1497.0,Globigerapsis,genus,both
1,Engelhardtia sp.(q),pollen,Engelhardtia,Engelhardtia sp.,sp.,,left_only,,,,both
2,Bulimina pagoda,benthic_forams,Bulimina,Bulimina pagoda,pagoda,,left_only,1032.0,Bulimina,genus,both
3,Thoracosphaera granifera,nannofossils,Thoracosphaera,Thoracosphaera granifera,granifera,,left_only,432568.0,Thoracosphaera,genus,both
4,Discolithina enormis,nannofossils,Discolithina,Discolithina enormis,enormis,,left_only,87709.0,Discolithina,genus,both


reorder columns and sort rows

In [87]:
merged_df = merged_df.reindex(columns=['taxon_group', 'verbatim_name', 
                                       'genus name', 'species name', 
                                       'subspecies name',
                                       'pbdb_taxon_id', 'pbdb_taxon_name',
                                       'pbdb_taxon_rank', '_simplified_name',
                                       '_merge_approved', '_merge_pbdb'
                                      ])

merged_df.sort_values(by=['taxon_group', 'verbatim_name'], inplace=True)

log_df(merged_df)

(7763, 11)


Unnamed: 0,taxon_group,verbatim_name,genus name,species name,subspecies name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,_simplified_name,_merge_approved,_merge_pbdb
236,benthic_forams,Abyssamina incisa,Abyssamina,incisa,,762,Abyssamina,genus,,left_only,both
2142,benthic_forams,Adercotryma glomeratum,Adercotryma,glomeratum,,774,Adercotryma,genus,,left_only,both
4841,benthic_forams,Adercotryma sp.,Adercotryma,sp.,,774,Adercotryma,genus,,left_only,both
198,benthic_forams,Alabamina decorata,Alabamina,decorata,,788,Alabamina,genus,,left_only,both
3369,benthic_forams,Alabamina haitiensis,Alabamina,haitiensis,,788,Alabamina,genus,,left_only,both


save csv

In [89]:
merged_df.to_csv(taxa_pbdb_path, index=False)