# NOAA DSDP taxa list

Create list of taxa for NOAA DSDP files. Compare NOAA taxa with the taxa that the PIs have already approved in order create a list of unapproved taxa. Add PBDB data to unapproved taxa.

In [1]:
import sys
import csv
import glob
import os
import requests
import re

sys.path.append('../scripts/')
sys.path.append('../')
import pandas as pd
import numpy as np

import db 

In [2]:
base_directory = 'cleaned_data'
metadata_path = os.path.join(base_directory, 'metadata', 'noaa_dsdp_files.csv')
approved_taxa_path = os.path.join(base_directory, 'taxa', 'approved_eodp_taxa_list.csv')
taxa_all_path = os.path.join(base_directory, 'taxa', 'draft', 'NOAA','taxa_list_noaa_dsdp_all.csv')
taxa_path = os.path.join(base_directory, 'taxa', 'draft', 'NOAA','taxa_list_noaa_dsdp.csv')
genus_path = os.path.join(base_directory, 'taxa', 'draft', 'NOAA', 'genus.csv')

## Create taxa list

In [3]:
metadata = pd.read_csv(metadata_path)
metadata.head()

Unnamed: 0,path,type,taxon_group,expedition,site
0,cleaned_data/NOAA_csv/DSDP_core_data/61/462/ra...,taxa,radiolarians,61,462
1,cleaned_data/NOAA_csv/DSDP_core_data/61/462/ag...,age,,61,462
2,cleaned_data/NOAA_csv/DSDP_core_data/61/462/b_...,taxa,benthic_foraminfera,61,462
3,cleaned_data/NOAA_csv/DSDP_core_data/61/462/p_...,taxa,planktic_foraminfera,61,462
4,cleaned_data/NOAA_csv/DSDP_core_data/61/462/hr...,hard_rock,,61,462


read all the taxa files to get unique taxa names

In [5]:
# 9933
taxa = set()

for index, row in metadata.iterrows():
    if row['type'] == 'taxa':
        df = pd.read_csv(row['path'])
        df.dropna(axis=0, inplace=True, how='all')
        df['temp'] = df['fossil'].str.strip() +  '|' + row['taxon_group']
        
        taxa.update(df['temp'])
        
len(taxa)

9933

In [6]:
# 9932
taxa_list = []

for taxon in taxa:
    if not pd.isna(taxon):
        taxon_name, taxon_group = taxon.split('|')
        simplified_name = re.sub('\(.*?\)$', '', taxon_name).strip()
        taxon_name_parts = simplified_name.split(' ')

        data = {'verbatim_name': taxon_name, 
                'taxon_group': taxon_group, 
                'genus name': taxon_name_parts[0],
                '_simplified_name': simplified_name}
        if len(taxon_name_parts) > 1:
            data['species name'] = taxon_name_parts[1]
        if len(taxon_name_parts) == 3:
            data['subspecies name'] = taxon_name_parts[2]

        taxa_list.append(data)
        
len(taxa_list)

9932

create taxa list csv

In [7]:
taxa_df = pd.DataFrame(taxa_list)
taxa_df.head()

Unnamed: 0,verbatim_name,taxon_group,genus name,_simplified_name,species name,subspecies name
0,Pseudotriceratium cinnamomeum,diatoms,Pseudotriceratium,Pseudotriceratium cinnamomeum,cinnamomeum,
1,Globigerinoides quadrilobatus,planktic_foraminfera,Globigerinoides,Globigerinoides quadrilobatus,quadrilobatus,
2,Lamptonium fabaeforme chaunothorax (q),radiolarians,Lamptonium,Lamptonium fabaeforme chaunothorax,fabaeforme,chaunothorax
3,Rucinolithus hayi (q),nannofossils,Rucinolithus,Rucinolithus hayi,hayi,
4,Melosira granulata curvata,diatoms,Melosira,Melosira granulata curvata,granulata,curvata


In [8]:
taxa_df.to_csv(taxa_all_path, index=False)

## Compare NOAA taxa with approved taxa

get NOAA taxa

In [11]:
# 9932
noaa_taxa_df = pd.read_csv(taxa_all_path)

print(len(noaa_taxa_df))
noaa_taxa_df.head()

9932


Unnamed: 0,verbatim_name,taxon_group,genus name,_simplified_name,species name,subspecies name
0,Pseudotriceratium cinnamomeum,diatoms,Pseudotriceratium,Pseudotriceratium cinnamomeum,cinnamomeum,
1,Globigerinoides quadrilobatus,planktic_foraminfera,Globigerinoides,Globigerinoides quadrilobatus,quadrilobatus,
2,Lamptonium fabaeforme chaunothorax (q),radiolarians,Lamptonium,Lamptonium fabaeforme chaunothorax,fabaeforme,chaunothorax
3,Rucinolithus hayi (q),nannofossils,Rucinolithus,Rucinolithus hayi,hayi,
4,Melosira granulata curvata,diatoms,Melosira,Melosira granulata curvata,granulata,curvata


get approved taxa

In [12]:
# 4600
approved_taxa_df = pd.read_csv(approved_taxa_path, 
                               usecols=['normalized_name', 'taxon_group', 'non-taxa descriptor'])
print(len(approved_taxa_df))
approved_taxa_df.head()

4600


Unnamed: 0,non-taxa descriptor,normalized_name,taxon_group
0,,Euuvigerina miozea,benthic_foraminfera
1,,Euuvigerina rodleyi,benthic_foraminfera
2,,Foraminifera indet.,benthic_foraminfera
3,,Pleurostomellia indet.,benthic_foraminfera
4,,Ostracoda indet.,benthic_foraminfera


add '_simplified_name' so we can compare NOAA taxa with approved taxa

In [13]:
approved_taxa_df['_simplified_name'] = (
    approved_taxa_df['normalized_name'].str.replace(' +\(.*?\)$', '')
)

In [14]:
approved_taxa_df[approved_taxa_df['non-taxa descriptor'].notna()].head(2)

Unnamed: 0,non-taxa descriptor,normalized_name,taxon_group,_simplified_name
170,crenulate,Bolivina cf. crenulata (crenulate),benthic_foraminfera,Bolivina cf. crenulata
171,smooth flat,Bolivina sp. (smooth flat),benthic_foraminfera,Bolivina sp.


In [15]:
del approved_taxa_df['normalized_name'] 
del approved_taxa_df['non-taxa descriptor'] 

In [16]:
# 4047
approved_taxa_df.drop_duplicates(inplace=True)

print(len(approved_taxa_df))
approved_taxa_df.head()

4047


Unnamed: 0,taxon_group,_simplified_name
0,benthic_foraminfera,Euuvigerina miozea
1,benthic_foraminfera,Euuvigerina rodleyi
2,benthic_foraminfera,Foraminifera indet.
3,benthic_foraminfera,Pleurostomellia indet.
4,benthic_foraminfera,Ostracoda indet.


merge NOAA taxa with approved taxa

In [17]:
# 9932
merged_df = pd.merge(noaa_taxa_df, approved_taxa_df,  
                     on=['_simplified_name', 'taxon_group'], 
                     how='left',
                     indicator='_merge_approved')

print(len(merged_df))
merged_df.head(6)

9932


Unnamed: 0,verbatim_name,taxon_group,genus name,_simplified_name,species name,subspecies name,_merge_approved
0,Pseudotriceratium cinnamomeum,diatoms,Pseudotriceratium,Pseudotriceratium cinnamomeum,cinnamomeum,,both
1,Globigerinoides quadrilobatus,planktic_foraminfera,Globigerinoides,Globigerinoides quadrilobatus,quadrilobatus,,both
2,Lamptonium fabaeforme chaunothorax (q),radiolarians,Lamptonium,Lamptonium fabaeforme chaunothorax,fabaeforme,chaunothorax,both
3,Rucinolithus hayi (q),nannofossils,Rucinolithus,Rucinolithus hayi,hayi,,left_only
4,Melosira granulata curvata,diatoms,Melosira,Melosira granulata curvata,granulata,curvata,left_only
5,Dictyophyllidites sp.,pollen,Dictyophyllidites,Dictyophyllidites sp.,sp.,,left_only


save changes to taxa_all csv

In [39]:
merged_df.to_csv(taxa_all_path, index=False)

## Created taxa list with unapproved NOAA taxa

Select unapproved NOAA taxa. When _merged is both, it means the taxa has been approved. When _merged is left_only, it means the taxa has not been approved.

In [18]:
merged_df = pd.read_csv(taxa_all_path)
merged_df.head()

Unnamed: 0,verbatim_name,taxon_group,genus name,_simplified_name,species name,subspecies name,_merge_approved
0,Pseudotriceratium cinnamomeum,diatoms,Pseudotriceratium,Pseudotriceratium cinnamomeum,cinnamomeum,,both
1,Globigerinoides quadrilobatus,planktic_foraminfera,Globigerinoides,Globigerinoides quadrilobatus,quadrilobatus,,both
2,Lamptonium fabaeforme chaunothorax (q),radiolarians,Lamptonium,Lamptonium fabaeforme chaunothorax,fabaeforme,chaunothorax,both
3,Rucinolithus hayi (q),nannofossils,Rucinolithus,Rucinolithus hayi,hayi,,left_only
4,Melosira granulata curvata,diatoms,Melosira,Melosira granulata curvata,granulata,curvata,left_only


In [20]:
(7763, 7)
unapproved_taxa_df = merged_df[merged_df['_merge_approved'] == 'left_only'].copy()

print(unapproved_taxa_df.shape)
unapproved_taxa_df.head()

(7763, 7)


Unnamed: 0,verbatim_name,taxon_group,genus name,_simplified_name,species name,subspecies name,_merge_approved
3,Rucinolithus hayi (q),nannofossils,Rucinolithus,Rucinolithus hayi,hayi,,left_only
4,Melosira granulata curvata,diatoms,Melosira,Melosira granulata curvata,granulata,curvata,left_only
5,Dictyophyllidites sp.,pollen,Dictyophyllidites,Dictyophyllidites sp.,sp.,,left_only
6,Cassidulinella renulinoformis,benthic_foraminfera,Cassidulinella,Cassidulinella renulinoformis,renulinoformis,,left_only
8,Conococcolithus minutus,nannofossils,Conococcolithus,Conococcolithus minutus,minutus,,left_only


create csv of unapproved NOAA taxa

In [53]:
unapproved_taxa_df.to_csv(taxa_path, index=False)

## Add PBDB data for taxa that aren't approved

Look up the genus for unapproved taxa in PBDB

In [59]:
unapproved_taxa_df = pd.read_csv(taxa_path)
unapproved_taxa_df.head()

Unnamed: 0,verbatim_name,taxon_group,genus name,_simplified_name,species name,subspecies name,_merge_approved
0,Rucinolithus hayi (q),nannofossils,Rucinolithus,Rucinolithus hayi,hayi,,left_only
1,Melosira granulata curvata,diatoms,Melosira,Melosira granulata curvata,granulata,curvata,left_only
2,Dictyophyllidites sp.,pollen,Dictyophyllidites,Dictyophyllidites sp.,sp.,,left_only
3,Cassidulinella renulinoformis,benthic_foraminfera,Cassidulinella,Cassidulinella renulinoformis,renulinoformis,,left_only
4,Conococcolithus minutus,nannofossils,Conococcolithus,Conococcolithus minutus,minutus,,left_only


create a dataframe of unique genera

In [61]:
genus_df = pd.DataFrame(unapproved_taxa_df['genus name'].unique(), columns=['genus name'])

print(len(genus_df))
genus_df.head()

1707


Unnamed: 0,genus name
0,Rucinolithus
1,Melosira
2,Dictyophyllidites
3,Cassidulinella
4,Conococcolithus


add pbdb taxa data

In [60]:
PBDB_API = "https://paleobiodb.org/data1.2/"
PBDB_TAXA = f"{PBDB_API}taxa/single.json?vocab=pbdb&name="

In [50]:
for index, row in genus_df.iterrows():
    pass
    url =  PBDB_TAXA +  row['genus name']
    response =requests.get(url)
    
    if response.status_code == 200:
        data = response.json()["records"]
        if len(data) == 1:
            # cast taxon_no to string to avoid pandas converting it to a float           
            genus_df.at[index, 'pbdb_taxon_id'] = str(data[0]["taxon_no"])
            genus_df.at[index, 'pbdb_taxon_name'] = data[0]["taxon_name"]
            genus_df.at[index, 'pbdb_taxon_rank'] = data[0]["taxon_rank"]
        


In [51]:
genus_df.head()

Unnamed: 0,genus name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank
0,Rucinolithus,87736.0,Rucinolithus,genus
1,Melosira,71289.0,Melosira,genus
2,Dictyophyllidites,252445.0,Dictyophyllidites,genus
3,Cassidulinella,1066.0,Cassidulinella,genus
4,Conococcolithus,,,


create genus csv

In [55]:
genus_df.to_csv(genus_path, index=False)

## add pbdb info to unapproved taxa 

In [67]:
genus_df = pd.read_csv(genus_path, dtype={'pbdb_taxon_id': str})
genus_df.head()

Unnamed: 0,genus name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank
0,Rucinolithus,87736.0,Rucinolithus,genus
1,Melosira,71289.0,Melosira,genus
2,Dictyophyllidites,252445.0,Dictyophyllidites,genus
3,Cassidulinella,1066.0,Cassidulinella,genus
4,Conococcolithus,,,


In [68]:
unapproved_taxa_df = pd.read_csv(taxa_path)

print(len(unapproved_taxa_df))
unapproved_taxa_df.head()

7763


Unnamed: 0,verbatim_name,taxon_group,genus name,_simplified_name,species name,subspecies name,_merge_approved
0,Rucinolithus hayi (q),nannofossils,Rucinolithus,Rucinolithus hayi,hayi,,left_only
1,Melosira granulata curvata,diatoms,Melosira,Melosira granulata curvata,granulata,curvata,left_only
2,Dictyophyllidites sp.,pollen,Dictyophyllidites,Dictyophyllidites sp.,sp.,,left_only
3,Cassidulinella renulinoformis,benthic_foraminfera,Cassidulinella,Cassidulinella renulinoformis,renulinoformis,,left_only
4,Conococcolithus minutus,nannofossils,Conococcolithus,Conococcolithus minutus,minutus,,left_only


merge NOAA unapproved taxa with pbdb data

In [69]:
merged_df = pd.merge(unapproved_taxa_df, genus_df, 
                     on = 'genus name', 
                     how='left',
                     indicator='_merge_pbdb')

print(len(merged_df))
merged_df.head()

7763


Unnamed: 0,verbatim_name,taxon_group,genus name,_simplified_name,species name,subspecies name,_merge_approved,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,_merge_pbdb
0,Rucinolithus hayi (q),nannofossils,Rucinolithus,Rucinolithus hayi,hayi,,left_only,87736.0,Rucinolithus,genus,both
1,Melosira granulata curvata,diatoms,Melosira,Melosira granulata curvata,granulata,curvata,left_only,71289.0,Melosira,genus,both
2,Dictyophyllidites sp.,pollen,Dictyophyllidites,Dictyophyllidites sp.,sp.,,left_only,252445.0,Dictyophyllidites,genus,both
3,Cassidulinella renulinoformis,benthic_foraminfera,Cassidulinella,Cassidulinella renulinoformis,renulinoformis,,left_only,1066.0,Cassidulinella,genus,both
4,Conococcolithus minutus,nannofossils,Conococcolithus,Conococcolithus minutus,minutus,,left_only,,,,both


reorder columns and sort rows

In [74]:
merged_df = merged_df.reindex(columns=['taxon_group', 'verbatim_name', 
                                       'genus name', 'species name', 
                                       'subspecies name',
                                       'pbdb_taxon_id', 'pbdb_taxon_name',
                                       'pbdb_taxon_rank', '_simplified_name',
                                       '_merge_approved', '_merge_pbdb'
                                      ])

merged_df.sort_values(by=['taxon_group', 'verbatim_name'], inplace=True)

merged_df.head()

Unnamed: 0,taxon_group,verbatim_name,genus name,species name,subspecies name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,_simplified_name,_merge_approved,_merge_pbdb
7261,benthic_foraminfera,Abyssamina incisa,Abyssamina,incisa,,762,Abyssamina,genus,Abyssamina incisa,left_only,both
7330,benthic_foraminfera,Adercotryma glomeratum,Adercotryma,glomeratum,,774,Adercotryma,genus,Adercotryma glomeratum,left_only,both
6161,benthic_foraminfera,Adercotryma sp.,Adercotryma,sp.,,774,Adercotryma,genus,Adercotryma sp.,left_only,both
89,benthic_foraminfera,Alabamina decorata,Alabamina,decorata,,788,Alabamina,genus,Alabamina decorata,left_only,both
7627,benthic_foraminfera,Alabamina haitiensis,Alabamina,haitiensis,,788,Alabamina,genus,Alabamina haitiensis,left_only,both


save csv

In [75]:
merged_df.to_csv(taxa_path, index=False)