# NOAA DSDP taxa list

create list of taxa for NOAA files

In [22]:
import sys
import csv
import glob
import os
sys.path.append('../scripts/')
sys.path.append('../')

import pandas as pd
import numpy as np

import db 

In [35]:
path = os.path.join('cleaned_data', 'metadata', 'noaa_dsdp_files.csv')
files_df = pd.read_csv(path)

## columns

In [14]:
columns = set()
columns_count = set()

for index, row in files_df.iterrows():
    if row['type'] == 'taxa':
        df = pd.read_csv(row['file'], nrows=1)
        columns.update(df.columns)
        columns_count.add(len(df.columns))

In [8]:
columns

{'age',
 'bottom interval depth (cm)',
 'chemical dissolution',
 'chemical overgrowth',
 'core',
 'coredepth(m)',
 'dsdp initial report volume number',
 'fossil',
 'fossil abundance',
 'fossil code',
 'fossil group',
 'fossil preservation',
 'group abundance',
 'hole',
 'investigators name',
 'leg',
 'mechanical preservations',
 'page number reference',
 'publication date (month/year)',
 'record join code',
 'sample depth(m)',
 'section',
 'site',
 'top interval depth(cm)',
 'total number of observed fossils'}

In [9]:
columns_count

{25}

## taxa

create list of unique taxa names

read taxa files to get unique taxa names

In [94]:
taxa = set()

for index, row in files_df.iterrows():
    if row['type'] == 'taxa':
        df = pd.read_csv(row['file'])
        df.dropna(axis=0, inplace=True, how='all')
        df['temp'] = df['fossil'] +  '|' + row['taxon_group']
        
        taxa.update(list(df['temp']))
        
print(len(taxa))

9933


In [95]:
taxa_list = []
taxon_groups = []

for taxon in taxa:
    if not pd.isna(taxon):
        parts = taxon.split('|')
        taxa_list.append(parts[0])
        taxon_groups.append(parts[1])

In [161]:
dict = {
        "taxon": taxa_list,
        "taxon_group": taxon_groups
        }
taxa_df = pd.DataFrame(dict)
taxa_df.tail()

Unnamed: 0,taxon,taxon_group
9927,Membranilarnacia ursulae,dinoflagellates
9928,Ellipsoglandulina subconica,benthic foraminifera
9929,Ericsonia fenestra (small),nannofossils
9930,Cassigerinella chipolensis,planktic foraminfera
9931,Melonis pacificum,benthic foraminifera


In [163]:
taxa_df.shape

(9932, 2)

get rid of (xxx) from taxa name

In [164]:
taxa_clean_df = taxa_df.replace(to_replace =' \(.*?\)$', value = '', regex = True)
taxa_clean_df.drop_duplicates(inplace=True)
taxa_clean_df.tail()

Unnamed: 0,taxon,taxon_group
9926,Guttulina irregularis,benthic foraminifera
9927,Membranilarnacia ursulae,dinoflagellates
9928,Ellipsoglandulina subconica,benthic foraminifera
9930,Cassigerinella chipolensis,planktic foraminfera
9931,Melonis pacificum,benthic foraminifera


In [165]:
taxa_clean_df.shape

(8683, 2)

check if taxa exists in database

In [181]:
# https://stackoverflow.com/a/56012147
!pip install psycopg2



In [182]:
# https://www.datacamp.com/community/tutorials/tutorial-postgresql-python

sql = "select name, pbdb_taxon_id, pbdb_taxon_name, pbdb_taxon_rank from taxa"
db_taxa_df = pd.read_sql(sql, db.conn)
db_taxa_df.head()

Unnamed: 0,name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank
0,Algirosphaera robusta,424331,Algirosphaera,genus
1,Amaurolithus tricorniculatus,388308,Amaurolithus,genus
2,? Biantholithus flosculus,424468,Biantholithus,genus
3,Blackites creber,424333,Blackites,genus
4,Blackites spinosus,424333,Blackites,genus


merge NOAA taxa with database taxa

In [167]:
merged_db_df = pd.merge(taxa_clean_df, db_taxa_df,  left_on='taxon', right_on='name', how='left')
merged_db_df.head()

Unnamed: 0,taxon,taxon_group,name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank
0,Distephanus speculum septenarius,silicoflagellates,,,,
1,Distephanus minutus,silicoflagellates,,,,
2,Spinidinium macmurdoense,dinoflagellates,,,,
3,Mesocena triangula,silicoflagellates,,,,
4,Globorotalia humilis,planktic foraminfera,,,,


In [168]:
merged_db_df.shape

(8683, 6)

get all lims taxa

In [169]:
lims_path = os.path.join('raw_data', 'taxa', 'Micropal_headers_PBDB_Taxonomy_notes_2021-04-23.csv')
lims_df = pd.read_csv(lims_path)
lims_df.shape

(4721, 21)

In [170]:
lims_df['normalized_name'] = np.where(
    lims_df['name to use (if different from "name")'].notnull(),
    lims_df['name to use (if different from "name")'],
    lims_df['name']
)

lims_name_df = lims_df[['normalized_name']]

lims_name_df.head()

Unnamed: 0,normalized_name
0,Pyrite
1,Euuvigerina miozea
2,Euuvigerina rodleyi
3,Foraminifera indet.
4,Pleurostomellia indet.


In [172]:
lims_name_df.shape

(4721, 1)

merge NOAA/db taxa with LIMS taxa

In [173]:
merged_lims_df = pd.merge(merged_db_df, lims_name_df,  left_on='taxon', right_on='normalized_name', how='left')
merged_lims_df.head()

Unnamed: 0,taxon,taxon_group,name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,normalized_name
0,Distephanus speculum septenarius,silicoflagellates,,,,,
1,Distephanus minutus,silicoflagellates,,,,,
2,Spinidinium macmurdoense,dinoflagellates,,,,,Spinidinium macmurdoense
3,Spinidinium macmurdoense,dinoflagellates,,,,,Spinidinium macmurdoense
4,Mesocena triangula,silicoflagellates,,,,,


In [176]:
merged_lims_df.shape

(9004, 7)

look up pbdb

In [183]:
filtered_lims_df = merged_lims_df[merged_lims_df['normalized_name'].isna()].copy()
filtered_lims_df.drop_duplicates(inplace=True)

filtered_lims_df.shape

(6976, 7)

In [187]:
PBDB_API = "https://paleobiodb.org/data1.2/"
PBDB_TAXA = f"{PBDB_API}taxa/single.json?vocab=pbdb&name="

'Leiotriletes'

In [244]:
import requests
import re


for index, row in filtered_lims_df.iterrows():
    url =  PBDB_TAXA + re.sub(' sp\.$', '', row['taxon'])
    response =requests.get(url)
    
    if response.status_code == 200:
        data = response.json()["records"]
        if len(data) == 1:
            filtered_lims_df.at[index, 'pbdb_taxon_id'] = data[0]["taxon_no"]
            filtered_lims_df.at[index, 'pbdb_taxon_name'] = data[0]["taxon_name"]
            filtered_lims_df.at[index, 'pbdb_taxon_rank'] = data[0]["taxon_rank"]


In [252]:
filtered_lims_df.head()

Unnamed: 0,taxon,taxon_group,name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,normalized_name
0,Distephanus speculum septenarius,silicoflagellates,,88737.0,,,
1,Distephanus minutus,silicoflagellates,,88737.0,,,
4,Mesocena triangula,silicoflagellates,,,,,
5,Globorotalia humilis,planktic foraminfera,,,,,
6,Cymatosira sp.,diatoms,,82156.0,Cymatosira,genus,


In [250]:
filtered_lims_df.shape

(6976, 8)

create csv

In [253]:
output_df = filtered_lims_df
output_df.drop_duplicates(inplace=True)
output_df.shape

(6976, 7)

In [254]:
path = os.path.join('cleaned_data', 'taxa', 'noaa_dsdp_taxa_list.csv')
output_df.to_csv(path, index=False)