# NOAA DSDP taxa list

create list of taxa for NOAA DSDP files

check if postgres connector is installed.

In [1]:
# https://stackoverflow.com/a/56012147
!pip install psycopg2-binary

You should consider upgrading via the '/Users/wyk/.pyenv/versions/3.6.8/envs/eodp-data/bin/python -m pip install --upgrade pip' command.[0m


In [17]:
import sys
import csv
import glob
import os
import requests
import re

sys.path.append('../scripts/')
sys.path.append('../')
import pandas as pd
import numpy as np

import db 

In [3]:
path = os.path.join('cleaned_data', 'metadata', 'noaa_dsdp_files.csv')
files_df = pd.read_csv(path)

## columns

get unique column names from all the taxa files

In [4]:
columns = set()
columns_count = set()

for index, row in files_df.iterrows():
    if row['type'] == 'taxa':
        df = pd.read_csv(row['file'], nrows=1)
        columns.update(df.columns)
        columns_count.add(len(df.columns))

In [5]:
columns

{'age',
 'bottom interval depth (cm)',
 'chemical dissolution',
 'chemical overgrowth',
 'core',
 'coredepth(m)',
 'dsdp initial report volume number',
 'fossil',
 'fossil abundance',
 'fossil code',
 'fossil group',
 'fossil preservation',
 'group abundance',
 'hole',
 'investigators name',
 'leg',
 'mechanical preservations',
 'page number reference',
 'publication date (month/year)',
 'record join code',
 'sample depth(m)',
 'section',
 'site',
 'top interval depth(cm)',
 'total number of observed fossils'}

In [6]:
columns_count

{25}

## taxa

### fetch taxa names from csv

read taxa files to get unique taxa names

In [7]:
taxa = set()

for index, row in files_df.iterrows():
    if row['type'] == 'taxa':
        df = pd.read_csv(row['file'])
        df.dropna(axis=0, inplace=True, how='all')
        df['temp'] = df['fossil'] +  '|' + row['taxon_group']
        
        taxa.update(list(df['temp']))
        
print(len(taxa))

9933


In [8]:
taxa_list = []
taxon_groups = []

for taxon in taxa:
    if not pd.isna(taxon):
        parts = taxon.split('|')
        taxa_list.append(parts[0])
        taxon_groups.append(parts[1])

In [9]:
dict = {
        "taxon": taxa_list,
        "taxon_group": taxon_groups
        }
taxa_df = pd.DataFrame(dict)
taxa_df.tail()

Unnamed: 0,taxon,taxon_group
9927,Periphaena tripyramis tripyramis,radiolarians
9928,Globorotalia pusilla,planktic foraminfera
9929,Morozovella conicotruncata,planktic foraminfera
9930,Pterocorys hirundo,radiolarians
9931,Thoracosphaera saxea,nannofossils


In [10]:
taxa_df.shape

(9932, 2)

get rid of (xxx) from taxa name

In [11]:
taxa_clean_df = taxa_df.replace(to_replace =' \(.*?\)$', value = '', regex = True)
taxa_clean_df.drop_duplicates(inplace=True)
taxa_clean_df.tail()

Unnamed: 0,taxon,taxon_group
9924,Eucyrtidium hexastichum,radiolarians
9927,Periphaena tripyramis tripyramis,radiolarians
9928,Globorotalia pusilla,planktic foraminfera
9929,Morozovella conicotruncata,planktic foraminfera
9930,Pterocorys hirundo,radiolarians


In [12]:
taxa_clean_df.shape

(8683, 2)

### fetch taxa from database

check if taxa exists in database

In [13]:
# https://www.datacamp.com/community/tutorials/tutorial-postgresql-python

sql = "select name, pbdb_taxon_id, pbdb_taxon_name, pbdb_taxon_rank from taxa"
db_taxa_df = pd.read_sql(sql, db.conn)
db_taxa_df.head()

Unnamed: 0,name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank
0,Algirosphaera robusta,424331,Algirosphaera,genus
1,Amaurolithus tricorniculatus,388308,Amaurolithus,genus
2,? Biantholithus flosculus,424468,Biantholithus,genus
3,Blackites creber,424333,Blackites,genus
4,Blackites spinosus,424333,Blackites,genus


merge NOAA taxa with database taxa

In [14]:
merged_db_df = pd.merge(taxa_clean_df, db_taxa_df,  left_on='taxon', right_on='name', how='left')
merged_db_df.head()

Unnamed: 0,taxon,taxon_group,name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank
0,Denticula lauta,diatoms,,,,
1,Triceratium aries,diatoms,,,,
2,Asterolampra hyalinas,diatoms,,,,
3,Corbisema navicula constricta,silicoflagellates,,,,
4,Dictyocha fibula fibula,silicoflagellates,,,,


In [15]:
merged_db_df.shape

(8683, 6)

### fetch LIMS taxa list that PIs are processing

get all LIMS taxa

In [19]:
lims_path = os.path.join('raw_data', 'taxa', 'Micropal_headers_PBDB_Taxonomy_notes_2021-04-23.csv')
lims_df = pd.read_csv(lims_path)
lims_df.shape

(4721, 21)

In [20]:
lims_df['normalized_name'] = np.where(
    lims_df['name to use (if different from "name")'].notnull(),
    lims_df['name to use (if different from "name")'],
    lims_df['name']
)

lims_name_df = lims_df[['normalized_name']]

lims_name_df.head()

Unnamed: 0,normalized_name
0,Pyrite
1,Euuvigerina miozea
2,Euuvigerina rodleyi
3,Foraminifera indet.
4,Pleurostomellia indet.


In [21]:
lims_name_df.shape

(4721, 1)

merge NOAA/db taxa with LIMS taxa

In [22]:
merged_noaa_lims_df = pd.merge(merged_db_df, lims_name_df,  left_on='taxon', right_on='normalized_name', how='left')
merged_noaa_lims_df.head()

Unnamed: 0,taxon,taxon_group,name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,normalized_name
0,Denticula lauta,diatoms,,,,,
1,Triceratium aries,diatoms,,,,,
2,Asterolampra hyalinas,diatoms,,,,,
3,Corbisema navicula constricta,silicoflagellates,,,,,
4,Dictyocha fibula fibula,silicoflagellates,,,,,


In [23]:
merged_noaa_lims_df.shape

(9004, 7)

filter out the NOAA taxa that are in LIMS

In [25]:
new_noaa_df = merged_noaa_lims_df[merged_noaa_lims_df['normalized_name'].isna()].copy()
new_noaa_df.drop_duplicates(inplace=True)

new_noaa_df.shape

(6976, 7)

### search pbdb api

In [187]:
PBDB_API = "https://paleobiodb.org/data1.2/"
PBDB_TAXA = f"{PBDB_API}taxa/single.json?vocab=pbdb&name="

add pbdb taxa data

In [244]:
for index, row in new_noaa_df.iterrows():
    url =  PBDB_TAXA + re.sub(' sp\.$', '', row['taxon'])
    response =requests.get(url)
    
    if response.status_code == 200:
        data = response.json()["records"]
        if len(data) == 1:
            new_noaa_df.at[index, 'pbdb_taxon_id'] = data[0]["taxon_no"]
            new_noaa_df.at[index, 'pbdb_taxon_name'] = data[0]["taxon_name"]
            new_noaa_df.at[index, 'pbdb_taxon_rank'] = data[0]["taxon_rank"]


In [252]:
new_noaa_df.head()

Unnamed: 0,taxon,taxon_group,name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,normalized_name
0,Distephanus speculum septenarius,silicoflagellates,,88737.0,,,
1,Distephanus minutus,silicoflagellates,,88737.0,,,
4,Mesocena triangula,silicoflagellates,,,,,
5,Globorotalia humilis,planktic foraminfera,,,,,
6,Cymatosira sp.,diatoms,,82156.0,Cymatosira,genus,


In [250]:
new_noaa_df.shape

(6976, 8)

### create csv

In [253]:
output_df = new_noaa_df
output_df.drop_duplicates(inplace=True)
output_df.shape

(6976, 7)

In [254]:
path = os.path.join('cleaned_data', 'taxa', 'noaa_dsdp_taxa_list.csv')
# output_df.to_csv(path, index=False)