#  NOAA DSDP taxa list
## 1-96 taxa

Create list of taxa for NOAA DSDP files. Compare NOAA taxa with the taxa that the PIs have already approved in order create a list of unapproved taxa. Add PBDB data to unapproved taxa.

In [1]:
import sys
import csv
import glob
import os
import requests
import re

sys.path.append('../scripts/')
sys.path.append('../')
import pandas as pd
import numpy as np

import db 
import normalize_taxa as nt


In [2]:
base_directory = 'cleaned_data'
date='2021-07-28'
metadata_path = os.path.join(base_directory, 'metadata', 'NOAA', 'noaa_dsdp_files.csv')
approved_taxa_path = os.path.join(base_directory, 'taxa', 'LIMS', f'taxa_list_{date}.csv')

crosswalk_path = os.path.join(base_directory, 'taxa', 'draft', 'NOAA',f'taxa_crosswalk_{date}.csv')
merged_path = os.path.join(base_directory, 'taxa', 'draft', 'NOAA',f'taxa_merged_{date}.csv')
merged2_path = os.path.join(base_directory, 'taxa', 'draft', 'NOAA',f'taxa_merged2_{date}.csv')

taxa_path = os.path.join(base_directory, 'taxa', 'draft', 'NOAA',f'taxa_list_{date}.csv')
genus_path = os.path.join(base_directory, 'taxa', 'draft', 'NOAA', f'genus_{date}.csv')
taxa_pbdb_path = os.path.join(base_directory, 'taxa', 'draft', 'NOAA',f'taxa_list_pbdb_{date}.csv')


In [3]:
def log_df(df, row_count=5):
    print(df.shape)
    return df.head(row_count)


## Create taxa list

In [4]:
metadata = pd.read_csv(metadata_path)
log_df(metadata)

(4477, 5)


Unnamed: 0,path,type,taxon_group,expedition,site
0,cleaned_data/NOAA_csv/DSDP_core_data/61/462/ra...,taxa,radiolarians,61,462
1,cleaned_data/NOAA_csv/DSDP_core_data/61/462/ag...,age,,61,462
2,cleaned_data/NOAA_csv/DSDP_core_data/61/462/b_...,taxa,benthic_foraminfera,61,462
3,cleaned_data/NOAA_csv/DSDP_core_data/61/462/p_...,taxa,planktic_foraminfera,61,462
4,cleaned_data/NOAA_csv/DSDP_core_data/61/462/hr...,hard_rock,,61,462


read all the taxa files to get unique taxa names

In [5]:
# 9933
taxa = set()

for index, row in metadata.iterrows():
    if row['type'] == 'taxa':
        df = pd.read_csv(row['path'])
        df.dropna(axis=0, inplace=True, how='all')
        df['temp'] = df['fossil'].str.strip() +  '|' + row['taxon_group']
        
        taxa.update(df['temp'])
        
len(taxa)

9933

In [6]:
list(taxa)[0:20]

[nan,
 'Pararotalia sp. (q)|benthic_foraminfera',
 'Cannopilus longispinus|silicoflagellates',
 'Zygolithus chiastus|nannofossils',
 'Sphaeroidinellopsis subdehiscens|planktic_foraminfera',
 'Stylatractus sp.|radiolarians',
 'Thamnospyris schizopodia (q)|radiolarians',
 'Kozloviella minor|diatoms',
 'Globorotalia nana semivera|planktic_foraminfera',
 'Bolivina albatrossi|benthic_foraminfera',
 'Triceraspyris sp.|radiolarians',
 'Eucyrtidium hexagonatum|radiolarians',
 'Eucampia balaustium minor|diatoms',
 'Mesocena diodon nodosa|silicoflagellates',
 'Paralecaniella indentata|dinoflagellates',
 'Globigerina pseudofoliata|planktic_foraminfera',
 'Dictyocha longa|silicoflagellates',
 'Globoquadrina pseudovenezuelana|planktic_foraminfera',
 'Rhizosolenia matuyamai|diatoms',
 'Rhabdammina sp.|benthic_foraminfera']

In [7]:
# 9932
taxa_list = []

for taxon in taxa:
    if not pd.isna(taxon):
        taxon_name, taxon_group = taxon.split('|')
        simplified_name = re.sub('\(.*?\)$', '', taxon_name).strip()
        taxon_name_parts = simplified_name.split(' ')

        data = {'verbatim_name': taxon_name, 
                'taxon_group': taxon_group, 
                'genus name': taxon_name_parts[0],
                'simplified_name': simplified_name}
        if len(taxon_name_parts) > 1:
            data['species name'] = taxon_name_parts[1]
        if len(taxon_name_parts) == 3:
            data['subspecies name'] = taxon_name_parts[2]

        taxa_list.append(data)
        
len(taxa_list)

9932

create taxa list csv

In [8]:
taxa_df = pd.DataFrame(taxa_list)
log_df(taxa_df)

(9932, 6)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name
0,Pararotalia sp. (q),benthic_foraminfera,Pararotalia,Pararotalia sp.,sp.,
1,Cannopilus longispinus,silicoflagellates,Cannopilus,Cannopilus longispinus,longispinus,
2,Zygolithus chiastus,nannofossils,Zygolithus,Zygolithus chiastus,chiastus,
3,Sphaeroidinellopsis subdehiscens,planktic_foraminfera,Sphaeroidinellopsis,Sphaeroidinellopsis subdehiscens,subdehiscens,
4,Stylatractus sp.,radiolarians,Stylatractus,Stylatractus sp.,sp.,


In [9]:
taxa_df.to_csv(crosswalk_path, index=False)

## compare and replace taxon groups 

In [10]:
noaa_taxa_df = pd.read_csv(crosswalk_path)
approved_taxa_df = pd.read_csv(approved_taxa_path)

In [11]:
approved_groups = list(approved_taxa_df['taxon_group'].unique())
approved_groups.sort()
approved_groups

['benthic_forams',
 'bolboformids',
 'chrysophyte_cysts',
 'diatoms',
 'dinoflagellates',
 'ebridians',
 'nannofossils',
 'ostracods',
 'palynology',
 'planktic_forams',
 'radiolarians',
 'silicoflagellates']

In [12]:
noaa_groups = list(noaa_taxa_df['taxon_group'].unique())
noaa_groups.sort()
noaa_groups

['benthic_foraminfera',
 'diatoms',
 'dinoflagellates',
 'ebridians',
 'nannofossils',
 'ostracods',
 'phytoliths',
 'planktic_foraminfera',
 'pollen',
 'radiolarians',
 'silicoflagellates']

In [13]:
set(noaa_groups) -  set(approved_groups)

{'benthic_foraminfera', 'phytoliths', 'planktic_foraminfera', 'pollen'}

In [14]:
noaa_taxa_df = noaa_taxa_df.replace(['benthic_foraminfera', 'planktic_foraminfera'],
                                    ['benthic_forams', 'planktic_forams'])
noaa_taxa_df.head()

Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name
0,Pararotalia sp. (q),benthic_forams,Pararotalia,Pararotalia sp.,sp.,
1,Cannopilus longispinus,silicoflagellates,Cannopilus,Cannopilus longispinus,longispinus,
2,Zygolithus chiastus,nannofossils,Zygolithus,Zygolithus chiastus,chiastus,
3,Sphaeroidinellopsis subdehiscens,planktic_forams,Sphaeroidinellopsis,Sphaeroidinellopsis subdehiscens,subdehiscens,
4,Stylatractus sp.,radiolarians,Stylatractus,Stylatractus sp.,sp.,


In [15]:
noaa_taxa_df.to_csv(crosswalk_path, index=False)

## Compare NOAA taxa with approved taxa

get NOAA taxa

In [16]:
# 9932
noaa_taxa_df = pd.read_csv(crosswalk_path)

log_df(noaa_taxa_df)

(9932, 6)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name
0,Pararotalia sp. (q),benthic_forams,Pararotalia,Pararotalia sp.,sp.,
1,Cannopilus longispinus,silicoflagellates,Cannopilus,Cannopilus longispinus,longispinus,
2,Zygolithus chiastus,nannofossils,Zygolithus,Zygolithus chiastus,chiastus,
3,Sphaeroidinellopsis subdehiscens,planktic_forams,Sphaeroidinellopsis,Sphaeroidinellopsis subdehiscens,subdehiscens,
4,Stylatractus sp.,radiolarians,Stylatractus,Stylatractus sp.,sp.,


get approved taxa

In [17]:
# 4209
approved_taxa_df = pd.read_csv(approved_taxa_path)
log_df(approved_taxa_df)

(4209, 12)


Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,normalized_name,taxon_group
0,,,Euuvigerina,,,,miozea,,,,Euuvigerina miozea,benthic_forams
1,,,Euuvigerina,,,,rodleyi,,,,Euuvigerina rodleyi,benthic_forams
2,Foraminifera indet.,,,,,,,,,,Foraminifera indet.,benthic_forams
3,Pleurostomellidae indet.,,,,,,,,,,Pleurostomellidae indet.,benthic_forams
4,Ostracoda indet.,,,,,,,,,,Ostracoda indet.,benthic_forams


add simplified_name without descriptor

In [18]:
del approved_taxa_df['normalized_name'] 


In [19]:
approved_taxa_df = nt.add_normalized_name_column(approved_taxa_df, 
                                                 include_descriptor=False, 
                                                 col_name="simplified_name")

In [20]:
approved_taxa_df[approved_taxa_df['non-taxa descriptor'].notna()].head(2)

Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,taxon_group,simplified_name
28,,,Globigerinoides,,,,sacculifer,,,without sac,planktic_forams,Globigerinoides sacculifer
201,,,Bolivina,,,cf.,crenulata,,,crenulate,benthic_forams,Bolivina cf. crenulata


In [21]:
approved_taxa_df = pd.DataFrame(approved_taxa_df[['taxon_group', 'simplified_name']])
log_df(approved_taxa_df)

(4209, 2)


Unnamed: 0,taxon_group,simplified_name
0,benthic_forams,Euuvigerina miozea
1,benthic_forams,Euuvigerina rodleyi
2,benthic_forams,Foraminifera indet.
3,benthic_forams,Pleurostomellidae indet.
4,benthic_forams,Ostracoda indet.


### merge NOAA taxa with approved taxa

In [22]:
# 10109 
merged_df = pd.merge(noaa_taxa_df, approved_taxa_df,  
                     on=['simplified_name', 'taxon_group' ], 
                     how='left',
                     indicator='_merge_approved')

log_df(merged_df)


(10109, 7)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name,_merge_approved
0,Pararotalia sp. (q),benthic_forams,Pararotalia,Pararotalia sp.,sp.,,left_only
1,Cannopilus longispinus,silicoflagellates,Cannopilus,Cannopilus longispinus,longispinus,,left_only
2,Zygolithus chiastus,nannofossils,Zygolithus,Zygolithus chiastus,chiastus,,left_only
3,Sphaeroidinellopsis subdehiscens,planktic_forams,Sphaeroidinellopsis,Sphaeroidinellopsis subdehiscens,subdehiscens,,both
4,Stylatractus sp.,radiolarians,Stylatractus,Stylatractus sp.,sp.,,both


In [23]:
# 9932
merged_df = merged_df.drop_duplicates()
log_df(merged_df)

(9932, 7)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name,_merge_approved
0,Pararotalia sp. (q),benthic_forams,Pararotalia,Pararotalia sp.,sp.,,left_only
1,Cannopilus longispinus,silicoflagellates,Cannopilus,Cannopilus longispinus,longispinus,,left_only
2,Zygolithus chiastus,nannofossils,Zygolithus,Zygolithus chiastus,chiastus,,left_only
3,Sphaeroidinellopsis subdehiscens,planktic_forams,Sphaeroidinellopsis,Sphaeroidinellopsis subdehiscens,subdehiscens,,both
4,Stylatractus sp.,radiolarians,Stylatractus,Stylatractus sp.,sp.,,both


In [24]:
merged_df.to_csv(merged_path, index=False)


In [25]:
# 10114 
merged2_df = pd.merge(noaa_taxa_df, approved_taxa_df,  
                     on=['simplified_name'], 
                     how='left',
                     indicator='_merge_approved')

log_df(merged2_df)


(10114, 8)


Unnamed: 0,verbatim_name,taxon_group_x,genus name,simplified_name,species name,subspecies name,taxon_group_y,_merge_approved
0,Pararotalia sp. (q),benthic_forams,Pararotalia,Pararotalia sp.,sp.,,,left_only
1,Cannopilus longispinus,silicoflagellates,Cannopilus,Cannopilus longispinus,longispinus,,,left_only
2,Zygolithus chiastus,nannofossils,Zygolithus,Zygolithus chiastus,chiastus,,,left_only
3,Sphaeroidinellopsis subdehiscens,planktic_forams,Sphaeroidinellopsis,Sphaeroidinellopsis subdehiscens,subdehiscens,,planktic_forams,both
4,Stylatractus sp.,radiolarians,Stylatractus,Stylatractus sp.,sp.,,radiolarians,both


In [26]:
# 9937
merged2_df = merged2_df.drop_duplicates()
log_df(merged2_df)

(9937, 8)


Unnamed: 0,verbatim_name,taxon_group_x,genus name,simplified_name,species name,subspecies name,taxon_group_y,_merge_approved
0,Pararotalia sp. (q),benthic_forams,Pararotalia,Pararotalia sp.,sp.,,,left_only
1,Cannopilus longispinus,silicoflagellates,Cannopilus,Cannopilus longispinus,longispinus,,,left_only
2,Zygolithus chiastus,nannofossils,Zygolithus,Zygolithus chiastus,chiastus,,,left_only
3,Sphaeroidinellopsis subdehiscens,planktic_forams,Sphaeroidinellopsis,Sphaeroidinellopsis subdehiscens,subdehiscens,,planktic_forams,both
4,Stylatractus sp.,radiolarians,Stylatractus,Stylatractus sp.,sp.,,radiolarians,both


save changes to taxa_all csv

In [27]:
merged2_df.to_csv(merged2_path, index=False)

the reason for the count difference is because the LIMS taxa list sometimes puts a taxa in two groups

NOAA: Selenopemphix nephroides - dinoflagellates 
LIMS: Selenopemphix nephroides - dinoflagellates, palynology

## Created taxa list with unapproved NOAA taxa

Select unapproved NOAA taxa. When _merged is both, it means the taxa has been approved. When _merged is left_only, it means the taxa has not been approved.

In [28]:
merged_df = pd.read_csv(merged_path)
log_df(merged_df)

(9932, 7)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name,_merge_approved
0,Pararotalia sp. (q),benthic_forams,Pararotalia,Pararotalia sp.,sp.,,left_only
1,Cannopilus longispinus,silicoflagellates,Cannopilus,Cannopilus longispinus,longispinus,,left_only
2,Zygolithus chiastus,nannofossils,Zygolithus,Zygolithus chiastus,chiastus,,left_only
3,Sphaeroidinellopsis subdehiscens,planktic_forams,Sphaeroidinellopsis,Sphaeroidinellopsis subdehiscens,subdehiscens,,both
4,Stylatractus sp.,radiolarians,Stylatractus,Stylatractus sp.,sp.,,both


In [29]:
# (7763, 7)
unapproved_taxa_df = merged_df[merged_df['_merge_approved'] == 'left_only'].copy()

log_df(unapproved_taxa_df)

(7763, 7)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name,_merge_approved
0,Pararotalia sp. (q),benthic_forams,Pararotalia,Pararotalia sp.,sp.,,left_only
1,Cannopilus longispinus,silicoflagellates,Cannopilus,Cannopilus longispinus,longispinus,,left_only
2,Zygolithus chiastus,nannofossils,Zygolithus,Zygolithus chiastus,chiastus,,left_only
5,Thamnospyris schizopodia (q),radiolarians,Thamnospyris,Thamnospyris schizopodia,schizopodia,,left_only
6,Kozloviella minor,diatoms,Kozloviella,Kozloviella minor,minor,,left_only


create csv of unapproved NOAA taxa

In [30]:
unapproved_taxa_df.to_csv(taxa_path, index=False)

## Add PBDB data for taxa that aren't approved

Look up the genus for unapproved taxa in PBDB

In [31]:
unapproved_taxa_df = pd.read_csv(taxa_path)
log_df(unapproved_taxa_df)

(7763, 7)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name,_merge_approved
0,Pararotalia sp. (q),benthic_forams,Pararotalia,Pararotalia sp.,sp.,,left_only
1,Cannopilus longispinus,silicoflagellates,Cannopilus,Cannopilus longispinus,longispinus,,left_only
2,Zygolithus chiastus,nannofossils,Zygolithus,Zygolithus chiastus,chiastus,,left_only
3,Thamnospyris schizopodia (q),radiolarians,Thamnospyris,Thamnospyris schizopodia,schizopodia,,left_only
4,Kozloviella minor,diatoms,Kozloviella,Kozloviella minor,minor,,left_only


create a dataframe of unique genera

In [32]:
genus_df = pd.DataFrame(unapproved_taxa_df['genus name'].unique(), columns=['genus name'])

log_df(genus_df)

(1707, 1)


Unnamed: 0,genus name
0,Pararotalia
1,Cannopilus
2,Zygolithus
3,Thamnospyris
4,Kozloviella


add pbdb taxa data

In [33]:
PBDB_API = "https://paleobiodb.org/data1.2/"
PBDB_TAXA = f"{PBDB_API}taxa/single.json?vocab=pbdb&name="

In [34]:
for index, row in genus_df.iterrows():
    if index % 50 == 0:
        print(index, end=' ')
        
    url =  PBDB_TAXA +  row['genus name']
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()["records"]
        if len(data) == 1:
            # cast taxon_no to string to avoid pandas converting it to a float           
            genus_df.at[index, 'pbdb_taxon_id'] = str(data[0]["taxon_no"])
            genus_df.at[index, 'pbdb_taxon_name'] = data[0]["taxon_name"]
            genus_df.at[index, 'pbdb_taxon_rank'] = data[0]["taxon_rank"]

0 50 100 150 200 250 300 350 400 450 500 550 600 650 700 750 800 850 900 950 1000 1050 1100 1150 1200 1250 1300 1350 1400 1450 1500 1550 1600 1650 1700 

In [35]:
genus_df.head()

Unnamed: 0,genus name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank
0,Pararotalia,2072.0,Pararotalia,genus
1,Cannopilus,82179.0,Cannopilus,genus
2,Zygolithus,87686.0,Zygolithus,genus
3,Thamnospyris,,,
4,Kozloviella,,,


create genus csv

In [36]:
genus_df.to_csv(genus_path, index=False)

## add pbdb info to unapproved taxa 

In [37]:
genus_df = pd.read_csv(genus_path, dtype={'pbdb_taxon_id': str})
log_df(genus_df)

(1707, 4)


Unnamed: 0,genus name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank
0,Astromma,33,Astromma,genus
1,Lagena,1739,Lagena,genus
2,Cretarhabdus,87816,Cretarhabdus,genus
3,Fasciculithus,424283,Fasciculithus,genus
4,Coscinodiscus,71292,Coscinodiscus,genus


In [38]:
unapproved_df = pd.read_csv(taxa_path)

log_df(unapproved_df)

(7763, 7)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name,_merge_approved
0,Dictyocha brevispina brevispina (q),silicoflagellates,Dictyocha,Dictyocha brevispina brevispina,brevispina,brevispina,left_only
1,Planorotalites ehrenbergi,planktic_forams,Planorotalites,Planorotalites ehrenbergi,ehrenbergi,,left_only
2,Globorotalia miozea sphericomiozea,planktic_forams,Globorotalia,Globorotalia miozea sphericomiozea,miozea,sphericomiozea,left_only
3,Globoquadrina globosa,planktic_forams,Globoquadrina,Globoquadrina globosa,globosa,,left_only
4,Spirocyrtis scalaris,radiolarians,Spirocyrtis,Spirocyrtis scalaris,scalaris,,left_only


merge NOAA unapproved taxa with pbdb data

In [39]:
merged_df = pd.merge(unapproved_taxa_df, genus_df, 
                     on = 'genus name', 
                     how='left',
                     indicator='_merge_pbdb')

log_df(merged_df)

(7763, 11)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name,_merge_approved,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,_merge_pbdb
0,Pararotalia sp. (q),benthic_forams,Pararotalia,Pararotalia sp.,sp.,,left_only,2072.0,Pararotalia,genus,both
1,Cannopilus longispinus,silicoflagellates,Cannopilus,Cannopilus longispinus,longispinus,,left_only,82179.0,Cannopilus,genus,both
2,Zygolithus chiastus,nannofossils,Zygolithus,Zygolithus chiastus,chiastus,,left_only,87686.0,Zygolithus,genus,both
3,Thamnospyris schizopodia (q),radiolarians,Thamnospyris,Thamnospyris schizopodia,schizopodia,,left_only,,,,both
4,Kozloviella minor,diatoms,Kozloviella,Kozloviella minor,minor,,left_only,,,,both


add name column

In [67]:
merged_df['name'] = merged_df['verbatim_name'].str.strip()
merged_df['Comment'] = np.nan
merged_df['Notes (change to Internal only notes?)'] = np.nan
merged_df['Any taxon above genus'] = np.nan
merged_df['genus modifier'] = ''
merged_df['subgenera modifier'] = np.nan
merged_df['subgenera name'] = np.nan
merged_df['species modifier'] = np.nan
merged_df['subspecies modifier'] = np.nan
merged_df['non-taxa descriptor'] = np.nan
merged_df['comments'] = np.nan


In [40]:
for index, row in merged_df.iterrows():
    if '(q)' in row['name']:
        
        merged_df.at[index,'name']=re.sub('(.*?) \(q\)', r'? \1', row['name'])
        merged_df.at[index,'genus modifier'] = '?'
        
log_df(merged_df)

(7763, 22)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name,_merge_approved,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,...,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,subgenera modifier,species modifier,subspecies modifier,non-taxa descriptor,comments,subgenera name
0,Pararotalia sp. (q),benthic_forams,Pararotalia,Pararotalia sp.,sp.,,left_only,2072.0,Pararotalia,genus,...,,,,?,,,,,,
1,Cannopilus longispinus,silicoflagellates,Cannopilus,Cannopilus longispinus,longispinus,,left_only,82179.0,Cannopilus,genus,...,,,,,,,,,,
2,Zygolithus chiastus,nannofossils,Zygolithus,Zygolithus chiastus,chiastus,,left_only,87686.0,Zygolithus,genus,...,,,,,,,,,,
3,Thamnospyris schizopodia (q),radiolarians,Thamnospyris,Thamnospyris schizopodia,schizopodia,,left_only,,,,...,,,,?,,,,,,
4,Kozloviella minor,diatoms,Kozloviella,Kozloviella minor,minor,,left_only,,,,...,,,,,,,,,,


reorder columns and sort rows

In [70]:
merged_df = merged_df.reindex(columns=['taxon_group', 'verbatim_name', 'name', 'Comment',
                                       'Notes (change to Internal only notes?)',
                                       'Any taxon above genus', 
                                       'genus modifier', 'genus name', 
                                       'subgenera modifier', 'subgenera name',
                                       'species modifier', 'species name', 
                                       'subspecies modifier', 'subspecies name',
                                       'non-taxa descriptor', 'comments',
                                       'pbdb_taxon_id', 'pbdb_taxon_name',
                                       'pbdb_taxon_rank', '_simplified_name',
                                       '_merge_approved', '_merge_pbdb'
                                      ])

merged_df.sort_values(by=['taxon_group', 'verbatim_name'], inplace=True)

log_df(merged_df)

(7763, 22)


Unnamed: 0,taxon_group,verbatim_name,name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,subspecies modifier,subspecies name,non-taxa descriptor,comments,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,_simplified_name,_merge_approved,_merge_pbdb
1455,benthic_forams,Abyssamina incisa,Abyssamina incisa,,,,,Abyssamina,,,...,,,,,762,Abyssamina,genus,,left_only,both
6982,benthic_forams,Adercotryma glomeratum,Adercotryma glomeratum,,,,,Adercotryma,,,...,,,,,774,Adercotryma,genus,,left_only,both
2565,benthic_forams,Adercotryma sp.,Adercotryma sp.,,,,,Adercotryma,,,...,,,,,774,Adercotryma,genus,,left_only,both
7313,benthic_forams,Alabamina decorata,Alabamina decorata,,,,,Alabamina,,,...,,,,,788,Alabamina,genus,,left_only,both
2216,benthic_forams,Alabamina haitiensis,Alabamina haitiensis,,,,,Alabamina,,,...,,,,,788,Alabamina,genus,,left_only,both


save csv

In [41]:
merged_df.to_csv(taxa_pbdb_path, index=False)

## process taxalist from PIs

update google sheet taxa list from PIs to deal with (q) in verbatim name.

In [7]:
path = 'raw_data/taxa/NOAA_taxa_lists_taxa list_2021-08-05.csv'
taxa_df = pd.read_csv(path, dtype={'pbdb_taxon_id': str, 'genus modifier':str})
log_df(taxa_df)

(7763, 20)


Unnamed: 0,taxon_group,verbatim_name,name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,comments,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,Unnamed: 19
0,benthic_forams,Abyssamina incisa,,,,,,Abyssamina,,,,incisa,,,,,762,Abyssamina,genus,
1,benthic_forams,Adercotryma glomeratum,,,,,,Adercotryma,,,,glomeratum,,,,,774,Adercotryma,genus,
2,benthic_forams,Adercotryma sp.,,,,,,Adercotryma,,,,sp.,,,,,774,Adercotryma,genus,
3,benthic_forams,Alabamina decorata,,,,,,Alabamina,,,,decorata,,,,,788,Alabamina,genus,
4,benthic_forams,Alabamina haitiensis,,,,,,Alabamina,,,,haitiensis,,,,,788,Alabamina,genus,


In [8]:

for index, row in taxa_df.iterrows():
    # don't overwrite existing name
    if isinstance(row['name'], str) :
        continue
    # if verbatim name has '(q)'
    if '(q)' in row['verbatim_name']:
        # set 'name' to 'verbatim name' without '(q)' 
        taxa_df.at[index,'name']=re.sub('(.*?) ?\(q\)', r'? \1', row['verbatim_name'])
        # set 'genus modifier' to '?'
        taxa_df.at[index,'genus modifier'] = '?'

        
log_df(taxa_df)

(7763, 20)


Unnamed: 0,taxon_group,verbatim_name,name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,comments,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,Unnamed: 19
0,benthic_forams,Abyssamina incisa,,,,,,Abyssamina,,,,incisa,,,,,762,Abyssamina,genus,
1,benthic_forams,Adercotryma glomeratum,,,,,,Adercotryma,,,,glomeratum,,,,,774,Adercotryma,genus,
2,benthic_forams,Adercotryma sp.,,,,,,Adercotryma,,,,sp.,,,,,774,Adercotryma,genus,
3,benthic_forams,Alabamina decorata,,,,,,Alabamina,,,,decorata,,,,,788,Alabamina,genus,
4,benthic_forams,Alabamina haitiensis,,,,,,Alabamina,,,,haitiensis,,,,,788,Alabamina,genus,


In [112]:
taxa_df.to_csv('cleaned_data/taxa/draft/NOAA/google_sheet_taxa lists_2021-08-05.csv', index=False)