#  NOAA DSDP taxa list
## 1-96 taxa

Create list of taxa for NOAA DSDP files. Compare NOAA taxa with the taxa that the PIs have already approved in order create a list of unapproved taxa. Add PBDB data to unapproved taxa.

In [1]:
import sys
import csv
import glob
import os
import requests
import re

sys.path.append('../../')
import pandas as pd
import numpy as np

import db 
import scripts.normalize_taxa as nt
from config import OUTPUT_DIR, CLEAN_DATA_DIR

In [6]:
date='2021-07-28'

base_dir = CLEAN_DATA_DIR

metadata_path = OUTPUT_DIR/'metadata'/'NOAA'/'noaa_dsdp_files.csv'
approved_taxa_path = OUTPUT_DIR/'taxa'/'LIMS'/f'taxa_list_{date}.csv'

crosswalk_path = OUTPUT_DIR/'taxa'/'draft'/'NOAA'/f'taxa_crosswalk_{date}.csv'
merged_path = OUTPUT_DIR/'taxa'/'draft'/'NOAA'/f'taxa_merged_{date}.csv'
merged2_path = OUTPUT_DIR/'taxa'/'draft'/'NOAA'/f'taxa_merged2_{date}.csv'

taxa_path = OUTPUT_DIR/'taxa'/'draft'/'NOAA'/f'taxa_list_{date}.csv'
genus_path = OUTPUT_DIR/'taxa'/'draft'/'NOAA'/f'genus_{date}.csv'
taxa_pbdb_path = OUTPUT_DIR/'taxa'/'draft'/'NOAA'/f'taxa_list_pbdb_{date}.csv'

In [7]:
def log_df(df, row_count=5):
    print(df.shape)
    return df.head(row_count)


## Create taxa list

In [8]:
metadata = pd.read_csv(metadata_path)
log_df(metadata)

(4477, 5)


Unnamed: 0,path,type,taxon_group,expedition,site
0,NOAA/DSDP_core_data/61/462/radiolar.csv,taxa,radiolarians,61,462
1,NOAA/DSDP_core_data/61/462/ageprof.csv,age,,61,462
2,NOAA/DSDP_core_data/61/462/b_forams.csv,taxa,benthic_foraminfera,61,462
3,NOAA/DSDP_core_data/61/462/p_forams.csv,taxa,planktic_foraminfera,61,462
4,NOAA/DSDP_core_data/61/462/hr_desc.csv,hard_rock,,61,462


read all the taxa files to get unique taxa names

In [9]:
# 9933
taxa = set()

for index, row in metadata.iterrows():
    if row['type'] == 'taxa':
        df = pd.read_csv(base_dir/row['path'])
        df.dropna(axis=0, inplace=True, how='all')
        df['temp'] = df['fossil'].str.strip() +  '|' + row['taxon_group']
        
        taxa.update(df['temp'])
        
len(taxa)

9933

In [10]:
list(taxa)[0:20]

[nan,
 'Globorotalia zealandica zealandica|planktic_foraminfera',
 'Lithomelissa mitra (q)|radiolarians',
 'Pyrgo rotularia|benthic_foraminfera',
 'Dictyocha fallacea|silicoflagellates',
 'Pinus sp.|pollen',
 'Thalassiphora pelagica (q)|dinoflagellates',
 'Planorotalites australiformis (q)|planktic_foraminfera',
 'Globigerina angustiumbilicata|planktic_foraminfera',
 'Globorotalia convexa|planktic_foraminfera',
 'Globulina lacrima|benthic_foraminfera',
 'Incisoria lanceolata|diatoms',
 'Dendrospyris stabilis (q)|radiolarians',
 'Trifarina fluews|benthic_foraminfera',
 'Planulina mexicana (q)|benthic_foraminfera',
 'Lithomelissa challengerae|radiolarians',
 'Triceratium kuepperi|diatoms',
 'Bekoma bidartensis|radiolarians',
 'Globorotalia intermedia|planktic_foraminfera',
 'Dorcadospyris alata|radiolarians']

In [11]:
# 9932
taxa_list = []

for taxon in taxa:
    if not pd.isna(taxon):
        taxon_name, taxon_group = taxon.split('|')
        simplified_name = re.sub('\(.*?\)$', '', taxon_name).strip()
        taxon_name_parts = simplified_name.split(' ')

        data = {'verbatim_name': taxon_name, 
                'taxon_group': taxon_group, 
                'genus name': taxon_name_parts[0],
                'simplified_name': simplified_name}
        if len(taxon_name_parts) > 1:
            data['species name'] = taxon_name_parts[1]
        if len(taxon_name_parts) == 3:
            data['subspecies name'] = taxon_name_parts[2]

        taxa_list.append(data)
        
len(taxa_list)

9932

create taxa list csv

In [13]:
taxa_df = pd.DataFrame(taxa_list).sort_values('verbatim_name')
log_df(taxa_df)

(9932, 6)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name
513,Abas wittii,diatoms,Abas,Abas wittii,wittii,
2833,Abathomphalus intermedius,planktic_foraminfera,Abathomphalus,Abathomphalus intermedius,intermedius,
7340,Abathomphalus mayaroensis,planktic_foraminfera,Abathomphalus,Abathomphalus mayaroensis,mayaroensis,
1588,Abies sp.,pollen,Abies,Abies sp.,sp.,
5259,Abutilon sp. (q),pollen,Abutilon,Abutilon sp.,sp.,


In [14]:
taxa_df.to_csv(crosswalk_path, index=False)

## compare and replace taxon groups 

In [15]:
noaa_taxa_df = pd.read_csv(crosswalk_path)
approved_taxa_df = pd.read_csv(approved_taxa_path)

In [16]:
approved_groups = list(approved_taxa_df['taxon_group'].unique())
approved_groups.sort()
approved_groups

['benthic_forams',
 'bolboformids',
 'chrysophyte_cysts',
 'diatoms',
 'dinoflagellates',
 'ebridians',
 'nannofossils',
 'ostracods',
 'palynology',
 'planktic_forams',
 'radiolarians',
 'silicoflagellates']

In [17]:
noaa_groups = list(noaa_taxa_df['taxon_group'].unique())
noaa_groups.sort()
noaa_groups

['benthic_foraminfera',
 'diatoms',
 'dinoflagellates',
 'ebridians',
 'nannofossils',
 'ostracods',
 'phytoliths',
 'planktic_foraminfera',
 'pollen',
 'radiolarians',
 'silicoflagellates']

In [18]:
set(noaa_groups) -  set(approved_groups)

{'benthic_foraminfera', 'phytoliths', 'planktic_foraminfera', 'pollen'}

In [19]:
noaa_taxa_df = noaa_taxa_df.replace(['benthic_foraminfera', 'planktic_foraminfera'],
                                    ['benthic_forams', 'planktic_forams'])
noaa_taxa_df.head()

Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name
0,Abas wittii,diatoms,Abas,Abas wittii,wittii,
1,Abathomphalus intermedius,planktic_forams,Abathomphalus,Abathomphalus intermedius,intermedius,
2,Abathomphalus mayaroensis,planktic_forams,Abathomphalus,Abathomphalus mayaroensis,mayaroensis,
3,Abies sp.,pollen,Abies,Abies sp.,sp.,
4,Abutilon sp. (q),pollen,Abutilon,Abutilon sp.,sp.,


In [20]:
noaa_taxa_df.to_csv(crosswalk_path, index=False)

## Compare NOAA taxa with approved taxa

get NOAA taxa

In [21]:
# 9932
noaa_taxa_df = pd.read_csv(crosswalk_path)

log_df(noaa_taxa_df)

(9932, 6)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name
0,Abas wittii,diatoms,Abas,Abas wittii,wittii,
1,Abathomphalus intermedius,planktic_forams,Abathomphalus,Abathomphalus intermedius,intermedius,
2,Abathomphalus mayaroensis,planktic_forams,Abathomphalus,Abathomphalus mayaroensis,mayaroensis,
3,Abies sp.,pollen,Abies,Abies sp.,sp.,
4,Abutilon sp. (q),pollen,Abutilon,Abutilon sp.,sp.,


get approved taxa

In [22]:
# 4209
approved_taxa_df = pd.read_csv(approved_taxa_path)
log_df(approved_taxa_df)

(4209, 12)


Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,normalized_name,taxon_group
0,,,Euuvigerina,,,,miozea,,,,Euuvigerina miozea,benthic_forams
1,,,Euuvigerina,,,,rodleyi,,,,Euuvigerina rodleyi,benthic_forams
2,Foraminifera indet.,,,,,,,,,,Foraminifera indet.,benthic_forams
3,Pleurostomellidae indet.,,,,,,,,,,Pleurostomellidae indet.,benthic_forams
4,Ostracoda indet.,,,,,,,,,,Ostracoda indet.,benthic_forams


add simplified_name without descriptor

In [23]:
del approved_taxa_df['normalized_name'] 


In [24]:
approved_taxa_df = nt.add_normalized_name_column(approved_taxa_df, 
                                                 include_descriptor=False, 
                                                 col_name="simplified_name")

In [25]:
approved_taxa_df[approved_taxa_df['non-taxa descriptor'].notna()].head(2)

Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,taxon_group,simplified_name
28,,,Globigerinoides,,,,sacculifer,,,without sac,planktic_forams,Globigerinoides sacculifer
201,,,Bolivina,,,cf.,crenulata,,,crenulate,benthic_forams,Bolivina cf. crenulata


In [26]:
approved_taxa_df = pd.DataFrame(approved_taxa_df[['taxon_group', 'simplified_name']])
log_df(approved_taxa_df)

(4209, 2)


Unnamed: 0,taxon_group,simplified_name
0,benthic_forams,Euuvigerina miozea
1,benthic_forams,Euuvigerina rodleyi
2,benthic_forams,Foraminifera indet.
3,benthic_forams,Pleurostomellidae indet.
4,benthic_forams,Ostracoda indet.


### merge NOAA taxa with approved taxa

In [27]:
# 10109 
merged_df = pd.merge(noaa_taxa_df, approved_taxa_df,  
                     on=['simplified_name', 'taxon_group' ], 
                     how='left',
                     indicator='_merge_approved')

log_df(merged_df)


(10109, 7)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name,_merge_approved
0,Abas wittii,diatoms,Abas,Abas wittii,wittii,,left_only
1,Abathomphalus intermedius,planktic_forams,Abathomphalus,Abathomphalus intermedius,intermedius,,left_only
2,Abathomphalus mayaroensis,planktic_forams,Abathomphalus,Abathomphalus mayaroensis,mayaroensis,,both
3,Abies sp.,pollen,Abies,Abies sp.,sp.,,left_only
4,Abutilon sp. (q),pollen,Abutilon,Abutilon sp.,sp.,,left_only


In [28]:
# 9932
merged_df = merged_df.drop_duplicates()
log_df(merged_df)

(9932, 7)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name,_merge_approved
0,Abas wittii,diatoms,Abas,Abas wittii,wittii,,left_only
1,Abathomphalus intermedius,planktic_forams,Abathomphalus,Abathomphalus intermedius,intermedius,,left_only
2,Abathomphalus mayaroensis,planktic_forams,Abathomphalus,Abathomphalus mayaroensis,mayaroensis,,both
3,Abies sp.,pollen,Abies,Abies sp.,sp.,,left_only
4,Abutilon sp. (q),pollen,Abutilon,Abutilon sp.,sp.,,left_only


In [29]:
merged_df.to_csv(merged_path, index=False)


In [30]:
# 10114 
merged2_df = pd.merge(noaa_taxa_df, approved_taxa_df,  
                     on=['simplified_name'], 
                     how='left',
                     indicator='_merge_approved')

log_df(merged2_df)


(10114, 8)


Unnamed: 0,verbatim_name,taxon_group_x,genus name,simplified_name,species name,subspecies name,taxon_group_y,_merge_approved
0,Abas wittii,diatoms,Abas,Abas wittii,wittii,,,left_only
1,Abathomphalus intermedius,planktic_forams,Abathomphalus,Abathomphalus intermedius,intermedius,,,left_only
2,Abathomphalus mayaroensis,planktic_forams,Abathomphalus,Abathomphalus mayaroensis,mayaroensis,,planktic_forams,both
3,Abies sp.,pollen,Abies,Abies sp.,sp.,,,left_only
4,Abutilon sp. (q),pollen,Abutilon,Abutilon sp.,sp.,,,left_only


In [31]:
# 9937
merged2_df = merged2_df.drop_duplicates()
log_df(merged2_df)

(9937, 8)


Unnamed: 0,verbatim_name,taxon_group_x,genus name,simplified_name,species name,subspecies name,taxon_group_y,_merge_approved
0,Abas wittii,diatoms,Abas,Abas wittii,wittii,,,left_only
1,Abathomphalus intermedius,planktic_forams,Abathomphalus,Abathomphalus intermedius,intermedius,,,left_only
2,Abathomphalus mayaroensis,planktic_forams,Abathomphalus,Abathomphalus mayaroensis,mayaroensis,,planktic_forams,both
3,Abies sp.,pollen,Abies,Abies sp.,sp.,,,left_only
4,Abutilon sp. (q),pollen,Abutilon,Abutilon sp.,sp.,,,left_only


save changes to taxa_all csv

In [32]:
merged2_df.to_csv(merged2_path, index=False)

the reason for the count difference is because the LIMS taxa list sometimes puts a taxa in two groups

NOAA: Selenopemphix nephroides - dinoflagellates 
LIMS: Selenopemphix nephroides - dinoflagellates, palynology

## Created taxa list with unapproved NOAA taxa

Select unapproved NOAA taxa. When _merged is both, it means the taxa has been approved. When _merged is left_only, it means the taxa has not been approved.

In [33]:
merged_df = pd.read_csv(merged_path)
log_df(merged_df)

(9932, 7)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name,_merge_approved
0,Abas wittii,diatoms,Abas,Abas wittii,wittii,,left_only
1,Abathomphalus intermedius,planktic_forams,Abathomphalus,Abathomphalus intermedius,intermedius,,left_only
2,Abathomphalus mayaroensis,planktic_forams,Abathomphalus,Abathomphalus mayaroensis,mayaroensis,,both
3,Abies sp.,pollen,Abies,Abies sp.,sp.,,left_only
4,Abutilon sp. (q),pollen,Abutilon,Abutilon sp.,sp.,,left_only


In [34]:
# (7763, 7)
unapproved_taxa_df = merged_df[merged_df['_merge_approved'] == 'left_only'].copy()

log_df(unapproved_taxa_df)

(7763, 7)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name,_merge_approved
0,Abas wittii,diatoms,Abas,Abas wittii,wittii,,left_only
1,Abathomphalus intermedius,planktic_forams,Abathomphalus,Abathomphalus intermedius,intermedius,,left_only
3,Abies sp.,pollen,Abies,Abies sp.,sp.,,left_only
4,Abutilon sp. (q),pollen,Abutilon,Abutilon sp.,sp.,,left_only
5,Abyssamina incisa,benthic_forams,Abyssamina,Abyssamina incisa,incisa,,left_only


create csv of unapproved NOAA taxa

In [35]:
unapproved_taxa_df.to_csv(taxa_path, index=False)

## Add PBDB data for taxa that aren't approved

Look up the genus for unapproved taxa in PBDB

In [36]:
unapproved_taxa_df = pd.read_csv(taxa_path)
log_df(unapproved_taxa_df)

(7763, 7)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name,_merge_approved
0,Abas wittii,diatoms,Abas,Abas wittii,wittii,,left_only
1,Abathomphalus intermedius,planktic_forams,Abathomphalus,Abathomphalus intermedius,intermedius,,left_only
2,Abies sp.,pollen,Abies,Abies sp.,sp.,,left_only
3,Abutilon sp. (q),pollen,Abutilon,Abutilon sp.,sp.,,left_only
4,Abyssamina incisa,benthic_forams,Abyssamina,Abyssamina incisa,incisa,,left_only


create a dataframe of unique genera

In [37]:
genus_df = pd.DataFrame(unapproved_taxa_df['genus name'].unique(), columns=['genus name'])

log_df(genus_df)

(1707, 1)


Unnamed: 0,genus name
0,Abas
1,Abathomphalus
2,Abies
3,Abutilon
4,Abyssamina


add pbdb taxa data

In [38]:
PBDB_API = "https://paleobiodb.org/data1.2/"
PBDB_TAXA = f"{PBDB_API}taxa/single.json?vocab=pbdb&name="

In [39]:
for index, row in genus_df.iterrows():
    if index % 50 == 0:
        print(index, end=' ')
        
    url =  PBDB_TAXA +  row['genus name']
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()["records"]
        if len(data) == 1:
            # cast taxon_no to string to avoid pandas converting it to a float           
            genus_df.at[index, 'pbdb_taxon_id'] = str(data[0]["taxon_no"])
            genus_df.at[index, 'pbdb_taxon_name'] = data[0]["taxon_name"]
            genus_df.at[index, 'pbdb_taxon_rank'] = data[0]["taxon_rank"]

0 50 100 150 200 250 300 350 400 450 500 550 600 650 700 750 800 850 900 950 1000 1050 1100 1150 1200 1250 1300 1350 1400 1450 1500 1550 1600 1650 1700 

In [40]:
genus_df.head()

Unnamed: 0,genus name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank
0,Abas,374615.0,Abas,genus
1,Abathomphalus,758.0,Abathomphalus,genus
2,Abies,55065.0,Abies,genus
3,Abutilon,,,
4,Abyssamina,762.0,Abyssamina,genus


create genus csv

In [41]:
genus_df.to_csv(genus_path, index=False)

## add pbdb info to unapproved taxa 

In [42]:
genus_df = pd.read_csv(genus_path, dtype={'pbdb_taxon_id': str})
log_df(genus_df)

(1707, 4)


Unnamed: 0,genus name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank
0,Abas,374615.0,Abas,genus
1,Abathomphalus,758.0,Abathomphalus,genus
2,Abies,55065.0,Abies,genus
3,Abutilon,,,
4,Abyssamina,762.0,Abyssamina,genus


In [43]:
unapproved_df = pd.read_csv(taxa_path)

log_df(unapproved_df)

(7763, 7)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name,_merge_approved
0,Abas wittii,diatoms,Abas,Abas wittii,wittii,,left_only
1,Abathomphalus intermedius,planktic_forams,Abathomphalus,Abathomphalus intermedius,intermedius,,left_only
2,Abies sp.,pollen,Abies,Abies sp.,sp.,,left_only
3,Abutilon sp. (q),pollen,Abutilon,Abutilon sp.,sp.,,left_only
4,Abyssamina incisa,benthic_forams,Abyssamina,Abyssamina incisa,incisa,,left_only


merge NOAA unapproved taxa with pbdb data

In [44]:
merged_df = pd.merge(unapproved_taxa_df, genus_df, 
                     on = 'genus name', 
                     how='left',
                     indicator='_merge_pbdb')

log_df(merged_df)

(7763, 11)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name,_merge_approved,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,_merge_pbdb
0,Abas wittii,diatoms,Abas,Abas wittii,wittii,,left_only,374615.0,Abas,genus,both
1,Abathomphalus intermedius,planktic_forams,Abathomphalus,Abathomphalus intermedius,intermedius,,left_only,758.0,Abathomphalus,genus,both
2,Abies sp.,pollen,Abies,Abies sp.,sp.,,left_only,55065.0,Abies,genus,both
3,Abutilon sp. (q),pollen,Abutilon,Abutilon sp.,sp.,,left_only,,,,both
4,Abyssamina incisa,benthic_forams,Abyssamina,Abyssamina incisa,incisa,,left_only,762.0,Abyssamina,genus,both


add name column

In [45]:
merged_df['name'] = merged_df['verbatim_name'].str.strip()
merged_df['Comment'] = np.nan
merged_df['Notes (change to Internal only notes?)'] = np.nan
merged_df['Any taxon above genus'] = np.nan
merged_df['genus modifier'] = ''
merged_df['subgenera modifier'] = np.nan
merged_df['subgenera name'] = np.nan
merged_df['species modifier'] = np.nan
merged_df['subspecies modifier'] = np.nan
merged_df['non-taxa descriptor'] = np.nan
merged_df['comments'] = np.nan


In [46]:
for index, row in merged_df.iterrows():
    if '(q)' in row['name']:
        
        merged_df.at[index,'name']=re.sub('(.*?) \(q\)', r'? \1', row['name'])
        merged_df.at[index,'genus modifier'] = '?'
        
log_df(merged_df)

(7763, 22)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name,_merge_approved,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,...,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,subgenera modifier,subgenera name,species modifier,subspecies modifier,non-taxa descriptor,comments
0,Abas wittii,diatoms,Abas,Abas wittii,wittii,,left_only,374615.0,Abas,genus,...,,,,,,,,,,
1,Abathomphalus intermedius,planktic_forams,Abathomphalus,Abathomphalus intermedius,intermedius,,left_only,758.0,Abathomphalus,genus,...,,,,,,,,,,
2,Abies sp.,pollen,Abies,Abies sp.,sp.,,left_only,55065.0,Abies,genus,...,,,,,,,,,,
3,Abutilon sp. (q),pollen,Abutilon,Abutilon sp.,sp.,,left_only,,,,...,,,,?,,,,,,
4,Abyssamina incisa,benthic_forams,Abyssamina,Abyssamina incisa,incisa,,left_only,762.0,Abyssamina,genus,...,,,,,,,,,,


reorder columns and sort rows

In [47]:
merged_df = merged_df.reindex(columns=['taxon_group', 'verbatim_name', 'name', 'Comment',
                                       'Notes (change to Internal only notes?)',
                                       'Any taxon above genus', 
                                       'genus modifier', 'genus name', 
                                       'subgenera modifier', 'subgenera name',
                                       'species modifier', 'species name', 
                                       'subspecies modifier', 'subspecies name',
                                       'non-taxa descriptor', 'comments',
                                       'pbdb_taxon_id', 'pbdb_taxon_name',
                                       'pbdb_taxon_rank', '_simplified_name',
                                       '_merge_approved', '_merge_pbdb'
                                      ])

merged_df.sort_values(by=['taxon_group', 'verbatim_name'], inplace=True)

log_df(merged_df)

(7763, 22)


Unnamed: 0,taxon_group,verbatim_name,name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,subspecies modifier,subspecies name,non-taxa descriptor,comments,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,_simplified_name,_merge_approved,_merge_pbdb
4,benthic_forams,Abyssamina incisa,Abyssamina incisa,,,,,Abyssamina,,,...,,,,,762,Abyssamina,genus,,left_only,both
131,benthic_forams,Adercotryma glomeratum,Adercotryma glomeratum,,,,,Adercotryma,,,...,,,,,774,Adercotryma,genus,,left_only,both
132,benthic_forams,Adercotryma sp.,Adercotryma sp.,,,,,Adercotryma,,,...,,,,,774,Adercotryma,genus,,left_only,both
144,benthic_forams,Alabamina decorata,Alabamina decorata,,,,,Alabamina,,,...,,,,,788,Alabamina,genus,,left_only,both
145,benthic_forams,Alabamina haitiensis,Alabamina haitiensis,,,,,Alabamina,,,...,,,,,788,Alabamina,genus,,left_only,both


save csv

In [48]:
merged_df.to_csv(taxa_pbdb_path, index=False)

## process taxalist from PIs

update google sheet taxa list from PIs to deal with (q) in verbatim name.

In [7]:
path = 'raw_data/taxa/NOAA_taxa_lists_taxa list_2021-08-05.csv'
taxa_df = pd.read_csv(path, dtype={'pbdb_taxon_id': str, 'genus modifier':str})
log_df(taxa_df)

(7763, 20)


Unnamed: 0,taxon_group,verbatim_name,name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,comments,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,Unnamed: 19
0,benthic_forams,Abyssamina incisa,,,,,,Abyssamina,,,,incisa,,,,,762,Abyssamina,genus,
1,benthic_forams,Adercotryma glomeratum,,,,,,Adercotryma,,,,glomeratum,,,,,774,Adercotryma,genus,
2,benthic_forams,Adercotryma sp.,,,,,,Adercotryma,,,,sp.,,,,,774,Adercotryma,genus,
3,benthic_forams,Alabamina decorata,,,,,,Alabamina,,,,decorata,,,,,788,Alabamina,genus,
4,benthic_forams,Alabamina haitiensis,,,,,,Alabamina,,,,haitiensis,,,,,788,Alabamina,genus,


In [8]:

for index, row in taxa_df.iterrows():
    # don't overwrite existing name
    if isinstance(row['name'], str) :
        continue
    # if verbatim name has '(q)'
    if '(q)' in row['verbatim_name']:
        # set 'name' to 'verbatim name' without '(q)' 
        taxa_df.at[index,'name']=re.sub('(.*?) ?\(q\)', r'? \1', row['verbatim_name'])
        # set 'genus modifier' to '?'
        taxa_df.at[index,'genus modifier'] = '?'

        
log_df(taxa_df)

(7763, 20)


Unnamed: 0,taxon_group,verbatim_name,name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,comments,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,Unnamed: 19
0,benthic_forams,Abyssamina incisa,,,,,,Abyssamina,,,,incisa,,,,,762,Abyssamina,genus,
1,benthic_forams,Adercotryma glomeratum,,,,,,Adercotryma,,,,glomeratum,,,,,774,Adercotryma,genus,
2,benthic_forams,Adercotryma sp.,,,,,,Adercotryma,,,,sp.,,,,,774,Adercotryma,genus,
3,benthic_forams,Alabamina decorata,,,,,,Alabamina,,,,decorata,,,,,788,Alabamina,genus,
4,benthic_forams,Alabamina haitiensis,,,,,,Alabamina,,,,haitiensis,,,,,788,Alabamina,genus,


In [112]:
taxa_df.to_csv('cleaned_data/taxa/draft/NOAA/google_sheet_taxa lists_2021-08-05.csv', index=False)