#  NOAA DSDP taxa list
## 1-96 taxa

Create list of taxa for NOAA DSDP files. Compare NOAA taxa with the LIMS approved taxa that the PIs have already approved in order create a list of unapproved NOAA taxa. Add PBDB data to unapproved taxa.

In [1]:
import sys
import csv
import glob
import os
import requests
import re
import time 

sys.path.append('../../')
import pandas as pd
import numpy as np

# import db 
import scripts.normalize_taxa as nt
from config import OUTPUT_DIR, CLEAN_DATA_DIR, RAW_DATA_DIR
from scripts.pbdb import get_parent_taxa, PBDB_TAXA_NAME
from scripts.shared_utils import (
    log_df
)

In [2]:
date = '2021-07-28'
# date = '2021-08-05'
# date='2021-11-29'
# date='2022-09-12'
# date='2022-10-26'


base_dir = CLEAN_DATA_DIR

input_file = RAW_DATA_DIR/'PI_processed_files'/f'NOAA_taxa_lists_taxa_list_{date}.csv'
input_pbdb_file = OUTPUT_DIR/'taxa'/'NOAA'/f"PI_normalized_taxa_list_with_pbdb_{date}.csv"


metadata_path = OUTPUT_DIR/'metadata'/'NOAA'/'noaa_dsdp_files.csv'
LIMS_taxa_path = OUTPUT_DIR/'taxa'/'LIMS'/f'taxa_list_2021-07-28.csv'


merged_path = OUTPUT_DIR/'taxa'/'draft'/'NOAA'/f'taxa_merged_{date}.csv'
merged2_path = OUTPUT_DIR/'taxa'/'draft'/'NOAA'/f'taxa_merged2_{date}.csv'

all_taxa_path = OUTPUT_DIR/'taxa'/'draft'/'NOAA'/f'all_taxa_list_{date}.csv'

taxa_path = OUTPUT_DIR/'taxa'/'draft'/'NOAA'/f'taxa_list_{date}.csv'
genus_path = OUTPUT_DIR/'taxa'/'draft'/'NOAA'/f'genus_pbdb_{date}.csv'
taxa_pbdb_path = OUTPUT_DIR/'taxa'/'draft'/'NOAA'/f'taxa_list_pbdb_{date}.csv'

# date='2021-11-29'
# old_genus_path = OUTPUT_DIR/'taxa'/'draft'/'NOAA'/f'genus_pbdb_{date}.csv'

## Create NOAA taxa list

In [3]:
metadata = pd.read_csv(metadata_path)
log_df(metadata)

(4477, 5)


Unnamed: 0,path,type,taxon_group,expedition,site
0,NOAA/DSDP_core_data/61/462/radiolar.csv,taxa,radiolarians,61,462
1,NOAA/DSDP_core_data/61/462/ageprof.csv,age,,61,462
2,NOAA/DSDP_core_data/61/462/b_forams.csv,taxa,benthic_foraminfera,61,462
3,NOAA/DSDP_core_data/61/462/p_forams.csv,taxa,planktic_foraminfera,61,462
4,NOAA/DSDP_core_data/61/462/hr_desc.csv,hard_rock,,61,462


read all the taxa files to get unique taxa names

In [4]:
# 9933
taxa = set()

for index, row in metadata.iterrows():
    if row['type'] == 'taxa':
        df = pd.read_csv(base_dir/row['path'])
        df.dropna(axis=0, inplace=True, how='all')
        df['temp'] = df['fossil'].str.strip() +  '|' + row['taxon_group']
        
        taxa.update(df['temp'])
        
len(taxa)
#9933

9933

In [5]:
list(taxa)[0:5]

[nan,
 'Theocalyptra bicornis|radiolarians',
 'Zygodiscus splendens|nannofossils',
 'Glandulina sp.|benthic_foraminfera',
 'Coscinodiscus radiatus var.|diatoms']

In [6]:
# 9932
taxa_list = []

for taxon in taxa:
    if not pd.isna(taxon):
        taxon_name, taxon_group = taxon.split('|')
        
        # remove (text) (q) from taxon name    
        simplified_name = re.sub('\(.*?\)$', '', taxon_name).strip()
        taxon_name_parts = simplified_name.split(' ')

        data = {'verbatim_name': taxon_name, 
                'taxon_group': taxon_group, 
                'genus name': taxon_name_parts[0],
                'simplified_name': simplified_name}
        if len(taxon_name_parts) > 1:
            data['species name'] = taxon_name_parts[1]
        if len(taxon_name_parts) == 3:
            data['subspecies name'] = taxon_name_parts[2]

        taxa_list.append(data)
        
len(taxa_list)

9932

In [7]:
noaa_taxa_df = pd.DataFrame(taxa_list).sort_values('verbatim_name')
log_df(noaa_taxa_df)

(9932, 6)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name
6496,Abas wittii,diatoms,Abas,Abas wittii,wittii,
8183,Abathomphalus intermedius,planktic_foraminfera,Abathomphalus,Abathomphalus intermedius,intermedius,
6550,Abathomphalus mayaroensis,planktic_foraminfera,Abathomphalus,Abathomphalus mayaroensis,mayaroensis,
8864,Abies sp.,pollen,Abies,Abies sp.,sp.,
6898,Abutilon sp. (q),pollen,Abutilon,Abutilon sp.,sp.,


## compare and replace taxon groups 

replace NOAA taxon groups with LIMS taxon groups

In [8]:
LIMS_taxa_df = pd.read_csv(LIMS_taxa_path)
log_df(LIMS_taxa_df)
# 4209

(4209, 12)


Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,normalized_name,taxon_group
0,,,Euuvigerina,,,,miozea,,,,Euuvigerina miozea,benthic_forams
1,,,Euuvigerina,,,,rodleyi,,,,Euuvigerina rodleyi,benthic_forams
2,Foraminifera indet.,,,,,,,,,,Foraminifera indet.,benthic_forams
3,Pleurostomellidae indet.,,,,,,,,,,Pleurostomellidae indet.,benthic_forams
4,Ostracoda indet.,,,,,,,,,,Ostracoda indet.,benthic_forams


In [9]:
LIMS_groups = list(LIMS_taxa_df['taxon_group'].unique())
LIMS_groups.sort()
LIMS_groups

['benthic_forams',
 'bolboformids',
 'chrysophyte_cysts',
 'diatoms',
 'dinoflagellates',
 'ebridians',
 'nannofossils',
 'ostracods',
 'palynology',
 'planktic_forams',
 'radiolarians',
 'silicoflagellates']

In [10]:
noaa_groups = list(noaa_taxa_df['taxon_group'].unique())
noaa_groups.sort()
noaa_groups

['benthic_foraminfera',
 'diatoms',
 'dinoflagellates',
 'ebridians',
 'nannofossils',
 'ostracods',
 'phytoliths',
 'planktic_foraminfera',
 'pollen',
 'radiolarians',
 'silicoflagellates']

In [11]:
set(noaa_groups) -  set(LIMS_groups)

{'benthic_foraminfera', 'phytoliths', 'planktic_foraminfera', 'pollen'}

In [12]:
noaa_taxa_df = noaa_taxa_df.replace(['benthic_foraminfera', 'planktic_foraminfera'],
                                    ['benthic_forams', 'planktic_forams'])
noaa_taxa_df.head()

Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name
6496,Abas wittii,diatoms,Abas,Abas wittii,wittii,
8183,Abathomphalus intermedius,planktic_forams,Abathomphalus,Abathomphalus intermedius,intermedius,
6550,Abathomphalus mayaroensis,planktic_forams,Abathomphalus,Abathomphalus mayaroensis,mayaroensis,
8864,Abies sp.,pollen,Abies,Abies sp.,sp.,
6898,Abutilon sp. (q),pollen,Abutilon,Abutilon sp.,sp.,


In [13]:
noaa_taxa_df.to_csv(all_taxa_path, index=False)

## create csv that compares NOAA taxa list with LIMS approved taxa

get NOAA taxa

In [15]:
# 9932
noaa_taxa_df = pd.read_csv(all_taxa_path)

log_df(noaa_taxa_df)

(9932, 6)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name
0,Abas wittii,diatoms,Abas,Abas wittii,wittii,
1,Abathomphalus intermedius,planktic_forams,Abathomphalus,Abathomphalus intermedius,intermedius,
2,Abathomphalus mayaroensis,planktic_forams,Abathomphalus,Abathomphalus mayaroensis,mayaroensis,
3,Abies sp.,pollen,Abies,Abies sp.,sp.,
4,Abutilon sp. (q),pollen,Abutilon,Abutilon sp.,sp.,


get LIMS taxa

In [16]:
# 4209
LIMS_taxa_df = pd.read_csv(LIMS_taxa_path)
log_df(LIMS_taxa_df)

(4209, 12)


Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,normalized_name,taxon_group
0,,,Euuvigerina,,,,miozea,,,,Euuvigerina miozea,benthic_forams
1,,,Euuvigerina,,,,rodleyi,,,,Euuvigerina rodleyi,benthic_forams
2,Foraminifera indet.,,,,,,,,,,Foraminifera indet.,benthic_forams
3,Pleurostomellidae indet.,,,,,,,,,,Pleurostomellidae indet.,benthic_forams
4,Ostracoda indet.,,,,,,,,,,Ostracoda indet.,benthic_forams


add simplified_name without descriptor

In [17]:
del LIMS_taxa_df['normalized_name'] 


In [18]:
LIMS_taxa_df = nt.add_normalized_name_column(LIMS_taxa_df, 
                                                 include_descriptor=False, 
                                                 col_name="simplified_name")

In [19]:
LIMS_taxa_df[LIMS_taxa_df['non-taxa descriptor'].notna()].head(2)

Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,taxon_group,simplified_name
28,,,Globigerinoides,,,,sacculifer,,,without sac,planktic_forams,Globigerinoides sacculifer
201,,,Bolivina,,,cf.,crenulata,,,crenulate,benthic_forams,Bolivina cf. crenulata


In [20]:
LIMS_taxa_df = pd.DataFrame(LIMS_taxa_df[['taxon_group', 'simplified_name']])
log_df(LIMS_taxa_df)

(4209, 2)


Unnamed: 0,taxon_group,simplified_name
0,benthic_forams,Euuvigerina miozea
1,benthic_forams,Euuvigerina rodleyi
2,benthic_forams,Foraminifera indet.
3,benthic_forams,Pleurostomellidae indet.
4,benthic_forams,Ostracoda indet.


### merge NOAA taxa with LIMS taxa

In [21]:
# 10109 
merged_df = pd.merge(noaa_taxa_df, LIMS_taxa_df,  
                     on=['simplified_name', 'taxon_group' ], 
                     how='left',
                     indicator='_merge_approved')

log_df(merged_df)


(10109, 7)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name,_merge_approved
0,Abas wittii,diatoms,Abas,Abas wittii,wittii,,left_only
1,Abathomphalus intermedius,planktic_forams,Abathomphalus,Abathomphalus intermedius,intermedius,,left_only
2,Abathomphalus mayaroensis,planktic_forams,Abathomphalus,Abathomphalus mayaroensis,mayaroensis,,both
3,Abies sp.,pollen,Abies,Abies sp.,sp.,,left_only
4,Abutilon sp. (q),pollen,Abutilon,Abutilon sp.,sp.,,left_only


In [22]:
# 9932
merged_df = merged_df.drop_duplicates()
log_df(merged_df)

(9932, 7)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name,_merge_approved
0,Abas wittii,diatoms,Abas,Abas wittii,wittii,,left_only
1,Abathomphalus intermedius,planktic_forams,Abathomphalus,Abathomphalus intermedius,intermedius,,left_only
2,Abathomphalus mayaroensis,planktic_forams,Abathomphalus,Abathomphalus mayaroensis,mayaroensis,,both
3,Abies sp.,pollen,Abies,Abies sp.,sp.,,left_only
4,Abutilon sp. (q),pollen,Abutilon,Abutilon sp.,sp.,,left_only


In [23]:
merged_df.to_csv(merged_path, index=False)


### compare merge methods
do merge on simplified_name without taxon groups

In [24]:
# 10114 
merged2_df = pd.merge(noaa_taxa_df, LIMS_taxa_df,  
                     on=['simplified_name'], 
                     how='left',
                     indicator='_merge_approved')

log_df(merged2_df)


(10114, 8)


Unnamed: 0,verbatim_name,taxon_group_x,genus name,simplified_name,species name,subspecies name,taxon_group_y,_merge_approved
0,Abas wittii,diatoms,Abas,Abas wittii,wittii,,,left_only
1,Abathomphalus intermedius,planktic_forams,Abathomphalus,Abathomphalus intermedius,intermedius,,,left_only
2,Abathomphalus mayaroensis,planktic_forams,Abathomphalus,Abathomphalus mayaroensis,mayaroensis,,planktic_forams,both
3,Abies sp.,pollen,Abies,Abies sp.,sp.,,,left_only
4,Abutilon sp. (q),pollen,Abutilon,Abutilon sp.,sp.,,,left_only


In [25]:
# 9937
merged2_df = merged2_df.drop_duplicates()
log_df(merged2_df)

(9937, 8)


Unnamed: 0,verbatim_name,taxon_group_x,genus name,simplified_name,species name,subspecies name,taxon_group_y,_merge_approved
0,Abas wittii,diatoms,Abas,Abas wittii,wittii,,,left_only
1,Abathomphalus intermedius,planktic_forams,Abathomphalus,Abathomphalus intermedius,intermedius,,,left_only
2,Abathomphalus mayaroensis,planktic_forams,Abathomphalus,Abathomphalus mayaroensis,mayaroensis,,planktic_forams,both
3,Abies sp.,pollen,Abies,Abies sp.,sp.,,,left_only
4,Abutilon sp. (q),pollen,Abutilon,Abutilon sp.,sp.,,,left_only


the reason for the count difference is because the LIMS taxa list sometimes puts a taxa in two groups

NOAA: Selenopemphix nephroides - dinoflagellates 
LIMS: Selenopemphix nephroides - dinoflagellates, palynology

## Created taxa list with unapproved NOAA taxa

Select unapproved NOAA taxa. When _merged is both, it means the taxa has been approved. When _merged is left_only, it means the taxa has not been approved.

In [26]:
merged_df = pd.read_csv(merged_path)
log_df(merged_df)

(9932, 7)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name,_merge_approved
0,Abas wittii,diatoms,Abas,Abas wittii,wittii,,left_only
1,Abathomphalus intermedius,planktic_forams,Abathomphalus,Abathomphalus intermedius,intermedius,,left_only
2,Abathomphalus mayaroensis,planktic_forams,Abathomphalus,Abathomphalus mayaroensis,mayaroensis,,both
3,Abies sp.,pollen,Abies,Abies sp.,sp.,,left_only
4,Abutilon sp. (q),pollen,Abutilon,Abutilon sp.,sp.,,left_only


In [27]:
# (7763, 7)
LIMS_taxa_df = merged_df[merged_df['_merge_approved'] == 'left_only'].copy()

log_df(LIMS_taxa_df)

(7763, 7)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name,_merge_approved
0,Abas wittii,diatoms,Abas,Abas wittii,wittii,,left_only
1,Abathomphalus intermedius,planktic_forams,Abathomphalus,Abathomphalus intermedius,intermedius,,left_only
3,Abies sp.,pollen,Abies,Abies sp.,sp.,,left_only
4,Abutilon sp. (q),pollen,Abutilon,Abutilon sp.,sp.,,left_only
5,Abyssamina incisa,benthic_forams,Abyssamina,Abyssamina incisa,incisa,,left_only


create csv of unapproved NOAA taxa

In [28]:
LIMS_taxa_df.to_csv(taxa_path, index=False)

## Create genus list with PBDB data for NOAA taxa that aren't approved

Look up the genus for unapproved taxa in PBDB

In [29]:
taxa_df = pd.read_csv(taxa_path)
log_df(taxa_df)

(7763, 7)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name,_merge_approved
0,Abas wittii,diatoms,Abas,Abas wittii,wittii,,left_only
1,Abathomphalus intermedius,planktic_forams,Abathomphalus,Abathomphalus intermedius,intermedius,,left_only
2,Abies sp.,pollen,Abies,Abies sp.,sp.,,left_only
3,Abutilon sp. (q),pollen,Abutilon,Abutilon sp.,sp.,,left_only
4,Abyssamina incisa,benthic_forams,Abyssamina,Abyssamina incisa,incisa,,left_only


In [30]:
# old_genus_df = pd.read_csv(old_genus_path, dtype=str)
# log_df(old_genus_df)

create a dataframe of unique genera

In [31]:
genus_df = pd.DataFrame(taxa_df['genus name'].unique(), columns=['genus name'])

log_df(genus_df)

(1707, 1)


Unnamed: 0,genus name
0,Abas
1,Abathomphalus
2,Abies
3,Abutilon
4,Abyssamina


In [226]:
# genus_df = genus_df.merge(old_genus_df, how='left')
# log_df(genus_df)

(1707, 16)


Unnamed: 0,genus name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,class_taxon_id,class_taxon_name,order_taxon_id,order_taxon_name,family_taxon_id,family_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name
0,Abas,441243.0,Abas,genus,69587.0,Bacillariophyceae,426784.0,Hemiaulales,387088.0,Hemiaulaceae,432613.0,Ochrophyta,,,,
1,Abathomphalus,758.0,Abathomphalus,genus,,,,,,,288974.0,Foraminifera,212476.0,Rhizaria,,
2,Abies,55065.0,Abies,genus,82141.0,Pinopsida,82140.0,Pinales,54794.0,Pinaceae,82139.0,Pinophyta,54311.0,Plantae,,
3,Abutilon,,,,,,,,,,,,,,,
4,Abyssamina,762.0,Abyssamina,genus,428504.0,Globothalamea,279579.0,Rotaliida,103796.0,Chilostomellidae,288974.0,Foraminifera,212476.0,Rhizaria,,


add pbdb taxa data

In [12]:
for index, row in genus_df.iterrows():        
    if index > 5:
        continue
    
    if index % 50 == 0:
        print(index, end=' ')
        
    # if pd.notna(row['pbdb_taxon_id']):
    #     continue

    time.sleep(0.25)

        
    url =  PBDB_TAXA_NAME +  row['genus name']
        
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()["records"]
        if len(data) == 1:
            genus_df.at[index, f'pbdb_taxon_id'] = str(data[0]["taxon_no"])
            genus_df.at[index, f'pbdb_taxon_name'] = data[0]["taxon_name"]
            genus_df.at[index, f'pbdb_taxon_rank'] = data[0]["taxon_rank"]
            
            # round = 0
            # get_parent_taxa(genus_df, data[0]["parent_no"], data[0]["taxon_rank"], round, index, None)

           

0 

In [13]:
genus_df.head()

Unnamed: 0,genus name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank
0,Abas,374615,Abas,genus
1,Abathomphalus,758,Abathomphalus,genus
2,Abies,55065,Abies,genus
3,Abutilon,454155,Abutilon,genus
4,Abyssamina,762,Abyssamina,genus


create genus csv

In [229]:
genus_df.to_csv(genus_path, index=False)

## create taxa list with pbdb info for the PIs

In [42]:
genus_df = pd.read_csv(genus_path, dtype= str)
log_df(genus_df)
# 1707

(1707, 4)


Unnamed: 0,genus name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank
0,Abas,374615.0,Abas,genus
1,Abathomphalus,758.0,Abathomphalus,genus
2,Abies,55065.0,Abies,genus
3,Abutilon,,,
4,Abyssamina,762.0,Abyssamina,genus


In [43]:
unapproved_df = pd.read_csv(taxa_path)

log_df(unapproved_df)
# 7763

(7763, 7)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name,_merge_approved
0,Abas wittii,diatoms,Abas,Abas wittii,wittii,,left_only
1,Abathomphalus intermedius,planktic_forams,Abathomphalus,Abathomphalus intermedius,intermedius,,left_only
2,Abies sp.,pollen,Abies,Abies sp.,sp.,,left_only
3,Abutilon sp. (q),pollen,Abutilon,Abutilon sp.,sp.,,left_only
4,Abyssamina incisa,benthic_forams,Abyssamina,Abyssamina incisa,incisa,,left_only


merge NOAA unapproved taxa with pbdb data

In [44]:
merged_df = pd.merge(LIMS_taxa_df, genus_df, 
                     on = 'genus name', 
                     how='left',
                     indicator='_merge_pbdb')

log_df(merged_df)

(7763, 11)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name,_merge_approved,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,_merge_pbdb
0,Abas wittii,diatoms,Abas,Abas wittii,wittii,,left_only,374615.0,Abas,genus,both
1,Abathomphalus intermedius,planktic_forams,Abathomphalus,Abathomphalus intermedius,intermedius,,left_only,758.0,Abathomphalus,genus,both
2,Abies sp.,pollen,Abies,Abies sp.,sp.,,left_only,55065.0,Abies,genus,both
3,Abutilon sp. (q),pollen,Abutilon,Abutilon sp.,sp.,,left_only,,,,both
4,Abyssamina incisa,benthic_forams,Abyssamina,Abyssamina incisa,incisa,,left_only,762.0,Abyssamina,genus,both


add columns 

In [45]:
merged_df['name'] = merged_df['verbatim_name'].str.strip()
merged_df['Comment'] = np.nan
merged_df['Notes (change to Internal only notes?)'] = np.nan
merged_df['Any taxon above genus'] = np.nan
merged_df['genus modifier'] = ''
merged_df['subgenera modifier'] = np.nan
merged_df['subgenera name'] = np.nan
merged_df['species modifier'] = np.nan
merged_df['subspecies modifier'] = np.nan
merged_df['non-taxa descriptor'] = np.nan
merged_df['comments'] = np.nan
merged_df['Corrections to pbdb_taxon_rank'] = np.nan


remove (q) from name, and add (q) to genus modifier

In [46]:
for index, row in merged_df.iterrows():
    if '(q)' in row['name']:
        
        merged_df.at[index,'name']=re.sub('(.*?) \(q\)', r'? \1', row['name'])
        merged_df.at[index,'genus modifier'] = '?'
        
log_df(merged_df)

(7763, 23)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name,_merge_approved,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,...,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,subgenera modifier,subgenera name,species modifier,subspecies modifier,non-taxa descriptor,comments,Corrections to pbdb_taxon_rank
0,Abas wittii,diatoms,Abas,Abas wittii,wittii,,left_only,374615.0,Abas,genus,...,,,,,,,,,,
1,Abathomphalus intermedius,planktic_forams,Abathomphalus,Abathomphalus intermedius,intermedius,,left_only,758.0,Abathomphalus,genus,...,,,,,,,,,,
2,Abies sp.,pollen,Abies,Abies sp.,sp.,,left_only,55065.0,Abies,genus,...,,,,,,,,,,
3,Abutilon sp. (q),pollen,Abutilon,Abutilon sp.,sp.,,left_only,,,,...,,,?,,,,,,,
4,Abyssamina incisa,benthic_forams,Abyssamina,Abyssamina incisa,incisa,,left_only,762.0,Abyssamina,genus,...,,,,,,,,,,


In [47]:
merged_df.columns

Index(['verbatim_name', 'taxon_group', 'genus name', 'simplified_name',
       'species name', 'subspecies name', '_merge_approved', 'pbdb_taxon_id',
       'pbdb_taxon_name', 'pbdb_taxon_rank', '_merge_pbdb', 'name', 'Comment',
       'Notes (change to Internal only notes?)', 'Any taxon above genus',
       'genus modifier', 'subgenera modifier', 'subgenera name',
       'species modifier', 'subspecies modifier', 'non-taxa descriptor',
       'comments', 'Corrections to pbdb_taxon_rank'],
      dtype='object')

reorder columns and sort rows

In [48]:
merged_df = merged_df.reindex(columns=[
    'taxon_group', 'verbatim_name', 'name', 'Comment',
    'Notes (change to Internal only notes?)',
    # 'Any taxon above genus modifier', 
    'Any taxon above genus', 
    'genus modifier', 'genus name', 
    'subgenera modifier', 'subgenera name',
    'species modifier', 'species name', 
    'subspecies modifier', 'subspecies name',
    'non-taxa descriptor', 
    'comments',
    'pbdb_taxon_id', 'pbdb_taxon_name', 'pbdb_taxon_rank', 
    '_simplified_name','_merge_approved','_merge_pbdb'
    # 'Corrections to pbdb_taxon_rank',
    # 'family_taxon_id', 'family_taxon_name',
    # 'superfamily_taxon_id', 'superfamily_taxon_name',
    # 'order_taxon_id', 'order_taxon_name',
    # 'class_taxon_id', 'class_taxon_name',
    # 'phylum_taxon_id', 'phylum_taxon_name',
    # 'kingdom_taxon_id', 'kingdom_taxon_name',
    # 'unranked clade_taxon_id', 'unranked clade_taxon_name'
])

merged_df.sort_values(by=['taxon_group', 'verbatim_name'], inplace=True)

log_df(merged_df)

(7763, 22)


Unnamed: 0,taxon_group,verbatim_name,name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,subspecies modifier,subspecies name,non-taxa descriptor,comments,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,_simplified_name,_merge_approved,_merge_pbdb
4,benthic_forams,Abyssamina incisa,Abyssamina incisa,,,,,Abyssamina,,,...,,,,,762,Abyssamina,genus,,left_only,both
131,benthic_forams,Adercotryma glomeratum,Adercotryma glomeratum,,,,,Adercotryma,,,...,,,,,774,Adercotryma,genus,,left_only,both
132,benthic_forams,Adercotryma sp.,Adercotryma sp.,,,,,Adercotryma,,,...,,,,,774,Adercotryma,genus,,left_only,both
144,benthic_forams,Alabamina decorata,Alabamina decorata,,,,,Alabamina,,,...,,,,,788,Alabamina,genus,,left_only,both
145,benthic_forams,Alabamina haitiensis,Alabamina haitiensis,,,,,Alabamina,,,...,,,,,788,Alabamina,genus,,left_only,both


In [49]:
merged_df.to_csv(taxa_pbdb_path, index=False)

## fix taxalist from PIs

update google sheet taxa list from PIs to deal with (q) in verbatim name. only need to do this once.

In [45]:
input_file = RAW_DATA_DIR/'PI_processed_files'/f'NOAA_taxa_lists_taxa_list_2021-08-05.csv'

taxa_df = pd.read_csv(input_file, dtype=str)
log_df(taxa_df)

(7763, 20)


Unnamed: 0,taxon_group,verbatim_name,name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,comments,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,Unnamed: 19
0,benthic_forams,Abyssamina incisa,,,,,,Abyssamina,,,,incisa,,,,,762,Abyssamina,genus,
1,benthic_forams,Adercotryma glomeratum,,,,,,Adercotryma,,,,glomeratum,,,,,774,Adercotryma,genus,
2,benthic_forams,Adercotryma sp.,,,,,,Adercotryma,,,,sp.,,,,,774,Adercotryma,genus,
3,benthic_forams,Alabamina decorata,,,,,,Alabamina,,,,decorata,,,,,788,Alabamina,genus,
4,benthic_forams,Alabamina haitiensis,,,,,,Alabamina,,,,haitiensis,,,,,788,Alabamina,genus,


In [46]:

for index, row in taxa_df.iterrows():
    # don't overwrite existing name
    if isinstance(row['name'], str) :
        continue
    # if verbatim name has '(q)'
    if '(q)' in row['verbatim_name']:
        # set 'name' to 'verbatim name' without '(q)' 
        taxa_df.at[index,'name']=re.sub('(.*?) ?\(q\)', r'? \1', row['verbatim_name'])
        # set 'genus modifier' to '?'
        taxa_df.at[index,'genus modifier'] = '?'

        
log_df(taxa_df)

(7763, 20)


Unnamed: 0,taxon_group,verbatim_name,name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,comments,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,Unnamed: 19
0,benthic_forams,Abyssamina incisa,,,,,,Abyssamina,,,,incisa,,,,,762,Abyssamina,genus,
1,benthic_forams,Adercotryma glomeratum,,,,,,Adercotryma,,,,glomeratum,,,,,774,Adercotryma,genus,
2,benthic_forams,Adercotryma sp.,,,,,,Adercotryma,,,,sp.,,,,,774,Adercotryma,genus,
3,benthic_forams,Alabamina decorata,,,,,,Alabamina,,,,decorata,,,,,788,Alabamina,genus,
4,benthic_forams,Alabamina haitiensis,,,,,,Alabamina,,,,haitiensis,,,,,788,Alabamina,genus,


In [47]:
taxa_df.to_csv(OUTPUT_DIR/'taxa'/'draft'/'NOAA'/'google_sheet_taxa_lists_2021-08-05.csv', index=False)