#  NOAA DSDP taxa list
## 1-96 taxa

Create list of taxa for NOAA DSDP files. Compare NOAA taxa with the LIMS approved taxa that the PIs have already approved in order create a list of unapproved NOAA taxa. Add PBDB data to unapproved taxa.

In [1]:
import sys
import csv
import glob
import os
import requests
import re
import time 

sys.path.append('../../')
import pandas as pd
import numpy as np

# import db 
import scripts.normalize_taxa as nt
from config import OUTPUT_DIR, CLEAN_DATA_DIR, RAW_DATA_DIR
from scripts.pbdb import get_parent_taxa, PBDB_TAXA_NAME

In [2]:
# date = '2021-07-28'
# date = '2021-08-05'
date='2021-11-29'

base_dir = CLEAN_DATA_DIR

input_file = RAW_DATA_DIR/'PI_processed_files'/f'NOAA_taxa_lists_taxa_list_{date}.csv'
input_pbdb_file = OUTPUT_DIR/'taxa'/'NOAA'/f"PI_normalized_taxa_list_with_pbdb_{date}.csv"


metadata_path = OUTPUT_DIR/'metadata'/'NOAA'/'noaa_dsdp_files.csv'
LIMS_taxa_path = OUTPUT_DIR/'taxa'/'LIMS'/f'taxa_list_2021-07-28.csv'


merged_path = OUTPUT_DIR/'taxa'/'draft'/'NOAA'/f'taxa_merged_{date}.csv'
merged2_path = OUTPUT_DIR/'taxa'/'draft'/'NOAA'/f'taxa_merged2_{date}.csv'

all_taxa_path = OUTPUT_DIR/'taxa'/'draft'/'NOAA'/f'all_taxa_list_{date}.csv'

taxa_path = OUTPUT_DIR/'taxa'/'draft'/'NOAA'/f'taxa_list_{date}.csv'
genus_path = OUTPUT_DIR/'taxa'/'draft'/'NOAA'/f'genus_pbdb_{date}.csv'
taxa_pbdb_path = OUTPUT_DIR/'taxa'/'draft'/'NOAA'/f'taxa_list_pbdb_{date}.csv'

In [3]:
def log_df(df, row_count=5):
    print(df.shape)
    return df.head(row_count)


## Create NOAA taxa list

In [4]:
metadata = pd.read_csv(metadata_path)
log_df(metadata)

(4477, 5)


Unnamed: 0,path,type,taxon_group,expedition,site
0,NOAA/DSDP_core_data/61/462/radiolar.csv,taxa,radiolarians,61,462
1,NOAA/DSDP_core_data/61/462/ageprof.csv,age,,61,462
2,NOAA/DSDP_core_data/61/462/b_forams.csv,taxa,benthic_foraminfera,61,462
3,NOAA/DSDP_core_data/61/462/p_forams.csv,taxa,planktic_foraminfera,61,462
4,NOAA/DSDP_core_data/61/462/hr_desc.csv,hard_rock,,61,462


read all the taxa files to get unique taxa names

In [5]:
# 9933
taxa = set()

for index, row in metadata.iterrows():
    if row['type'] == 'taxa':
        df = pd.read_csv(base_dir/row['path'])
        df.dropna(axis=0, inplace=True, how='all')
        df['temp'] = df['fossil'].str.strip() +  '|' + row['taxon_group']
        
        taxa.update(df['temp'])
        
len(taxa)
#9933

9933

In [7]:
list(taxa)[0:5]

[nan,
 'Boldia sp.|benthic_foraminfera',
 'Dictyocha perlaevis flexatella|silicoflagellates',
 'Cladogramma californicum|diatoms',
 'Spiroplectammina biformis|benthic_foraminfera']

In [8]:
# 9932
taxa_list = []

for taxon in taxa:
    if not pd.isna(taxon):
        taxon_name, taxon_group = taxon.split('|')
        
        # remove (text) (q) from taxon name    
        simplified_name = re.sub('\(.*?\)$', '', taxon_name).strip()
        taxon_name_parts = simplified_name.split(' ')

        data = {'verbatim_name': taxon_name, 
                'taxon_group': taxon_group, 
                'genus name': taxon_name_parts[0],
                'simplified_name': simplified_name}
        if len(taxon_name_parts) > 1:
            data['species name'] = taxon_name_parts[1]
        if len(taxon_name_parts) == 3:
            data['subspecies name'] = taxon_name_parts[2]

        taxa_list.append(data)
        
len(taxa_list)

9932

In [9]:
noaa_taxa_df = pd.DataFrame(taxa_list).sort_values('verbatim_name')
log_df(noaa_taxa_df)

(9932, 6)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name
4650,Abas wittii,diatoms,Abas,Abas wittii,wittii,
585,Abathomphalus intermedius,planktic_foraminfera,Abathomphalus,Abathomphalus intermedius,intermedius,
6324,Abathomphalus mayaroensis,planktic_foraminfera,Abathomphalus,Abathomphalus mayaroensis,mayaroensis,
2806,Abies sp.,pollen,Abies,Abies sp.,sp.,
3588,Abutilon sp. (q),pollen,Abutilon,Abutilon sp.,sp.,


## compare and replace taxon groups 

replace NOAA taxon groups with LIMS taxon groups

In [11]:
LIMS_taxa_df = pd.read_csv(LIMS_taxa_path)

In [12]:
LIMS_groups = list(LIMS_taxa_df['taxon_group'].unique())
LIMS_groups.sort()
LIMS_groups

['benthic_forams',
 'bolboformids',
 'chrysophyte_cysts',
 'diatoms',
 'dinoflagellates',
 'ebridians',
 'nannofossils',
 'ostracods',
 'palynology',
 'planktic_forams',
 'radiolarians',
 'silicoflagellates']

In [13]:
noaa_groups = list(noaa_taxa_df['taxon_group'].unique())
noaa_groups.sort()
noaa_groups

['benthic_foraminfera',
 'diatoms',
 'dinoflagellates',
 'ebridians',
 'nannofossils',
 'ostracods',
 'phytoliths',
 'planktic_foraminfera',
 'pollen',
 'radiolarians',
 'silicoflagellates']

In [14]:
set(noaa_groups) -  set(LIMS_groups)

{'benthic_foraminfera', 'phytoliths', 'planktic_foraminfera', 'pollen'}

In [15]:
noaa_taxa_df = noaa_taxa_df.replace(['benthic_foraminfera', 'planktic_foraminfera'],
                                    ['benthic_forams', 'planktic_forams'])
noaa_taxa_df.head()

Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name
4650,Abas wittii,diatoms,Abas,Abas wittii,wittii,
585,Abathomphalus intermedius,planktic_forams,Abathomphalus,Abathomphalus intermedius,intermedius,
6324,Abathomphalus mayaroensis,planktic_forams,Abathomphalus,Abathomphalus mayaroensis,mayaroensis,
2806,Abies sp.,pollen,Abies,Abies sp.,sp.,
3588,Abutilon sp. (q),pollen,Abutilon,Abutilon sp.,sp.,


In [16]:
noaa_taxa_df.to_csv(all_taxa_path, index=False)

## create csv that compares NOAA taxa list with LIMS approved taxa

get NOAA taxa

In [17]:
# 9932
noaa_taxa_df = pd.read_csv(all_taxa_path)

log_df(noaa_taxa_df)

(9932, 6)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name
0,Abas wittii,diatoms,Abas,Abas wittii,wittii,
1,Abathomphalus intermedius,planktic_forams,Abathomphalus,Abathomphalus intermedius,intermedius,
2,Abathomphalus mayaroensis,planktic_forams,Abathomphalus,Abathomphalus mayaroensis,mayaroensis,
3,Abies sp.,pollen,Abies,Abies sp.,sp.,
4,Abutilon sp. (q),pollen,Abutilon,Abutilon sp.,sp.,


get LIMS taxa

In [18]:
# 4209
LIMS_taxa_df = pd.read_csv(LIMS_taxa_path)
log_df(LIMS_taxa_df)

(4209, 12)


Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,normalized_name,taxon_group
0,,,Euuvigerina,,,,miozea,,,,Euuvigerina miozea,benthic_forams
1,,,Euuvigerina,,,,rodleyi,,,,Euuvigerina rodleyi,benthic_forams
2,Foraminifera indet.,,,,,,,,,,Foraminifera indet.,benthic_forams
3,Pleurostomellidae indet.,,,,,,,,,,Pleurostomellidae indet.,benthic_forams
4,Ostracoda indet.,,,,,,,,,,Ostracoda indet.,benthic_forams


add simplified_name without descriptor

In [19]:
del LIMS_taxa_df['normalized_name'] 


In [20]:
LIMS_taxa_df = nt.add_normalized_name_column(LIMS_taxa_df, 
                                                 include_descriptor=False, 
                                                 col_name="simplified_name")

In [21]:
LIMS_taxa_df[LIMS_taxa_df['non-taxa descriptor'].notna()].head(2)

Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,taxon_group,simplified_name
28,,,Globigerinoides,,,,sacculifer,,,without sac,planktic_forams,Globigerinoides sacculifer
201,,,Bolivina,,,cf.,crenulata,,,crenulate,benthic_forams,Bolivina cf. crenulata


In [22]:
LIMS_taxa_df = pd.DataFrame(LIMS_taxa_df[['taxon_group', 'simplified_name']])
log_df(LIMS_taxa_df)

(4209, 2)


Unnamed: 0,taxon_group,simplified_name
0,benthic_forams,Euuvigerina miozea
1,benthic_forams,Euuvigerina rodleyi
2,benthic_forams,Foraminifera indet.
3,benthic_forams,Pleurostomellidae indet.
4,benthic_forams,Ostracoda indet.


### merge NOAA taxa with LIMS taxa

In [23]:
# 10109 
merged_df = pd.merge(noaa_taxa_df, LIMS_taxa_df,  
                     on=['simplified_name', 'taxon_group' ], 
                     how='left',
                     indicator='_merge_approved')

log_df(merged_df)


(10109, 7)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name,_merge_approved
0,Abas wittii,diatoms,Abas,Abas wittii,wittii,,left_only
1,Abathomphalus intermedius,planktic_forams,Abathomphalus,Abathomphalus intermedius,intermedius,,left_only
2,Abathomphalus mayaroensis,planktic_forams,Abathomphalus,Abathomphalus mayaroensis,mayaroensis,,both
3,Abies sp.,pollen,Abies,Abies sp.,sp.,,left_only
4,Abutilon sp. (q),pollen,Abutilon,Abutilon sp.,sp.,,left_only


In [24]:
# 9932
merged_df = merged_df.drop_duplicates()
log_df(merged_df)

(9932, 7)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name,_merge_approved
0,Abas wittii,diatoms,Abas,Abas wittii,wittii,,left_only
1,Abathomphalus intermedius,planktic_forams,Abathomphalus,Abathomphalus intermedius,intermedius,,left_only
2,Abathomphalus mayaroensis,planktic_forams,Abathomphalus,Abathomphalus mayaroensis,mayaroensis,,both
3,Abies sp.,pollen,Abies,Abies sp.,sp.,,left_only
4,Abutilon sp. (q),pollen,Abutilon,Abutilon sp.,sp.,,left_only


In [25]:
merged_df.to_csv(merged_path, index=False)


### compare merge methods
do merge on simplified_name without taxon groups

In [26]:
# 10114 
merged2_df = pd.merge(noaa_taxa_df, LIMS_taxa_df,  
                     on=['simplified_name'], 
                     how='left',
                     indicator='_merge_approved')

log_df(merged2_df)


(10114, 8)


Unnamed: 0,verbatim_name,taxon_group_x,genus name,simplified_name,species name,subspecies name,taxon_group_y,_merge_approved
0,Abas wittii,diatoms,Abas,Abas wittii,wittii,,,left_only
1,Abathomphalus intermedius,planktic_forams,Abathomphalus,Abathomphalus intermedius,intermedius,,,left_only
2,Abathomphalus mayaroensis,planktic_forams,Abathomphalus,Abathomphalus mayaroensis,mayaroensis,,planktic_forams,both
3,Abies sp.,pollen,Abies,Abies sp.,sp.,,,left_only
4,Abutilon sp. (q),pollen,Abutilon,Abutilon sp.,sp.,,,left_only


In [27]:
# 9937
merged2_df = merged2_df.drop_duplicates()
log_df(merged2_df)

(9937, 8)


Unnamed: 0,verbatim_name,taxon_group_x,genus name,simplified_name,species name,subspecies name,taxon_group_y,_merge_approved
0,Abas wittii,diatoms,Abas,Abas wittii,wittii,,,left_only
1,Abathomphalus intermedius,planktic_forams,Abathomphalus,Abathomphalus intermedius,intermedius,,,left_only
2,Abathomphalus mayaroensis,planktic_forams,Abathomphalus,Abathomphalus mayaroensis,mayaroensis,,planktic_forams,both
3,Abies sp.,pollen,Abies,Abies sp.,sp.,,,left_only
4,Abutilon sp. (q),pollen,Abutilon,Abutilon sp.,sp.,,,left_only


the reason for the count difference is because the LIMS taxa list sometimes puts a taxa in two groups

NOAA: Selenopemphix nephroides - dinoflagellates 
LIMS: Selenopemphix nephroides - dinoflagellates, palynology

## Created taxa list with unapproved NOAA taxa

Select unapproved NOAA taxa. When _merged is both, it means the taxa has been approved. When _merged is left_only, it means the taxa has not been approved.

In [28]:
merged_df = pd.read_csv(merged_path)
log_df(merged_df)

(9932, 7)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name,_merge_approved
0,Abas wittii,diatoms,Abas,Abas wittii,wittii,,left_only
1,Abathomphalus intermedius,planktic_forams,Abathomphalus,Abathomphalus intermedius,intermedius,,left_only
2,Abathomphalus mayaroensis,planktic_forams,Abathomphalus,Abathomphalus mayaroensis,mayaroensis,,both
3,Abies sp.,pollen,Abies,Abies sp.,sp.,,left_only
4,Abutilon sp. (q),pollen,Abutilon,Abutilon sp.,sp.,,left_only


In [29]:
# (7763, 7)
LIMS_taxa_df = merged_df[merged_df['_merge_approved'] == 'left_only'].copy()

log_df(LIMS_taxa_df)

(7763, 7)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name,_merge_approved
0,Abas wittii,diatoms,Abas,Abas wittii,wittii,,left_only
1,Abathomphalus intermedius,planktic_forams,Abathomphalus,Abathomphalus intermedius,intermedius,,left_only
3,Abies sp.,pollen,Abies,Abies sp.,sp.,,left_only
4,Abutilon sp. (q),pollen,Abutilon,Abutilon sp.,sp.,,left_only
5,Abyssamina incisa,benthic_forams,Abyssamina,Abyssamina incisa,incisa,,left_only


create csv of unapproved NOAA taxa

In [30]:
LIMS_taxa_df.to_csv(taxa_path, index=False)

## Create genus list with PBDB data for NOAA taxa that aren't approved

Look up the genus for unapproved taxa in PBDB

In [31]:
taxa_df = pd.read_csv(taxa_path)
log_df(taxa_df)

(7763, 7)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name,_merge_approved
0,Abas wittii,diatoms,Abas,Abas wittii,wittii,,left_only
1,Abathomphalus intermedius,planktic_forams,Abathomphalus,Abathomphalus intermedius,intermedius,,left_only
2,Abies sp.,pollen,Abies,Abies sp.,sp.,,left_only
3,Abutilon sp. (q),pollen,Abutilon,Abutilon sp.,sp.,,left_only
4,Abyssamina incisa,benthic_forams,Abyssamina,Abyssamina incisa,incisa,,left_only


create a dataframe of unique genera

In [32]:
genus_df = pd.DataFrame(taxa_df['genus name'].unique(), columns=['genus name'])

log_df(genus_df)

(1707, 1)


Unnamed: 0,genus name
0,Abas
1,Abathomphalus
2,Abies
3,Abutilon
4,Abyssamina


add pbdb taxa data

In [34]:
for index, row in genus_df.iterrows():        
    if index < 250:
        continue
    time.sleep(0.5)
    
    if index % 50 == 0:
        print(index, end=' ')

        
    url =  PBDB_TAXA_NAME +  row['genus name']
        
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()["records"]
        if len(data) == 1:
            genus_df.at[index, f'pbdb_taxon_id'] = str(data[0]["taxon_no"])
            genus_df.at[index, f'pbdb_taxon_name'] = data[0]["taxon_name"]
            genus_df.at[index, f'pbdb_taxon_rank'] = data[0]["taxon_rank"]
            
            round = 0
            get_parent_taxa(genus_df, data[0]["parent_no"], data[0]["taxon_rank"], round, index, None)

           

In [None]:
genus_df.head()

create genus csv

In [33]:
genus_df.to_csv(genus_path, index=False)

# update genus file
look for genus that don't have higher pbdb data

In [35]:
genus_df = pd.read_csv(genus_path, dtype=str)

In [36]:
missing_df = genus_df[genus_df['pbdb_taxon_id'].notna() & 
         genus_df['family_taxon_id'].isna() &
         genus_df['order_taxon_id'].isna() &
         genus_df['class_taxon_id'].isna() &
         genus_df['phylum_taxon_id'].isna() &
         genus_df['kingdom_taxon_id'].isna()]

missing_df.shape

(32, 16)

In [37]:
for index, row in missing_df.iterrows():        
    time.sleep(0.5)
    
    
    print(index, row['genus name'])

        
    url =  PBDB_TAXA_NAME +  row['genus name']
        
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()["records"]
        if len(data) == 1:
            genus_df.at[index, f'pbdb_taxon_id'] = str(data[0]["taxon_no"])
            genus_df.at[index, f'pbdb_taxon_name'] = data[0]["taxon_name"]
            genus_df.at[index, f'pbdb_taxon_rank'] = data[0]["taxon_rank"]
            
            round = 0
            get_parent_taxa(genus_df, data[0]["parent_no"], data[0]["taxon_rank"], round, index, None)


7 Acaciapollenites
94 Anthocorys
151 Australopollis
319 Chenopodipollis
329 Chrysophyta
370 Comasphaeridium
405 Crassosphaera
443 Cyclopsiella
446 Cymatiosphaera
507 Dictyosphaeridium
553 Echitricolporites
746 Helminthopsis
779 Homotryblium
792 Hypericum
861 Leiofusa
862 Leiosphaeridia
974 Membranosphaera
981 Micrhystridium
993 Mimosaceae
1182 Platycystidia
1379 Sagittaria
1396 Schizocolpus
1409 Sethamphora
1458 Spondias
1509 Striatricolpites
1535 Tasmanites
1540 Tectatodinium
1637 Tuberculodinium
1675 Veryhachium
1678 Virola
1737 Nipponocythere
1739 Palaeostomocystis


In [38]:
genus_df.to_csv(genus_path, index=False)

## create taxa list with pbdb info for the PIs

In [39]:
genus_df = pd.read_csv(genus_path, dtype= str)
log_df(genus_df)

(1751, 16)


Unnamed: 0,genus name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,class_taxon_id,class_taxon_name,order_taxon_id,order_taxon_name,family_taxon_id,family_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name
0,Abas,441243.0,Abas,genus,69587.0,Bacillariophyceae,426784.0,Hemiaulales,387088.0,Hemiaulaceae,432613.0,Ochrophyta,,,,
1,Abathomphalus,758.0,Abathomphalus,genus,,,,,,,288974.0,Foraminifera,212476.0,Rhizaria,,
2,Abies,55065.0,Abies,genus,82141.0,Pinopsida,82140.0,Pinales,54794.0,Pinaceae,82139.0,Pinophyta,54311.0,Plantae,,
3,Abutilon,,,,,,,,,,,,,,,
4,Abyssamina,762.0,Abyssamina,genus,428504.0,Globothalamea,279579.0,Rotaliida,103796.0,Chilostomellidae,288974.0,Foraminifera,212476.0,Rhizaria,,


In [40]:
unapproved_df = pd.read_csv(taxa_path)

log_df(unapproved_df)

(7763, 7)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name,_merge_approved
0,Abas wittii,diatoms,Abas,Abas wittii,wittii,,left_only
1,Abathomphalus intermedius,planktic_forams,Abathomphalus,Abathomphalus intermedius,intermedius,,left_only
2,Abies sp.,pollen,Abies,Abies sp.,sp.,,left_only
3,Abutilon sp. (q),pollen,Abutilon,Abutilon sp.,sp.,,left_only
4,Abyssamina incisa,benthic_forams,Abyssamina,Abyssamina incisa,incisa,,left_only


merge NOAA unapproved taxa with pbdb data

In [41]:
merged_df = pd.merge(LIMS_taxa_df, genus_df, 
                     on = 'genus name', 
                     how='left',
                     indicator='_merge_pbdb')

log_df(merged_df)

(7763, 23)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name,_merge_approved,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,...,order_taxon_name,family_taxon_id,family_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name,_merge_pbdb
0,Abas wittii,diatoms,Abas,Abas wittii,wittii,,left_only,441243.0,Abas,genus,...,Hemiaulales,387088.0,Hemiaulaceae,432613.0,Ochrophyta,,,,,both
1,Abathomphalus intermedius,planktic_forams,Abathomphalus,Abathomphalus intermedius,intermedius,,left_only,758.0,Abathomphalus,genus,...,,,,288974.0,Foraminifera,212476.0,Rhizaria,,,both
2,Abies sp.,pollen,Abies,Abies sp.,sp.,,left_only,55065.0,Abies,genus,...,Pinales,54794.0,Pinaceae,82139.0,Pinophyta,54311.0,Plantae,,,both
3,Abutilon sp. (q),pollen,Abutilon,Abutilon sp.,sp.,,left_only,,,,...,,,,,,,,,,both
4,Abyssamina incisa,benthic_forams,Abyssamina,Abyssamina incisa,incisa,,left_only,762.0,Abyssamina,genus,...,Rotaliida,103796.0,Chilostomellidae,288974.0,Foraminifera,212476.0,Rhizaria,,,both


add columns 

In [42]:
merged_df['name'] = merged_df['verbatim_name'].str.strip()
merged_df['Comment'] = np.nan
merged_df['Notes (change to Internal only notes?)'] = np.nan
merged_df['Any taxon above genus'] = np.nan
merged_df['genus modifier'] = ''
merged_df['subgenera modifier'] = np.nan
merged_df['subgenera name'] = np.nan
merged_df['species modifier'] = np.nan
merged_df['subspecies modifier'] = np.nan
merged_df['non-taxa descriptor'] = np.nan
merged_df['comments'] = np.nan


remove (q) from name, and add (q) to genus modifier

In [43]:
for index, row in merged_df.iterrows():
    if '(q)' in row['name']:
        
        merged_df.at[index,'name']=re.sub('(.*?) \(q\)', r'? \1', row['name'])
        merged_df.at[index,'genus modifier'] = '?'
        
log_df(merged_df)

(7763, 34)


Unnamed: 0,verbatim_name,taxon_group,genus name,simplified_name,species name,subspecies name,_merge_approved,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,...,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,subgenera modifier,subgenera name,species modifier,subspecies modifier,non-taxa descriptor,comments
0,Abas wittii,diatoms,Abas,Abas wittii,wittii,,left_only,441243.0,Abas,genus,...,,,,,,,,,,
1,Abathomphalus intermedius,planktic_forams,Abathomphalus,Abathomphalus intermedius,intermedius,,left_only,758.0,Abathomphalus,genus,...,,,,,,,,,,
2,Abies sp.,pollen,Abies,Abies sp.,sp.,,left_only,55065.0,Abies,genus,...,,,,,,,,,,
3,Abutilon sp. (q),pollen,Abutilon,Abutilon sp.,sp.,,left_only,,,,...,,,,?,,,,,,
4,Abyssamina incisa,benthic_forams,Abyssamina,Abyssamina incisa,incisa,,left_only,762.0,Abyssamina,genus,...,,,,,,,,,,


reorder columns and sort rows

In [44]:
merged_df = merged_df.reindex(columns=['taxon_group', 'verbatim_name', 'name', 'Comment',
                                       'Notes (change to Internal only notes?)',
                                       'Any taxon above genus', 
                                       'genus modifier', 'genus name', 
                                       'subgenera modifier', 'subgenera name',
                                       'species modifier', 'species name', 
                                       'subspecies modifier', 'subspecies name',
                                       'non-taxa descriptor', 'comments',
                                       'pbdb_taxon_id', 'pbdb_taxon_name',
                                       'pbdb_taxon_rank', '_simplified_name',
                                       '_merge_approved', '_merge_pbdb'
                                      ])

merged_df.sort_values(by=['taxon_group', 'verbatim_name'], inplace=True)

log_df(merged_df)

(7763, 22)


Unnamed: 0,taxon_group,verbatim_name,name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,subspecies modifier,subspecies name,non-taxa descriptor,comments,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,_simplified_name,_merge_approved,_merge_pbdb
4,benthic_forams,Abyssamina incisa,Abyssamina incisa,,,,,Abyssamina,,,...,,,,,762,Abyssamina,genus,,left_only,both
131,benthic_forams,Adercotryma glomeratum,Adercotryma glomeratum,,,,,Adercotryma,,,...,,,,,774,Adercotryma,genus,,left_only,both
132,benthic_forams,Adercotryma sp.,Adercotryma sp.,,,,,Adercotryma,,,...,,,,,774,Adercotryma,genus,,left_only,both
144,benthic_forams,Alabamina decorata,Alabamina decorata,,,,,Alabamina,,,...,,,,,788,Alabamina,genus,,left_only,both
145,benthic_forams,Alabamina haitiensis,Alabamina haitiensis,,,,,Alabamina,,,...,,,,,788,Alabamina,genus,,left_only,both


In [45]:
merged_df.to_csv(taxa_pbdb_path, index=False)

## fix taxalist from PIs

update google sheet taxa list from PIs to deal with (q) in verbatim name. only need to do this once.

In [45]:
input_file = RAW_DATA_DIR/'PI_processed_files'/f'NOAA_taxa_lists_taxa_list_2021-08-05.csv'

taxa_df = pd.read_csv(input_file, dtype=str)
log_df(taxa_df)

(7763, 20)


Unnamed: 0,taxon_group,verbatim_name,name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,comments,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,Unnamed: 19
0,benthic_forams,Abyssamina incisa,,,,,,Abyssamina,,,,incisa,,,,,762,Abyssamina,genus,
1,benthic_forams,Adercotryma glomeratum,,,,,,Adercotryma,,,,glomeratum,,,,,774,Adercotryma,genus,
2,benthic_forams,Adercotryma sp.,,,,,,Adercotryma,,,,sp.,,,,,774,Adercotryma,genus,
3,benthic_forams,Alabamina decorata,,,,,,Alabamina,,,,decorata,,,,,788,Alabamina,genus,
4,benthic_forams,Alabamina haitiensis,,,,,,Alabamina,,,,haitiensis,,,,,788,Alabamina,genus,


In [46]:

for index, row in taxa_df.iterrows():
    # don't overwrite existing name
    if isinstance(row['name'], str) :
        continue
    # if verbatim name has '(q)'
    if '(q)' in row['verbatim_name']:
        # set 'name' to 'verbatim name' without '(q)' 
        taxa_df.at[index,'name']=re.sub('(.*?) ?\(q\)', r'? \1', row['verbatim_name'])
        # set 'genus modifier' to '?'
        taxa_df.at[index,'genus modifier'] = '?'

        
log_df(taxa_df)

(7763, 20)


Unnamed: 0,taxon_group,verbatim_name,name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,comments,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,Unnamed: 19
0,benthic_forams,Abyssamina incisa,,,,,,Abyssamina,,,,incisa,,,,,762,Abyssamina,genus,
1,benthic_forams,Adercotryma glomeratum,,,,,,Adercotryma,,,,glomeratum,,,,,774,Adercotryma,genus,
2,benthic_forams,Adercotryma sp.,,,,,,Adercotryma,,,,sp.,,,,,774,Adercotryma,genus,
3,benthic_forams,Alabamina decorata,,,,,,Alabamina,,,,decorata,,,,,788,Alabamina,genus,
4,benthic_forams,Alabamina haitiensis,,,,,,Alabamina,,,,haitiensis,,,,,788,Alabamina,genus,


In [47]:
taxa_df.to_csv(OUTPUT_DIR/'taxa'/'draft'/'NOAA'/'google_sheet_taxa_lists_2021-08-05.csv', index=False)