# Create Taxa list csv for missing taxa for micropal 4

Create a csv containing missing taxa names for micropal 4

In [1]:
import sys
sys.path.append('../../../')
import glob
import re
import os.path
import time
import requests

import pandas as pd
import numpy as np

from scripts.shared_utils import extract_taxon_group_from_filename
from scripts.normalize_data import (
    csv_cleanup,
    update_metadata,
    get_taxonomy_columns,
    clean_taxon_name
)
from scripts.normalize_taxa import add_normalized_name_column, taxon_name_parser
from config import CLEAN_DATA_DIR, OUTPUT_DIR, RAW_DATA_DIR

from scripts.pbdb import get_parent_taxa, PBDB_TAXA_NAME
from scripts.shared_utils import (
    log_df
)

In [2]:
base_dir = CLEAN_DATA_DIR

micropal_4 = CLEAN_DATA_DIR/'LIMS'/'Micropal_CSV_4'
metadata_path = OUTPUT_DIR/'metadata'/'LIMS'/'Micropal_changes_4.csv'

LIMS_taxa_file = OUTPUT_DIR/'taxa'/'LIMS'/f"taxa_crosswalk_2022-08-08.csv"
NOAA_taxa_file =  RAW_DATA_DIR/'PI_processed_files'/'NOAA_taxa_lists_taxa_list_2022-02-24.csv'

date = '2022-08-08'
taxa_file = OUTPUT_DIR/'taxa'/'draft'/'LIMS'/f"micropal_4_taxa_{date}.csv"
genus_file= OUTPUT_DIR/'taxa'/'draft'/'LIMS'/f"micropal_4_genus_{date}.csv"
taxa_pbdb_file = OUTPUT_DIR/'taxa'/'draft'/'LIMS'/f'micropal_4_taxa_pbdb_{date}.csv'

In [3]:
clean_csvs = []
clean_csvs = clean_csvs + list(micropal_4.glob("*.csv"))

clean_csvs[0:3]

[PosixPath('../../../output/cleaned_data/LIMS/Micropal_CSV_4/320_U1334_benthic_forams.csv'),
 PosixPath('../../../output/cleaned_data/LIMS/Micropal_CSV_4/361_U1479B_nannofossils.csv'),
 PosixPath('../../../output/cleaned_data/LIMS/Micropal_CSV_4/372_U1517C_planktic_forams.csv')]

In [5]:
clean_data_path = CLEAN_DATA_DIR
metadata_file = metadata_path

## Create a csv of all taxa

In [6]:
metadata = pd.read_csv(metadata_path)
log_df(metadata)

(137, 12)


Unnamed: 0,file,path,taxon_group,change_file_encoding,remove_bad_characters,remove_empty_rows,remove_spaces,delete_duplicate_rows,delete_duplicate_columns,update_sample_col,update_top_bottom,add_missing_cols
0,320_U1334_benthic_forams.csv,LIMS/Micropal_CSV_4/320_U1334_benthic_forams.csv,benthic_forams,False,False,False,True,False,False,True,False,False
1,361_U1479B_nannofossils.csv,LIMS/Micropal_CSV_4/361_U1479B_nannofossils.csv,nannofossils,True,True,False,True,False,False,False,False,False
2,372_U1517C_planktic_forams.csv,LIMS/Micropal_CSV_4/372_U1517C_planktic_forams...,planktic_forams,False,False,False,True,False,False,False,False,False
3,317_U1352_planktic_forams.csv,LIMS/Micropal_CSV_4/317_U1352_planktic_forams.csv,planktic_forams,False,False,False,True,False,False,True,False,False
4,323_U1339_palynology.csv,LIMS/Micropal_CSV_4/323_U1339_palynology.csv,palynology,False,False,False,True,False,False,True,False,False


get all columns with data

In [7]:
all_columns = set()
for path in metadata['path']:
    df = pd.read_csv(clean_data_path/path, dtype=str)
    df = csv_cleanup(df, clean_data_path/path)
    df = df.dropna(how='all', axis='columns')
    all_columns.update([col.strip() for col in df.columns])

In [8]:
len(all_columns)

1603

In [9]:
strip_cols = [col.strip() for col in all_columns]

get procesessed LIMS taxa

In [10]:
existing_LIMS_taxa = set()

existing_taxa_df = pd.read_csv(LIMS_taxa_file)
for index, row in existing_taxa_df.iterrows():
    existing_LIMS_taxa.add(row['verbatim_name'])

len(existing_LIMS_taxa)

5269

get NOAA taxa

In [11]:
existing_NOAA_taxa = set()

existing_taxa_df = pd.read_csv(NOAA_taxa_file)
for index, row in existing_taxa_df.iterrows():
    existing_NOAA_taxa.add(row['verbatim_name'])

len(existing_NOAA_taxa)

7758

In [12]:
nontaxa = {
 'Abundance',
 'Abundance ',
 'Abundance (%)',
 'Abundances',
 'Age',
 'Age:',
 'Benthic abundance',
 'Biozone name',
 'Biozone name (short)',
 'Bottom (cm)',
 'Bottom (m CSF-A)',
 'Bottom CSF-A (m)',
 'Bottom Depth (m)',
 'Bottom Depth (m) CSF-A',
 'Bottom Depth CSF-A (m)',
 'Bottom Depth [CFS m]',
 'Bottom Depth [m]',
 'Bottom Offset (cm) on Parent Sample',
 'Bottom [cm]',
 'Bottom depth CSF-B (m)',
 'Bottom depth CSF-B (m):',
 'Bottom interval (cm)',
 'COMMENTS',
 'Comments',
 'Core',
 'Core Type',
 'Core Type - Section',
 'Core type',
 'Core,    section',
 'Core, Section',
 'Core, Section, Interval',
 'Core, section',
 'Core, section, interval',
 'Core, section, interval (cm)',
 'Datum age average (Ma)',
 'Datum name',
 'Datum type',
 'Depth (cm)',
 'Depth (csf)',
 'Depth (m) CSF-A',
 'Depth CSF (m)',
 'Depth CSF-A (m)',
 'Depth Method',
 'Depth bottom CSF-A (m)',
 'Depth m (m csf)',
 'Depth top CSF-A (m)',
 'Exp',
 'Expedition',
 'Expedition ',
 'Expedition, site, hole, core, section, interval (cm):',
 'Foraminferal preservation',
 'Foraminiferal abundance',
 'Foraminiferal preservation',
 'Group Abundance',
 'Group abundance',
 'Half',
 'Hole',
 'Hole, Core, Section',
 'Hole.1',
 'IRD',
 'Interval (bottom)',
 'Interval (top)',
 'Interval Top (cm) on SHLF',
 'Interval Bot (cm) on SHLF',   
 'Miscellaneous',
 'Nannofossil Zone',
 'Nannofossil abundance',
 'Nannofossil comment',
 'Oberservations',
 'Observations',
 'Original Bottom Depth (m)',
 'Original Top Depth (m)',
 'Other fossil material',
 'Other observations',
 'Other taxa',
 'Preservation',
 'Presevation',
 'REMARKS',
 'Remarks',
 'Sample',
 'Section',
 'Section Half',
 'Secton Half',
 'Site',
 'Top (cm)',
 'Top (m CSF-A)',
 'Top CSF-A (m)',
 'Top Depth (CSF m)',
 'Top Depth (m)',
 'Top Depth (m) CSF-A',
 'Top Depth CFS (m)',
 'Top Depth CSF-A (m)',
 'Top Depth [CFS m]',
 'Top Depth [CSF m]',
 'Top Depth [m]',
 'Top Offset (cm) on Parent Sample',
 'Top [cm]',
 'Top depth CSF (m)',
 'Top depth CSF-B (m)',
 'Top depth CSF-B (m):',
 'Top depth [CSF m]',
 'Top interval (cm)',
 'Total pollen',
 'Total radiolarians',
 'Type',
 'Unnamed: 148',
 'Unnamed: 21',
 'Unnamed: 3',
 'Unnamed: 61',
 'Unnamed: 81',
 'Zone',
 'Zone name (short)',
 'Zone/Subzone',
 'bottom (cm)',
 'bottom interval (cm)',
 'comments',
 'core, section',
 'depth Bottom (m CSF-A)',
 'depth Bottom (m)',
 'depth Bottom CSF-A (m)',
 'depth CSF-A',
 'depth CSF-A (m)',
 'depth CSF-A Bottom (m)',
 'depth CSF-A Top (m)',
 'depth Top (m CSF-A)',
 'depth Top (m)',
 'depth Top CSF-A (m)',
 'interval (cm)',
 'mean depth (mbsf)',
 'preservation',
 'section',
 'top (cm)',
 'top interval (cm)',
'A/W',
'eodp_id',
"Core, Type, Section",
'Bathymetry',
'Diatom Zone (NPD) in Yanagisawa and Akiba (1998)',
'Diatom Zone (Yanagisawa and Akiba, 1998)',
'Gen. et sp. indet',
'Marine',
'Martini (1971) Zone',
'Planktic foraminiferal %',
'Planktic foraminiferal (%)',
'Radiolarian zone',
'Radiolarian zone/subzone',
'Silicoflagellate Zone in Ling (1992)',
'Tintinids',
'Zone in Ling (1992)',
''    
}

all_taxa_names = all_columns  - nontaxa
taxa_names = all_taxa_names - existing_LIMS_taxa - existing_NOAA_taxa

In [13]:
print(len(all_taxa_names), len(taxa_names), len(nontaxa))


1515 14 150


In [14]:
taxa_names

{'Actinocyclus ingens nodus',
 'Actinocyclus senarius',
 'Actinocyclus vulgaris',
 'Aulacoseira sp.',
 'Coccolithus streckeri',
 'Discoaster spp. (six-rayed)',
 'Distephanus boliviensis',
 'Dorcadospyris scambos',
 'Globoconella explicationis',
 'Impagidinium spp.',
 'Parasubbotina griffinae',
 'Reticulofenestra sp.',
 'Tenuitella sp.',
 'Tenuitella spp.'}

### Create csv

Create a taxa list csv that contains all the taxon names and the associated taxon group.

In [15]:
taxa_and_group = set()


for index, row in metadata.iterrows():

    file =  clean_data_path/row['path']
    if '317_U1351_planktic_forams.csv' in str(path):
        header = 1
    else:
        header = 0
        
    df = pd.read_csv(file, dtype=str, header=header, nrows=0)
    
    for col in df.columns:
        if col in taxa_names:
            taxa_and_group.add(f'{col}|{row["taxon_group"]}')


len(taxa_and_group)

14

In [16]:
taxa_list = []

for taxon in taxa_and_group:
    if not pd.isna(taxon):
        taxon_name, taxon_group = taxon.split('|')
        
        taxon_name_parts = taxon_name_parser(taxon_name)

        data = { 
            'taxon_group': taxon_group, 
            'verbatim_name': taxon_name,
        }
        all_ranks =[
            'genus modifier', 'genus name', 'species modifier', 'species name', 
            'subspecies modifier', 'subspecies name', 'non-taxa descriptor'
        ]
        for rank in all_ranks:            
            if rank in taxon_name_parts:
                data[rank] = taxon_name_parts[rank]

        taxa_list.append(data)
        
len(taxa_list)

14

In [17]:
taxa_df = pd.DataFrame(taxa_list)
taxa_df.sort_values(['taxon_group', 'verbatim_name'], inplace=True)
taxa_df.head()

Unnamed: 0,taxon_group,verbatim_name,genus name,species name,non-taxa descriptor,subspecies name
9,diatoms,Actinocyclus ingens nodus,Actinocyclus,ingens,,nodus
4,diatoms,Actinocyclus senarius,Actinocyclus,senarius,,
8,diatoms,Actinocyclus vulgaris,Actinocyclus,vulgaris,,
5,diatoms,Aulacoseira sp.,Aulacoseira,sp.,,
2,dinoflagellates,Impagidinium spp.,Impagidinium,spp.,,


In [18]:
cols = [
'taxon_group',
'verbatim_name',
'genus name', 
'species name', 
'subspecies name',
'species modifier',
'non-taxa descriptor',
'subspecies modifier',  
'genus modifier',     
]

taxa_df = taxa_df.reindex(columns=cols)
taxa_df.head()

Unnamed: 0,taxon_group,verbatim_name,genus name,species name,subspecies name,species modifier,non-taxa descriptor,subspecies modifier,genus modifier
9,diatoms,Actinocyclus ingens nodus,Actinocyclus,ingens,nodus,,,,
4,diatoms,Actinocyclus senarius,Actinocyclus,senarius,,,,,
8,diatoms,Actinocyclus vulgaris,Actinocyclus,vulgaris,,,,,
5,diatoms,Aulacoseira sp.,Aulacoseira,sp.,,,,,
2,dinoflagellates,Impagidinium spp.,Impagidinium,spp.,,,,,


In [19]:
taxa_df.to_csv(taxa_file, index=False)

# add pbdb data 

In [20]:
taxa_df = pd.read_csv(taxa_file, dtype=str)
log_df(taxa_df)

(14, 9)


Unnamed: 0,taxon_group,verbatim_name,genus name,species name,subspecies name,species modifier,non-taxa descriptor,subspecies modifier,genus modifier
0,diatoms,Actinocyclus ingens nodus,Actinocyclus,ingens,nodus,,,,
1,diatoms,Actinocyclus senarius,Actinocyclus,senarius,,,,,
2,diatoms,Actinocyclus vulgaris,Actinocyclus,vulgaris,,,,,
3,diatoms,Aulacoseira sp.,Aulacoseira,sp.,,,,,
4,dinoflagellates,Impagidinium spp.,Impagidinium,spp.,,,,,


In [21]:
genus_df = pd.DataFrame(taxa_df['genus name'].unique(), columns=['genus name'])

log_df(genus_df)

(11, 1)


Unnamed: 0,genus name
0,Actinocyclus
1,Aulacoseira
2,Impagidinium
3,Coccolithus
4,Discoaster


In [22]:
for index, row in genus_df.iterrows():
    if 'pbdb_taxon_id' in row and pd.notna(row['pbdb_taxon_id']):
        continue
        
    time.sleep(0.5)
    
    if index % 50 == 0:
        print(index, end=' ')

        
    url =  PBDB_TAXA_NAME +  row['genus name']
        
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()["records"]
        if len(data) == 1:
            genus_df.at[index, f'pbdb_taxon_id'] = str(data[0]["taxon_no"])
            genus_df.at[index, f'pbdb_taxon_name'] = data[0]["taxon_name"]
            genus_df.at[index, f'pbdb_taxon_rank'] = data[0]["taxon_rank"]
            
            round = 0
            get_parent_taxa(genus_df, data[0]["parent_no"], data[0]["taxon_rank"], round, index, None)

        

0 

In [23]:
log_df(genus_df)

(11, 16)


Unnamed: 0,genus name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name,kingdom_taxon_id,kingdom_taxon_name
0,Actinocyclus,82146,Actinocyclus,genus,71207.0,Hemidiscaceae,426780.0,Coscinodiscales,69587,Bacillariophyceae,432613.0,Ochrophyta,28595.0,Life,,
1,Aulacoseira,432983,Aulacoseira,genus,427312.0,Aulacoseiraceae,426698.0,Aulacoseirales,69587,Bacillariophyceae,432613.0,Ochrophyta,28595.0,Life,,
2,Impagidinium,276906,Impagidinium,genus,321603.0,Gonyaulacaceae,321606.0,Gonyaulacales,321578,Dinophyceae,,,,,,
3,Coccolithus,87684,Coccolithus,genus,,,,,418920,Coccolithophyceae,87644.0,Haptophyta,28595.0,Life,,
4,Discoaster,87682,Discoaster,genus,,,,,418920,Coccolithophyceae,87644.0,Haptophyta,28595.0,Life,,


In [24]:
cols = [
    'genus name','pbdb_taxon_id','pbdb_taxon_name',
    'pbdb_taxon_rank','family_taxon_id','family_taxon_name',
    'order_taxon_id','order_taxon_name','class_taxon_id',
    'class_taxon_name','phylum_taxon_id','phylum_taxon_name',
    'unranked clade_taxon_id','unranked clade_taxon_name',
    'kingdom_taxon_id','kingdom_taxon_name'   
]

genus_df = genus_df.reindex(columns=cols)
genus_df.head()

Unnamed: 0,genus name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name,kingdom_taxon_id,kingdom_taxon_name
0,Actinocyclus,82146,Actinocyclus,genus,71207.0,Hemidiscaceae,426780.0,Coscinodiscales,69587,Bacillariophyceae,432613.0,Ochrophyta,28595.0,Life,,
1,Aulacoseira,432983,Aulacoseira,genus,427312.0,Aulacoseiraceae,426698.0,Aulacoseirales,69587,Bacillariophyceae,432613.0,Ochrophyta,28595.0,Life,,
2,Impagidinium,276906,Impagidinium,genus,321603.0,Gonyaulacaceae,321606.0,Gonyaulacales,321578,Dinophyceae,,,,,,
3,Coccolithus,87684,Coccolithus,genus,,,,,418920,Coccolithophyceae,87644.0,Haptophyta,28595.0,Life,,
4,Discoaster,87682,Discoaster,genus,,,,,418920,Coccolithophyceae,87644.0,Haptophyta,28595.0,Life,,


In [25]:
genus_df.to_csv(genus_file, index=False)

## create taxa list with pbdb info for the PIs

In [36]:
genus_df = pd.read_csv(genus_file, dtype= str)
log_df(genus_df)

(11, 16)


Unnamed: 0,genus name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name,kingdom_taxon_id,kingdom_taxon_name
0,Actinocyclus,82146,Actinocyclus,genus,71207.0,Hemidiscaceae,426780.0,Coscinodiscales,69587,Bacillariophyceae,432613.0,Ochrophyta,28595.0,Life,,
1,Aulacoseira,432983,Aulacoseira,genus,427312.0,Aulacoseiraceae,426698.0,Aulacoseirales,69587,Bacillariophyceae,432613.0,Ochrophyta,28595.0,Life,,
2,Impagidinium,276906,Impagidinium,genus,321603.0,Gonyaulacaceae,321606.0,Gonyaulacales,321578,Dinophyceae,,,,,,
3,Coccolithus,87684,Coccolithus,genus,,,,,418920,Coccolithophyceae,87644.0,Haptophyta,28595.0,Life,,
4,Discoaster,87682,Discoaster,genus,,,,,418920,Coccolithophyceae,87644.0,Haptophyta,28595.0,Life,,


In [37]:
unapproved_df = pd.read_csv(taxa_file)

log_df(unapproved_df)

(14, 9)


Unnamed: 0,taxon_group,verbatim_name,genus name,species name,subspecies name,species modifier,non-taxa descriptor,subspecies modifier,genus modifier
0,diatoms,Actinocyclus ingens nodus,Actinocyclus,ingens,nodus,,,,
1,diatoms,Actinocyclus senarius,Actinocyclus,senarius,,,,,
2,diatoms,Actinocyclus vulgaris,Actinocyclus,vulgaris,,,,,
3,diatoms,Aulacoseira sp.,Aulacoseira,sp.,,,,,
4,dinoflagellates,Impagidinium spp.,Impagidinium,spp.,,,,,


In [38]:
merged_df = pd.merge(unapproved_df, genus_df, 
                     on = 'genus name', 
                     how='left',
                     indicator='_merge_pbdb')

log_df(merged_df)

(14, 25)


Unnamed: 0,taxon_group,verbatim_name,genus name,species name,subspecies name,species modifier,non-taxa descriptor,subspecies modifier,genus modifier,pbdb_taxon_id,...,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name,kingdom_taxon_id,kingdom_taxon_name,_merge_pbdb
0,diatoms,Actinocyclus ingens nodus,Actinocyclus,ingens,nodus,,,,,82146,...,Coscinodiscales,69587,Bacillariophyceae,432613.0,Ochrophyta,28595.0,Life,,,both
1,diatoms,Actinocyclus senarius,Actinocyclus,senarius,,,,,,82146,...,Coscinodiscales,69587,Bacillariophyceae,432613.0,Ochrophyta,28595.0,Life,,,both
2,diatoms,Actinocyclus vulgaris,Actinocyclus,vulgaris,,,,,,82146,...,Coscinodiscales,69587,Bacillariophyceae,432613.0,Ochrophyta,28595.0,Life,,,both
3,diatoms,Aulacoseira sp.,Aulacoseira,sp.,,,,,,432983,...,Aulacoseirales,69587,Bacillariophyceae,432613.0,Ochrophyta,28595.0,Life,,,both
4,dinoflagellates,Impagidinium spp.,Impagidinium,spp.,,,,,,276906,...,Gonyaulacales,321578,Dinophyceae,,,,,,,both


In [40]:
merged_df.columns

Index(['taxon_group', 'verbatim_name', 'genus name', 'species name',
       'subspecies name', 'species modifier', 'non-taxa descriptor',
       'subspecies modifier', 'genus modifier', 'pbdb_taxon_id',
       'pbdb_taxon_name', 'pbdb_taxon_rank', 'family_taxon_id',
       'family_taxon_name', 'order_taxon_id', 'order_taxon_name',
       'class_taxon_id', 'class_taxon_name', 'phylum_taxon_id',
       'phylum_taxon_name', 'unranked clade_taxon_id',
       'unranked clade_taxon_name', 'kingdom_taxon_id', 'kingdom_taxon_name',
       '_merge_pbdb'],
      dtype='object')

In [41]:
merged_df = merged_df.reindex(columns=[
    'taxon_group', 'verbatim_name',  'name',
    'Comment', 'Notes (change to Internal only notes?)',
    'Any taxon above genus', 
    'genus modifier', 'genus name', 
    'subgenera modifier','subgenera name', 
    'species modifier',  'species name',
    'subspecies modifier', 'subspecies name',
    'non-taxa descriptor', 
    'comments',
    'pbdb_taxon_id', 
    'Corrections to pbdb_taxon_id',
    'pbdb_taxon_name','pbdb_taxon_rank', 
    'family_taxon_id', 'family_taxon_name',
    'order_taxon_id', 'order_taxon_name', 
    'class_taxon_id', 'class_taxon_name', 
    'phylum_taxon_id', 'phylum_taxon_name',
    'kingdom_taxon_id', 'kingdom_taxon_name', 
    'unranked clade_taxon_id', 'unranked clade_taxon_name',
])


merged_df.sort_values(by=['taxon_group', 'verbatim_name'], inplace=True)

log_df(merged_df)

(14, 32)


Unnamed: 0,taxon_group,verbatim_name,name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name
0,diatoms,Actinocyclus ingens nodus,,,,,,Actinocyclus,,,...,426780,Coscinodiscales,69587,Bacillariophyceae,432613.0,Ochrophyta,,,28595.0,Life
1,diatoms,Actinocyclus senarius,,,,,,Actinocyclus,,,...,426780,Coscinodiscales,69587,Bacillariophyceae,432613.0,Ochrophyta,,,28595.0,Life
2,diatoms,Actinocyclus vulgaris,,,,,,Actinocyclus,,,...,426780,Coscinodiscales,69587,Bacillariophyceae,432613.0,Ochrophyta,,,28595.0,Life
3,diatoms,Aulacoseira sp.,,,,,,Aulacoseira,,,...,426698,Aulacoseirales,69587,Bacillariophyceae,432613.0,Ochrophyta,,,28595.0,Life
4,dinoflagellates,Impagidinium spp.,,,,,,Impagidinium,,,...,321606,Gonyaulacales,321578,Dinophyceae,,,,,,


In [35]:
merged_df.to_csv(taxa_pbdb_file, index=False)