# Normalize taxa list micropal 4

Cleanup the normalized micropal 4 taxa list from the eODP researchers. Add PBDB taxa data.

In [1]:
import sys
sys.path.append('../../../')

import pandas as pd
import numpy as np
import requests

from config import CLEAN_DATA_DIR, OUTPUT_DIR, RAW_DATA_DIR

import scripts.normalize_taxa as nt
import scripts.pbdb as pbdb
from scripts.normalize_data import remove_whitespace

from scripts.pbdb import get_parent_taxa, PBDB_TAXA_NAME, PBDB_TAXA_ID
from scripts.shared_utils import (
    log_df
)

In [2]:
date = '2022-08-08'

input_file = RAW_DATA_DIR/'PI_processed_files'/f'LIMS Micropal_CSV_4_taxa_{date}.csv'

pi_taxa_file_pbdb = OUTPUT_DIR /'taxa'/'LIMS'/f'PI_Micropal_CSV_4_normalized_taxa_list_with_pbdb_{date}.csv'

genus_pbdb_file = OUTPUT_DIR/'taxa'/'LIMS'/f"genera_pbdb_lims_4_{date}.csv"
higher_taxa_pbdb_file = OUTPUT_DIR/'taxa'/'LIMS'/f"higher_taxa_pbdb_lims_4_{date}.csv"

genus_letter_old_file= OUTPUT_DIR/'taxa'/'draft'/'LIMS'/f"micropal_4_genus_letter_2022-02-24.csv"
genus_letter_file= OUTPUT_DIR/'taxa'/'draft'/'LIMS'/f"micropal_4_genus_letter_{date}.csv"



crosswalk_file = OUTPUT_DIR/'taxa'/'LIMS'/f"taxa_crosswalk_{date}.csv"
taxa_list_file = OUTPUT_DIR/'taxa'/'LIMS'/f"taxa_list_{date}.csv"



## fix abbreviated genus 

In [4]:
df = pd.read_csv(input_file, dtype=str)
df = df.dropna(axis=0, how='all')
if 'abbrev_genus' in df.columns:
    del df['abbrev_genus']
           
log_df(df)
# 695

(695, 32)


Unnamed: 0,taxon_group,verbatim_name,Name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name
0,benthic_forams,Agglutinated Foram. gen. et sp. indet.,Textulariia indet.,,,Textulariia indet.,,,,,...,,,,,288974,Foraminifera,212476,Rhizaria,,
1,benthic_forams,Agglutinated indet.,Textulariia indet.,,,Textulariia indet.,,,,,...,,,,,288974,Foraminifera,212476,Rhizaria,,
2,benthic_forams,Alabaminella weddelensis,,,,,,Alabaminella,,,...,,,,,288974,Foraminifera,212476,Rhizaria,,
3,benthic_forams,Alabammina sp.,Alabamina sp.,genus misspelled,,,,Alabamina,,,...,,,,,288974,Foraminifera,212476,Rhizaria,,
4,benthic_forams,Alabamminella weddelensis,Alabaminella weddelensis,genus misspelled,,,,Alabaminella,,,...,,,,,288974,Foraminifera,212476,Rhizaria,,


In [5]:
genus_df = pd.read_csv(genus_letter_old_file, dtype=str, usecols=['verbatim_name'])
genus_df['abbrev_genus'] = True
log_df(genus_df)

(34, 2)


Unnamed: 0,verbatim_name,abbrev_genus
0,Calc. Hyaline Foram. gen. et sp. indet.,True
1,Gen. et sp. indet,True
2,O. centrocarpum var. Arctic,True
3,O. centrocarpum var. short processes,True
4,O. centrocarpum-Arctic morphotype,True


In [6]:
df = df.merge(genus_df, on ='verbatim_name', how='left')
df.loc[df['abbrev_genus'].isna() == True, 'abbrev_genus'] = False

log_df(df)

(695, 33)


Unnamed: 0,taxon_group,verbatim_name,Name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name,abbrev_genus
0,benthic_forams,Agglutinated Foram. gen. et sp. indet.,Textulariia indet.,,,Textulariia indet.,,,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False
1,benthic_forams,Agglutinated indet.,Textulariia indet.,,,Textulariia indet.,,,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False
2,benthic_forams,Alabaminella weddelensis,,,,,,Alabaminella,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False
3,benthic_forams,Alabammina sp.,Alabamina sp.,genus misspelled,,,,Alabamina,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False
4,benthic_forams,Alabamminella weddelensis,Alabaminella weddelensis,genus misspelled,,,,Alabaminella,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False


set name parts columns to null; reparse name parts column

In [7]:
columns = [
    'genus modifier', 'genus name', 'subgenera modifier', 'subgenera name',
    'species modifier', 'species name', 'subspecies modifier',
    'subspecies name', 'non-taxa descriptor', 'pbdb_taxon_id',
     'pbdb_taxon_name', 'pbdb_taxon_rank',
    'family_taxon_id', 'family_taxon_name', 'order_taxon_id',
    'order_taxon_name', 'class_taxon_id', 'class_taxon_name',
    'phylum_taxon_id', 'phylum_taxon_name', 'kingdom_taxon_id',
    'kingdom_taxon_name', 'unranked clade_taxon_id',
    'unranked clade_taxon_name'
]

def reparse_names():
    for index, row in df[df['abbrev_genus'] & df['Name'].notna()].iterrows():    
        taxon_name_parts = nt.taxon_name_parser(row['Name'])

        for col in columns:
            df.at[index, col] = np.nan

        for rank in taxon_name_parts:
            df.at[index, rank] = taxon_name_parts[rank]
  

add pbdb id

In [8]:
def add_pbdb_id():
    for index, row in df[df['abbrev_genus'] & df['Name'].notna()].iterrows(): 
        if index % 10 == 0:
            print(index)

        url =  PBDB_TAXA_NAME +  row['genus name']

        response = requests.get(url)

        if response.status_code == 200:
            data = response.json()["records"]
            if len(data) == 1:
                df.at[index, f'pbdb_taxon_id'] = str(data[0]["taxon_no"])
                df.at[index, f'pbdb_taxon_name'] = data[0]["taxon_name"]
                df.at[index, f'pbdb_taxon_rank'] = data[0]["taxon_rank"]

                round = 0
                get_parent_taxa(df, data[0]["parent_no"], data[0]["taxon_rank"], round, index, None)


In [9]:
# reparse_names()
# add_pbdb_id()

df.to_csv( pi_taxa_file_pbdb,index=False)

## fix incorect pbdb_taxon_id

incorported  pbdb_taxon_id that the PIs corrected into the taxalist.


In [10]:
df = pd.read_csv(pi_taxa_file_pbdb, dtype=str)
df = df.dropna(how="all", axis="index")
df['corrected'] = False

log_df(df)
# 695

(695, 34)


Unnamed: 0,taxon_group,verbatim_name,Name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name,abbrev_genus,corrected
0,benthic_forams,Agglutinated Foram. gen. et sp. indet.,Textulariia indet.,,,Textulariia indet.,,,,,...,,,288974,Foraminifera,212476,Rhizaria,,,False,False
1,benthic_forams,Agglutinated indet.,Textulariia indet.,,,Textulariia indet.,,,,,...,,,288974,Foraminifera,212476,Rhizaria,,,False,False
2,benthic_forams,Alabaminella weddelensis,,,,,,Alabaminella,,,...,,,288974,Foraminifera,212476,Rhizaria,,,False,False
3,benthic_forams,Alabammina sp.,Alabamina sp.,genus misspelled,,,,Alabamina,,,...,,,288974,Foraminifera,212476,Rhizaria,,,False,False
4,benthic_forams,Alabamminella weddelensis,Alabaminella weddelensis,genus misspelled,,,,Alabaminella,,,...,,,288974,Foraminifera,212476,Rhizaria,,,False,False


In [11]:
tmp = df[df['Corrections to pbdb_taxon_id'].notna() & (df['corrected'] == False)]
tmp['Corrections to pbdb_taxon_id'].unique()

array(['1064', '432650', '68421', '71266', '421517', '54452', '446918',
       '2894', '71284', '82180'], dtype=object)

In [12]:
pbdb.fix_pbdb_id(df, '1064', 1064)
pbdb.fix_pbdb_id(df, '432650', 432650)
pbdb.fix_pbdb_id(df, '68421', 68421)
pbdb.fix_pbdb_id(df, '71266', 71266)
pbdb.fix_pbdb_id(df, '421517', 421517)
pbdb.fix_pbdb_id(df, '71284', 71284)
pbdb.fix_pbdb_id(df, '82180', 82180)
pbdb.fix_pbdb_id(df, '54452', 54452)
pbdb.fix_pbdb_id(df, '2894', 2894)


1064
432650
68421
71266
421517
71284
82180
54452
2894


In [13]:
df.shape

(695, 34)

In [14]:
del df['corrected']

In [15]:
df.to_csv(pi_taxa_file_pbdb, index=False)

## add missing pbdb id

In [16]:
df = pd.read_csv(pi_taxa_file_pbdb, dtype=str)

log_df(df, 3)
# 695

(695, 33)


Unnamed: 0,taxon_group,verbatim_name,Name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name,abbrev_genus
0,benthic_forams,Agglutinated Foram. gen. et sp. indet.,Textulariia indet.,,,Textulariia indet.,,,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False
1,benthic_forams,Agglutinated indet.,Textulariia indet.,,,Textulariia indet.,,,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False
2,benthic_forams,Alabaminella weddelensis,,,,,,Alabaminella,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False


In [17]:
tmp = df[df['pbdb_taxon_id'].isna() & df['Any taxon above genus'].isna() ]

log_df(tmp)
# 13

(12, 33)


Unnamed: 0,taxon_group,verbatim_name,Name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name,abbrev_genus
25,benthic_forams,Bathymetry,,,keep and move to eODP unified data structure,,,,,,...,,,,,,,,,,False
283,diatoms,Diatom Zone (NPD) in Yanagisawa and Akiba (1998),,,keep and move to eODP unified data structure,,,,,,...,,,,,,,,,,False
284,diatoms,"Diatom Zone (Yanagisawa and Akiba, 1998)",,,keep and move to eODP unified data structure,,,,,,...,,,,,,,,,,False
299,diatoms,Gen. et sp. indet,,DELETE,"JAS: Not useful, delete? AF: Yes",,,,,,...,,,,,,,,,,True
307,diatoms,Marine,,DELETE,“Marine” only appears in one file and has abun...,,,,,,...,,,,,,,,,,False


In [18]:
genus_df = pd.DataFrame(tmp['genus name'].dropna().unique(), columns=['genus name'])
log_df(genus_df)

(0, 1)


Unnamed: 0,genus name


In [19]:
for index, row in genus_df.iterrows(): 
    if index % 50 == 0:
        print(index, end=' ')
        
    url =  PBDB_TAXA_NAME +  row['genus name']
        
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()["records"]
        if len(data) == 1:
            genus_df.at[index, f'pbdb_taxon_id'] = str(data[0]["taxon_no"])
            genus_df.at[index, f'pbdb_taxon_name'] = data[0]["taxon_name"]
            genus_df.at[index, f'pbdb_taxon_rank'] = data[0]["taxon_rank"]
            
            round = 0
            get_parent_taxa(genus_df, data[0]["parent_no"], data[0]["taxon_rank"], round, index, None)


In [20]:
log_df(genus_df)

(0, 1)


Unnamed: 0,genus name


In [21]:
genus_df.to_csv(genus_pbdb_file, index=False)

In [22]:
genus_df.columns

Index(['genus name'], dtype='object')

In [23]:
columns = [
    
     'pbdb_taxon_name', 'pbdb_taxon_rank',
    'family_taxon_id', 'family_taxon_name', 'order_taxon_id',
    'order_taxon_name', 'class_taxon_id', 'class_taxon_name',
    'phylum_taxon_id', 'phylum_taxon_name', 'kingdom_taxon_id',
    'kingdom_taxon_name', 'unranked clade_taxon_id',
    'unranked clade_taxon_name', 'pbdb_taxon_id',
    
]

for index, row  in genus_df.iterrows():
    for col in columns:
        if col not in genus_df:
            continue
        df.loc[ (df['pbdb_taxon_id'].isna()) & (df['genus name'] == row['genus name']), col] = row[col]
        


In [24]:
log_df(df)
# 695

(695, 33)


Unnamed: 0,taxon_group,verbatim_name,Name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name,abbrev_genus
0,benthic_forams,Agglutinated Foram. gen. et sp. indet.,Textulariia indet.,,,Textulariia indet.,,,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False
1,benthic_forams,Agglutinated indet.,Textulariia indet.,,,Textulariia indet.,,,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False
2,benthic_forams,Alabaminella weddelensis,,,,,,Alabaminella,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False
3,benthic_forams,Alabammina sp.,Alabamina sp.,genus misspelled,,,,Alabamina,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False
4,benthic_forams,Alabamminella weddelensis,Alabaminella weddelensis,genus misspelled,,,,Alabaminella,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False


In [25]:
df.to_csv(pi_taxa_file_pbdb, index=False)

## higher taxa

In [26]:
df = pd.read_csv(pi_taxa_file_pbdb, dtype=str)
log_df(df, 3)
# 695

(695, 33)


Unnamed: 0,taxon_group,verbatim_name,Name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name,abbrev_genus
0,benthic_forams,Agglutinated Foram. gen. et sp. indet.,Textulariia indet.,,,Textulariia indet.,,,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False
1,benthic_forams,Agglutinated indet.,Textulariia indet.,,,Textulariia indet.,,,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False
2,benthic_forams,Alabaminella weddelensis,,,,,,Alabaminella,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False


In [27]:
tmp = df[df['pbdb_taxon_id'].isna() & df['Any taxon above genus'].notna() ]

higher_df = pd.DataFrame(tmp['Any taxon above genus'].str.strip().dropna().unique(), columns=['Any taxon above genus']) 
higher_df.replace('', np.nan, inplace=True)
higher_df.dropna(inplace=True)
higher_df

Unnamed: 0,Any taxon above genus
0,"""Fish scales"""
1,"""Fish teeth"""
2,"""Otoliths"""
3,"""Undifferentiated pollen from herbs"""
4,"""Undifferentiated pollen"""
5,"""Undifferentiated pollen from shrubs"""
6,"""Undifferentiated pollen from trees"""
7,"""Reworked palynomorphs"""
8,"""Undifferentiated spores from moss or ferns"""
9,Tintinnida indet.


In [28]:
for index, row in higher_df.iterrows():        
    if index % 20 == 0:
        print(index)
        
    name = row['Any taxon above genus'].replace(' indet.', '')

  
    url =  PBDB_TAXA_NAME +  name
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()["records"]
        if len(data) == 1:
            higher_df.at[index, f'pbdb_taxon_id'] = str(data[0]["taxon_no"])
            higher_df.at[index, f'pbdb_taxon_name'] = data[0]["taxon_name"]
            higher_df.at[index, f'pbdb_taxon_rank'] = data[0]["taxon_rank"]
            
            round = 0
            get_parent_taxa(higher_df, data[0]["parent_no"], data[0]["taxon_rank"], round, index, None)
        else:
            print(name, len(data), ' found')
    else:
        print(name, ' not found')

0
"Fish scales"  not found
"Fish teeth"  not found
"Otoliths"  not found
"Undifferentiated pollen from herbs"  not found
"Undifferentiated pollen"  not found
"Undifferentiated pollen from shrubs"  not found
"Undifferentiated pollen from trees"  not found
"Reworked palynomorphs"  not found
"Undifferentiated spores from moss or ferns"  not found


In [29]:
higher_df.to_csv(higher_taxa_pbdb_file, index=False)

In [30]:
higher_df.columns

Index(['Any taxon above genus', 'pbdb_taxon_id', 'pbdb_taxon_name',
       'pbdb_taxon_rank', 'phylum_taxon_id', 'phylum_taxon_name',
       'unranked clade_taxon_id', 'unranked clade_taxon_name'],
      dtype='object')

In [31]:
columns = [
    
     'pbdb_taxon_name', 'pbdb_taxon_rank',
    'order_taxon_id', 'order_taxon_name', 
    'class_taxon_id', 'class_taxon_name',
    'phylum_taxon_id', 'phylum_taxon_name', 
    'kingdom_taxon_id', 'kingdom_taxon_name', 
    'unranked clade_taxon_id',  'unranked clade_taxon_name', 'pbdb_taxon_id',
    
]

for index, row  in higher_df.iterrows():
    for col in columns:
        if col not in higher_df.columns:
            continue
        df.loc[ (df['pbdb_taxon_id'].isna()) & (df['Any taxon above genus'] == row['Any taxon above genus']), col] = row[col]
        

In [32]:
log_df(df)

(695, 33)


Unnamed: 0,taxon_group,verbatim_name,Name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name,abbrev_genus
0,benthic_forams,Agglutinated Foram. gen. et sp. indet.,Textulariia indet.,,,Textulariia indet.,,,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False
1,benthic_forams,Agglutinated indet.,Textulariia indet.,,,Textulariia indet.,,,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False
2,benthic_forams,Alabaminella weddelensis,,,,,,Alabaminella,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False
3,benthic_forams,Alabammina sp.,Alabamina sp.,genus misspelled,,,,Alabamina,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False
4,benthic_forams,Alabamminella weddelensis,Alabaminella weddelensis,genus misspelled,,,,Alabaminella,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False


In [33]:
df.to_csv(pi_taxa_file_pbdb, index=False)

## add LIMS 4 to taxa crosswalk csv

In [34]:
existing_crosswalk = pd.read_csv(crosswalk_file, dtype=str)
log_df(existing_crosswalk, 3)
# 4593

(4593, 17)


Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,normalized_name,taxon_group,verbatim_name,name comment field,Comment,Notes (change to Internal only notes?),comments
0,,,Euuvigerina,,,,miozea,,,,Euuvigerina miozea,benthic_forams,Euuvigerina miozea (group) >100 m,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",>100 m group
1,,,Euuvigerina,,,,rodleyi,,,,Euuvigerina rodleyi,benthic_forams,Euuvigerina rodleyi (group) >50 m,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",>50 m group
2,Foraminifera indet.,,,,,,,,,,Foraminifera indet.,benthic_forams,Others,,not a taxa,Andy,other benthic foraminifera


In [35]:
df = pd.read_csv(pi_taxa_file_pbdb, dtype=str)
df.rename(columns={'Name': 'name'}, inplace=True)
log_df(df, 3)
# 695

(695, 33)


Unnamed: 0,taxon_group,verbatim_name,name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name,abbrev_genus
0,benthic_forams,Agglutinated Foram. gen. et sp. indet.,Textulariia indet.,,,Textulariia indet.,,,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False
1,benthic_forams,Agglutinated indet.,Textulariia indet.,,,Textulariia indet.,,,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False
2,benthic_forams,Alabaminella weddelensis,,,,,,Alabaminella,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False


In [37]:
filtered_taxa = nt.create_taxa_crosswalk_df(df)
# (695, 17)
# (683, 17)
# (683, 17)

fields: ['Any taxon above genus', 'genus modifier', 'genus name', 'subgenera modifier', 'subgenera name', 'species modifier', 'species name', 'subspecies modifier', 'subspecies name', 'non-taxa descriptor', 'normalized_name', 'taxon_group', 'verbatim_name', 'name comment field', 'Comment', 'Notes (change to Internal only notes?)', 'comments']
initial df:  (695, 17)
remove nontaxa df:  (683, 17)
drop duplicates df:  (683, 17)


In [38]:
existing_crosswalk.columns == filtered_taxa.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True])

In [40]:
combine_df = pd.concat([existing_crosswalk, filtered_taxa])
remove_whitespace(combine_df)

combine_df = combine_df.drop_duplicates()
combine_df.shape
# 5276

(5276, 17)

In [41]:
combine_df.to_csv(crosswalk_file, index=False)

## add LIMS 4 to taxa csv


In [42]:
existing_taxa = pd.read_csv(taxa_list_file, dtype=str)
log_df(existing_taxa, 3)
# 4168

(4168, 25)


Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,...,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
0,,,Euuvigerina,,,,miozea,,,,...,,,,,,,288974.0,Foraminifera,212476,Rhizaria
1,,,Euuvigerina,,,,rodleyi,,,,...,,,,,,,288974.0,Foraminifera,212476,Rhizaria
2,Foraminifera indet.,,,,,,,,,,...,,,,,,,,,212476,Rhizaria


In [43]:
df = pd.read_csv(pi_taxa_file_pbdb, dtype=str)
log_df(df, 3)
# 695

(695, 33)


Unnamed: 0,taxon_group,verbatim_name,Name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name,abbrev_genus
0,benthic_forams,Agglutinated Foram. gen. et sp. indet.,Textulariia indet.,,,Textulariia indet.,,,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False
1,benthic_forams,Agglutinated indet.,Textulariia indet.,,,Textulariia indet.,,,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False
2,benthic_forams,Alabaminella weddelensis,,,,,,Alabaminella,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False


In [45]:
filtered_taxa = nt.create_taxa_list_df(df)
# 695
# 683
# 610

fields: ['Any taxon above genus', 'genus modifier', 'genus name', 'subgenera modifier', 'subgenera name', 'species modifier', 'species name', 'subspecies modifier', 'subspecies name', 'non-taxa descriptor', 'normalized_name', 'taxon_group', 'pbdb_taxon_id', 'pbdb_taxon_name', 'pbdb_taxon_rank', 'family_taxon_id', 'family_taxon_name', 'order_taxon_id', 'order_taxon_name', 'class_taxon_id', 'class_taxon_name', 'phylum_taxon_id', 'phylum_taxon_name', 'kingdom_taxon_id', 'kingdom_taxon_name']
initial df:  (695, 25)
remove nontaxa df:  (683, 25)
drop duplicates df:  (610, 25)


In [46]:
existing_taxa.columns == filtered_taxa.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True])

In [47]:
combine_df = pd.concat([existing_taxa, filtered_taxa])
remove_whitespace(combine_df)
combine_df = combine_df.drop_duplicates()

combine_df.shape
# 4646

(4646, 25)

In [48]:
combine_df.to_csv(taxa_list_file, index=False)