# Normalize taxa list micropal 4

Cleanup the normalized micropal 4 taxa list from the eODP researchers. Add PBDB taxa data.

In [1]:
import sys
sys.path.append('../../../')

import pandas as pd
import numpy as np
import requests

from config import CLEAN_DATA_DIR, OUTPUT_DIR, RAW_DATA_DIR

import scripts.normalize_taxa as nt
import scripts.pbdb as pbdb
from scripts.normalize_data import remove_whitespace

from scripts.pbdb import (
    get_parent_taxa, 
    PBDB_TAXA_NAME, 
    PBDB_TAXA_ID,
    check_multiple_pbdb_id,
    create_genus_df,
    fetch_pdbd_data,
    add_pbdb_data,
    create_higher_taxa_df
)

from scripts.shared_utils import (
    log_df
)

In [2]:
date = '2022-11-15'

input_file = RAW_DATA_DIR/'PI_processed_files'/f'LIMS Micropal_CSV_4_taxa_{date}.csv'

pi_taxa_file_pbdb = OUTPUT_DIR /'taxa'/'LIMS'/f'PI_Micropal_CSV_4_normalized_taxa_list_with_pbdb_{date}.csv'

genus_pbdb_file = OUTPUT_DIR/'taxa'/'LIMS'/f"genera_pbdb_lims_4_{date}.csv"
higher_taxa_pbdb_file = OUTPUT_DIR/'taxa'/'LIMS'/f"higher_taxa_pbdb_lims_4_{date}.csv"

genus_letter_old_file= OUTPUT_DIR/'taxa'/'draft'/'LIMS'/f"micropal_4_genus_letter_2022-02-24.csv"
genus_letter_file= OUTPUT_DIR/'taxa'/'draft'/'LIMS'/f"micropal_4_genus_letter_{date}.csv"

crosswalk_file = OUTPUT_DIR/'taxa'/'LIMS'/f"taxa_crosswalk_{date}.csv"
taxa_list_file = OUTPUT_DIR/'taxa'/'LIMS'/f"taxa_list_{date}.csv"


## fix abbreviated genus 

In [3]:
df = pd.read_csv(input_file, dtype=str)
df = df.dropna(axis=0, how='all')
if 'abbrev_genus' in df.columns:
    del df['abbrev_genus']
           
log_df(df)
# 695

(695, 32)


Unnamed: 0,taxon_group,verbatim_name,Name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name
0,benthic_forams,Agglutinated Foram. gen. et sp. indet.,Textulariia indet.,,,Textulariia indet.,,,,,...,,,,,288974,Foraminifera,212476,Rhizaria,,
1,benthic_forams,Agglutinated indet.,Textulariia indet.,,,Textulariia indet.,,,,,...,,,,,288974,Foraminifera,212476,Rhizaria,,
2,benthic_forams,Alabaminella weddelensis,Alabaminella weddellensis,,,,,Alabaminella,,,...,,,,,288974,Foraminifera,212476,Rhizaria,,
3,benthic_forams,Alabammina sp.,Alabamina sp.,genus misspelled,,,,Alabamina,,,...,,,,,288974,Foraminifera,212476,Rhizaria,,
4,benthic_forams,Alabamminella weddelensis,Alabaminella weddellensis,genus & species misspelled,,,,Alabaminella,,,...,,,,,288974,Foraminifera,212476,Rhizaria,,


In [4]:
genus_df = pd.read_csv(genus_letter_old_file, dtype=str, usecols=['verbatim_name'])
genus_df['abbrev_genus'] = True
log_df(genus_df)

(34, 2)


Unnamed: 0,verbatim_name,abbrev_genus
0,Calc. Hyaline Foram. gen. et sp. indet.,True
1,Gen. et sp. indet,True
2,O. centrocarpum var. Arctic,True
3,O. centrocarpum var. short processes,True
4,O. centrocarpum-Arctic morphotype,True


In [5]:
df = df.merge(genus_df, on ='verbatim_name', how='left')
df.loc[df['abbrev_genus'].isna() == True, 'abbrev_genus'] = False

log_df(df)

(695, 33)


Unnamed: 0,taxon_group,verbatim_name,Name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name,abbrev_genus
0,benthic_forams,Agglutinated Foram. gen. et sp. indet.,Textulariia indet.,,,Textulariia indet.,,,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False
1,benthic_forams,Agglutinated indet.,Textulariia indet.,,,Textulariia indet.,,,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False
2,benthic_forams,Alabaminella weddelensis,Alabaminella weddellensis,,,,,Alabaminella,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False
3,benthic_forams,Alabammina sp.,Alabamina sp.,genus misspelled,,,,Alabamina,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False
4,benthic_forams,Alabamminella weddelensis,Alabaminella weddellensis,genus & species misspelled,,,,Alabaminella,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False


set name parts columns to null; reparse name parts column

In [6]:
columns = [
    'genus modifier', 'genus name', 'subgenera modifier', 'subgenera name',
    'species modifier', 'species name', 'subspecies modifier',
    'subspecies name', 'non-taxa descriptor', 'pbdb_taxon_id',
     'pbdb_taxon_name', 'pbdb_taxon_rank',
    'family_taxon_id', 'family_taxon_name', 'order_taxon_id',
    'order_taxon_name', 'class_taxon_id', 'class_taxon_name',
    'phylum_taxon_id', 'phylum_taxon_name', 'kingdom_taxon_id',
    'kingdom_taxon_name', 'unranked clade_taxon_id',
    'unranked clade_taxon_name'
]

def reparse_names():
    for index, row in df[df['abbrev_genus'] & df['Name'].notna()].iterrows():    
        taxon_name_parts = nt.taxon_name_parser(row['Name'])

        for col in columns:
            df.at[index, col] = np.nan

        for rank in taxon_name_parts:
            df.at[index, rank] = taxon_name_parts[rank]
  

add pbdb id

In [7]:
def add_pbdb_id():
    for index, row in df[df['abbrev_genus'] & df['Name'].notna()].iterrows(): 
        if index % 10 == 0:
            print(index)

        url =  PBDB_TAXA_NAME +  row['genus name']

        response = requests.get(url)

        if response.status_code == 200:
            data = response.json()["records"]
            if len(data) == 1:
                df.at[index, f'pbdb_taxon_id'] = str(data[0]["taxon_no"])
                df.at[index, f'pbdb_taxon_name'] = data[0]["taxon_name"]
                df.at[index, f'pbdb_taxon_rank'] = data[0]["taxon_rank"]

                round = 0
                get_parent_taxa(df, data[0]["parent_no"], data[0]["taxon_rank"], round, index, None)


check if pbdb data is same for each pbdb taxon id

In [8]:
check_multiple_pbdb_id(df)

Unnamed: 0,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name


In [9]:
# reparse_names()
# add_pbdb_id()

df.to_csv( pi_taxa_file_pbdb,index=False)

## fix incorect pbdb_taxon_id

incorported  pbdb_taxon_id that the PIs corrected into the taxalist.


In [10]:
df = pd.read_csv(pi_taxa_file_pbdb, dtype=str)
df = df.dropna(how="all", axis="index")
df['corrected'] = False

log_df(df)
# 695

(695, 34)


Unnamed: 0,taxon_group,verbatim_name,Name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name,abbrev_genus,corrected
0,benthic_forams,Agglutinated Foram. gen. et sp. indet.,Textulariia indet.,,,Textulariia indet.,,,,,...,,,288974,Foraminifera,212476,Rhizaria,,,False,False
1,benthic_forams,Agglutinated indet.,Textulariia indet.,,,Textulariia indet.,,,,,...,,,288974,Foraminifera,212476,Rhizaria,,,False,False
2,benthic_forams,Alabaminella weddelensis,Alabaminella weddellensis,,,,,Alabaminella,,,...,,,288974,Foraminifera,212476,Rhizaria,,,False,False
3,benthic_forams,Alabammina sp.,Alabamina sp.,genus misspelled,,,,Alabamina,,,...,,,288974,Foraminifera,212476,Rhizaria,,,False,False
4,benthic_forams,Alabamminella weddelensis,Alabaminella weddellensis,genus & species misspelled,,,,Alabaminella,,,...,,,288974,Foraminifera,212476,Rhizaria,,,False,False


In [11]:
df[['pbdb_taxon_id', 'Corrections to pbdb_taxon_id']].dropna(subset=['Corrections to pbdb_taxon_id'])

Unnamed: 0,pbdb_taxon_id,Corrections to pbdb_taxon_id
49,1064,1064
50,1064,1064
51,1064,1064
52,1064,1064
53,1064,1064
54,1064,1064
304,432650,432650
311,68421,68421
325,71266,71266
437,421517,421517


In [12]:
tmp = df[df['Corrections to pbdb_taxon_id'].notna() & (df['corrected'] == False)]
tmp['Corrections to pbdb_taxon_id'].unique()

array(['1064', '432650', '68421', '71266', '421517', '54452', '446918',
       '2894', '71284', '82180'], dtype=object)

In [13]:
pbdb.fix_pbdb_id(df, '1064', 1064)
pbdb.fix_pbdb_id(df, '432650', 432650)
pbdb.fix_pbdb_id(df, '68421', 68421)
pbdb.fix_pbdb_id(df, '71266', 71266)
pbdb.fix_pbdb_id(df, '421517', 421517)
pbdb.fix_pbdb_id(df, '54452', 54452)
pbdb.fix_pbdb_id(df, '446918', 446918)
pbdb.fix_pbdb_id(df, '2894', 2894)
pbdb.fix_pbdb_id(df, '71284', 71284)
pbdb.fix_pbdb_id(df, '82180', 82180)


1064
432650
68421
71266
421517
54452
446918
2894
71284
82180


In [14]:
df.shape

(695, 34)

In [15]:
del df['corrected']

In [16]:
df.to_csv(pi_taxa_file_pbdb, index=False)

## QA rows without pbdb id

In [17]:
df = pd.read_csv(pi_taxa_file_pbdb, dtype=str)

log_df(df, 3)
# 695

(695, 33)


Unnamed: 0,taxon_group,verbatim_name,Name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name,abbrev_genus
0,benthic_forams,Agglutinated Foram. gen. et sp. indet.,Textulariia indet.,,,Textulariia indet.,,,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False
1,benthic_forams,Agglutinated indet.,Textulariia indet.,,,Textulariia indet.,,,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False
2,benthic_forams,Alabaminella weddelensis,Alabaminella weddellensis,,,,,Alabaminella,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False


In [18]:
tmp = df[df['pbdb_taxon_id'].isna() & df['Any taxon above genus'].isna() ]

log_df(tmp)
# 12

tmp['verbatim_name']

(12, 33)


25                                           Bathymetry
283    Diatom Zone (NPD) in Yanagisawa and Akiba (1998)
284            Diatom Zone (Yanagisawa and Akiba, 1998)
299                                   Gen. et sp. indet
307                                              Marine
462                                 Martini (1971) Zone
616                            Planktic foraminiferal %
617                          Planktic foraminiferal (%)
651                                    Radiolarian zone
652                            Radiolarian zone/subzone
679                Silicoflagellate Zone in Ling (1992)
680                                 Zone in Ling (1992)
Name: verbatim_name, dtype: object

## create genera csv

In [58]:
df = pd.read_csv(pi_taxa_file_pbdb, dtype=str)
log_df(df)
# 695

(695, 33)


Unnamed: 0,taxon_group,verbatim_name,Name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name,abbrev_genus
0,benthic_forams,Agglutinated Foram. gen. et sp. indet.,Textulariia indet.,,,Textulariia indet.,,,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False
1,benthic_forams,Agglutinated indet.,Textulariia indet.,,,Textulariia indet.,,,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False
2,benthic_forams,Alabaminella weddelensis,Alabaminella weddellensis,,,,,Alabaminella,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False
3,benthic_forams,Alabammina sp.,Alabamina sp.,genus misspelled,,,,Alabamina,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False
4,benthic_forams,Alabamminella weddelensis,Alabaminella weddellensis,genus & species misspelled,,,,Alabaminella,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False


In [59]:
genus_df =  create_genus_df(df)

log_df(genus_df)
# 242

(242, 15)


Unnamed: 0,taxon_group,genus name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
2,benthic_forams,Alabaminella,423746,Alabaminella,genus,241423,Eponididae,,,,,288974,Foraminifera,212476,Rhizaria
3,benthic_forams,Alabamina,788,Alabamina,genus,82213,Alabaminidae,,,,,288974,Foraminifera,212476,Rhizaria
5,benthic_forams,Ammodiscus,818,Ammodiscus,genus,112199,Ammodiscidae,,,,,288974,Foraminifera,212476,Rhizaria
6,benthic_forams,Ammonia,823,Ammonia,genus,103768,Rotaliidae,,,,,288974,Foraminifera,212476,Rhizaria
7,benthic_forams,Amphicoryna,835,Amphicoryna,genus,112281,Vaginulinidae,428878.0,Vaginulinida,428875.0,Nodosariata,288974,Foraminifera,212476,Rhizaria


check for duplicate genus

In [60]:
genus_df[genus_df.duplicated(subset=['taxon_group', 'genus name'])]

Unnamed: 0,taxon_group,genus name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name


check for genus name used in more than taxon group

In [61]:
genus_df[genus_df.duplicated(subset=['genus name'])]

Unnamed: 0,taxon_group,genus name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name


fetch pbdb info for genus that do not have pbdb info

In [62]:
fetch_pdbd_data(genus_df, 'genus name')


200 250 300 350 400 600 650 

In [63]:
genus_df.drop_duplicates(inplace=True)

log_df(genus_df)
# 242

(242, 15)


Unnamed: 0,taxon_group,genus name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
2,benthic_forams,Alabaminella,423746,Alabaminella,genus,241423,Eponididae,,,,,288974,Foraminifera,212476,Rhizaria
3,benthic_forams,Alabamina,788,Alabamina,genus,82213,Alabaminidae,,,,,288974,Foraminifera,212476,Rhizaria
5,benthic_forams,Ammodiscus,818,Ammodiscus,genus,112199,Ammodiscidae,,,,,288974,Foraminifera,212476,Rhizaria
6,benthic_forams,Ammonia,823,Ammonia,genus,103768,Rotaliidae,,,,,288974,Foraminifera,212476,Rhizaria
7,benthic_forams,Amphicoryna,835,Amphicoryna,genus,112281,Vaginulinidae,428878.0,Vaginulinida,428875.0,Nodosariata,288974,Foraminifera,212476,Rhizaria


In [25]:
genus_df.to_csv(genus_pbdb_file, index=False)

## add genus pbdb to input_file

In [27]:
df = pd.read_csv(pi_taxa_file_pbdb, dtype=str)
log_df(df)
# 695, 32

(695, 33)


Unnamed: 0,taxon_group,verbatim_name,Name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name,abbrev_genus
0,benthic_forams,Agglutinated Foram. gen. et sp. indet.,Textulariia indet.,,,Textulariia indet.,,,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False
1,benthic_forams,Agglutinated indet.,Textulariia indet.,,,Textulariia indet.,,,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False
2,benthic_forams,Alabaminella weddelensis,Alabaminella weddellensis,,,,,Alabaminella,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False
3,benthic_forams,Alabammina sp.,Alabamina sp.,genus misspelled,,,,Alabamina,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False
4,benthic_forams,Alabamminella weddelensis,Alabaminella weddellensis,genus & species misspelled,,,,Alabaminella,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False


In [28]:
genus_df= pd.read_csv(genus_pbdb_file, dtype=str)

log_df(genus_df)
#242

(242, 15)


Unnamed: 0,taxon_group,genus name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
0,benthic_forams,Alabaminella,423746,Alabaminella,genus,241423,Eponididae,,,,,288974,Foraminifera,212476,Rhizaria
1,benthic_forams,Alabamina,788,Alabamina,genus,82213,Alabaminidae,,,,,288974,Foraminifera,212476,Rhizaria
2,benthic_forams,Ammodiscus,818,Ammodiscus,genus,112199,Ammodiscidae,,,,,288974,Foraminifera,212476,Rhizaria
3,benthic_forams,Ammonia,823,Ammonia,genus,103768,Rotaliidae,,,,,288974,Foraminifera,212476,Rhizaria
4,benthic_forams,Amphicoryna,835,Amphicoryna,genus,112281,Vaginulinidae,428878.0,Vaginulinida,428875.0,Nodosariata,288974,Foraminifera,212476,Rhizaria


In [29]:
add_pbdb_data(df, genus_df, 'genus name')

In [30]:
log_df(df)
# 695

(695, 33)


Unnamed: 0,taxon_group,verbatim_name,Name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name,abbrev_genus
0,benthic_forams,Agglutinated Foram. gen. et sp. indet.,Textulariia indet.,,,Textulariia indet.,,,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False
1,benthic_forams,Agglutinated indet.,Textulariia indet.,,,Textulariia indet.,,,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False
2,benthic_forams,Alabaminella weddelensis,Alabaminella weddellensis,,,,,Alabaminella,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False
3,benthic_forams,Alabammina sp.,Alabamina sp.,genus misspelled,,,,Alabamina,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False
4,benthic_forams,Alabamminella weddelensis,Alabaminella weddellensis,genus & species misspelled,,,,Alabaminella,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False


In [31]:
df.to_csv(pi_taxa_file_pbdb, index=False)

## create higher taxa

In [71]:
df = pd.read_csv(pi_taxa_file_pbdb, dtype=str)
log_df(df, 3)
# 695

(695, 33)


Unnamed: 0,taxon_group,verbatim_name,Name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name,abbrev_genus
0,benthic_forams,Agglutinated Foram. gen. et sp. indet.,Textulariia indet.,,,Textulariia indet.,,,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False
1,benthic_forams,Agglutinated indet.,Textulariia indet.,,,Textulariia indet.,,,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False
2,benthic_forams,Alabaminella weddelensis,Alabaminella weddellensis,,,,,Alabaminella,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False


check for for taxon that has both higher taxa and genus name 

In [72]:
df[df['Any taxon above genus'].notna() & df['genus name'].notna()]

Unnamed: 0,taxon_group,verbatim_name,Name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name,abbrev_genus


In [65]:
higher_df =  create_higher_taxa_df(df)

log_df(higher_df)
# 35

(35, 15)


Unnamed: 0,taxon_group,Any taxon above genus,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
0,benthic_forams,Textulariia indet.,434530,Textulariia,subclass,,,,,,,288974.0,Foraminifera,212476.0,Rhizaria
47,benthic_forams,Foraminifera indet.,288974,Foraminifera,phylum,,,,,,,,,212476.0,Rhizaria
366,dinoflagellates,Dinoflagellata indet.,354791,Dinoflagellata,superclass,,,,,,,277918.0,Myzozoa,,
381,dinoflagellates,Foraminifera indet.,288974,Foraminifera,phylum,,,,,,,,,212476.0,Rhizaria
395,dinoflagellates,Dinophyceae indet.,321578,Dinophyceae,class,,,,,,,,,,


In [66]:
higher_df[higher_df.duplicated(subset=['taxon_group', 'Any taxon above genus'])]

Unnamed: 0,taxon_group,Any taxon above genus,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name


In [67]:
higher_df[higher_df.duplicated(subset=['Any taxon above genus'])]

Unnamed: 0,taxon_group,Any taxon above genus,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
381,dinoflagellates,Foraminifera indet.,288974.0,Foraminifera,phylum,,,,,,,,,212476.0,Rhizaria
531,other,Ostracoda indet.,22826.0,Ostracoda,class,,,,,,,18891.0,Arthropoda,325038.0,Animalia
559,planktic_forams,Bryozoa indet.,24773.0,Bryozoa,phylum,,,,,,,,,325038.0,Animalia
567,planktic_forams,Echinoidea indet.,32733.0,Echinoidea,class,,,,,,,30739.0,Echinodermata,325038.0,Animalia
569,planktic_forams,"""Fish teeth""",,,,,,,,,,,,,
608,planktic_forams,Ostracoda indet.,22826.0,Ostracoda,class,,,,,,,18891.0,Arthropoda,325038.0,Animalia
609,planktic_forams,"""Otoliths""",,,,,,,,,,,,,
620,planktic_forams,Radiolaria indet.,4.0,Radiolaria,phylum,,,,,,,,,212476.0,Rhizaria


In [68]:
higher_df['Any taxon above genus edit'] = higher_df['Any taxon above genus'].replace(' indet.', '')

fetch_pdbd_data(higher_df, 'Any taxon above genus edit')

0 "Fish scales"  not found
"Fish teeth"  not found
"Otoliths"  not found
"Undifferentiated pollen from herbs"  not found
"Undifferentiated pollen"  not found
"Undifferentiated pollen from shrubs"  not found
"Undifferentiated pollen from trees"  not found
"Reworked palynomorphs"  not found
"Undifferentiated spores from moss or ferns"  not found
"Fish teeth"  not found
"Otoliths"  not found


In [69]:
del higher_df['Any taxon above genus edit']
higher_df.drop_duplicates(inplace=True)

log_df(higher_df)
# 35

(35, 15)


Unnamed: 0,taxon_group,Any taxon above genus,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
0,benthic_forams,Textulariia indet.,434530,Textulariia,subclass,,,,,,,288974.0,Foraminifera,212476.0,Rhizaria
47,benthic_forams,Foraminifera indet.,288974,Foraminifera,phylum,,,,,,,,,212476.0,Rhizaria
366,dinoflagellates,Dinoflagellata indet.,354791,Dinoflagellata,superclass,,,,,,,277918.0,Myzozoa,,
381,dinoflagellates,Foraminifera indet.,288974,Foraminifera,phylum,,,,,,,,,212476.0,Rhizaria
395,dinoflagellates,Dinophyceae indet.,321578,Dinophyceae,class,,,,,,,,,,


In [70]:
higher_df.columns

Index(['taxon_group', 'Any taxon above genus', 'pbdb_taxon_id',
       'pbdb_taxon_name', 'pbdb_taxon_rank', 'family_taxon_id',
       'family_taxon_name', 'order_taxon_id', 'order_taxon_name',
       'class_taxon_id', 'class_taxon_name', 'phylum_taxon_id',
       'phylum_taxon_name', 'kingdom_taxon_id', 'kingdom_taxon_name'],
      dtype='object')

In [38]:
higher_df.to_csv(higher_taxa_pbdb_file, index=False)

## add higher pbdb to input_file

In [39]:
df = pd.read_csv(pi_taxa_file_pbdb,  dtype=str)
log_df(df)
# 695

(695, 33)


Unnamed: 0,taxon_group,verbatim_name,Name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name,abbrev_genus
0,benthic_forams,Agglutinated Foram. gen. et sp. indet.,Textulariia indet.,,,Textulariia indet.,,,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False
1,benthic_forams,Agglutinated indet.,Textulariia indet.,,,Textulariia indet.,,,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False
2,benthic_forams,Alabaminella weddelensis,Alabaminella weddellensis,,,,,Alabaminella,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False
3,benthic_forams,Alabammina sp.,Alabamina sp.,genus misspelled,,,,Alabamina,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False
4,benthic_forams,Alabamminella weddelensis,Alabaminella weddellensis,genus & species misspelled,,,,Alabaminella,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False


In [40]:
higher_df = pd.read_csv(higher_taxa_pbdb_file, dtype=str)
log_df(higher_df)

(35, 15)


Unnamed: 0,taxon_group,Any taxon above genus,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
0,benthic_forams,Textulariia indet.,434530,Textulariia,subclass,,,,,,,288974.0,Foraminifera,212476.0,Rhizaria
1,benthic_forams,Foraminifera indet.,288974,Foraminifera,phylum,,,,,,,,,212476.0,Rhizaria
2,dinoflagellates,Dinoflagellata indet.,354791,Dinoflagellata,superclass,,,,,,,277918.0,Myzozoa,,
3,dinoflagellates,Foraminifera indet.,288974,Foraminifera,phylum,,,,,,,,,212476.0,Rhizaria
4,dinoflagellates,Dinophyceae indet.,321578,Dinophyceae,class,,,,,,,,,,


In [41]:
add_pbdb_data(df, higher_df, 'Any taxon above genus')

In [42]:
log_df(df)

(695, 33)


Unnamed: 0,taxon_group,verbatim_name,Name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name,abbrev_genus
0,benthic_forams,Agglutinated Foram. gen. et sp. indet.,Textulariia indet.,,,Textulariia indet.,,,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False
1,benthic_forams,Agglutinated indet.,Textulariia indet.,,,Textulariia indet.,,,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False
2,benthic_forams,Alabaminella weddelensis,Alabaminella weddellensis,,,,,Alabaminella,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False
3,benthic_forams,Alabammina sp.,Alabamina sp.,genus misspelled,,,,Alabamina,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False
4,benthic_forams,Alabamminella weddelensis,Alabaminella weddellensis,genus & species misspelled,,,,Alabaminella,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False


In [43]:
df.to_csv(pi_taxa_file_pbdb, index=False)

## add LIMS 4 to taxa crosswalk csv

In [44]:
existing_crosswalk = pd.read_csv(crosswalk_file, dtype=str)
log_df(existing_crosswalk, 3)
# 4593

(4593, 17)


Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,normalized_name,taxon_group,verbatim_name,name comment field,Comment,Notes (change to Internal only notes?),comments
0,,,Euuvigerina,,,,miozea,,,,Euuvigerina miozea,benthic_forams,Euuvigerina miozea (group) >100 m,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",Euuvigerina miozea (group) >100 m
1,,,Euuvigerina,,,,rodleyi,,,,Euuvigerina rodleyi,benthic_forams,Euuvigerina rodleyi (group) >50 m,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",Euuvigerina rodleyi (group) >50 m
2,Foraminifera indet.,,,,,,,,,,Foraminifera indet.,benthic_forams,Others,,not a taxa,Andy,other benthic foraminifera


In [45]:
df = pd.read_csv(pi_taxa_file_pbdb, dtype=str)
df.rename(columns={'Name': 'name'}, inplace=True)
log_df(df, 3)
# 695

(695, 33)


Unnamed: 0,taxon_group,verbatim_name,name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name,abbrev_genus
0,benthic_forams,Agglutinated Foram. gen. et sp. indet.,Textulariia indet.,,,Textulariia indet.,,,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False
1,benthic_forams,Agglutinated indet.,Textulariia indet.,,,Textulariia indet.,,,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False
2,benthic_forams,Alabaminella weddelensis,Alabaminella weddellensis,,,,,Alabaminella,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False


In [46]:
filtered_taxa = nt.create_taxa_crosswalk_df(df)
# (695, 17)
# (683, 17)
# (683, 17)

fields: ['Any taxon above genus', 'genus modifier', 'genus name', 'subgenera modifier', 'subgenera name', 'species modifier', 'species name', 'subspecies modifier', 'subspecies name', 'non-taxa descriptor', 'normalized_name', 'taxon_group', 'verbatim_name', 'name comment field', 'Comment', 'Notes (change to Internal only notes?)', 'comments']
initial df:  (695, 17)
remove nontaxa df:  (683, 17)
drop duplicates df:  (683, 17)


In [47]:
existing_crosswalk.columns == filtered_taxa.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True])

In [48]:
combine_df = pd.concat([existing_crosswalk, filtered_taxa])
remove_whitespace(combine_df)

combine_df = combine_df.drop_duplicates()
combine_df.shape
# 5276

(5276, 17)

In [49]:
combine_df.drop_duplicates(subset=['normalized_name', 'taxon_group', 'verbatim_name'], 
                           keep='first', inplace=True)
log_df(combine_df)
# 5275

(5275, 17)


Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,normalized_name,taxon_group,verbatim_name,name comment field,Comment,Notes (change to Internal only notes?),comments
0,,,Euuvigerina,,,,miozea,,,,Euuvigerina miozea,benthic_forams,Euuvigerina miozea (group) >100 m,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",Euuvigerina miozea (group) >100 m
1,,,Euuvigerina,,,,rodleyi,,,,Euuvigerina rodleyi,benthic_forams,Euuvigerina rodleyi (group) >50 m,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",Euuvigerina rodleyi (group) >50 m
2,Foraminifera indet.,,,,,,,,,,Foraminifera indet.,benthic_forams,Others,,not a taxa,Andy,other benthic foraminifera
3,Pleurostomellidae indet.,,,,,,,,,,Pleurostomellidae indet.,benthic_forams,Pleurostomellids comment,,not a taxa,Andy,
4,Ostracoda indet.,,,,,,,,,,Ostracoda indet.,benthic_forams,Ostracoda spp.,Ostracoda spp.,group abundance,"Summer 2020: go in as Ostracoda indet, and the...",


In [50]:
combine_df.to_csv(crosswalk_file, index=False)

## add LIMS 4 to taxa csv


In [51]:
existing_taxa = pd.read_csv(taxa_list_file, dtype=str)
log_df(existing_taxa, 3)
# 4153

(4153, 25)


Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,...,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
0,,,Euuvigerina,,,,miozea,,,,...,,,,,,,288974.0,Foraminifera,212476,Rhizaria
1,,,Euuvigerina,,,,rodleyi,,,,...,,,,,,,288974.0,Foraminifera,212476,Rhizaria
2,Foraminifera indet.,,,,,,,,,,...,,,,,,,,,212476,Rhizaria


In [52]:
df = pd.read_csv(pi_taxa_file_pbdb, dtype=str)
log_df(df, 3)
# 695

(695, 33)


Unnamed: 0,taxon_group,verbatim_name,Name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name,abbrev_genus
0,benthic_forams,Agglutinated Foram. gen. et sp. indet.,Textulariia indet.,,,Textulariia indet.,,,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False
1,benthic_forams,Agglutinated indet.,Textulariia indet.,,,Textulariia indet.,,,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False
2,benthic_forams,Alabaminella weddelensis,Alabaminella weddellensis,,,,,Alabaminella,,,...,,,,288974,Foraminifera,212476,Rhizaria,,,False


In [53]:
filtered_taxa = nt.create_taxa_list_df(df)
# 695
# 683
# 607

fields: ['Any taxon above genus', 'genus modifier', 'genus name', 'subgenera modifier', 'subgenera name', 'species modifier', 'species name', 'subspecies modifier', 'subspecies name', 'non-taxa descriptor', 'normalized_name', 'taxon_group', 'pbdb_taxon_id', 'pbdb_taxon_name', 'pbdb_taxon_rank', 'family_taxon_id', 'family_taxon_name', 'order_taxon_id', 'order_taxon_name', 'class_taxon_id', 'class_taxon_name', 'phylum_taxon_id', 'phylum_taxon_name', 'kingdom_taxon_id', 'kingdom_taxon_name']
initial df:  (695, 25)
remove nontaxa df:  (683, 25)
drop duplicates df:  (607, 25)


In [54]:
existing_taxa.columns == filtered_taxa.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True])

In [55]:
combine_df = pd.concat([existing_taxa, filtered_taxa])
remove_whitespace(combine_df)
combine_df = combine_df.drop_duplicates()

combine_df.shape
# 4606

(4606, 25)

In [56]:
check_multiple_pbdb_id(combine_df)

Unnamed: 0,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name


In [57]:
combine_df.to_csv(taxa_list_file, index=False)