# Normalize taxa list

Cleanup the normalized taxa list from the eODP researchers. Add PBDB taxa data.

In [1]:
import sys
sys.path.append('../../../')

import pandas as pd
import numpy as np
import requests

from config import CLEAN_DATA_DIR, OUTPUT_DIR, RAW_DATA_DIR

import scripts.normalize_taxa as nt
from scripts.normalize_data import remove_whitespace
import scripts.pbdb as pbdb

from scripts.pbdb import (
    get_parent_taxa, 
    PBDB_TAXA_NAME, 
    check_multiple_pbdb_id,
    create_genus_df,
    fetch_pdbd_data,
    add_pbdb_data,
    create_higher_taxa_df
)
from scripts.shared_utils import (
    log_df
)

In [2]:
date = '2022-11-15'

input_file = RAW_DATA_DIR/'PI_processed_files'/f'LIMS_Micropal_headers_PBDB_Taxonomy_notes_taxa_list_{date}.csv'

pi_taxa_file_pbdb = OUTPUT_DIR /'taxa'/'LIMS'/f'PI_normalized_taxa_list_with_pbdb_{date}.csv'
crosswalk_file = OUTPUT_DIR/'taxa'/'LIMS'/f"taxa_crosswalk_{date}.csv"
taxa_list_file = OUTPUT_DIR/'taxa'/'LIMS'/f"taxa_list_{date}.csv"

genus_pbdb_file = OUTPUT_DIR/'taxa'/'LIMS'/f"genera_pbdb_{date}.csv"
higher_taxa_pbdb_file = OUTPUT_DIR/'taxa'/'LIMS'/f"higher_taxa_pbdb_{date}.csv"


## create file

In [3]:
df = pd.read_csv(input_file, skiprows = 9, dtype=str)
df = df.drop(df.index[[0, 1]])
df = df.dropna(how="all", axis="index")
log_df(df)
# 4741

(4741, 32)


Unnamed: 0,taxon_group,verbatim_name,name,"name to use (if different from ""name"")",name comment field,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,...,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
2,benthic_forams,Euuvigerina miozea (group) >100 m,Euuvigerina miozea (group) >100 m,Euuvigerina miozea,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",,,Euuvigerina,...,,,,,,,288974.0,Foraminifera,212476,Rhizaria
3,benthic_forams,Euuvigerina rodleyi (group) >50 m,Euuvigerina rodleyi (group) >50 m,Euuvigerina rodleyi,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",,,Euuvigerina,...,,,,,,,288974.0,Foraminifera,212476,Rhizaria
4,benthic_forams,Others,Others,Foraminifera indet.,,not a taxa,Andy,Foraminifera indet.,,,...,,,,,,,,,212476,Rhizaria
5,benthic_forams,Pleurostomellids comment,Pleurostomellids comment,Pleurostomellia indet.,,not a taxa,Andy,Pleurostomellidae indet.,,,...,,,,,,,288974.0,Foraminifera,212476,Rhizaria
6,benthic_forams,Ostracoda spp.,Ostracoda spp.,Ostracoda indet.,Ostracoda spp.,group abundance,"Summer 2020: go in as Ostracoda indet, and the...",Ostracoda indet.,,,...,,,,,,,18891.0,Arthropoda,325038,Animalia


In [4]:
check_multiple_pbdb_id(df)

Unnamed: 0,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name


In [5]:
df.columns

Index(['taxon_group', 'verbatim_name', 'name',
       'name to use (if different from "name")', 'name comment field',
       'Comment', 'Notes (change to Internal only notes?)',
       'Any taxon above genus', 'genus modifier', 'genus name',
       'subgenera modifier', 'subgenera name', 'species modifier',
       'species name', 'subspecies modifier', 'subspecies name',
       'non-taxa descriptor', 'comments', 'pbdb_taxon_id', 'pbdb_taxon_name',
       'pbdb_taxon_rank', 'Corrections to pbdb_taxon_id', 'family_taxon_id',
       'family_taxon_name', 'order_taxon_id', 'order_taxon_name',
       'class_taxon_id', 'class_taxon_name', 'phylum_taxon_id',
       'phylum_taxon_name', 'kingdom_taxon_id', 'kingdom_taxon_name'],
      dtype='object')

In [7]:
df.to_csv(pi_taxa_file_pbdb, index=False)

## fix incorect pbdb_taxon_id

incorported  pbdb_taxon_id that the PIs corrected into the taxalist.


In [10]:
df = pd.read_csv(pi_taxa_file_pbdb, dtype=str)
df['corrected'] = False

log_df(df)
# 4741

(4741, 33)


Unnamed: 0,taxon_group,verbatim_name,name,"name to use (if different from ""name"")",name comment field,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,...,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,corrected
0,benthic_forams,Euuvigerina miozea (group) >100 m,Euuvigerina miozea (group) >100 m,Euuvigerina miozea,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",,,Euuvigerina,...,,,,,,288974.0,Foraminifera,212476,Rhizaria,False
1,benthic_forams,Euuvigerina rodleyi (group) >50 m,Euuvigerina rodleyi (group) >50 m,Euuvigerina rodleyi,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",,,Euuvigerina,...,,,,,,288974.0,Foraminifera,212476,Rhizaria,False
2,benthic_forams,Others,Others,Foraminifera indet.,,not a taxa,Andy,Foraminifera indet.,,,...,,,,,,,,212476,Rhizaria,False
3,benthic_forams,Pleurostomellids comment,Pleurostomellids comment,Pleurostomellia indet.,,not a taxa,Andy,Pleurostomellidae indet.,,,...,,,,,,288974.0,Foraminifera,212476,Rhizaria,False
4,benthic_forams,Ostracoda spp.,Ostracoda spp.,Ostracoda indet.,Ostracoda spp.,group abundance,"Summer 2020: go in as Ostracoda indet, and the...",Ostracoda indet.,,,...,,,,,,18891.0,Arthropoda,325038,Animalia,False


In [11]:
df[['pbdb_taxon_id', 'Corrections to pbdb_taxon_id']].dropna(subset=['Corrections to pbdb_taxon_id'])

Unnamed: 0,pbdb_taxon_id,Corrections to pbdb_taxon_id
309,1064,genus; id 1064
310,1064,genus; id 1064
311,1064,genus; id 1064
312,1064,genus; id 1064
313,1064,genus; id 1064
314,1064,genus; id 1064
315,1064,genus; id 1064
316,1064,genus; id 1064
317,1064,genus; id 1064
318,1064,genus; id 1064


In [30]:
tmp = df[df['Corrections to pbdb_taxon_id'].notna() & (df['corrected'] == False)]
tmp['Corrections to pbdb_taxon_id'].unique()

array(['genus; id 1064', 'genus, id 1124', 'genus; taxon_no= 2092',
       'genus; ID 421517', 'genus; ID 432678', 'genus; ID 71247',
       'genus; ID 82145', 'genus; ID 432650', 'genus: ID 68421',
       'genus; ID 432651', 'genus; ID 443753', 'genus; ID 434997',
       'genus; ID 165526'], dtype=object)

In [20]:
pbdb.fix_pbdb_id(df, 'genus; id 1064', 1064)
pbdb.fix_pbdb_id(df, 'genus, id 1124', 1124)
pbdb.fix_pbdb_id(df, 'genus; taxon_no= 2092', 2092)
pbdb.fix_pbdb_id(df, 'genus; ID 421517', 421517)
pbdb.fix_pbdb_id(df, 'genus; ID 432678', 432678)
pbdb.fix_pbdb_id(df, 'genus; ID 71247', 71247)
pbdb.fix_pbdb_id(df, 'genus; ID 82145', 82145)
pbdb.fix_pbdb_id(df, 'genus; ID 432650', 432650)
pbdb.fix_pbdb_id(df, 'genus: ID 68421', 68421)
pbdb.fix_pbdb_id(df, 'genus; ID 432651', 432651)
pbdb.fix_pbdb_id(df, 'genus; ID 443753', 443753)
pbdb.fix_pbdb_id(df, 'genus; ID 434997', 434997)
pbdb.fix_pbdb_id(df, 'genus; ID 165526', 165526)


1064
1124
2092
421517
432678
71247
82145
432650
68421
432651
443753
434997
165526


In [21]:
df.shape
# 4741

(4741, 36)

In [22]:
if 'corrected' in df:
    del df['corrected']

In [39]:
df.to_csv(pi_taxa_file_pbdb, index=False)

## create genera csv

In [48]:
taxa_df = pd.read_csv(pi_taxa_file_pbdb, dtype=str)
log_df(taxa_df)
# 4741

(4741, 32)


Unnamed: 0,taxon_group,verbatim_name,name,"name to use (if different from ""name"")",name comment field,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,...,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
0,benthic_forams,Euuvigerina miozea (group) >100 m,Euuvigerina miozea (group) >100 m,Euuvigerina miozea,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",,,Euuvigerina,...,,,,,,,288974.0,Foraminifera,212476,Rhizaria
1,benthic_forams,Euuvigerina rodleyi (group) >50 m,Euuvigerina rodleyi (group) >50 m,Euuvigerina rodleyi,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",,,Euuvigerina,...,,,,,,,288974.0,Foraminifera,212476,Rhizaria
2,benthic_forams,Others,Others,Foraminifera indet.,,not a taxa,Andy,Foraminifera indet.,,,...,,,,,,,,,212476,Rhizaria
3,benthic_forams,Pleurostomellids comment,Pleurostomellids comment,Pleurostomellia indet.,,not a taxa,Andy,Pleurostomellidae indet.,,,...,,,,,,,288974.0,Foraminifera,212476,Rhizaria
4,benthic_forams,Ostracoda spp.,Ostracoda spp.,Ostracoda indet.,Ostracoda spp.,group abundance,"Summer 2020: go in as Ostracoda indet, and the...",Ostracoda indet.,,,...,,,,,,,18891.0,Arthropoda,325038,Animalia


In [49]:
genus_df =  create_genus_df(taxa_df)

log_df(genus_df)
# 1026

(1026, 15)


Unnamed: 0,taxon_group,genus name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
0,benthic_forams,Euuvigerina,1408,Euuvigerina,genus,,,,,,,288974,Foraminifera,212476,Rhizaria
13,benthic_forams,Nodosaria,1952,Nodosaria,genus,82197.0,Nodosariidae,429322.0,Nodosariida,428875.0,Nodosariata,288974,Foraminifera,212476,Rhizaria
18,benthic_forams,Cibicides,1107,Cibicides,genus,82208.0,Cibicididae,,,,,288974,Foraminifera,212476,Rhizaria
22,benthic_forams,Brizalina,1017,Brizalina,genus,112279.0,Bolivinidae,,,,,288974,Foraminifera,212476,Rhizaria
23,planktic_forams,Candeina,1053,Candeina,genus,422277.0,Candeinidae,,,,,288974,Foraminifera,212476,Rhizaria


check for duplicate genus

In [50]:
genus_df[genus_df.duplicated(subset=['taxon_group', 'genus name'])]

Unnamed: 0,taxon_group,genus name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name


check for genus name used in more than taxon group

In [51]:
genus_df[genus_df.duplicated(subset=['genus name'])]

Unnamed: 0,taxon_group,genus name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
645,benthic_forams,Globigerinoides,1504,Globigerinoides,genus,82191.0,Globigerinidae,,,,,288974.0,Foraminifera,212476.0,Rhizaria
891,benthic_forams,Neogloboquadrina,1917,Neogloboquadrina,genus,82192.0,Globorotaliidae,,,,,288974.0,Foraminifera,212476.0,Rhizaria
3109,palynology,Brigantedinium,264634,Brigantedinium,genus,277915.0,Peridiniaceae,277919.0,Peridiniales,321578.0,Dinophyceae,,,,
3118,palynology,Cymatiosphaera,170215,Cymatiosphaera,genus,,,,,,,,,,
3126,palynology,Enneadocysta,435001,Enneadocysta,genus,323989.0,Cladopyxiaceae,321606.0,Gonyaulacales,321578.0,Dinophyceae,,,,
3135,palynology,Impagidinium,276906,Impagidinium,genus,321603.0,Gonyaulacaceae,321606.0,Gonyaulacales,321578.0,Dinophyceae,,,,
3139,palynology,Lejeunecysta,264642,Lejeunecysta,genus,323951.0,Protoperidiniaceae,277919.0,Peridiniales,321578.0,Dinophyceae,,,,
3171,palynology,Selenopemphix,208980,Selenopemphix,genus,323951.0,Protoperidiniaceae,277919.0,Peridiniales,321578.0,Dinophyceae,,,,
3201,planktic_forams,Paragloborotalia,2065,Paragloborotalia,genus,82191.0,Globigerinidae,,,,,288974.0,Foraminifera,212476.0,Rhizaria
4732,silicoflagellates,Corbisema,71282,Corbisema,genus,434214.0,Corbisemaceae,306702.0,Dictyochales,306701.0,Dictyochophyceae,432613.0,Ochrophyta,,


check for genus names that aren't in pbdb

In [52]:
genus_df[genus_df['pbdb_taxon_id'].isna()]

Unnamed: 0,taxon_group,genus name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
1592,diatoms,"""Skeletonema""",,,,,,,,,,,,,
2201,dinoflagellates,"""Forma T""",,,,,,,,,,,,,
3205,planktic_forams,"""Globigerina""",,,,,,,,,,,,,


check for genus that are not genus

In [53]:
genus_df[genus_df['pbdb_taxon_rank'] != 'genus']

Unnamed: 0,taxon_group,genus name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
1592,diatoms,"""Skeletonema""",,,,,,,,,,,,,
2201,dinoflagellates,"""Forma T""",,,,,,,,,,,,,
3205,planktic_forams,"""Globigerina""",,,,,,,,,,,,,
3744,planktic_forams,Hirsutella,422423.0,Globorotalia (Hirsutella),subgenus,82192.0,Globorotaliidae,,,,,288974.0,Foraminifera,212476.0,Rhizaria
3769,planktic_forams,Menardella,422437.0,Globorotalia (Menardella),subgenus,82192.0,Globorotaliidae,,,,,288974.0,Foraminifera,212476.0,Rhizaria
4052,planktic_forams,Truncorotalia,422428.0,Globorotalia (Truncorotalia),subgenus,82192.0,Globorotaliidae,,,,,288974.0,Foraminifera,212476.0,Rhizaria
4691,radiolarians,Theocotylissa,152649.0,Theocotyle (Theocotylissa),subgenus,87065.0,Theocotylidae,402.0,Nassellaria,,,4.0,Radiolaria,212476.0,Rhizaria


fetch pbdb info for genus that do not have pbdb info

In [54]:
fetch_pdbd_data(genus_df, 'genus name')
                

0 100 200 300 600 750 850 1000 1200 1250 1300 1400 "Skeletonema"  not found
1600 1750 1800 1950 2200 "Forma T"  not found
2300 2800 3000 3050 3100 "Globigerina"  not found
3350 4150 4300 4350 4600 

In [55]:
genus_df.drop_duplicates(inplace=True)

log_df(genus_df)

(1026, 15)


Unnamed: 0,taxon_group,genus name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
0,benthic_forams,Euuvigerina,1408,Euuvigerina,genus,,,,,,,288974,Foraminifera,212476,Rhizaria
13,benthic_forams,Nodosaria,1952,Nodosaria,genus,82197.0,Nodosariidae,429322.0,Nodosariida,428875.0,Nodosariata,288974,Foraminifera,212476,Rhizaria
18,benthic_forams,Cibicides,1107,Cibicides,genus,82208.0,Cibicididae,,,,,288974,Foraminifera,212476,Rhizaria
22,benthic_forams,Brizalina,1017,Brizalina,genus,112279.0,Bolivinidae,,,,,288974,Foraminifera,212476,Rhizaria
23,planktic_forams,Candeina,1053,Candeina,genus,422277.0,Candeinidae,,,,,288974,Foraminifera,212476,Rhizaria


In [56]:
genus_df.to_csv(genus_pbdb_file, index=False)

## add genus pbdb to input_file

In [21]:
df = pd.read_csv(pi_taxa_file_pbdb, dtype=str)
log_df(df)
# 4741, 32

(4741, 32)


Unnamed: 0,taxon_group,verbatim_name,name,"name to use (if different from ""name"")",name comment field,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,...,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
0,benthic_forams,Euuvigerina miozea (group) >100 m,Euuvigerina miozea (group) >100 m,Euuvigerina miozea,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",,,Euuvigerina,...,,,,,,,288974.0,Foraminifera,212476,Rhizaria
1,benthic_forams,Euuvigerina rodleyi (group) >50 m,Euuvigerina rodleyi (group) >50 m,Euuvigerina rodleyi,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",,,Euuvigerina,...,,,,,,,288974.0,Foraminifera,212476,Rhizaria
2,benthic_forams,Others,Others,Foraminifera indet.,,not a taxa,Andy,Foraminifera indet.,,,...,,,,,,,,,212476,Rhizaria
3,benthic_forams,Pleurostomellids comment,Pleurostomellids comment,Pleurostomellia indet.,,not a taxa,Andy,Pleurostomellidae indet.,,,...,,,,,,,288974.0,Foraminifera,212476,Rhizaria
4,benthic_forams,Ostracoda spp.,Ostracoda spp.,Ostracoda indet.,Ostracoda spp.,group abundance,"Summer 2020: go in as Ostracoda indet, and the...",Ostracoda indet.,,,...,,,,,,,18891.0,Arthropoda,325038,Animalia


In [22]:
genus_df= pd.read_csv(genus_pbdb_file, dtype=str)

log_df(genus_df)
#1026

(1026, 15)


Unnamed: 0,taxon_group,genus name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
0,benthic_forams,Euuvigerina,1408,Euuvigerina,genus,,,,,,,288974,Foraminifera,212476,Rhizaria
1,benthic_forams,Nodosaria,1952,Nodosaria,genus,82197.0,Nodosariidae,429322.0,Nodosariida,428875.0,Nodosariata,288974,Foraminifera,212476,Rhizaria
2,benthic_forams,Cibicides,1107,Cibicides,genus,82208.0,Cibicididae,,,,,288974,Foraminifera,212476,Rhizaria
3,benthic_forams,Brizalina,1017,Brizalina,genus,112279.0,Bolivinidae,,,,,288974,Foraminifera,212476,Rhizaria
4,planktic_forams,Candeina,1053,Candeina,genus,422277.0,Candeinidae,,,,,288974,Foraminifera,212476,Rhizaria


add genus data if it doesn't exist

In [23]:
add_pbdb_data(df, genus_df, 'genus name')

In [24]:
print(df.shape)
df.columns

(4741, 32)


Index(['taxon_group', 'verbatim_name', 'name',
       'name to use (if different from "name")', 'name comment field',
       'Comment', 'Notes (change to Internal only notes?)',
       'Any taxon above genus', 'genus modifier', 'genus name',
       'subgenera modifier', 'subgenera name', 'species modifier',
       'species name', 'subspecies modifier', 'subspecies name',
       'non-taxa descriptor', 'comments', 'pbdb_taxon_id', 'pbdb_taxon_name',
       'pbdb_taxon_rank', 'Corrections to pbdb_taxon_id', 'family_taxon_id',
       'family_taxon_name', 'order_taxon_id', 'order_taxon_name',
       'class_taxon_id', 'class_taxon_name', 'phylum_taxon_id',
       'phylum_taxon_name', 'kingdom_taxon_id', 'kingdom_taxon_name'],
      dtype='object')

In [25]:
df.to_csv(pi_taxa_file_pbdb, index=False)

## create higher csv

In [64]:
taxa_df = pd.read_csv(pi_taxa_file_pbdb, dtype=str)
log_df(taxa_df)

# 4741

(4741, 32)


Unnamed: 0,taxon_group,verbatim_name,name,"name to use (if different from ""name"")",name comment field,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,...,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
0,benthic_forams,Euuvigerina miozea (group) >100 m,Euuvigerina miozea (group) >100 m,Euuvigerina miozea,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",,,Euuvigerina,...,,,,,,,288974.0,Foraminifera,212476,Rhizaria
1,benthic_forams,Euuvigerina rodleyi (group) >50 m,Euuvigerina rodleyi (group) >50 m,Euuvigerina rodleyi,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",,,Euuvigerina,...,,,,,,,288974.0,Foraminifera,212476,Rhizaria
2,benthic_forams,Others,Others,Foraminifera indet.,,not a taxa,Andy,Foraminifera indet.,,,...,,,,,,,,,212476,Rhizaria
3,benthic_forams,Pleurostomellids comment,Pleurostomellids comment,Pleurostomellia indet.,,not a taxa,Andy,Pleurostomellidae indet.,,,...,,,,,,,288974.0,Foraminifera,212476,Rhizaria
4,benthic_forams,Ostracoda spp.,Ostracoda spp.,Ostracoda indet.,Ostracoda spp.,group abundance,"Summer 2020: go in as Ostracoda indet, and the...",Ostracoda indet.,,,...,,,,,,,18891.0,Arthropoda,325038,Animalia


check for for taxon that has both higher taxa and genus name 

In [65]:
taxa_df[taxa_df['Any taxon above genus'].notna() & taxa_df['genus name'].notna()]

Unnamed: 0,taxon_group,verbatim_name,name,"name to use (if different from ""name"")",name comment field,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,...,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
4116,radiolarians,Acrobotrys tritubus,Acrobotrys tritubus,,,,"in PBDB, so it will be classified correctly",,,Acrobotrys,...,85922.0,Cannobotryidae,402,Nassellaria,,,4,Radiolaria,212476,Rhizaria
4262,radiolarians,Corocalyptra cervus,Corocalyptra cervus,Corocalyptra cervus,,,"in PBDB, so it will be classified correctly",,,Corocalyptra,...,,,402,Nassellaria,,,4,Radiolaria,212476,Rhizaria


In [58]:
higher_df =  create_higher_taxa_df(taxa_df)

log_df(higher_df)
# 72

(72, 15)


Unnamed: 0,taxon_group,Any taxon above genus,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
2,benthic_forams,Foraminifera indet.,288974,Foraminifera,phylum,,,,,,,,,212476,Rhizaria
3,benthic_forams,Pleurostomellidae indet.,155922,Pleurostomellidae,family,,,,,,,288974.0,Foraminifera,212476,Rhizaria
4,benthic_forams,Ostracoda indet.,22826,Ostracoda,class,,,,,,,18891.0,Arthropoda,325038,Animalia
8,benthic_forams,Textulariia indet.,434530,Textulariia,subclass,,,,,,,288974.0,Foraminifera,212476,Rhizaria
72,benthic_forams,Elphidiidae indet.,82206,Elphidiidae,family,,,,,,,288974.0,Foraminifera,212476,Rhizaria


In [59]:
higher_df[higher_df.duplicated(subset=['taxon_group', 'Any taxon above genus'])]

Unnamed: 0,taxon_group,Any taxon above genus,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name


In [60]:
higher_df[higher_df.duplicated(subset=['Any taxon above genus'])]

Unnamed: 0,taxon_group,Any taxon above genus,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
2202,dinoflagellates,Foraminifera indet.,288974.0,Foraminifera,phylum,,,,,,,,,212476.0,Rhizaria
2223,nannofossils,Dinoflagellata indet.,354791.0,Dinoflagellata,superclass,,,,,,,277918.0,Myzozoa,,
3062,palynology,"""Amorphous organic matter""",,,,,,,,,,,,,
3063,palynology,"""Black phytoclasts""",,,,,,,,,,,,,
3065,palynology,"""Brown phytoclasts""",,,,,,,,,,,,,
3073,palynology,Foraminifera indet.,288974.0,Foraminifera,phylum,,,,,,,,,212476.0,Rhizaria
3077,palynology,Dinoflagellata indet.,354791.0,Dinoflagellata,superclass,,,,,,,277918.0,Myzozoa,,
3203,planktic_forams,Foraminifera indet.,288974.0,Foraminifera,phylum,,,,,,,,,212476.0,Rhizaria


In [61]:
higher_df['Any taxon above genus edit'] = higher_df['Any taxon above genus'].replace(' indet.', '')

fetch_pdbd_data(higher_df, 'Any taxon above genus edit')

"Challengeria spp."  not found
1450 "Phytolith"  not found
1500 "Amorphous organic matter"  not found
"Black phytoclasts"  not found
"Brown phytoclasts"  not found
"round browns indet., psilate"  not found
"Terrestrial palynomorphs"  not found
"Unknown palynology taxa"  not found
"Other pollen"  not found
"Spores"  not found
"Fungal spores"  not found
"Pollen/spores reworked"  not found
"Sporomorphs"  not found
"Amorphous organic matter"  not found
"Black phytoclasts"  not found
"Black woody phytoclasts"  not found
"Brown phytoclasts"  not found
"Brown woody phytoclasts"  not found
"Saccate pollen"  not found
"Monolete ornamented"  not found
"Monolete psilate"  not found
"Trilete ornamented"  not found
"Trilete psilate"  not found


In [62]:
del higher_df['Any taxon above genus edit']
higher_df.drop_duplicates(inplace=True)

log_df(higher_df)

(72, 15)


Unnamed: 0,taxon_group,Any taxon above genus,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
2,benthic_forams,Foraminifera indet.,288974,Foraminifera,phylum,,,,,,,,,212476,Rhizaria
3,benthic_forams,Pleurostomellidae indet.,155922,Pleurostomellidae,family,,,,,,,288974.0,Foraminifera,212476,Rhizaria
4,benthic_forams,Ostracoda indet.,22826,Ostracoda,class,,,,,,,18891.0,Arthropoda,325038,Animalia
8,benthic_forams,Textulariia indet.,434530,Textulariia,subclass,,,,,,,288974.0,Foraminifera,212476,Rhizaria
72,benthic_forams,Elphidiidae indet.,82206,Elphidiidae,family,,,,,,,288974.0,Foraminifera,212476,Rhizaria


In [63]:
higher_df.columns

Index(['taxon_group', 'Any taxon above genus', 'pbdb_taxon_id',
       'pbdb_taxon_name', 'pbdb_taxon_rank', 'family_taxon_id',
       'family_taxon_name', 'order_taxon_id', 'order_taxon_name',
       'class_taxon_id', 'class_taxon_name', 'phylum_taxon_id',
       'phylum_taxon_name', 'kingdom_taxon_id', 'kingdom_taxon_name'],
      dtype='object')

In [32]:
higher_df.to_csv(higher_taxa_pbdb_file, index=False)

## add higher pbdb to input_file

In [33]:
df = pd.read_csv(pi_taxa_file_pbdb,  dtype=str)
log_df(df)
# 4741

(4741, 32)


Unnamed: 0,taxon_group,verbatim_name,name,"name to use (if different from ""name"")",name comment field,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,...,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
0,benthic_forams,Euuvigerina miozea (group) >100 m,Euuvigerina miozea (group) >100 m,Euuvigerina miozea,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",,,Euuvigerina,...,,,,,,,288974.0,Foraminifera,212476,Rhizaria
1,benthic_forams,Euuvigerina rodleyi (group) >50 m,Euuvigerina rodleyi (group) >50 m,Euuvigerina rodleyi,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",,,Euuvigerina,...,,,,,,,288974.0,Foraminifera,212476,Rhizaria
2,benthic_forams,Others,Others,Foraminifera indet.,,not a taxa,Andy,Foraminifera indet.,,,...,,,,,,,,,212476,Rhizaria
3,benthic_forams,Pleurostomellids comment,Pleurostomellids comment,Pleurostomellia indet.,,not a taxa,Andy,Pleurostomellidae indet.,,,...,,,,,,,288974.0,Foraminifera,212476,Rhizaria
4,benthic_forams,Ostracoda spp.,Ostracoda spp.,Ostracoda indet.,Ostracoda spp.,group abundance,"Summer 2020: go in as Ostracoda indet, and the...",Ostracoda indet.,,,...,,,,,,,18891.0,Arthropoda,325038,Animalia


In [34]:
df.columns

Index(['taxon_group', 'verbatim_name', 'name',
       'name to use (if different from "name")', 'name comment field',
       'Comment', 'Notes (change to Internal only notes?)',
       'Any taxon above genus', 'genus modifier', 'genus name',
       'subgenera modifier', 'subgenera name', 'species modifier',
       'species name', 'subspecies modifier', 'subspecies name',
       'non-taxa descriptor', 'comments', 'pbdb_taxon_id', 'pbdb_taxon_name',
       'pbdb_taxon_rank', 'Corrections to pbdb_taxon_id', 'family_taxon_id',
       'family_taxon_name', 'order_taxon_id', 'order_taxon_name',
       'class_taxon_id', 'class_taxon_name', 'phylum_taxon_id',
       'phylum_taxon_name', 'kingdom_taxon_id', 'kingdom_taxon_name'],
      dtype='object')

In [35]:
higher_df = pd.read_csv(higher_taxa_pbdb_file, dtype=str)
log_df(higher_df)

(72, 15)


Unnamed: 0,taxon_group,Any taxon above genus,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
0,benthic_forams,Foraminifera indet.,288974,Foraminifera,phylum,,,,,,,,,212476,Rhizaria
1,benthic_forams,Pleurostomellidae indet.,155922,Pleurostomellidae,family,,,,,,,288974.0,Foraminifera,212476,Rhizaria
2,benthic_forams,Ostracoda indet.,22826,Ostracoda,class,,,,,,,18891.0,Arthropoda,325038,Animalia
3,benthic_forams,Textulariia indet.,434530,Textulariia,subclass,,,,,,,288974.0,Foraminifera,212476,Rhizaria
4,benthic_forams,Elphidiidae indet.,82206,Elphidiidae,family,,,,,,,288974.0,Foraminifera,212476,Rhizaria


In [36]:
add_pbdb_data(df, higher_df, 'Any taxon above genus')

In [37]:
log_df(df)

(4741, 32)


Unnamed: 0,taxon_group,verbatim_name,name,"name to use (if different from ""name"")",name comment field,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,...,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
0,benthic_forams,Euuvigerina miozea (group) >100 m,Euuvigerina miozea (group) >100 m,Euuvigerina miozea,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",,,Euuvigerina,...,,,,,,,288974.0,Foraminifera,212476,Rhizaria
1,benthic_forams,Euuvigerina rodleyi (group) >50 m,Euuvigerina rodleyi (group) >50 m,Euuvigerina rodleyi,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",,,Euuvigerina,...,,,,,,,288974.0,Foraminifera,212476,Rhizaria
2,benthic_forams,Others,Others,Foraminifera indet.,,not a taxa,Andy,Foraminifera indet.,,,...,,,,,,,,,212476,Rhizaria
3,benthic_forams,Pleurostomellids comment,Pleurostomellids comment,Pleurostomellia indet.,,not a taxa,Andy,Pleurostomellidae indet.,,,...,,,,,,,288974.0,Foraminifera,212476,Rhizaria
4,benthic_forams,Ostracoda spp.,Ostracoda spp.,Ostracoda indet.,Ostracoda spp.,group abundance,"Summer 2020: go in as Ostracoda indet, and the...",Ostracoda indet.,,,...,,,,,,,18891.0,Arthropoda,325038,Animalia


In [38]:
df.to_csv(pi_taxa_file_pbdb,  index=False)


## Create crosswalk csv

In [39]:
df = pd.read_csv(pi_taxa_file_pbdb, dtype=str)
log_df(df, 3)
# 4741

(4741, 32)


Unnamed: 0,taxon_group,verbatim_name,name,"name to use (if different from ""name"")",name comment field,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,...,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
0,benthic_forams,Euuvigerina miozea (group) >100 m,Euuvigerina miozea (group) >100 m,Euuvigerina miozea,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",,,Euuvigerina,...,,,,,,,288974.0,Foraminifera,212476,Rhizaria
1,benthic_forams,Euuvigerina rodleyi (group) >50 m,Euuvigerina rodleyi (group) >50 m,Euuvigerina rodleyi,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",,,Euuvigerina,...,,,,,,,288974.0,Foraminifera,212476,Rhizaria
2,benthic_forams,Others,Others,Foraminifera indet.,,not a taxa,Andy,Foraminifera indet.,,,...,,,,,,,,,212476,Rhizaria


In [40]:
df.loc[df["verbatim_name"].str.startswith("Dextral:Sinistral"), "verbatim_name"] = df["name"]


In [41]:
filtered_taxa = nt.create_taxa_crosswalk_df(df)

# 4741
# 4732
# 4593

fields: ['Any taxon above genus', 'genus modifier', 'genus name', 'subgenera modifier', 'subgenera name', 'species modifier', 'species name', 'subspecies modifier', 'subspecies name', 'non-taxa descriptor', 'normalized_name', 'taxon_group', 'verbatim_name', 'name comment field', 'Comment', 'Notes (change to Internal only notes?)', 'comments']
initial df:  (4741, 17)
remove nontaxa df:  (4732, 17)
drop duplicates df:  (4593, 17)


In [42]:
filtered_taxa[filtered_taxa.duplicated()]

Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,normalized_name,taxon_group,verbatim_name,name comment field,Comment,Notes (change to Internal only notes?),comments


In [43]:
filtered_taxa.to_csv(crosswalk_file, index=False)

## Create taxa list csv

create taxa list with unique taxa fields

In [44]:
df = pd.read_csv(pi_taxa_file_pbdb, dtype=str)
log_df(df, 3)
# 4741

(4741, 32)


Unnamed: 0,taxon_group,verbatim_name,name,"name to use (if different from ""name"")",name comment field,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,...,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
0,benthic_forams,Euuvigerina miozea (group) >100 m,Euuvigerina miozea (group) >100 m,Euuvigerina miozea,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",,,Euuvigerina,...,,,,,,,288974.0,Foraminifera,212476,Rhizaria
1,benthic_forams,Euuvigerina rodleyi (group) >50 m,Euuvigerina rodleyi (group) >50 m,Euuvigerina rodleyi,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",,,Euuvigerina,...,,,,,,,288974.0,Foraminifera,212476,Rhizaria
2,benthic_forams,Others,Others,Foraminifera indet.,,not a taxa,Andy,Foraminifera indet.,,,...,,,,,,,,,212476,Rhizaria


In [45]:
filtered_taxa = nt.create_taxa_list_df(df)
# 4741
# 4732
# 4153

fields: ['Any taxon above genus', 'genus modifier', 'genus name', 'subgenera modifier', 'subgenera name', 'species modifier', 'species name', 'subspecies modifier', 'subspecies name', 'non-taxa descriptor', 'normalized_name', 'taxon_group', 'pbdb_taxon_id', 'pbdb_taxon_name', 'pbdb_taxon_rank', 'family_taxon_id', 'family_taxon_name', 'order_taxon_id', 'order_taxon_name', 'class_taxon_id', 'class_taxon_name', 'phylum_taxon_id', 'phylum_taxon_name', 'kingdom_taxon_id', 'kingdom_taxon_name']
initial df:  (4741, 25)
remove nontaxa df:  (4732, 25)
drop duplicates df:  (4153, 25)


In [46]:
filtered_taxa[filtered_taxa.duplicated()]

Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,...,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name


In [47]:
filtered_taxa.to_csv(taxa_list_file, index=False)