#  NOAA Janus taxa list
## 101-210 taxa

Create list of taxa for NOAA Janus files. Compare NOAA taxa with the taxa that the PIs have already approved in order create a list of unapproved taxa. Add PBDB data to unapproved taxa.

In [448]:
import sys
import csv
import glob
import os
import requests
import re
import shutil

sys.path.append('../../')

from config import CLEAN_DATA_DIR, OUTPUT_DIR, RAW_DATA_DIR


import pandas as pd
import numpy as np

import db 
import scripts.normalize_taxa as nt
from scripts.shared_utils import (
    log_df
)
import scripts.pbdb as pbdb 


In [449]:
# base_directory = 'cleaned_data'
date='2022-11-15'

lims_taxa_path = OUTPUT_DIR/'taxa'/'LIMS'/f'taxa_list_{date}.csv'
lims_crosswalk_path = OUTPUT_DIR/'taxa'/'LIMS'/f'taxa_crosswalk_{date}.csv'
lims_genus_path = OUTPUT_DIR/'taxa'/'LIMS'/f'genera_pbdb_{date}.csv'


date='2022-12-01'

metadata_path = OUTPUT_DIR/'metadata'/'NOAA'/'noaa_janus_iodp_files.csv'
noaa_101_210_taxa_path = OUTPUT_DIR/'taxa'/'draft'/'NOAA'/f'taxa_101_210_{date}.csv'
unapproved_taxa_path = OUTPUT_DIR/'taxa'/'draft'/'NOAA'/f'taxa_unapproved_101_210_{date}.csv'
species_path = OUTPUT_DIR/'taxa'/'NOAA'/f'species_101_210_{date}.csv'
genus_path = OUTPUT_DIR/'taxa'/'NOAA'/f'genus_101_210_{date}.csv'
higher_path = OUTPUT_DIR/'taxa'/'NOAA'/f'higher_taxa_101_210_{date}.csv'


metadata_1_96_path = OUTPUT_DIR/'metadata'/'NOAA'/'noaa_dsdp_files.csv'
noaa_1_96_taxa_path = OUTPUT_DIR/'taxa'/'draft'/'NOAA'/f'taxa_1_96_{date}.csv'

PI_noaa_1_96_taxa_path =  RAW_DATA_DIR/'PI_processed_files'/'NOAA_taxa_lists_taxa_list_2022-11-15.csv'


## Create taxa list

In [450]:
metadata = pd.read_csv(metadata_path)
metadata = metadata[metadata['type'] == 'taxa']
log_df(metadata)

(2045, 5)


Unnamed: 0,path,type,expedition,site,taxon_group
0,NOAA/JanusIODP_paleo_agemodel/paleontology/ran...,taxa,101,626,benthic_forams
1,NOAA/JanusIODP_paleo_agemodel/paleontology/ran...,taxa,101,626,benthic_forams
2,NOAA/JanusIODP_paleo_agemodel/paleontology/ran...,taxa,101,626,nannofossils
3,NOAA/JanusIODP_paleo_agemodel/paleontology/ran...,taxa,101,626,nannofossils
4,NOAA/JanusIODP_paleo_agemodel/paleontology/ran...,taxa,101,626,planktic_forams


In [451]:
taxa_df = nt.create_noaa_2_taxa_crosswalk_df(metadata, CLEAN_DATA_DIR)
log_df(taxa_df)
# 13066

(13066, 10)


Unnamed: 0,taxon_group,verbatim_name,name,genus modifier,genus name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor
12318,Dinoflagellates/Acritarchs/Prasinophytes,?Labyrinthodinium sp. 1,,?,Labyrinthodinium,,sp.,,1.0,
12841,Dinoflagellates/Acritarchs/Prasinophytes,?Maduradinium sp.,,?,Maduradinium,,sp.,,,
5864,Dinoflagellates/Acritarchs/Prasinophytes,?Pyxidiella sp. 1,,?,Pyxidiella,,sp.,,1.0,
10216,Dinoflagellates/Acritarchs/Prasinophytes,Aandalusiella ivoirensis,,,Aandalusiella,,ivoirensis,,,
4982,Dinoflagellates/Acritarchs/Prasinophytes,Abratopdinium cardioforme,,,Abratopdinium,,cardioforme,,,


In [51]:
taxa_df.to_csv(noaa_101_210_taxa_path, index=False)

## create noaa 1 taxa

In [52]:
metadata_1 = pd.read_csv(metadata_1_96_path)
metadata_1 = metadata_1[metadata_1['type'] == 'taxa']
log_df(metadata_1)

(2093, 5)


Unnamed: 0,path,type,taxon_group,expedition,site
0,NOAA/DSDP_core_data/61/462/radiolar.csv,taxa,radiolarians,61,462
2,NOAA/DSDP_core_data/61/462/b_forams.csv,taxa,benthic_forams,61,462
3,NOAA/DSDP_core_data/61/462/p_forams.csv,taxa,planktic_forams,61,462
5,NOAA/DSDP_core_data/61/462/nannos.csv,taxa,nannofossils,61,462
7,NOAA/DSDP_core_data/61/462A/radiolar.csv,taxa,radiolarians,61,462A


In [53]:
taxa_1_df = nt.create_noaa_1_taxa_crosswalk_df(metadata_1, CLEAN_DATA_DIR)
log_df(taxa_1_df)
# 9933

(9933, 10)


Unnamed: 0,taxon_group,verbatim_name,name,genus modifier,genus name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor
6274,Ebridians and Actinicidians,Actiniscus elongatus (q),? Actiniscus elongatus,?,Actiniscus,,elongatus,,,
3303,Ebridians and Actinicidians,Actiniscus laciniatus (q),? Actiniscus laciniatus,?,Actiniscus,,laciniatus,,,
5703,Ebridians and Actinicidians,Actiniscus pentasterias,,,Actiniscus,,pentasterias,,,
3706,Ebridians and Actinicidians,Actiniscus sp.,,,Actiniscus,,sp.,,,
3985,Ebridians and Actinicidians,Actiniscus squamosus,,,Actiniscus,,squamosus,,,


In [54]:
taxa_1_df.to_csv(noaa_1_96_taxa_path, index=False)

## Created taxa list with unapproved NOAA taxa

In [452]:
noaa_2_taxa_df = pd.read_csv(noaa_101_210_taxa_path)
log_df(noaa_2_taxa_df)

# 13066

(13066, 10)


Unnamed: 0,taxon_group,verbatim_name,name,genus modifier,genus name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor
0,Dinoflagellates/Acritarchs/Prasinophytes,?Labyrinthodinium sp. 1,,?,Labyrinthodinium,,sp.,,1.0,
1,Dinoflagellates/Acritarchs/Prasinophytes,?Maduradinium sp.,,?,Maduradinium,,sp.,,,
2,Dinoflagellates/Acritarchs/Prasinophytes,?Pyxidiella sp. 1,,?,Pyxidiella,,sp.,,1.0,
3,Dinoflagellates/Acritarchs/Prasinophytes,Aandalusiella ivoirensis,,,Aandalusiella,,ivoirensis,,,
4,Dinoflagellates/Acritarchs/Prasinophytes,Abratopdinium cardioforme,,,Abratopdinium,,cardioforme,,,


In [453]:
def process_verbatim_names(file, skip=0):
    df = pd.read_csv(file, usecols=['taxon_group', 'verbatim_name'], skiprows=skip)    
   
    df.drop_duplicates(inplace=True)
    df.rename(columns={'verbatim_name': 'existing_name'}, inplace=True)


    return df


def process_normalized_names(file, skip=0):
    df = pd.read_csv(file, skiprows=skip)
    
    nt.add_normalized_name_column(df)
    df = df[['taxon_group', 'normalized_name']]   
    
    df.drop_duplicates(inplace=True)
    df.dropna(subset=['taxon_group'], inplace=True)
    df.dropna(subset=['normalized_name'], inplace=True)
    
    df.rename(columns={'normalized_name': 'existing_name'}, inplace=True)
    
    return df




In [454]:
noaa_1_names =  pd.read_csv(noaa_1_96_taxa_path, usecols=['taxon_group', 'name'])
noaa_1_names.drop_duplicates(inplace=True)
noaa_1_names.dropna(subset=['name'], inplace=True)
noaa_1_names.rename(columns={'name': 'existing_name'}, inplace=True)


log_df(noaa_1_names)

(1656, 2)


Unnamed: 0,taxon_group,existing_name
0,Ebridians and Actinicidians,? Actiniscus elongatus
1,Ebridians and Actinicidians,? Actiniscus laciniatus
9,Ebridians and Actinicidians,? Ammodochium sp.
11,Ebridians and Actinicidians,? Cintactiniscus sp.
21,Ebridians and Actinicidians,? Ebriopsis aplanata


use verbtim names for noaa 1 since PIs are still working on corrected the names

In [455]:
noaa_1_vnames =  process_verbatim_names(noaa_1_96_taxa_path)

log_df(noaa_1_vnames)

(9933, 2)


Unnamed: 0,taxon_group,existing_name
0,Ebridians and Actinicidians,Actiniscus elongatus (q)
1,Ebridians and Actinicidians,Actiniscus laciniatus (q)
2,Ebridians and Actinicidians,Actiniscus pentasterias
3,Ebridians and Actinicidians,Actiniscus sp.
4,Ebridians and Actinicidians,Actiniscus squamosus


use verbatim names and corrected name for LIMS

In [456]:
lims_vnames = process_verbatim_names(lims_crosswalk_path)

log_df(lims_vnames)

(5380, 2)


Unnamed: 0,taxon_group,existing_name
0,benthic_forams,Euuvigerina miozea (group) >100 m
1,benthic_forams,Euuvigerina rodleyi (group) >50 m
2,benthic_forams,Others
3,benthic_forams,Pleurostomellids comment
4,benthic_forams,Ostracoda spp.


In [457]:
lims_names = process_normalized_names(lims_crosswalk_path)

log_df(lims_names)

(4694, 2)


Unnamed: 0,taxon_group,existing_name
0,benthic_forams,Euuvigerina miozea
1,benthic_forams,Euuvigerina rodleyi
2,benthic_forams,Foraminifera indet.
3,benthic_forams,Pleurostomellidae indet.
4,benthic_forams,Ostracoda indet.


get taxa that PIs have looked at

In [458]:
approved_taxa_df = pd.concat([
    noaa_1_vnames,
    noaa_1_names,
    lims_vnames,
    lims_names
    
])
approved_taxa_df.drop_duplicates(inplace=True)

log_df(approved_taxa_df)
# 15863

(15863, 2)


Unnamed: 0,taxon_group,existing_name
0,Ebridians and Actinicidians,Actiniscus elongatus (q)
1,Ebridians and Actinicidians,Actiniscus laciniatus (q)
2,Ebridians and Actinicidians,Actiniscus pentasterias
3,Ebridians and Actinicidians,Actiniscus sp.
4,Ebridians and Actinicidians,Actiniscus squamosus


select NOAA taxa that needs approval

In [459]:
merged_df = noaa_2_taxa_df.merge(approved_taxa_df,  
                     left_on=['verbatim_name', 'taxon_group' ], 
                     right_on=['existing_name', 'taxon_group' ], 
                     how='left',
                     indicator='_merge_type')

merged_df = merged_df.drop_duplicates()
del merged_df['existing_name']

log_df(merged_df)
# 13066, 11 

(13066, 11)


Unnamed: 0,taxon_group,verbatim_name,name,genus modifier,genus name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,_merge_type
0,Dinoflagellates/Acritarchs/Prasinophytes,?Labyrinthodinium sp. 1,,?,Labyrinthodinium,,sp.,,1.0,,left_only
1,Dinoflagellates/Acritarchs/Prasinophytes,?Maduradinium sp.,,?,Maduradinium,,sp.,,,,left_only
2,Dinoflagellates/Acritarchs/Prasinophytes,?Pyxidiella sp. 1,,?,Pyxidiella,,sp.,,1.0,,left_only
3,Dinoflagellates/Acritarchs/Prasinophytes,Aandalusiella ivoirensis,,,Aandalusiella,,ivoirensis,,,,left_only
4,Dinoflagellates/Acritarchs/Prasinophytes,Abratopdinium cardioforme,,,Abratopdinium,,cardioforme,,,,left_only


Select unapproved NOAA taxa. When _merged is both, it means the taxa has been approved. When _merged is left_only, it means the taxa has not been approved.

In [460]:
unapproved_taxa_df = merged_df[merged_df['_merge_type'] == 'left_only'].copy()

del unapproved_taxa_df['_merge_type']

log_df(unapproved_taxa_df)

# (8975, 10)

(8975, 10)


Unnamed: 0,taxon_group,verbatim_name,name,genus modifier,genus name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor
0,Dinoflagellates/Acritarchs/Prasinophytes,?Labyrinthodinium sp. 1,,?,Labyrinthodinium,,sp.,,1.0,
1,Dinoflagellates/Acritarchs/Prasinophytes,?Maduradinium sp.,,?,Maduradinium,,sp.,,,
2,Dinoflagellates/Acritarchs/Prasinophytes,?Pyxidiella sp. 1,,?,Pyxidiella,,sp.,,1.0,
3,Dinoflagellates/Acritarchs/Prasinophytes,Aandalusiella ivoirensis,,,Aandalusiella,,ivoirensis,,,
4,Dinoflagellates/Acritarchs/Prasinophytes,Abratopdinium cardioforme,,,Abratopdinium,,cardioforme,,,


In [461]:
unapproved_taxa_df.to_csv(unapproved_taxa_path, index=False)

## create species csv

Look up the genus for unapproved taxa in PBDB

In [435]:
def add_genus_species(taxa_df):
    taxa_df.loc[~taxa_df['species name'].str.contains('spp\.|sp\..*?', regex=True).fillna(False), 'genus species']=taxa_df['genus name'] + ' ' + taxa_df['species name']
    # taxa_df.loc[taxa_df['species name'].isna(), 'genus species']=taxa_df['genus name'] 
    # taxa_df.loc[taxa_df['species name'].str.contains('spp\.|sp\..*?', regex=True).fillna(False), 'genus species']=taxa_df['genus name']
    taxa_df['genus species'] = taxa_df['genus species'].str.replace('(?)', '', regex=False)
    taxa_df['genus species'] = taxa_df['genus species'].str.replace('?', '', regex=False)

    taxa_df['genus species'] = taxa_df['genus species'].str.strip()

In [436]:
df = pd.read_csv(unapproved_taxa_path)
add_genus_species(df)
log_df(df)
# (8975, 9)

(8975, 40)


Unnamed: 0,taxon_group,verbatim_name,name,Comment,Notes (change to Internal only notes?),Any taxon above genus modifier,Any taxon above genus,genus modifier,genus name,subgenera modifier,...,subclass_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name,genus species
0,Dinoflagellates/Acritarchs/Prasinophytes,?Labyrinthodinium sp. 1,,,,,,?,Labyrinthodinium,,...,,321578.0,Dinophyceae,,,,,,,
1,Dinoflagellates/Acritarchs/Prasinophytes,?Maduradinium sp.,,,,,,?,Maduradinium,,...,,321578.0,Dinophyceae,,,,,,,
2,Dinoflagellates/Acritarchs/Prasinophytes,?Pyxidiella sp. 1,,,,,,?,Pyxidiella,,...,,321578.0,Dinophyceae,,,,,,,
3,Dinoflagellates/Acritarchs/Prasinophytes,Aandalusiella ivoirensis,,,,,,,Aandalusiella,,...,,,,,,,,,,Aandalusiella ivoirensis
4,Dinoflagellates/Acritarchs/Prasinophytes,Abratopdinium cardioforme,,,,,,,Abratopdinium,,...,,,,,,,,,,Abratopdinium cardioforme


In [437]:
species_df = df[df['genus species'].notna()].copy()[['taxon_group', 'genus species']]
species_df.drop_duplicates(inplace=True)
# species_df['check'] = False

log_df(species_df)
# 5509

(5509, 2)


Unnamed: 0,taxon_group,genus species
3,Dinoflagellates/Acritarchs/Prasinophytes,Aandalusiella ivoirensis
4,Dinoflagellates/Acritarchs/Prasinophytes,Abratopdinium cardioforme
5,Dinoflagellates/Acritarchs/Prasinophytes,Abratopdinium kerguelense
6,Dinoflagellates/Acritarchs/Prasinophytes,Acanthaulax granulata
7,Dinoflagellates/Acritarchs/Prasinophytes,Acanthaulax wilsonii


In [346]:
pbdb.fetch_pdbd_data(species_df, 'genus species')

In [91]:
species_df[species_df['check'] == False].shape

(0, 20)

In [94]:
# del species_df['check']


In [342]:
species_df.to_csv(species_path, index=False)

## add species pbdb info to unapproved taxa 

In [464]:
def add_genus_species(taxa_df):
    taxa_df.loc[~taxa_df['species name'].str.contains('spp\.|sp\..*?', regex=True).fillna(False), 'genus species']=taxa_df['genus name'] + ' ' + taxa_df['species name']
    # taxa_df.loc[taxa_df['species name'].isna(), 'genus species']=taxa_df['genus name'] 
    # taxa_df.loc[taxa_df['species name'].str.contains('spp\.|sp\..*?', regex=True).fillna(False), 'genus species']=taxa_df['genus name']
    taxa_df['genus species'] = taxa_df['genus species'].str.replace('(?)', '', regex=False)
    taxa_df['genus species'] = taxa_df['genus species'].str.replace('?', '', regex=False)

    taxa_df['genus species'] = taxa_df['genus species'].str.strip()

In [465]:
species_df = pd.read_csv(species_path, dtype=str)

log_df(species_df)
# 5509

(5509, 19)


Unnamed: 0,taxon_group,genus species,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,genus_taxon_id,genus_taxon_name,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name
0,Dinoflagellates/Acritarchs/Prasinophytes,Aandalusiella ivoirensis,,,,,,,,,,,,,,,,,
1,Dinoflagellates/Acritarchs/Prasinophytes,Abratopdinium cardioforme,,,,,,,,,,,,,,,,,
2,Dinoflagellates/Acritarchs/Prasinophytes,Abratopdinium kerguelense,,,,,,,,,,,,,,,,,
3,Dinoflagellates/Acritarchs/Prasinophytes,Acanthaulax granulata,,,,,,,,,,,,,,,,,
4,Dinoflagellates/Acritarchs/Prasinophytes,Acanthaulax wilsonii,,,,,,,,,,,,,,,,,


In [466]:
species_df = species_df[species_df['pbdb_taxon_name'].notna() & (species_df['pbdb_taxon_rank'] == 'species')]
log_df(species_df)
# 958

(958, 19)


Unnamed: 0,taxon_group,genus species,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,genus_taxon_id,genus_taxon_name,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name
5,Dinoflagellates/Acritarchs/Prasinophytes,Achilleodinium bianii,323992,Achilleodinium bianii,species,323991,Achilleodinium,321603,Gonyaulacaceae,321606,Gonyaulacales,321578,Dinophyceae,,,,,,
15,Dinoflagellates/Acritarchs/Prasinophytes,Achomosphaera ramulifera,277049,Achomosphaera ramulifera,species,277048,Achomosphaera,321603,Gonyaulacaceae,321606,Gonyaulacales,321578,Dinophyceae,,,,,,
16,Dinoflagellates/Acritarchs/Prasinophytes,Achomosphaera sagena,323552,Achomosphaera sagena,species,277048,Achomosphaera,321603,Gonyaulacaceae,321606,Gonyaulacales,321578,Dinophyceae,,,,,,
17,Dinoflagellates/Acritarchs/Prasinophytes,Achomosphaera triangulata,323940,Achomosphaera triangulata,species,277048,Achomosphaera,321603,Gonyaulacaceae,321606,Gonyaulacales,321578,Dinophyceae,,,,,,
18,Dinoflagellates/Acritarchs/Prasinophytes,Achomosphaera verdieri,323553,Achomosphaera verdieri,species,277048,Achomosphaera,321603,Gonyaulacaceae,321606,Gonyaulacales,321578,Dinophyceae,,,,,,


In [467]:
unapproved_df = pd.read_csv(unapproved_taxa_path, dtype=str)
add_genus_species(unapproved_df)
log_df(unapproved_df)
# (8975, 11)


(8975, 11)


Unnamed: 0,taxon_group,verbatim_name,name,genus modifier,genus name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,genus species
0,Dinoflagellates/Acritarchs/Prasinophytes,?Labyrinthodinium sp. 1,,?,Labyrinthodinium,,sp.,,1.0,,
1,Dinoflagellates/Acritarchs/Prasinophytes,?Maduradinium sp.,,?,Maduradinium,,sp.,,,,
2,Dinoflagellates/Acritarchs/Prasinophytes,?Pyxidiella sp. 1,,?,Pyxidiella,,sp.,,1.0,,
3,Dinoflagellates/Acritarchs/Prasinophytes,Aandalusiella ivoirensis,,,Aandalusiella,,ivoirensis,,,,Aandalusiella ivoirensis
4,Dinoflagellates/Acritarchs/Prasinophytes,Abratopdinium cardioforme,,,Abratopdinium,,cardioforme,,,,Abratopdinium cardioforme


In [468]:
pbdb.add_pbdb_data(unapproved_df, species_df, 'genus species')

In [469]:
diff = set(species_df.columns) - set(unapproved_df.columns)
diff

set()

In [470]:
log_df(unapproved_df)
# (8975, 28)

(8975, 28)


Unnamed: 0,taxon_group,verbatim_name,name,genus modifier,genus name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,...,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,genus_taxon_id,genus_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name
0,Dinoflagellates/Acritarchs/Prasinophytes,?Labyrinthodinium sp. 1,,?,Labyrinthodinium,,sp.,,1.0,,...,,,,,,,,,,
1,Dinoflagellates/Acritarchs/Prasinophytes,?Maduradinium sp.,,?,Maduradinium,,sp.,,,,...,,,,,,,,,,
2,Dinoflagellates/Acritarchs/Prasinophytes,?Pyxidiella sp. 1,,?,Pyxidiella,,sp.,,1.0,,...,,,,,,,,,,
3,Dinoflagellates/Acritarchs/Prasinophytes,Aandalusiella ivoirensis,,,Aandalusiella,,ivoirensis,,,,...,,,,,,,,,,
4,Dinoflagellates/Acritarchs/Prasinophytes,Abratopdinium cardioforme,,,Abratopdinium,,cardioforme,,,,...,,,,,,,,,,


In [471]:
unapproved_df.to_csv(unapproved_taxa_path, index=False)

## create genus csv

In [409]:
lims_genus_df = pd.read_csv(lims_genus_path, dtype=str)
log_df(lims_genus_df)

(1026, 15)


Unnamed: 0,taxon_group,genus name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
0,benthic_forams,Euuvigerina,1408,Euuvigerina,genus,,,,,,,288974,Foraminifera,212476,Rhizaria
1,benthic_forams,Nodosaria,1952,Nodosaria,genus,82197.0,Nodosariidae,429322.0,Nodosariida,428875.0,Nodosariata,288974,Foraminifera,212476,Rhizaria
2,benthic_forams,Cibicides,1107,Cibicides,genus,82208.0,Cibicididae,,,,,288974,Foraminifera,212476,Rhizaria
3,benthic_forams,Brizalina,1017,Brizalina,genus,112279.0,Bolivinidae,,,,,288974,Foraminifera,212476,Rhizaria
4,planktic_forams,Candeina,1053,Candeina,genus,422277.0,Candeinidae,,,,,288974,Foraminifera,212476,Rhizaria


In [410]:
df = pd.read_csv(unapproved_taxa_path)
log_df(df)

(8975, 28)


Unnamed: 0,taxon_group,verbatim_name,name,genus modifier,genus name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,...,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,genus_taxon_id,genus_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name
0,Dinoflagellates/Acritarchs/Prasinophytes,?Labyrinthodinium sp. 1,,?,Labyrinthodinium,,sp.,,1.0,,...,,,,,,,,,,
1,Dinoflagellates/Acritarchs/Prasinophytes,?Maduradinium sp.,,?,Maduradinium,,sp.,,,,...,,,,,,,,,,
2,Dinoflagellates/Acritarchs/Prasinophytes,?Pyxidiella sp. 1,,?,Pyxidiella,,sp.,,1.0,,...,,,,,,,,,,
3,Dinoflagellates/Acritarchs/Prasinophytes,Aandalusiella ivoirensis,,,Aandalusiella,,ivoirensis,,,,...,,,,,,,,,,
4,Dinoflagellates/Acritarchs/Prasinophytes,Abratopdinium cardioforme,,,Abratopdinium,,cardioforme,,,,...,,,,,,,,,,


In [411]:
genus_df = df[df['genus name'].notna()].copy()[['taxon_group', 'genus name']]
genus_df.drop_duplicates(inplace=True)
genus_df['check'] = False

log_df(genus_df)

(2537, 3)


Unnamed: 0,taxon_group,genus name,check
0,Dinoflagellates/Acritarchs/Prasinophytes,Labyrinthodinium,False
1,Dinoflagellates/Acritarchs/Prasinophytes,Maduradinium,False
2,Dinoflagellates/Acritarchs/Prasinophytes,Pyxidiella,False
3,Dinoflagellates/Acritarchs/Prasinophytes,Aandalusiella,False
4,Dinoflagellates/Acritarchs/Prasinophytes,Abratopdinium,False


In [412]:
genus_df = genus_df.merge(lims_genus_df, how='left')
genus_df.loc[genus_df['pbdb_taxon_id'].notna(), 'check'] = True

log_df(genus_df)

(2537, 16)


Unnamed: 0,taxon_group,genus name,check,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
0,Dinoflagellates/Acritarchs/Prasinophytes,Labyrinthodinium,False,,,,,,,,,,,,,
1,Dinoflagellates/Acritarchs/Prasinophytes,Maduradinium,False,,,,,,,,,,,,,
2,Dinoflagellates/Acritarchs/Prasinophytes,Pyxidiella,False,,,,,,,,,,,,,
3,Dinoflagellates/Acritarchs/Prasinophytes,Aandalusiella,False,,,,,,,,,,,,,
4,Dinoflagellates/Acritarchs/Prasinophytes,Abratopdinium,False,,,,,,,,,,,,,


In [99]:
pbdb.fetch_pdbd_data(genus_df, 'genus name')

Index(['taxon_group', 'genus name', 'check', 'pbdb_taxon_id',
       'pbdb_taxon_name', 'pbdb_taxon_rank', 'family_taxon_id',
       'family_taxon_name', 'order_taxon_id', 'order_taxon_name',
       'class_taxon_id', 'class_taxon_name', 'phylum_taxon_id',
       'phylum_taxon_name', 'kingdom_taxon_id', 'kingdom_taxon_name',
       'unranked clade_taxon_id', 'unranked clade_taxon_name',
       'subclass_taxon_id', 'subclass_taxon_name', 'genus_taxon_id',
       'genus_taxon_name'],
      dtype='object')
800 1000 1350 1750 1800 1850 2100 2150 2200 2250 2300 2350 2400 2450 2500 

In [106]:
genus_df[genus_df['check'] == False].shape

(0, 22)

In [107]:
log_df(genus_df)

(2545, 22)


Unnamed: 0,taxon_group,genus name,check,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,...,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name,subclass_taxon_id,subclass_taxon_name,genus_taxon_id,genus_taxon_name
0,Dinoflagellates/Acritarchs/Prasinophytes,Labyrinthodinium,True,443826.0,Labyrinthodinium,genus,,,321606.0,Gonyaulacales,...,,,,,,,,,,
1,Dinoflagellates/Acritarchs/Prasinophytes,Maduradinium,True,325673.0,Maduradinium,genus,277915.0,Peridiniaceae,277919.0,Peridiniales,...,,,,,,,,,,
2,Dinoflagellates/Acritarchs/Prasinophytes,Pyxidiella,True,336773.0,Pyxidiella,genus,277915.0,Peridiniaceae,277919.0,Peridiniales,...,,,,,,,,,,
3,Dinoflagellates/Acritarchs/Prasinophytes,Aandalusiella,True,,,,,,,,...,,,,,,,,,,
4,Dinoflagellates/Acritarchs/Prasinophytes,Abratopdinium,True,,,,,,,,...,,,,,,,,,,


In [108]:
del genus_df['check']

In [110]:
genus_df.to_csv(genus_path, index=False)

## add genus pbdb info to unapproved taxa 

In [472]:
genus_df = pd.read_csv(genus_path, dtype=str)
log_df(genus_df)
# (2537, 21)

(2537, 21)


Unnamed: 0,taxon_group,genus name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,...,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name,subclass_taxon_id,subclass_taxon_name,genus_taxon_id,genus_taxon_name
0,Dinoflagellates/Acritarchs/Prasinophytes,Labyrinthodinium,443826.0,Labyrinthodinium,genus,,,321606.0,Gonyaulacales,321578.0,...,,,,,,,,,,
1,Dinoflagellates/Acritarchs/Prasinophytes,Maduradinium,325673.0,Maduradinium,genus,277915.0,Peridiniaceae,277919.0,Peridiniales,321578.0,...,,,,,,,,,,
2,Dinoflagellates/Acritarchs/Prasinophytes,Pyxidiella,336773.0,Pyxidiella,genus,277915.0,Peridiniaceae,277919.0,Peridiniales,321578.0,...,,,,,,,,,,
3,Dinoflagellates/Acritarchs/Prasinophytes,Aandalusiella,,,,,,,,,...,,,,,,,,,,
4,Dinoflagellates/Acritarchs/Prasinophytes,Abratopdinium,,,,,,,,,...,,,,,,,,,,


In [473]:
unapproved_df = pd.read_csv(unapproved_taxa_path, dtype=str)

log_df(unapproved_df)
#  (8975, 28)


(8975, 28)


Unnamed: 0,taxon_group,verbatim_name,name,genus modifier,genus name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,...,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,genus_taxon_id,genus_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name
0,Dinoflagellates/Acritarchs/Prasinophytes,?Labyrinthodinium sp. 1,,?,Labyrinthodinium,,sp.,,1.0,,...,,,,,,,,,,
1,Dinoflagellates/Acritarchs/Prasinophytes,?Maduradinium sp.,,?,Maduradinium,,sp.,,,,...,,,,,,,,,,
2,Dinoflagellates/Acritarchs/Prasinophytes,?Pyxidiella sp. 1,,?,Pyxidiella,,sp.,,1.0,,...,,,,,,,,,,
3,Dinoflagellates/Acritarchs/Prasinophytes,Aandalusiella ivoirensis,,,Aandalusiella,,ivoirensis,,,,...,,,,,,,,,,
4,Dinoflagellates/Acritarchs/Prasinophytes,Abratopdinium cardioforme,,,Abratopdinium,,cardioforme,,,,...,,,,,,,,,,


In [474]:
pbdb.add_pbdb_data(unapproved_df, genus_df, 'genus name')

In [475]:
diff = set(genus_df.columns) - set(unapproved_df.columns)
diff

set()

In [476]:
log_df(unapproved_df)
# (8975, 30)

(8975, 30)


Unnamed: 0,taxon_group,verbatim_name,name,genus modifier,genus name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,...,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,genus_taxon_id,genus_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name,subclass_taxon_id,subclass_taxon_name
0,Dinoflagellates/Acritarchs/Prasinophytes,?Labyrinthodinium sp. 1,,?,Labyrinthodinium,,sp.,,1.0,,...,,,,,,,,,,
1,Dinoflagellates/Acritarchs/Prasinophytes,?Maduradinium sp.,,?,Maduradinium,,sp.,,,,...,,,,,,,,,,
2,Dinoflagellates/Acritarchs/Prasinophytes,?Pyxidiella sp. 1,,?,Pyxidiella,,sp.,,1.0,,...,,,,,,,,,,
3,Dinoflagellates/Acritarchs/Prasinophytes,Aandalusiella ivoirensis,,,Aandalusiella,,ivoirensis,,,,...,,,,,,,,,,
4,Dinoflagellates/Acritarchs/Prasinophytes,Abratopdinium cardioforme,,,Abratopdinium,,cardioforme,,,,...,,,,,,,,,,


In [477]:
unapproved_df.to_csv(unapproved_taxa_path, index=False)

## update columns in taxa list

In [478]:
unapproved_df = pd.read_csv(unapproved_taxa_path, dtype=str)

log_df(unapproved_df)
#  (8975, 30)

(8975, 30)


Unnamed: 0,taxon_group,verbatim_name,name,genus modifier,genus name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,...,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,genus_taxon_id,genus_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name,subclass_taxon_id,subclass_taxon_name
0,Dinoflagellates/Acritarchs/Prasinophytes,?Labyrinthodinium sp. 1,,?,Labyrinthodinium,,sp.,,1.0,,...,,,,,,,,,,
1,Dinoflagellates/Acritarchs/Prasinophytes,?Maduradinium sp.,,?,Maduradinium,,sp.,,,,...,,,,,,,,,,
2,Dinoflagellates/Acritarchs/Prasinophytes,?Pyxidiella sp. 1,,?,Pyxidiella,,sp.,,1.0,,...,,,,,,,,,,
3,Dinoflagellates/Acritarchs/Prasinophytes,Aandalusiella ivoirensis,,,Aandalusiella,,ivoirensis,,,,...,,,,,,,,,,
4,Dinoflagellates/Acritarchs/Prasinophytes,Abratopdinium cardioforme,,,Abratopdinium,,cardioforme,,,,...,,,,,,,,,,


In [479]:
PI_df = pd.read_csv(PI_noaa_1_96_taxa_path, dtype=str)

In [480]:
set([col.replace('.1', '') for col in PI_df.columns]) - set(unapproved_df.columns)

{'Any taxon above genus',
 'Any taxon above genus modifier',
 'Comment',
 'Corrections to pbdb_taxon',
 'Corrections to pbdb_taxon_rank',
 'Notes (change to Internal only notes?)',
 'comments',
 'subgenera modifier',
 'subgenera name',
 'superfamily_taxon_id',
 'superfamily_taxon_name'}

In [481]:
unapproved_df['Any taxon above genus'] = pd.NA
unapproved_df['Any taxon above genus modifier'] = pd.NA
unapproved_df['Comment'] = pd.NA
unapproved_df['Corrections to pbdb_taxon'] = pd.NA
unapproved_df['Notes (change to Internal only notes?)'] = pd.NA
unapproved_df['comments'] = pd.NA
unapproved_df['subgenera modifier'] = pd.NA
unapproved_df['subgenera name'] = pd.NA
unapproved_df['species_taxon_id'] = pd.NA
unapproved_df['species_taxon_name'] = pd.NA

del unapproved_df['genus species']

In [482]:
len(unapproved_df.columns)
# 39

39

In [483]:
for rank in ['species', 'genus', 'family', 'order', 'class', 'phylum']:
    unapproved_df.loc[unapproved_df['pbdb_taxon_rank'] == rank, f'{rank}_taxon_name' ] = unapproved_df['pbdb_taxon_name']
    unapproved_df.loc[unapproved_df['pbdb_taxon_rank'] == rank, f'{rank}_taxon_id' ] = unapproved_df['pbdb_taxon_id']

log_df(unapproved_df)
# (8975, 39)

(8975, 39)


Unnamed: 0,taxon_group,verbatim_name,name,genus modifier,genus name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,...,Any taxon above genus,Any taxon above genus modifier,Comment,Corrections to pbdb_taxon,Notes (change to Internal only notes?),comments,subgenera modifier,subgenera name,species_taxon_id,species_taxon_name
0,Dinoflagellates/Acritarchs/Prasinophytes,?Labyrinthodinium sp. 1,,?,Labyrinthodinium,,sp.,,1.0,,...,,,,,,,,,,
1,Dinoflagellates/Acritarchs/Prasinophytes,?Maduradinium sp.,,?,Maduradinium,,sp.,,,,...,,,,,,,,,,
2,Dinoflagellates/Acritarchs/Prasinophytes,?Pyxidiella sp. 1,,?,Pyxidiella,,sp.,,1.0,,...,,,,,,,,,,
3,Dinoflagellates/Acritarchs/Prasinophytes,Aandalusiella ivoirensis,,,Aandalusiella,,ivoirensis,,,,...,,,,,,,,,,
4,Dinoflagellates/Acritarchs/Prasinophytes,Abratopdinium cardioforme,,,Abratopdinium,,cardioforme,,,,...,,,,,,,,,,


In [484]:
PI_df.columns

Index(['taxon_group', 'verbatim_name', 'name', 'Comment',
       'Notes (change to Internal only notes?)',
       'Any taxon above genus modifier', 'Any taxon above genus',
       'genus modifier', 'genus name', 'subgenera modifier', 'subgenera name',
       'species modifier', 'species name', 'subspecies modifier',
       'subspecies name', 'non-taxa descriptor', 'comments', 'pbdb_taxon_id',
       'pbdb_taxon_name', 'pbdb_taxon_rank', 'Corrections to pbdb_taxon_rank',
       'pbdb_taxon_id.1', 'pbdb_taxon_name.1', 'pbdb_taxon_rank.1',
       'Corrections to pbdb_taxon', 'family_taxon_id', 'family_taxon_name',
       'superfamily_taxon_id', 'superfamily_taxon_name', 'order_taxon_id',
       'order_taxon_name', 'class_taxon_id', 'class_taxon_name',
       'phylum_taxon_id', 'phylum_taxon_name', 'kingdom_taxon_id',
       'kingdom_taxon_name', 'unranked clade_taxon_id',
       'unranked clade_taxon_name'],
      dtype='object')

In [485]:
unapproved_df.columns

Index(['taxon_group', 'verbatim_name', 'name', 'genus modifier', 'genus name',
       'species modifier', 'species name', 'subspecies modifier',
       'subspecies name', 'non-taxa descriptor', 'pbdb_taxon_id',
       'pbdb_taxon_name', 'pbdb_taxon_rank', 'family_taxon_id',
       'family_taxon_name', 'order_taxon_id', 'order_taxon_name',
       'class_taxon_id', 'class_taxon_name', 'phylum_taxon_id',
       'phylum_taxon_name', 'kingdom_taxon_id', 'kingdom_taxon_name',
       'genus_taxon_id', 'genus_taxon_name', 'unranked clade_taxon_id',
       'unranked clade_taxon_name', 'subclass_taxon_id', 'subclass_taxon_name',
       'Any taxon above genus', 'Any taxon above genus modifier', 'Comment',
       'Corrections to pbdb_taxon', 'Notes (change to Internal only notes?)',
       'comments', 'subgenera modifier', 'subgenera name', 'species_taxon_id',
       'species_taxon_name'],
      dtype='object')

In [486]:
old_cols = set(unapproved_df.columns)

In [487]:
unapproved_df = unapproved_df.reindex(columns=[
    'taxon_group', 
    'verbatim_name', 'name',
     'Comment', 'Notes (change to Internal only notes?)',
    'Any taxon above genus modifier', 'Any taxon above genus', 
    'genus modifier', 'genus name',
    'subgenera modifier', 'subgenera name',
    'species modifier', 'species name', 
    'subspecies modifier','subspecies name', 
    'non-taxa descriptor', 
    'comments',
    'pbdb_taxon_id', 'pbdb_taxon_name', 'pbdb_taxon_rank', 
    'Corrections to pbdb_taxon',
    'species_taxon_id', 'species_taxon_name',
    'genus_taxon_id', 'genus_taxon_name',
    'family_taxon_id', 'family_taxon_name', 
    'order_taxon_id', 'order_taxon_name',
    'subclass_taxon_id', 'subclass_taxon_name',
    'class_taxon_id', 'class_taxon_name', 
    'phylum_taxon_id', 'phylum_taxon_name', 
    'kingdom_taxon_id', 'kingdom_taxon_name',
     'unranked clade_taxon_id', 'unranked clade_taxon_name',   
])
len(unapproved_df.columns)

39

In [488]:
old_cols - set(unapproved_df.columns)

set()

In [489]:
unapproved_df.to_csv(unapproved_taxa_path, index=False)