# Normalize taxa list

Cleanup the normalized taxa list from the eODP researchers. Add PBDB taxa data.

In [1]:
import sys
sys.path.append('../../../')

import pandas as pd
import numpy as np
import requests

from config import CLEAN_DATA_DIR, OUTPUT_DIR, RAW_DATA_DIR

import scripts.normalize_taxa as nt
import scripts.pbdb as pbdb

from scripts.pbdb import get_parent_taxa, PBDB_TAXA_NAME

In [2]:
date = '2022-03-30'

input_file = RAW_DATA_DIR/'PI_processed_files'/f'LIMS_Micropal_headers_PBDB_Taxonomy_notes_taxa_list_{date}.csv'

pi_taxa_file_pbdb = OUTPUT_DIR /'taxa'/'LIMS'/f'PI_normalized_taxa_list_with_pbdb_{date}.csv'
crosswalk_file = OUTPUT_DIR/'taxa'/'LIMS'/f"taxa_crosswalk_{date}.csv"
taxa_list_file = OUTPUT_DIR/'taxa'/'LIMS'/f"taxa_list_{date}.csv"

genus_pbdb_file = OUTPUT_DIR/'taxa'/'LIMS'/f"genera_pbdb_{date}.csv"
higher_taxa_pbdb_file = OUTPUT_DIR/'taxa'/'LIMS'/f"higher_taxa_pbdb_{date}.csv"



In [3]:
def log_df(df, row_count=5):
    print(df.shape)
    return df.head(row_count)


## fix incorect pbdb_taxon_id

incorported  pbdb_taxon_id that the PIs corrected into the taxalist.


In [None]:
df = pd.read_csv(input_file, skiprows = 9, dtype=str)
df = df.drop(df.index[[0, 1]])
df = df.dropna(how="all", axis="index")
df['corrected'] = False

log_df(df)

In [None]:
tmp = df[df['Corrections to pbdb_taxon_id'].notna() & (df['corrected'] == False)]
tmp['Corrections to pbdb_taxon_rank'].unique()

array(['1064', '432650', '68421', '71266', '421517', '71284', '82180'],
      dtype=object)

In [20]:
pbdb.fix_pbdb_id(df, 'genus; id 1064', 1064)
pbdb.fix_pbdb_id(df, 'genus, id 1124', 1124)
pbdb.fix_pbdb_id(df, 'genus; taxon_no= 2092', 2092)
pbdb.fix_pbdb_id(df, 'genus; ID 421517', 421517)
pbdb.fix_pbdb_id(df, 'genus; ID 432678', 432678)
pbdb.fix_pbdb_id(df, 'genus; ID 71247', 71247)
pbdb.fix_pbdb_id(df, 'genus; ID 82145', 82145)
pbdb.fix_pbdb_id(df, 'genus; ID 432650', 432650)
pbdb.fix_pbdb_id(df, 'genus: ID 68421', 68421)
pbdb.fix_pbdb_id(df, 'genus; ID 432651', 432651)
pbdb.fix_pbdb_id(df, 'genus; ID 443753', 443753)
pbdb.fix_pbdb_id(df, 'genus; ID 434997', 434997)
pbdb.fix_pbdb_id(df, 'genus; ID 165526', 165526)


1064
1124
2092
421517
432678
71247
82145
432650
68421
432651
443753
434997
165526


In [21]:
df.shape

(4742, 35)

In [22]:
del df['corrected']

In [23]:
df.to_csv(pi_taxa_file_pbdb, index=False)

## Create crosswalk csv

In [4]:
df = pd.read_csv(pi_taxa_file_pbdb, dtype=str)
log_df(df, 3)

(4742, 34)


Unnamed: 0,taxon_group,verbatim_name,name,"name to use (if different from ""name"")",name comment field,Comment,notes,Any taxon above genus,genus modifier,genus name,...,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name
0,benthic_forams,Euuvigerina miozea (group) >100 m,Euuvigerina miozea (group) >100 m,Euuvigerina miozea,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",,,Euuvigerina,...,,,,,288974.0,Foraminifera,212476,Rhizaria,,
1,benthic_forams,Euuvigerina rodleyi (group) >50 m,Euuvigerina rodleyi (group) >50 m,Euuvigerina rodleyi,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",,,Euuvigerina,...,,,,,288974.0,Foraminifera,212476,Rhizaria,,
2,benthic_forams,Others,Others,Foraminifera indet.,,not a taxa,Andy,Foraminifera indet.,,,...,,,,,,,212476,Rhizaria,,


In [5]:
fields = nt.taxa_rank_fields + nt.taxa_fields + nt.metadata_fields  
fields

['Any taxon above genus',
 'genus modifier',
 'genus name',
 'subgenera modifier',
 'subgenera name',
 'species modifier',
 'species name',
 'subspecies modifier',
 'subspecies name',
 'non-taxa descriptor',
 'normalized_name',
 'taxon_group',
 'verbatim_name',
 'comments']

In [6]:
filtered_taxa = pd.DataFrame(df, columns=fields)
log_df(filtered_taxa, 2)


(4742, 14)


Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,normalized_name,taxon_group,verbatim_name,comments
0,,,Euuvigerina,,,,miozea,,,,,benthic_forams,Euuvigerina miozea (group) >100 m,>100 m group
1,,,Euuvigerina,,,,rodleyi,,,,,benthic_forams,Euuvigerina rodleyi (group) >50 m,>50 m group


set normalized_name using the taxa fields

In [7]:
nt.add_normalized_name_column(filtered_taxa)
log_df(filtered_taxa)

filtered_taxa = filtered_taxa.drop(filtered_taxa[filtered_taxa['normalized_name'] == ''].index)
log_df(filtered_taxa)

filtered_taxa.drop_duplicates(keep='first', inplace=True, subset =['verbatim_name', 'normalized_name'])
log_df(filtered_taxa)

(4742, 14)
(4735, 14)
(4590, 14)


Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,normalized_name,taxon_group,verbatim_name,comments
0,,,Euuvigerina,,,,miozea,,,,Euuvigerina miozea,benthic_forams,Euuvigerina miozea (group) >100 m,>100 m group
1,,,Euuvigerina,,,,rodleyi,,,,Euuvigerina rodleyi,benthic_forams,Euuvigerina rodleyi (group) >50 m,>50 m group
2,Foraminifera indet.,,,,,,,,,,Foraminifera indet.,benthic_forams,Others,other benthic foraminifera
3,Pleurostomellidae indet.,,,,,,,,,,Pleurostomellidae indet.,benthic_forams,Pleurostomellids comment,
4,Ostracoda indet.,,,,,,,,,,Ostracoda indet.,benthic_forams,Ostracoda spp.,


create crosswalk csv

In [8]:
filtered_taxa.to_csv(crosswalk_file, index=False)

## Create taxa list csv

create taxa list with unique taxa fields

In [9]:
df = pd.read_csv(pi_taxa_file_pbdb, dtype=str)
log_df(df, 3)

(4742, 34)


Unnamed: 0,taxon_group,verbatim_name,name,"name to use (if different from ""name"")",name comment field,Comment,notes,Any taxon above genus,genus modifier,genus name,...,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name
0,benthic_forams,Euuvigerina miozea (group) >100 m,Euuvigerina miozea (group) >100 m,Euuvigerina miozea,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",,,Euuvigerina,...,,,,,288974.0,Foraminifera,212476,Rhizaria,,
1,benthic_forams,Euuvigerina rodleyi (group) >50 m,Euuvigerina rodleyi (group) >50 m,Euuvigerina rodleyi,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",,,Euuvigerina,...,,,,,288974.0,Foraminifera,212476,Rhizaria,,
2,benthic_forams,Others,Others,Foraminifera indet.,,not a taxa,Andy,Foraminifera indet.,,,...,,,,,,,212476,Rhizaria,,


In [10]:
fields = nt.taxa_rank_fields  + nt.taxa_fields + nt.pdbd_fields + ['verbatim_name']
fields

['Any taxon above genus',
 'genus modifier',
 'genus name',
 'subgenera modifier',
 'subgenera name',
 'species modifier',
 'species name',
 'subspecies modifier',
 'subspecies name',
 'non-taxa descriptor',
 'normalized_name',
 'taxon_group',
 'pbdb_taxon_id',
 'pbdb_taxon_name',
 'pbdb_taxon_rank',
 'family_taxon_id',
 'family_taxon_name',
 'order_taxon_id',
 'order_taxon_name',
 'class_taxon_id',
 'class_taxon_name',
 'phylum_taxon_id',
 'phylum_taxon_name',
 'kingdom_taxon_id',
 'kingdom_taxon_name',
 'verbatim_name']

In [11]:
filtered_taxa = pd.DataFrame(df, columns=fields)
log_df(filtered_taxa, 2)

(4742, 26)


Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,...,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,verbatim_name
0,,,Euuvigerina,,,,miozea,,,,...,,,,,,288974,Foraminifera,212476,Rhizaria,Euuvigerina miozea (group) >100 m
1,,,Euuvigerina,,,,rodleyi,,,,...,,,,,,288974,Foraminifera,212476,Rhizaria,Euuvigerina rodleyi (group) >50 m


In [12]:
nt.add_normalized_name_column(filtered_taxa)
log_df(filtered_taxa)

filtered_taxa = filtered_taxa.drop(filtered_taxa[filtered_taxa['normalized_name'] == ''].index)
log_df(filtered_taxa)

filtered_taxa.drop_duplicates(keep='first', inplace=True, subset =['verbatim_name', 'normalized_name'])
log_df(filtered_taxa)

(4742, 26)
(4735, 26)
(4590, 26)


Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,...,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,verbatim_name
0,,,Euuvigerina,,,,miozea,,,,...,,,,,,288974.0,Foraminifera,212476,Rhizaria,Euuvigerina miozea (group) >100 m
1,,,Euuvigerina,,,,rodleyi,,,,...,,,,,,288974.0,Foraminifera,212476,Rhizaria,Euuvigerina rodleyi (group) >50 m
2,Foraminifera indet.,,,,,,,,,,...,,,,,,,,212476,Rhizaria,Others
3,Pleurostomellidae indet.,,,,,,,,,,...,,,,,,288974.0,Foraminifera,212476,Rhizaria,Pleurostomellids comment
4,Ostracoda indet.,,,,,,,,,,...,,,,,,18891.0,Arthropoda,325038,Animalia,Ostracoda spp.


In [13]:
fields.remove('verbatim_name')
taxa_df = pd.DataFrame(filtered_taxa, columns=fields)
taxa_df = taxa_df.drop_duplicates() 

log_df(taxa_df)

(4205, 25)


Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,...,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
0,,,Euuvigerina,,,,miozea,,,,...,,,,,,,288974.0,Foraminifera,212476,Rhizaria
1,,,Euuvigerina,,,,rodleyi,,,,...,,,,,,,288974.0,Foraminifera,212476,Rhizaria
2,Foraminifera indet.,,,,,,,,,,...,,,,,,,,,212476,Rhizaria
3,Pleurostomellidae indet.,,,,,,,,,,...,,,,,,,288974.0,Foraminifera,212476,Rhizaria
4,Ostracoda indet.,,,,,,,,,,...,,,,,,,18891.0,Arthropoda,325038,Animalia


### create taxa csv

In [9]:
taxa_df.to_csv(taxa_list_file, index=False)

NameError: name 'taxa_df' is not defined

## create genera csv

In [8]:
taxa_df = pd.read_csv(taxa_list_file, dtype=str)
log_df(taxa_df)

(606, 25)


Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,...,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
0,Textulariia indet.,,,,,,,,,,...,,,,,,,,,,
1,,,Alabaminella,,,,weddelensis,,,,...,241423.0,Eponididae,,,,,288974.0,Foraminifera,212476.0,Rhizaria
2,,,Alabamina,,,,sp.,,,,...,,,,,,,,,,
3,,,Alabaminella,,,,weddelensis,,,,...,,,,,,,,,,
4,,,Ammodiscus,,,,latus,,,,...,112199.0,Ammodiscidae,,,,,288974.0,Foraminifera,212476.0,Rhizaria


In [9]:
genus_df = pd.DataFrame(taxa_df['genus name'].str.strip().dropna().unique(), columns=['genus name']) 

log_df(genus_df)

(247, 1)


Unnamed: 0,genus name
0,Alabaminella
1,Alabamina
2,Ammodiscus
3,Ammonia
4,Amphicoryna


In [10]:
genus_df.loc[genus_df['genus name'] == 'Martinotiella', 'genus name'] = 'Martinottiella'

In [11]:
for index, row in genus_df.iterrows(): 
    if index > 2:
        continue
    if index % 50 == 0:
        print(index, end=' ')

    url =  PBDB_TAXA_NAME +  row['genus name']
        
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()["records"]
        if len(data) == 1:
            genus_df.at[index, f'pbdb_taxon_id'] = str(data[0]["taxon_no"])
            genus_df.at[index, f'pbdb_taxon_name'] = data[0]["taxon_name"]
            genus_df.at[index, f'pbdb_taxon_rank'] = data[0]["taxon_rank"]
            
            round = 0
            get_parent_taxa(genus_df, data[0]["parent_no"], data[0]["taxon_rank"], round, index, None)

                

0 0
1
2
3
4
5
0
1
2
3
4
5
0
1
2
3
4
5


In [284]:
genus_df = genus_df.reindex(columns=[
    'genus name', 
    'pbdb_taxon_id', 'pbdb_taxon_name', 'pbdb_taxon_rank',
    'family_taxon_id', 'family_taxon_name',
    'order_taxon_id', 'order_taxon_name', 
    'class_taxon_id', 'class_taxon_name',
    'phylum_taxon_id', 'phylum_taxon_name', 
    'kingdom_taxon_id', 'kingdom_taxon_name', 
    ])

In [None]:
genus_df[0:20]

In [286]:
genus_df.shape

(1015, 14)

In [287]:
genus_df.to_csv(genus_pbdb_file, index=False)

### add genus pbdb to input_file

In [288]:
df = pd.read_csv(input_file, skiprows = 9, dtype=str)
log_df(df)


(4754, 21)


Unnamed: 0,taxon_group,verbatim_name,name,"name to use (if different from ""name"")",name comment field,Comment,notes,Any taxon above genus,genus modifier,genus name,...,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,comments,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank
0,,RESOLVED NAMES BY TAXONOMIC GROUP,,,,,,,,,...,,,,,,,,,,
1,benthic_forams,Pyrite,Pyrite,,,"not a taxa; Going into Macrostrat, not PBDB",Summer 2020: deal with later; should go in as ...,,,,...,,,,,,,,,,
2,benthic_forams,Euuvigerina miozea (group) >100 m,Euuvigerina miozea (group) >100 m,Euuvigerina miozea,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",,,Euuvigerina,...,,,miozea,,,,>100 m group,,,
3,benthic_forams,Euuvigerina rodleyi (group) >50 m,Euuvigerina rodleyi (group) >50 m,Euuvigerina rodleyi,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",,,Euuvigerina,...,,,rodleyi,,,,>50 m group,,,
4,benthic_forams,Others,Others,Foraminifera indet.,,not a taxa,Andy,Foraminifera indet.,,,...,,,,,,,other benthic foraminifera,,,


In [269]:
genus_df= pd.read_csv(genus_pbdb_file, dtype=str)

log_df(genus_df)

(1015, 14)


Unnamed: 0,genus name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
0,Euuvigerina,1408,Euuvigerina,genus,,,,,,,288974,Foraminifera,212476,Rhizaria
1,Nodosaria,1952,Nodosaria,genus,82197.0,Nodosariidae,429322.0,Nodosariida,428875.0,Nodosariata,288974,Foraminifera,212476,Rhizaria
2,Cibicides,1107,Cibicides,genus,82208.0,Cibicididae,,,,,288974,Foraminifera,212476,Rhizaria
3,Brizalina,1017,Brizalina,genus,112279.0,Bolivinidae,,,,,288974,Foraminifera,212476,Rhizaria
4,Candeina,1053,Candeina,genus,422277.0,Candeinidae,,,,,288974,Foraminifera,212476,Rhizaria


In [289]:
df['genus name'] = df['genus name'].str.strip()
genus_df['genus name'] = genus_df['genus name'].str.strip()

merged_df = df.merge(genus_df, on="genus name", how="left", suffixes=('_prev', None))
log_df(merged_df)

(4754, 34)


Unnamed: 0,taxon_group,verbatim_name,name,"name to use (if different from ""name"")",name comment field,Comment,notes,Any taxon above genus,genus modifier,genus name,...,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
0,,RESOLVED NAMES BY TAXONOMIC GROUP,,,,,,,,,...,,,,,,,,,,
1,benthic_forams,Pyrite,Pyrite,,,"not a taxa; Going into Macrostrat, not PBDB",Summer 2020: deal with later; should go in as ...,,,,...,,,,,,,,,,
2,benthic_forams,Euuvigerina miozea (group) >100 m,Euuvigerina miozea (group) >100 m,Euuvigerina miozea,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",,,Euuvigerina,...,,,,,,,288974.0,Foraminifera,212476.0,Rhizaria
3,benthic_forams,Euuvigerina rodleyi (group) >50 m,Euuvigerina rodleyi (group) >50 m,Euuvigerina rodleyi,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",,,Euuvigerina,...,,,,,,,288974.0,Foraminifera,212476.0,Rhizaria
4,benthic_forams,Others,Others,Foraminifera indet.,,not a taxa,Andy,Foraminifera indet.,,,...,,,,,,,,,,


In [52]:
merged_df.to_csv(pi_taxa_file_pbdb, index=False)

## create higher csv

In [9]:
taxa_df = pd.read_csv(taxa_list_file)
log_df(taxa_df)

(4209, 12)


Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,normalized_name,taxon_group
0,,,Euuvigerina,,,,miozea,,,,Euuvigerina miozea,benthic_forams
1,,,Euuvigerina,,,,rodleyi,,,,Euuvigerina rodleyi,benthic_forams
2,Foraminifera indet.,,,,,,,,,,Foraminifera indet.,benthic_forams
3,Pleurostomellidae indet.,,,,,,,,,,Pleurostomellidae indet.,benthic_forams
4,Ostracoda indet.,,,,,,,,,,Ostracoda indet.,benthic_forams


In [120]:
higher_df = pd.DataFrame(taxa_df['Any taxon above genus'].str.strip().dropna().unique(), columns=['Any taxon above genus']) 
higher_df.replace('', np.nan, inplace=True)
higher_df.dropna(inplace=True)
log_df(higher_df)

(66, 1)


Unnamed: 0,Any taxon above genus
0,Foraminifera indet.
1,Pleurostomellidae indet.
2,Ostracoda indet.
3,Textulariia indet.
4,Elphidiidae indet.


In [123]:
for index, row in higher_df.iterrows():        
    if index % 20 == 0:
        print(index)
        
    name = row['Any taxon above genus'].replace(' indet.', '')

  
    url =  PBDB_TAXA_NAME +  name
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()["records"]
        if len(data) == 1:
            higher_df.at[index, f'pbdb_taxon_id'] = str(data[0]["taxon_no"])
            higher_df.at[index, f'pbdb_taxon_name'] = data[0]["taxon_name"]
            higher_df.at[index, f'pbdb_taxon_rank'] = data[0]["taxon_rank"]
            
            round = 0
            get_parent_taxa(higher_df, data[0]["parent_no"], data[0]["taxon_rank"], round, index, None)
        else:
            print(name, len(data), ' found')
    else:
        print(name, ' not found')

0
"Challengeria spp."  not found
"Phytolith"  not found
20
"Amorphous organic matter"  not found
"Black phytoclasts"  not found
"Brown phytoclasts"  not found
"round browns, psilate"  not found
"Terrestrial palynomorphs"  not found
"Unknown taxa"  not found
"other pollen"  not found
"Spores"  not found
"Fungal spores"  not found
"Pollen/spores reworked"  not found
"Other pollen"  not found
40
"Sporomorphs"  not found
"Black woody phytoclasts"  not found
"Brown woody phytoclasts"  not found
"Saccate pollen"  not found
"Monolete ornamented"  not found
"Monolete psilate"  not found
"Trilete ornamented"  not found
"Trilete psilate"  not found
60


In [124]:
higher_df.columns

Index(['Any taxon above genus', 'pbdb_taxon_id', 'pbdb_taxon_name',
       'pbdb_taxon_rank', 'kingdom_taxon_id', 'kingdom_taxon_name',
       'phylum_taxon_id', 'phylum_taxon_name'],
      dtype='object')

In [None]:
higher_df = higher_df.reindex(columns=[
    'Any taxon above genus', 
    'pbdb_taxon_id', 'pbdb_taxon_name', 'pbdb_taxon_rank',
    'order_taxon_id', 'order_taxon_name', 
    'class_taxon_id', 'class_taxon_name',
    'phylum_taxon_id', 'phylum_taxon_name', 
    'kingdom_taxon_id', 'kingdom_taxon_name', 
    ])
higher_df

In [68]:
higher_df.to_csv(higher_taxa_pbdb_file, index=False)

### add higher pbdb to input_file

In [125]:
df = pd.read_csv(pi_taxa_file_pbdb,  dtype=str)
log_df(df)

(4754, 34)


Unnamed: 0,taxon_group,verbatim_name,name,"name to use (if different from ""name"")",name comment field,Comment,notes,Any taxon above genus,genus modifier,genus name,...,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
0,,RESOLVED NAMES BY TAXONOMIC GROUP,,,,,,,,,...,,,,,,,,,,
1,benthic_forams,Pyrite,Pyrite,,,"not a taxa; Going into Macrostrat, not PBDB",Summer 2020: deal with later; should go in as ...,,,,...,,,,,,,,,,
2,benthic_forams,Euuvigerina miozea (group) >100 m,Euuvigerina miozea (group) >100 m,Euuvigerina miozea,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",,,Euuvigerina,...,,,,,,,288974.0,Foraminifera,212476.0,Rhizaria
3,benthic_forams,Euuvigerina rodleyi (group) >50 m,Euuvigerina rodleyi (group) >50 m,Euuvigerina rodleyi,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",,,Euuvigerina,...,,,,,,,288974.0,Foraminifera,212476.0,Rhizaria
4,benthic_forams,Others,Others,Foraminifera indet.,,not a taxa,Andy,Foraminifera indet.,,,...,,,,,,,,,,


In [107]:
df.columns

Index(['taxon_group', 'verbatim_name', 'name',
       'name to use (if different from "name")', 'name comment field',
       'Comment', 'notes', 'Any taxon above genus', 'genus modifier',
       'genus name', 'subgenera modifier', 'subgenera name',
       'species modifier', 'species name', 'subspecies modifier',
       'subspecies name', 'non-taxa descriptor', 'comments',
       'pbdb_taxon_id_prev', 'pbdb_taxon_name_prev', 'pbdb_taxon_rank_prev',
       'pbdb_taxon_id', 'pbdb_taxon_name', 'pbdb_taxon_rank',
       'family_taxon_id', 'family_taxon_name', 'order_taxon_id',
       'order_taxon_name', 'class_taxon_id', 'class_taxon_name',
       'phylum_taxon_id', 'phylum_taxon_name', 'kingdom_taxon_id',
       'kingdom_taxon_name'],
      dtype='object')

In [126]:
higher_df = pd.read_csv(higher_taxa_pbdb_file, dtype=str)

In [127]:
for index, row in higher_df.iterrows():    
    for col in higher_df.columns:
        if col == 'Any taxon above genus':
            continue

        df.loc[df['Any taxon above genus'] == row['Any taxon above genus'], col] = row[col]

    

In [129]:
df.to_csv(pi_taxa_file_pbdb,  index=False)
