# Normalize taxa list

Cleanup the normalized taxa list from the eODP researchers. Add PBDB taxa data.

In [1]:
import sys
sys.path.append('../../../')

import pandas as pd
import numpy as np
import requests

from config import CLEAN_DATA_DIR, OUTPUT_DIR, RAW_DATA_DIR

import scripts.normalize_taxa as nt
from scripts.normalize_data import remove_whitespace
import scripts.pbdb as pbdb

from scripts.pbdb import (
    get_parent_taxa, 
    PBDB_TAXA_NAME, 
    PBDB_TAXA_ID,
    check_multiple_pbdb_id
)
from scripts.shared_utils import (
    log_df
)

In [2]:
date = '2022-11-08'

taxa_list_file = OUTPUT_DIR/'taxa'/'LIMS'/f"taxa_list_{date}.csv"

species_dir = OUTPUT_DIR/'taxa'/'draft'/'LIMS'/'species'
species_file = species_dir/f'species_list_{date}.csv'


## create species csv - species and genus ranks only

In [3]:
taxa_df = pd.read_csv(taxa_list_file, dtype=str)
log_df(taxa_df)
# 4736

(4736, 25)


Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,...,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
0,,,Euuvigerina,,,,miozea,,,,...,,,,,,,288974.0,Foraminifera,212476,Rhizaria
1,,,Euuvigerina,,,,rodleyi,,,,...,,,,,,,288974.0,Foraminifera,212476,Rhizaria
2,Foraminifera indet.,,,,,,,,,,...,,,,,,,,,212476,Rhizaria
3,Pleurostomellidae indet.,,,,,,,,,,...,,,,,,,288974.0,Foraminifera,212476,Rhizaria
4,Ostracoda indet.,,,,,,,,,,...,,,,,,,18891.0,Arthropoda,325038,Animalia


In [4]:
taxa_df = taxa_df[taxa_df['Any taxon above genus'].isna()]
taxa_df = taxa_df[taxa_df['species name'].notna()]
log_df(taxa_df)

taxa_df.loc[~taxa_df['species name'].str.contains('spp\.|sp\..*?', regex=True), 'genus species name'] = taxa_df['genus name'] + ' ' + taxa_df['species name']

taxa_df['genus species name'] = taxa_df['genus species name'].str.strip()

log_df(taxa_df)
# 4628

(4628, 25)
(4628, 26)


Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,...,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,genus species name
0,,,Euuvigerina,,,,miozea,,,,...,,,,,,288974,Foraminifera,212476,Rhizaria,Euuvigerina miozea
1,,,Euuvigerina,,,,rodleyi,,,,...,,,,,,288974,Foraminifera,212476,Rhizaria,Euuvigerina rodleyi
6,,,Nodosaria,,,,spp.,,,,...,Nodosariidae,429322.0,Nodosariida,428875.0,Nodosariata,288974,Foraminifera,212476,Rhizaria,
7,,,Cibicides,,,,spp.,,,,...,Cibicididae,,,,,288974,Foraminifera,212476,Rhizaria,
8,,,Brizalina,,,,spp.,,,,...,Bolivinidae,,,,,288974,Foraminifera,212476,Rhizaria,


In [5]:
species_df = taxa_df[['genus species name']].copy().dropna().drop_duplicates().reset_index(drop=True)

log_df(species_df)
# 3302

(3302, 1)


Unnamed: 0,genus species name
0,Euuvigerina miozea
1,Euuvigerina rodleyi
2,Candeina nitida
3,Dentoglobigerina altispira
4,Dentoglobigerina baroemoenensis


In [6]:
genus_ids = {}

for index, row in species_df.iterrows(): 
    need_genus = True
    count = 0

    # if index > 10:
    #     continue
    
    if index % 50 == 0:
        print(index, end=' ')
        
    url =  PBDB_TAXA_NAME +  row['genus species name']
    
    # if row['genus species name'] != 'Charcotia actinochilus':
    #        continue
    # print('species: ', url)
        
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()["records"]
        if len(data) == 1:
            species_df.at[index, 'species_taxon_id'] = str(data[0]["taxon_no"])
            species_df.at[index, 'species_taxon_name'] = data[0]["taxon_name"]
            # print(data[0])
            
            
            
            parent_id = data[0]['parent_no']
            
            if parent_id in genus_ids:
                species_df.at[index, 'tmp_genus_taxon_id'] = str(parent_id)
                species_df.at[index, 'tmp_genus_taxon_name'] = genus_ids[parent_id]
                    
                need_genus = False
            
            while need_genus:
                url =  PBDB_TAXA_ID +  parent_id
                # print(url)
                response = requests.get(url)
                if response.status_code == 200:
                    data = response.json()["records"]
                    if data[0]["taxon_rank"] == 'genus':
                        genus_id = data[0]["taxon_no"]
                        genus_ids[genus_id] = data[0]["taxon_name"]
                        species_df.at[index, 'tmp_genus_taxon_id'] = str(genus_id)
                        species_df.at[index, 'tmp_genus_taxon_name'] =  data[0]["taxon_name"]
                        # print(data[0])


                        need_genus = False
                    else:
                        parent_id = data[0]['parent_no']
                if count > 20:
                    need_genus = False
                count += 1
        else:
            print('multiple matches for: ', row['genus species name'] )
    else:
        pass
        # print('not found: ', row['genus species name'])



0 50 100 150 200 250 300 350 400 450 500 550 600 650 700 750 800 850 900 950 1000 1050 1100 1150 1200 1250 1300 1350 1400 1450 1500 1550 1600 1650 1700 1750 1800 1850 1900 1950 2000 2050 2100 2150 2200 2250 2300 2350 2400 2450 2500 2550 2600 2650 2700 2750 2800 2850 2900 2950 3000 3050 3100 3150 3200 3250 3300 

In [8]:
species_df.head(10)

Unnamed: 0,genus species name,species_taxon_id,species_taxon_name,tmp_genus_taxon_id,tmp_genus_taxon_name
0,Euuvigerina miozea,,,,
1,Euuvigerina rodleyi,,,,
2,Candeina nitida,422278.0,Candeina nitida,1053.0,Candeina
3,Dentoglobigerina altispira,402661.0,Dentoglobigerina altispira,1264.0,Dentoglobigerina
4,Dentoglobigerina baroemoenensis,422289.0,Dentoglobigerina baroemoenensis,1264.0,Dentoglobigerina
5,Globigerina bulloides,113301.0,Globigerina bulloides,1498.0,Globigerina
6,Globigerina falconensis,388387.0,Globigerina falconensis,1498.0,Globigerina
7,Globigerina rubescens,422320.0,Globigerina rubescens,1529.0,Globoturborotalita
8,Globigerinella calida,422302.0,Globigerinella calida,1501.0,Globigerinella
9,Globigerinella siphonifera,422304.0,Globigerinella siphonifera,1501.0,Globigerinella


In [7]:
species_df.shape

(3302, 5)

In [9]:
species_df.to_csv(species_file, index=False)

## create species csv - all ranks

In [35]:
taxa_df = pd.read_csv(taxa_list_file, dtype=str)
log_df(taxa_df)
# 4736

(4736, 29)


Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,...,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
0,,,Euuvigerina,,,,miozea,,,,...,,,,,,,288974,Foraminifera,212476,Rhizaria
1,,,Euuvigerina,,,,rodleyi,,,,...,,,,,,,288974,Foraminifera,212476,Rhizaria
2,Foraminifera indet.,,,,,,,,,,...,,,,,,,288974,Foraminifera,212476,Rhizaria
3,Pleurostomellidae indet.,,,,,,,,,,...,155922.0,Pleurostomellidae,,,,,288974,Foraminifera,212476,Rhizaria
4,Ostracoda indet.,,,,,,,,,,...,,,,,22826.0,Ostracoda,18891,Arthropoda,325038,Animalia


In [36]:
taxa_df = taxa_df[taxa_df['Any taxon above genus'].isna()]
taxa_df = taxa_df[taxa_df['species name'].notna()]
log_df(taxa_df)

taxa_df.loc[~taxa_df['species name'].str.contains('spp\.|sp\..*?', regex=True), 'genus species name'] = taxa_df['genus name'] + ' ' + taxa_df['species name']

taxa_df['genus species name'] = taxa_df['genus species name'].str.strip()

log_df(taxa_df)
# 4628

(4628, 29)
(4628, 30)


Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,...,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,genus species name
0,,,Euuvigerina,,,,miozea,,,,...,,,,,,288974,Foraminifera,212476,Rhizaria,Euuvigerina miozea
1,,,Euuvigerina,,,,rodleyi,,,,...,,,,,,288974,Foraminifera,212476,Rhizaria,Euuvigerina rodleyi
6,,,Nodosaria,,,,spp.,,,,...,Nodosariidae,429322.0,Nodosariida,428875.0,Nodosariata,288974,Foraminifera,212476,Rhizaria,
7,,,Cibicides,,,,spp.,,,,...,Cibicididae,,,,,288974,Foraminifera,212476,Rhizaria,
8,,,Brizalina,,,,spp.,,,,...,Bolivinidae,,,,,288974,Foraminifera,212476,Rhizaria,


In [60]:
species_df = taxa_df[['genus species name']].copy().dropna().drop_duplicates().reset_index(drop=True)

 
log_df(species_df)
# 3302

(3302, 1)


Unnamed: 0,genus species name
0,Euuvigerina miozea
1,Euuvigerina rodleyi
2,Candeina nitida
3,Dentoglobigerina altispira
4,Dentoglobigerina baroemoenensis


In [56]:
for index, row in species_df.iterrows(): 
    if index % 50 == 0:
        print(index, end=' ')
        
    if row['genus species name'] == 'Hirsutella hirsuta':
        row['genus species name'] = 'Globorotalia (Hirsutella) hirsuta'
        

    url =  PBDB_TAXA_NAME +  row['genus species name']
        
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()["records"]
        if len(data) == 1:
            species_df.at[index, f'pbdb_taxon_id'] = str(data[0]["taxon_no"])
            species_df.at[index, f'pbdb_taxon_name'] = data[0]["taxon_name"]
            species_df.at[index, f'pbdb_taxon_rank'] = data[0]["taxon_rank"]
            
            species_df.at[index, 'species_taxon_id'] = str(data[0]["taxon_no"])
            species_df.at[index, 'species_taxon_name'] = data[0]["taxon_name"]
            
            round = 0
            get_parent_taxa(species_df, data[0]["parent_no"], data[0]["taxon_rank"], round, index, None)

                   
        
        

In [59]:
log_df(species_df, 2)

(2, 20)


Unnamed: 0,genus species name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,species_taxon_id,species_taxon_name,genus_taxon_id,genus_taxon_name,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name
2280,Globorotalia (Hirsutella) hirsuta,422434,Globorotalia (Hirsutella) hirsuta,species,422434,Globorotalia (Hirsutella) hirsuta,1521,Globorotalia,82192,Globorotaliidae,,,,,288974,Foraminifera,212476,Rhizaria,,
2281,Hirsutella juanai,422436,Globorotalia (Hirsutella) juanai,species,422436,Globorotalia (Hirsutella) juanai,1521,Globorotalia,82192,Globorotaliidae,,,,,288974,Foraminifera,212476,Rhizaria,,


In [9]:
species_df.columns

Index(['genus species name', 'pbdb_taxon_id', 'pbdb_taxon_name',
       'pbdb_taxon_rank', 'species_taxon_id', 'species_taxon_name',
       'genus_taxon_id', 'genus_taxon_name', 'family_taxon_id',
       'family_taxon_name', 'phylum_taxon_id', 'phylum_taxon_name',
       'kingdom_taxon_id', 'kingdom_taxon_name', 'order_taxon_id',
       'order_taxon_name', 'class_taxon_id', 'class_taxon_name',
       'unranked clade_taxon_id', 'unranked clade_taxon_name'],
      dtype='object')

In [58]:
species_df = species_df.reindex(columns=[
    'genus species name', 
    'pbdb_taxon_id', 'pbdb_taxon_name', 'pbdb_taxon_rank',
    'species_taxon_id', 'species_taxon_name',
    'genus_taxon_id', 'genus_taxon_name',
    'family_taxon_id', 'family_taxon_name',
    'order_taxon_id', 'order_taxon_name', 
    'class_taxon_id', 'class_taxon_name',
    'phylum_taxon_id', 'phylum_taxon_name', 
    'kingdom_taxon_id', 'kingdom_taxon_name', 
    'unranked clade_taxon_id', 'unranked clade_taxon_name'
    ])

log_df(species_df, 2)

(2, 20)


Unnamed: 0,genus species name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,species_taxon_id,species_taxon_name,genus_taxon_id,genus_taxon_name,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name
2280,Globorotalia (Hirsutella) hirsuta,422434,Globorotalia (Hirsutella) hirsuta,species,422434,Globorotalia (Hirsutella) hirsuta,1521,Globorotalia,82192,Globorotaliidae,,,,,288974,Foraminifera,212476,Rhizaria,,
2281,Hirsutella juanai,422436,Globorotalia (Hirsutella) juanai,species,422436,Globorotalia (Hirsutella) juanai,1521,Globorotalia,82192,Globorotaliidae,,,,,288974,Foraminifera,212476,Rhizaria,,


In [11]:
species_df.to_csv(species_file, index=False)

compare pbdb ids for 

In [12]:
df = pd.read_csv(species_file, 
                 usecols =['genus species name', 'pbdb_taxon_id'])

df2 = pd.read_csv(str(species_file).replace(date, '2022-10-27'),
                 usecols =['genus species name', 'species_taxon_id'])


merge_df = df.merge(df2)
merge_df.fillna('', inplace=True)
merge_df[merge_df['pbdb_taxon_id'] != merge_df['species_taxon_id']]

Unnamed: 0,genus species name,pbdb_taxon_id,species_taxon_id


## QA species

In [10]:
species_df = pd.read_csv(species_file, dtype=str)
log_df(species_df)
# 3302

(3302, 5)


Unnamed: 0,genus species name,species_taxon_id,species_taxon_name,tmp_genus_taxon_id,tmp_genus_taxon_name
0,Euuvigerina miozea,,,,
1,Euuvigerina rodleyi,,,,
2,Candeina nitida,422278.0,Candeina nitida,1053.0,Candeina
3,Dentoglobigerina altispira,402661.0,Dentoglobigerina altispira,1264.0,Dentoglobigerina
4,Dentoglobigerina baroemoenensis,422289.0,Dentoglobigerina baroemoenensis,1264.0,Dentoglobigerina


In [11]:
taxa_df = pd.read_csv(taxa_list_file, dtype=str)
log_df(taxa_df)
# 4736

(4736, 25)


Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,...,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
0,,,Euuvigerina,,,,miozea,,,,...,,,,,,,288974.0,Foraminifera,212476,Rhizaria
1,,,Euuvigerina,,,,rodleyi,,,,...,,,,,,,288974.0,Foraminifera,212476,Rhizaria
2,Foraminifera indet.,,,,,,,,,,...,,,,,,,,,212476,Rhizaria
3,Pleurostomellidae indet.,,,,,,,,,,...,,,,,,,288974.0,Foraminifera,212476,Rhizaria
4,Ostracoda indet.,,,,,,,,,,...,,,,,,,18891.0,Arthropoda,325038,Animalia


In [12]:
taxa_df.columns

Index(['Any taxon above genus', 'genus modifier', 'genus name',
       'subgenera modifier', 'subgenera name', 'species modifier',
       'species name', 'subspecies modifier', 'subspecies name',
       'non-taxa descriptor', 'normalized_name', 'taxon_group',
       'pbdb_taxon_id', 'pbdb_taxon_name', 'pbdb_taxon_rank',
       'family_taxon_id', 'family_taxon_name', 'order_taxon_id',
       'order_taxon_name', 'class_taxon_id', 'class_taxon_name',
       'phylum_taxon_id', 'phylum_taxon_name', 'kingdom_taxon_id',
       'kingdom_taxon_name'],
      dtype='object')

In [14]:
set(species_df['tmp_genus_taxon_id'].dropna()) - set(taxa_df['pbdb_taxon_id'])

{'29328'}

In [15]:
species_df[species_df['tmp_genus_taxon_id'] == '29328']

Unnamed: 0,genus species name,species_taxon_id,species_taxon_name,tmp_genus_taxon_id,tmp_genus_taxon_name
2280,Hirsutella hirsuta,237025,Hirsutella hirsuta,29328,Hirsutella


In [16]:
tmp = pd.DataFrame([{
    'Any taxon above genus': pd.NA,
    'genus modifier': pd.NA, 
    'genus name': 'Hirsutella',
    'subgenera modifier': pd.NA, 
    'subgenera name': pd.NA, 
    'species modifier': pd.NA,
    'species name': 'hirsuta', 
    'subspecies modifier': pd.NA, 
    'subspecies name': pd.NA,
    'non-taxa descriptor': pd.NA, 
    'normalized_name': pd.NA, 
    'taxon_group': 'planktic_forams',
    'pbdb_taxon_id': '29328', 
    'pbdb_taxon_name': 'Hirsutella', 
    'pbdb_taxon_rank': 'genus',
    'genus_taxon_id': '29328', 
    'genus_taxon_name': 'Hirsutella'
}])
log_df(tmp)

(1, 17)


Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,normalized_name,taxon_group,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,genus_taxon_id,genus_taxon_name
0,,,Hirsutella,,,,hirsuta,,,,,planktic_forams,29328,Hirsutella,genus,29328,Hirsutella


In [17]:
for index, row in tmp.iterrows(): 

    url =  PBDB_TAXA_ID +  row['genus_taxon_id']
        
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()["records"]
        if len(data) == 1:
            tmp.at[index, f'pbdb_taxon_id'] = str(data[0]["taxon_no"])
            tmp.at[index, f'pbdb_taxon_name'] = data[0]["taxon_name"]
            tmp.at[index, f'pbdb_taxon_rank'] = data[0]["taxon_rank"]
            
            round = 0
            get_parent_taxa(tmp, data[0]["parent_no"], data[0]["taxon_rank"], round, index, None)

log_df(tmp)

(1, 27)


Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,...,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
0,,,Hirsutella,,,,hirsuta,,,,...,126841,Bittnerulidae,83137,Spiriferinida,82921,Rhynchonellata,26322,Brachiopoda,325038,Animalia


In [19]:
tmp.to_csv(OUTPUT_DIR/'tmp'/'Hirsutella_hirsuta.csv', index=False)

## create taxa list with species

In [21]:
species_df = pd.read_csv(species_file, dtype=str)
log_df(species_df)
# 3302

(3302, 5)


Unnamed: 0,genus species name,species_taxon_id,species_taxon_name,tmp_genus_taxon_id,tmp_genus_taxon_name
0,Euuvigerina miozea,,,,
1,Euuvigerina rodleyi,,,,
2,Candeina nitida,422278.0,Candeina nitida,1053.0,Candeina
3,Dentoglobigerina altispira,402661.0,Dentoglobigerina altispira,1264.0,Dentoglobigerina
4,Dentoglobigerina baroemoenensis,422289.0,Dentoglobigerina baroemoenensis,1264.0,Dentoglobigerina


In [22]:
taxa_df = pd.read_csv(taxa_list_file, dtype=str)
log_df(taxa_df)

(4736, 25)


Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,...,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
0,,,Euuvigerina,,,,miozea,,,,...,,,,,,,288974.0,Foraminifera,212476,Rhizaria
1,,,Euuvigerina,,,,rodleyi,,,,...,,,,,,,288974.0,Foraminifera,212476,Rhizaria
2,Foraminifera indet.,,,,,,,,,,...,,,,,,,,,212476,Rhizaria
3,Pleurostomellidae indet.,,,,,,,,,,...,,,,,,,288974.0,Foraminifera,212476,Rhizaria
4,Ostracoda indet.,,,,,,,,,,...,,,,,,,18891.0,Arthropoda,325038,Animalia


In [23]:
taxa_df = taxa_df[taxa_df['Any taxon above genus'].isna()]
taxa_df = taxa_df[taxa_df['species name'].notna()]
log_df(taxa_df)

taxa_df.loc[~taxa_df['species name'].str.contains('spp\.|sp\..*?', regex=True), 'genus species name'] = taxa_df['genus name'] + ' ' + taxa_df['species name']

taxa_df['genus species name'] = taxa_df['genus species name'].str.strip()

log_df(taxa_df)
# 4628

(4628, 25)
(4628, 26)


Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,...,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,genus species name
0,,,Euuvigerina,,,,miozea,,,,...,,,,,,288974,Foraminifera,212476,Rhizaria,Euuvigerina miozea
1,,,Euuvigerina,,,,rodleyi,,,,...,,,,,,288974,Foraminifera,212476,Rhizaria,Euuvigerina rodleyi
6,,,Nodosaria,,,,spp.,,,,...,Nodosariidae,429322.0,Nodosariida,428875.0,Nodosariata,288974,Foraminifera,212476,Rhizaria,
7,,,Cibicides,,,,spp.,,,,...,Cibicididae,,,,,288974,Foraminifera,212476,Rhizaria,
8,,,Brizalina,,,,spp.,,,,...,Bolivinidae,,,,,288974,Foraminifera,212476,Rhizaria,


In [12]:
cols = [
    'pbdb_taxon_id', 'pbdb_taxon_name',  'pbdb_taxon_rank', 
    'genus species name',
    'family_taxon_id', 'family_taxon_name', 
    'order_taxon_id', 'order_taxon_name',
    'class_taxon_id', 'class_taxon_name', 
    'phylum_taxon_id', 'phylum_taxon_name', 
    'kingdom_taxon_id', 'kingdom_taxon_name'
]

merge_df = taxa_df[cols].merge(species_df, how='left', on='genus species name')


log_df(merge_df)
# 4635

(4635, 18)


Unnamed: 0,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,genus species name,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,species_taxon_id,species_taxon_name,tmp_genus_taxon_id,tmp_genus_taxon_name
0,1408,Euuvigerina,genus,Euuvigerina miozea,,,,,,,288974,Foraminifera,212476,Rhizaria,,,,
1,1408,Euuvigerina,genus,Euuvigerina rodleyi,,,,,,,288974,Foraminifera,212476,Rhizaria,,,,
2,1952,Nodosaria,genus,,82197.0,Nodosariidae,429322.0,Nodosariida,428875.0,Nodosariata,288974,Foraminifera,212476,Rhizaria,,,,
3,1107,Cibicides,genus,,82208.0,Cibicididae,,,,,288974,Foraminifera,212476,Rhizaria,,,,
4,1017,Brizalina,genus,,112279.0,Bolivinidae,,,,,288974,Foraminifera,212476,Rhizaria,,,,


In [13]:
merge_df.to_csv(species_dir/f'taxa_list_with_species_{date}.csv', index=False)

## create mistched genus list 

In [24]:
taxa_df = pd.read_csv(taxa_list_file, dtype=str)
log_df(taxa_df)

(4736, 25)


Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,...,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
0,,,Euuvigerina,,,,miozea,,,,...,,,,,,,288974.0,Foraminifera,212476,Rhizaria
1,,,Euuvigerina,,,,rodleyi,,,,...,,,,,,,288974.0,Foraminifera,212476,Rhizaria
2,Foraminifera indet.,,,,,,,,,,...,,,,,,,,,212476,Rhizaria
3,Pleurostomellidae indet.,,,,,,,,,,...,,,,,,,288974.0,Foraminifera,212476,Rhizaria
4,Ostracoda indet.,,,,,,,,,,...,,,,,,,18891.0,Arthropoda,325038,Animalia


In [25]:
taxa_df = taxa_df[taxa_df['Any taxon above genus'].isna()]
taxa_df = taxa_df[taxa_df['species name'].notna()]
log_df(taxa_df)

taxa_df.loc[~taxa_df['species name'].str.contains('spp\.|sp\..*?', regex=True), 'genus species name'] = taxa_df['genus name'] + ' ' + taxa_df['species name']

taxa_df['genus species name'] = taxa_df['genus species name'].str.strip()

log_df(taxa_df)

(4628, 25)
(4628, 26)


Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,...,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,genus species name
0,,,Euuvigerina,,,,miozea,,,,...,,,,,,288974,Foraminifera,212476,Rhizaria,Euuvigerina miozea
1,,,Euuvigerina,,,,rodleyi,,,,...,,,,,,288974,Foraminifera,212476,Rhizaria,Euuvigerina rodleyi
6,,,Nodosaria,,,,spp.,,,,...,Nodosariidae,429322.0,Nodosariida,428875.0,Nodosariata,288974,Foraminifera,212476,Rhizaria,
7,,,Cibicides,,,,spp.,,,,...,Cibicididae,,,,,288974,Foraminifera,212476,Rhizaria,
8,,,Brizalina,,,,spp.,,,,...,Bolivinidae,,,,,288974,Foraminifera,212476,Rhizaria,


In [26]:
species_df = pd.read_csv(species_file, dtype=str)
log_df(species_df)

(3302, 5)


Unnamed: 0,genus species name,species_taxon_id,species_taxon_name,tmp_genus_taxon_id,tmp_genus_taxon_name
0,Euuvigerina miozea,,,,
1,Euuvigerina rodleyi,,,,
2,Candeina nitida,422278.0,Candeina nitida,1053.0,Candeina
3,Dentoglobigerina altispira,402661.0,Dentoglobigerina altispira,1264.0,Dentoglobigerina
4,Dentoglobigerina baroemoenensis,422289.0,Dentoglobigerina baroemoenensis,1264.0,Dentoglobigerina


In [27]:
species_filter_df = species_df.dropna(subset=['species_taxon_id'])
log_df(species_filter_df)
# 495

(495, 5)


Unnamed: 0,genus species name,species_taxon_id,species_taxon_name,tmp_genus_taxon_id,tmp_genus_taxon_name
2,Candeina nitida,422278,Candeina nitida,1053,Candeina
3,Dentoglobigerina altispira,402661,Dentoglobigerina altispira,1264,Dentoglobigerina
4,Dentoglobigerina baroemoenensis,422289,Dentoglobigerina baroemoenensis,1264,Dentoglobigerina
5,Globigerina bulloides,113301,Globigerina bulloides,1498,Globigerina
6,Globigerina falconensis,388387,Globigerina falconensis,1498,Globigerina


In [28]:
cols = [
    'pbdb_taxon_id', 'pbdb_taxon_name',  'pbdb_taxon_rank', 
    'genus species name',
    'family_taxon_id', 'family_taxon_name', 
    'order_taxon_id', 'order_taxon_name',
    'class_taxon_id', 'class_taxon_name', 
    'phylum_taxon_id', 'phylum_taxon_name', 
    'kingdom_taxon_id', 'kingdom_taxon_name'
]

merge_df = species_filter_df.merge(taxa_df[cols], how='left', on='genus species name')
merge_df.drop_duplicates(inplace=True)


log_df(merge_df)
# 495

(495, 18)


Unnamed: 0,genus species name,species_taxon_id,species_taxon_name,tmp_genus_taxon_id,tmp_genus_taxon_name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
0,Candeina nitida,422278,Candeina nitida,1053,Candeina,1053,Candeina,genus,422277,Candeinidae,,,,,288974,Foraminifera,212476,Rhizaria
1,Dentoglobigerina altispira,402661,Dentoglobigerina altispira,1264,Dentoglobigerina,1264,Dentoglobigerina,genus,82191,Globigerinidae,,,,,288974,Foraminifera,212476,Rhizaria
3,Dentoglobigerina baroemoenensis,422289,Dentoglobigerina baroemoenensis,1264,Dentoglobigerina,1264,Dentoglobigerina,genus,82191,Globigerinidae,,,,,288974,Foraminifera,212476,Rhizaria
4,Globigerina bulloides,113301,Globigerina bulloides,1498,Globigerina,1498,Globigerina,genus,82191,Globigerinidae,,,,,288974,Foraminifera,212476,Rhizaria
6,Globigerina falconensis,388387,Globigerina falconensis,1498,Globigerina,1498,Globigerina,genus,82191,Globigerinidae,,,,,288974,Foraminifera,212476,Rhizaria


In [30]:
mismatch_df = merge_df[
    (merge_df['tmp_genus_taxon_id'] != merge_df['pbdb_taxon_id'])
    & (merge_df['pbdb_taxon_rank'] == 'genus')
]
log_df(mismatch_df)
# 39

(39, 18)


Unnamed: 0,genus species name,species_taxon_id,species_taxon_name,tmp_genus_taxon_id,tmp_genus_taxon_name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
7,Globigerina rubescens,422320,Globigerina rubescens,1529,Globoturborotalita,1498,Globigerina,genus,82191,Globigerinidae,,,,,288974,Foraminifera,212476.0,Rhizaria
27,Globigerinoides sacculifer,388389,Globigerinoides sacculifer,422361,Trilobatus,1504,Globigerinoides,genus,82191,Globigerinidae,,,,,288974,Foraminifera,212476.0,Rhizaria
39,Globorotalia humerosa,402487,Globorotalia humerosa,1917,Neogloboquadrina,1521,Globorotalia,genus,82192,Globorotaliidae,,,,,288974,Foraminifera,212476.0,Rhizaria
100,Anomalina praeacuta,382642,Anomalina praeacuta,859,Anomalinoides,86769,Anomalina,genus,103798,Anomalinidae,279579.0,Rotaliida,428504.0,Globothalamea,288974,Foraminifera,212476.0,Rhizaria
209,Actinocyclus senarius,387049,Actinocyclus senarius,71244,Actinoptychus,82146,Actinocyclus,genus,71207,Hemidiscaceae,426780.0,Coscinodiscales,69587.0,Bacillariophyceae,432613,Ochrophyta,,


In [32]:
mismatch_df.to_csv(species_dir/f'species_mistmach_genus_{date}.csv', index=False)

## add genus columns to taxa file

In [73]:
taxa_df = pd.read_csv(taxa_list_file, dtype=str)
log_df(taxa_df)

# 4736

(4736, 25)


Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,...,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
0,,,Euuvigerina,,,,miozea,,,,...,,,,,,,288974.0,Foraminifera,212476,Rhizaria
1,,,Euuvigerina,,,,rodleyi,,,,...,,,,,,,288974.0,Foraminifera,212476,Rhizaria
2,Foraminifera indet.,,,,,,,,,,...,,,,,,,,,212476,Rhizaria
3,Pleurostomellidae indet.,,,,,,,,,,...,,,,,,,288974.0,Foraminifera,212476,Rhizaria
4,Ostracoda indet.,,,,,,,,,,...,,,,,,,18891.0,Arthropoda,325038,Animalia


In [74]:
taxa_df['species_taxon_id'] = pd.NA
taxa_df['species_taxon_name'] = pd.NA

taxa_df.loc[taxa_df['pbdb_taxon_rank'] == 'genus', 'genus_taxon_name' ] = taxa_df['pbdb_taxon_name']
taxa_df.loc[taxa_df['pbdb_taxon_rank'] == 'genus', 'genus_taxon_id' ] = taxa_df['pbdb_taxon_id']

taxa_df.loc[taxa_df['pbdb_taxon_rank'] == 'family', 'family_taxon_name' ] = taxa_df['pbdb_taxon_name']
taxa_df.loc[taxa_df['pbdb_taxon_rank'] == 'family', 'family_taxon_id' ] = taxa_df['pbdb_taxon_id']

taxa_df.loc[taxa_df['pbdb_taxon_rank'] == 'order', 'order_taxon_name' ] = taxa_df['pbdb_taxon_name']
taxa_df.loc[taxa_df['pbdb_taxon_rank'] == 'order', 'order_taxon_id' ] = taxa_df['pbdb_taxon_id']


taxa_df.loc[taxa_df['pbdb_taxon_rank'] == 'class', 'class_taxon_name' ] = taxa_df['pbdb_taxon_name']
taxa_df.loc[taxa_df['pbdb_taxon_rank'] == 'class', 'class_taxon_id' ] = taxa_df['pbdb_taxon_id']

taxa_df.loc[taxa_df['pbdb_taxon_rank'] == 'phylum', 'phylum_taxon_name' ] = taxa_df['pbdb_taxon_name']
taxa_df.loc[taxa_df['pbdb_taxon_rank'] == 'phylum', 'phylum_taxon_id' ] = taxa_df['pbdb_taxon_id']



log_df(taxa_df)

(4736, 29)


Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,...,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,species_taxon_id,species_taxon_name,genus_taxon_name,genus_taxon_id
0,,,Euuvigerina,,,,miozea,,,,...,,,288974,Foraminifera,212476,Rhizaria,,,Euuvigerina,1408.0
1,,,Euuvigerina,,,,rodleyi,,,,...,,,288974,Foraminifera,212476,Rhizaria,,,Euuvigerina,1408.0
2,Foraminifera indet.,,,,,,,,,,...,,,288974,Foraminifera,212476,Rhizaria,,,,
3,Pleurostomellidae indet.,,,,,,,,,,...,,,288974,Foraminifera,212476,Rhizaria,,,,
4,Ostracoda indet.,,,,,,,,,,...,22826.0,Ostracoda,18891,Arthropoda,325038,Animalia,,,,


In [75]:
taxa_df.columns

Index(['Any taxon above genus', 'genus modifier', 'genus name',
       'subgenera modifier', 'subgenera name', 'species modifier',
       'species name', 'subspecies modifier', 'subspecies name',
       'non-taxa descriptor', 'normalized_name', 'taxon_group',
       'pbdb_taxon_id', 'pbdb_taxon_name', 'pbdb_taxon_rank',
       'family_taxon_id', 'family_taxon_name', 'order_taxon_id',
       'order_taxon_name', 'class_taxon_id', 'class_taxon_name',
       'phylum_taxon_id', 'phylum_taxon_name', 'kingdom_taxon_id',
       'kingdom_taxon_name', 'species_taxon_id', 'species_taxon_name',
       'genus_taxon_name', 'genus_taxon_id'],
      dtype='object')

In [76]:
cols = [
    'Any taxon above genus', 
    'genus modifier', 'genus name',
    'subgenera modifier', 'subgenera name', 
    'species modifier', 'species name', 
    'subspecies modifier', 'subspecies name',
    'non-taxa descriptor', 'normalized_name', 'taxon_group',
    'pbdb_taxon_id', 'pbdb_taxon_name', 'pbdb_taxon_rank',
    'species_taxon_id', 'species_taxon_name', 
    'genus_taxon_id', 'genus_taxon_name', 
    'family_taxon_id', 'family_taxon_name', 
    'order_taxon_id', 'order_taxon_name', 
    'class_taxon_id', 'class_taxon_name',
    'phylum_taxon_id', 'phylum_taxon_name', 
    'kingdom_taxon_id', 'kingdom_taxon_name'
]

taxa_df = taxa_df.reindex(columns=cols)
log_df(taxa_df)

(4736, 29)


Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,...,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
0,,,Euuvigerina,,,,miozea,,,,...,,,,,,,288974,Foraminifera,212476,Rhizaria
1,,,Euuvigerina,,,,rodleyi,,,,...,,,,,,,288974,Foraminifera,212476,Rhizaria
2,Foraminifera indet.,,,,,,,,,,...,,,,,,,288974,Foraminifera,212476,Rhizaria
3,Pleurostomellidae indet.,,,,,,,,,,...,155922.0,Pleurostomellidae,,,,,288974,Foraminifera,212476,Rhizaria
4,Ostracoda indet.,,,,,,,,,,...,,,,,22826.0,Ostracoda,18891,Arthropoda,325038,Animalia


In [77]:
taxa_df.to_csv(taxa_list_file, index=False)

## add species to taxa list

In [61]:
species_df = pd.read_csv(species_file, dtype=str)

log_df(species_df)
# 3302

(3302, 20)


Unnamed: 0,genus species name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,species_taxon_id,species_taxon_name,genus_taxon_id,genus_taxon_name,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name
0,Euuvigerina miozea,,,,,,,,,,,,,,,,,,,
1,Euuvigerina rodleyi,,,,,,,,,,,,,,,,,,,
2,Candeina nitida,422278.0,Candeina nitida,species,422278.0,Candeina nitida,1053.0,Candeina,422277.0,Candeinidae,,,,,288974.0,Foraminifera,212476.0,Rhizaria,,
3,Dentoglobigerina altispira,402661.0,Dentoglobigerina altispira,species,402661.0,Dentoglobigerina altispira,1264.0,Dentoglobigerina,82191.0,Globigerinidae,,,,,288974.0,Foraminifera,212476.0,Rhizaria,,
4,Dentoglobigerina baroemoenensis,422289.0,Dentoglobigerina baroemoenensis,species,422289.0,Dentoglobigerina baroemoenensis,1264.0,Dentoglobigerina,82191.0,Globigerinidae,,,,,288974.0,Foraminifera,212476.0,Rhizaria,,


In [62]:
taxa_df = pd.read_csv(taxa_list_file, dtype=str)

log_df(taxa_df)
# 4736

(4736, 29)


Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,...,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
0,,,Euuvigerina,,,,miozea,,,,...,,,,,,,288974,Foraminifera,212476,Rhizaria
1,,,Euuvigerina,,,,rodleyi,,,,...,,,,,,,288974,Foraminifera,212476,Rhizaria
2,Foraminifera indet.,,,,,,,,,,...,,,,,,,288974,Foraminifera,212476,Rhizaria
3,Pleurostomellidae indet.,,,,,,,,,,...,155922.0,Pleurostomellidae,,,,,288974,Foraminifera,212476,Rhizaria
4,Ostracoda indet.,,,,,,,,,,...,,,,,22826.0,Ostracoda,18891,Arthropoda,325038,Animalia


In [63]:
filter_species_df = species_df[species_df['species_taxon_id'].notna()]

log_df(filter_species_df)
# 495

(495, 20)


Unnamed: 0,genus species name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,species_taxon_id,species_taxon_name,genus_taxon_id,genus_taxon_name,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name
2,Candeina nitida,422278,Candeina nitida,species,422278,Candeina nitida,1053,Candeina,422277,Candeinidae,,,,,288974,Foraminifera,212476,Rhizaria,,
3,Dentoglobigerina altispira,402661,Dentoglobigerina altispira,species,402661,Dentoglobigerina altispira,1264,Dentoglobigerina,82191,Globigerinidae,,,,,288974,Foraminifera,212476,Rhizaria,,
4,Dentoglobigerina baroemoenensis,422289,Dentoglobigerina baroemoenensis,species,422289,Dentoglobigerina baroemoenensis,1264,Dentoglobigerina,82191,Globigerinidae,,,,,288974,Foraminifera,212476,Rhizaria,,
5,Globigerina bulloides,113301,Globigerina bulloides,species,113301,Globigerina bulloides,1498,Globigerina,82191,Globigerinidae,,,,,288974,Foraminifera,212476,Rhizaria,,
6,Globigerina falconensis,388387,Globigerina falconensis,species,388387,Globigerina falconensis,1498,Globigerina,82191,Globigerinidae,,,,,288974,Foraminifera,212476,Rhizaria,,


In [64]:
filter_species_df[filter_species_df['pbdb_taxon_id'].isna()]

Unnamed: 0,genus species name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,species_taxon_id,species_taxon_name,genus_taxon_id,genus_taxon_name,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name


In [65]:
taxa_df['species name'].fillna('', inplace=True)

taxa_df.loc[~taxa_df['species name'].str.contains('spp\.|sp\..*?', regex=True), 'genus species name'] = taxa_df['genus name'] + ' ' + taxa_df['species name']

taxa_df['genus species name'] = taxa_df['genus species name'].str.strip()

log_df(taxa_df)
# 4736

(4736, 30)


Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,...,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,genus species name
0,,,Euuvigerina,,,,miozea,,,,...,,,,,,288974,Foraminifera,212476,Rhizaria,Euuvigerina miozea
1,,,Euuvigerina,,,,rodleyi,,,,...,,,,,,288974,Foraminifera,212476,Rhizaria,Euuvigerina rodleyi
2,Foraminifera indet.,,,,,,,,,,...,,,,,,288974,Foraminifera,212476,Rhizaria,
3,Pleurostomellidae indet.,,,,,,,,,,...,Pleurostomellidae,,,,,288974,Foraminifera,212476,Rhizaria,
4,Ostracoda indet.,,,,,,,,,,...,,,,22826.0,Ostracoda,18891,Arthropoda,325038,Animalia,


In [66]:
fields = [
    'species_taxon_name', 'species_taxon_id',
    'genus_taxon_name', 'genus_taxon_id',
    'family_taxon_name', 'family_taxon_id',
    'order_taxon_name', 'order_taxon_id',
    'class_taxon_name', 'class_taxon_id',
    'phylum_taxon_name', 'phylum_taxon_id',
    'kingdom_taxon_name', 'kingdom_taxon_id',
    
]

for index, row in filter_species_df.iterrows():
    if pd.isna(row['pbdb_taxon_id']):
        continue
        
    taxa_df.loc[taxa_df['genus species name'] == row['genus species name'], 'pbdb_taxon_id' ] = row['species_taxon_id']
    taxa_df.loc[taxa_df['genus species name'] == row['genus species name'], 'pbdb_taxon_name' ] = row['species_taxon_name']
    taxa_df.loc[taxa_df['genus species name'] == row['genus species name'], 'pbdb_taxon_rank' ] = 'species'

        
    for field in fields:
        taxa_df.loc[taxa_df['genus species name'] == row['genus species name'], field] = row[field]
    


In [67]:
del taxa_df['genus species name']

log_df(taxa_df)

(4736, 29)


Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,...,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
0,,,Euuvigerina,,,,miozea,,,,...,,,,,,,288974,Foraminifera,212476,Rhizaria
1,,,Euuvigerina,,,,rodleyi,,,,...,,,,,,,288974,Foraminifera,212476,Rhizaria
2,Foraminifera indet.,,,,,,,,,,...,,,,,,,288974,Foraminifera,212476,Rhizaria
3,Pleurostomellidae indet.,,,,,,,,,,...,155922.0,Pleurostomellidae,,,,,288974,Foraminifera,212476,Rhizaria
4,Ostracoda indet.,,,,,,,,,,...,,,,,22826.0,Ostracoda,18891,Arthropoda,325038,Animalia


In [68]:
taxa_df.to_csv(taxa_list_file, index=False)