# Normalize taxa list

Cleanup the normalized taxa list from the eODP researchers. Add PBDB taxa data.

In [1]:
import sys
sys.path.append('../../../')

import pandas as pd
import numpy as np
import requests

from config import CLEAN_DATA_DIR, OUTPUT_DIR, RAW_DATA_DIR

import scripts.normalize_taxa as nt
from scripts.normalize_data import remove_whitespace
import scripts.pbdb as pbdb

from scripts.pbdb import get_parent_taxa, PBDB_TAXA_NAME, PBDB_TAXA_ID
from scripts.shared_utils import (
    log_df
)

In [38]:
date = '2022-10-27'

taxa_list_file = OUTPUT_DIR/'taxa'/'LIMS'/f"taxa_list_{date}.csv"

species_dir = OUTPUT_DIR/'taxa'/'draft'/'LIMS'/'species'



## create species csv

In [20]:
taxa_df = pd.read_csv(taxa_list_file, dtype=str)
log_df(taxa_df)
# 4745

(4745, 25)


Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,...,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
0,,,Euuvigerina,,,,miozea,,,,...,,,,,,,288974.0,Foraminifera,212476,Rhizaria
1,,,Euuvigerina,,,,rodleyi,,,,...,,,,,,,288974.0,Foraminifera,212476,Rhizaria
2,Foraminifera indet.,,,,,,,,,,...,,,,,,,,,212476,Rhizaria
3,Pleurostomellidae indet.,,,,,,,,,,...,,,,,,,288974.0,Foraminifera,212476,Rhizaria
4,Ostracoda indet.,,,,,,,,,,...,,,,,,,18891.0,Arthropoda,325038,Animalia


In [27]:
taxa_df = taxa_df[taxa_df['Any taxon above genus'].isna()]
taxa_df = taxa_df[taxa_df['species name'].notna()]
log_df(taxa_df)

taxa_df.loc[~taxa_df['species name'].str.contains('spp\.|sp\..*?', regex=True), 'genus species name'] = taxa_df['genus name'] + ' ' + taxa_df['species name']

taxa_df['genus species name'] = taxa_df['genus species name'].str.strip()

log_df(taxa_df)
# 4635

(4635, 25)
(4635, 26)


Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,...,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,genus species name
0,,,Euuvigerina,,,,miozea,,,,...,,,,,,288974,Foraminifera,212476,Rhizaria,Euuvigerina miozea
1,,,Euuvigerina,,,,rodleyi,,,,...,,,,,,288974,Foraminifera,212476,Rhizaria,Euuvigerina rodleyi
6,,,Nodosaria,,,,spp.,,,,...,Nodosariidae,429322.0,Nodosariida,428875.0,Nodosariata,288974,Foraminifera,212476,Rhizaria,
7,,,Cibicides,,,,spp.,,,,...,Cibicididae,,,,,288974,Foraminifera,212476,Rhizaria,
8,,,Brizalina,,,,spp.,,,,...,Bolivinidae,,,,,288974,Foraminifera,212476,Rhizaria,


In [6]:
species_df = taxa_df[['genus species name']].copy().dropna().drop_duplicates().reset_index(drop=True)

log_df(species_df)
# 3302

(3302, 1)


Unnamed: 0,genus species name
0,Euuvigerina miozea
1,Euuvigerina rodleyi
2,Candeina nitida
3,Dentoglobigerina altispira
4,Dentoglobigerina baroemoenensis


In [16]:
genus_ids = {}

for index, row in species_df.iterrows(): 
    need_genus = True
    count = 0

    # if index > 10:
    #     continue
    
    if index % 50 == 0:
        print(index, end=' ')
        
    url =  PBDB_TAXA_NAME +  row['genus species name']
    
    # if row['genus species name'] != 'Charcotia actinochilus':
    #        continue
    # print('species: ', url)
        
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()["records"]
        if len(data) == 1:
            species_df.at[index, 'species_taxon_id'] = str(data[0]["taxon_no"])
            species_df.at[index, 'species_taxon_name'] = data[0]["taxon_name"]
            # print(data[0])
            
            
            
            parent_id = data[0]['parent_no']
            
            if parent_id in genus_ids:
                species_df.at[index, 'tmp_genus_taxon_id'] = str(parent_id)
                species_df.at[index, 'tmp_genus_taxon_name'] = genus_ids[parent_id]
                    
                need_genus = False
            
            while need_genus:
                url =  PBDB_TAXA_ID +  parent_id
                # print(url)
                response = requests.get(url)
                if response.status_code == 200:
                    data = response.json()["records"]
                    if data[0]["taxon_rank"] == 'genus':
                        genus_id = data[0]["taxon_no"]
                        genus_ids[genus_id] = data[0]["taxon_name"]
                        species_df.at[index, 'tmp_genus_taxon_id'] = str(genus_id)
                        species_df.at[index, 'tmp_genus_taxon_name'] =  data[0]["taxon_name"]
                        # print(data[0])


                        need_genus = False
                    else:
                        parent_id = data[0]['parent_no']
                if count > 20:
                    need_genus = False
                count += 1
        else:
            print('multiple matches for: ', row['genus species name'] )
    else:
        pass
        # print('not found: ', row['genus species name'])



0 50 100 150 200 250 300 350 400 450 500 550 600 650 700 750 800 850 900 950 1000 1050 1100 1150 1200 1250 1300 1350 1400 1450 1500 1550 1600 1650 1700 1750 1800 1850 1900 1950 2000 2050 2100 2150 2200 2250 2300 2350 2400 2450 2500 2550 2600 2650 2700 2750 2800 2850 2900 2950 3000 3050 3100 3150 3200 3250 3300 

In [17]:
species_df.head(10)

Unnamed: 0,genus species name,species_taxon_id,species_taxon_name,tmp_genus_taxon_id,tmp_genus_taxon_name
0,Euuvigerina miozea,,,,
1,Euuvigerina rodleyi,,,,
2,Candeina nitida,422278.0,Candeina nitida,1053.0,Candeina
3,Dentoglobigerina altispira,402661.0,Dentoglobigerina altispira,1264.0,Dentoglobigerina
4,Dentoglobigerina baroemoenensis,422289.0,Dentoglobigerina baroemoenensis,1264.0,Dentoglobigerina
5,Globigerina bulloides,113301.0,Globigerina bulloides,1498.0,Globigerina
6,Globigerina falconensis,388387.0,Globigerina falconensis,1498.0,Globigerina
7,Globigerina rubescens,422320.0,Globigerina rubescens,1529.0,Globoturborotalita
8,Globigerinella calida,422302.0,Globigerinella calida,1501.0,Globigerinella
9,Globigerinella siphonifera,422304.0,Globigerinella siphonifera,1501.0,Globigerinella


In [7]:
species_df.to_csv(species_dir/'species_list.csv', index=False)

## create taxa list with species

In [39]:
species_df = pd.read_csv(species_dir/'species_list.csv', dtype=str)
log_df(species_df)
# 3302

(3302, 5)


Unnamed: 0,genus species name,species_taxon_id,species_taxon_name,tmp_genus_taxon_id,tmp_genus_taxon_name
0,Euuvigerina miozea,,,,
1,Euuvigerina rodleyi,,,,
2,Candeina nitida,422278.0,Candeina nitida,1053.0,Candeina
3,Dentoglobigerina altispira,402661.0,Dentoglobigerina altispira,1264.0,Dentoglobigerina
4,Dentoglobigerina baroemoenensis,422289.0,Dentoglobigerina baroemoenensis,1264.0,Dentoglobigerina


In [40]:
taxa_df = pd.read_csv(taxa_list_file, dtype=str)
log_df(taxa_df)

(4745, 25)


Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,...,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
0,,,Euuvigerina,,,,miozea,,,,...,,,,,,,288974.0,Foraminifera,212476,Rhizaria
1,,,Euuvigerina,,,,rodleyi,,,,...,,,,,,,288974.0,Foraminifera,212476,Rhizaria
2,Foraminifera indet.,,,,,,,,,,...,,,,,,,,,212476,Rhizaria
3,Pleurostomellidae indet.,,,,,,,,,,...,,,,,,,288974.0,Foraminifera,212476,Rhizaria
4,Ostracoda indet.,,,,,,,,,,...,,,,,,,18891.0,Arthropoda,325038,Animalia


In [41]:
taxa_df = taxa_df[taxa_df['Any taxon above genus'].isna()]
taxa_df = taxa_df[taxa_df['species name'].notna()]
log_df(taxa_df)

taxa_df.loc[~taxa_df['species name'].str.contains('spp\.|sp\..*?', regex=True), 'genus species name'] = taxa_df['genus name'] + ' ' + taxa_df['species name']

taxa_df['genus species name'] = taxa_df['genus species name'].str.strip()

log_df(taxa_df)
# 4635

(4635, 25)
(4635, 26)


Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,...,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,genus species name
0,,,Euuvigerina,,,,miozea,,,,...,,,,,,288974,Foraminifera,212476,Rhizaria,Euuvigerina miozea
1,,,Euuvigerina,,,,rodleyi,,,,...,,,,,,288974,Foraminifera,212476,Rhizaria,Euuvigerina rodleyi
6,,,Nodosaria,,,,spp.,,,,...,Nodosariidae,429322.0,Nodosariida,428875.0,Nodosariata,288974,Foraminifera,212476,Rhizaria,
7,,,Cibicides,,,,spp.,,,,...,Cibicididae,,,,,288974,Foraminifera,212476,Rhizaria,
8,,,Brizalina,,,,spp.,,,,...,Bolivinidae,,,,,288974,Foraminifera,212476,Rhizaria,


In [42]:
cols = [
    'pbdb_taxon_id', 'pbdb_taxon_name',  'pbdb_taxon_rank', 
    'genus species name',
    'family_taxon_id', 'family_taxon_name', 
    'order_taxon_id', 'order_taxon_name',
    'class_taxon_id', 'class_taxon_name', 
    'phylum_taxon_id', 'phylum_taxon_name', 
    'kingdom_taxon_id', 'kingdom_taxon_name'
]

merge_df = taxa_df[cols].merge(species_df, how='left', on='genus species name')


log_df(merge_df)
# 4635

(4635, 18)


Unnamed: 0,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,genus species name,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,species_taxon_id,species_taxon_name,tmp_genus_taxon_id,tmp_genus_taxon_name
0,1408,Euuvigerina,genus,Euuvigerina miozea,,,,,,,288974,Foraminifera,212476,Rhizaria,,,,
1,1408,Euuvigerina,genus,Euuvigerina rodleyi,,,,,,,288974,Foraminifera,212476,Rhizaria,,,,
2,1952,Nodosaria,genus,,82197.0,Nodosariidae,429322.0,Nodosariida,428875.0,Nodosariata,288974,Foraminifera,212476,Rhizaria,,,,
3,1107,Cibicides,genus,,82208.0,Cibicididae,,,,,288974,Foraminifera,212476,Rhizaria,,,,
4,1017,Brizalina,genus,,112279.0,Bolivinidae,,,,,288974,Foraminifera,212476,Rhizaria,,,,


In [43]:
merge_df.to_csv(species_dir/'taxa_list_with_species.csv', index=False)

## create mistched genus list 

In [44]:
species_df = pd.read_csv(species_dir/'species_list.csv', dtype=str)
log_df(species_df)

(3302, 5)


Unnamed: 0,genus species name,species_taxon_id,species_taxon_name,tmp_genus_taxon_id,tmp_genus_taxon_name
0,Euuvigerina miozea,,,,
1,Euuvigerina rodleyi,,,,
2,Candeina nitida,422278.0,Candeina nitida,1053.0,Candeina
3,Dentoglobigerina altispira,402661.0,Dentoglobigerina altispira,1264.0,Dentoglobigerina
4,Dentoglobigerina baroemoenensis,422289.0,Dentoglobigerina baroemoenensis,1264.0,Dentoglobigerina


In [45]:
species_filter_df = species_df.dropna(subset=['species_taxon_id'])
log_df(species_filter_df)
# 495

(495, 5)


Unnamed: 0,genus species name,species_taxon_id,species_taxon_name,tmp_genus_taxon_id,tmp_genus_taxon_name
2,Candeina nitida,422278,Candeina nitida,1053,Candeina
3,Dentoglobigerina altispira,402661,Dentoglobigerina altispira,1264,Dentoglobigerina
4,Dentoglobigerina baroemoenensis,422289,Dentoglobigerina baroemoenensis,1264,Dentoglobigerina
5,Globigerina bulloides,113301,Globigerina bulloides,1498,Globigerina
6,Globigerina falconensis,388387,Globigerina falconensis,1498,Globigerina


In [46]:
cols = [
    'pbdb_taxon_id', 'pbdb_taxon_name',  'pbdb_taxon_rank', 
    'genus species name',
    'family_taxon_id', 'family_taxon_name', 
    'order_taxon_id', 'order_taxon_name',
    'class_taxon_id', 'class_taxon_name', 
    'phylum_taxon_id', 'phylum_taxon_name', 
    'kingdom_taxon_id', 'kingdom_taxon_name'
]

merge_df = species_filter_df.merge(taxa_df[cols], how='left', on='genus species name')
merge_df.drop_duplicates(inplace=True)


log_df(merge_df)
# 497

(497, 18)


Unnamed: 0,genus species name,species_taxon_id,species_taxon_name,tmp_genus_taxon_id,tmp_genus_taxon_name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
0,Candeina nitida,422278,Candeina nitida,1053,Candeina,1053,Candeina,genus,422277,Candeinidae,,,,,288974,Foraminifera,212476,Rhizaria
1,Dentoglobigerina altispira,402661,Dentoglobigerina altispira,1264,Dentoglobigerina,1264,Dentoglobigerina,genus,82191,Globigerinidae,,,,,288974,Foraminifera,212476,Rhizaria
3,Dentoglobigerina baroemoenensis,422289,Dentoglobigerina baroemoenensis,1264,Dentoglobigerina,1264,Dentoglobigerina,genus,82191,Globigerinidae,,,,,288974,Foraminifera,212476,Rhizaria
4,Globigerina bulloides,113301,Globigerina bulloides,1498,Globigerina,1498,Globigerina,genus,82191,Globigerinidae,,,,,288974,Foraminifera,212476,Rhizaria
6,Globigerina falconensis,388387,Globigerina falconensis,1498,Globigerina,1498,Globigerina,genus,82191,Globigerinidae,,,,,288974,Foraminifera,212476,Rhizaria


In [47]:
mismatch_df = merge_df[
    (merge_df['tmp_genus_taxon_id'] != merge_df['pbdb_taxon_id'])
    & (merge_df['pbdb_taxon_rank'] == 'genus')
]
log_df(mismatch_df)
# 44

(44, 18)


Unnamed: 0,genus species name,species_taxon_id,species_taxon_name,tmp_genus_taxon_id,tmp_genus_taxon_name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
7,Globigerina rubescens,422320,Globigerina rubescens,1529,Globoturborotalita,1498,Globigerina,genus,82191,Globigerinidae,,,,,288974,Foraminifera,212476.0,Rhizaria
27,Globigerinoides sacculifer,388389,Globigerinoides sacculifer,422361,Trilobatus,1504,Globigerinoides,genus,82191,Globigerinidae,,,,,288974,Foraminifera,212476.0,Rhizaria
39,Globorotalia humerosa,402487,Globorotalia humerosa,1917,Neogloboquadrina,1521,Globorotalia,genus,82192,Globorotaliidae,,,,,288974,Foraminifera,212476.0,Rhizaria
100,Anomalina praeacuta,382642,Anomalina praeacuta,859,Anomalinoides,86769,Anomalina,genus,103798,Anomalinidae,279579.0,Rotaliida,428504.0,Globothalamea,288974,Foraminifera,212476.0,Rhizaria
209,Actinocyclus senarius,387049,Actinocyclus senarius,71244,Actinoptychus,82146,Actinocyclus,genus,71207,Hemidiscaceae,426780.0,Coscinodiscales,69587.0,Bacillariophyceae,432613,Ochrophyta,,


In [48]:
mismatch_df.to_csv(species_dir/'species_mistmach_genus.csv', index=False)