In [15]:
import requests
import sys
import pandas as pd
import time

sys.path.append('../../')

from config import CLEAN_DATA_DIR, OUTPUT_DIR, RAW_DATA_DIR
import scripts.normalize_taxa as nt
from scripts.shared_utils import (
    log_df
)

import scripts.pbdb as pbdb

In [2]:


PBDB_API = "https://paleobiodb.org/data1.2/"
PBDB_TAXA_LIST_NAME = f"{PBDB_API}taxa/list.json?show=class&rel=all_parents&name="


PBDB_TAXA_NAME = f"{PBDB_API}taxa/single.json?vocab=pbdb&name="


date = '2024-01-18'
initial_input_file = RAW_DATA_DIR/'PI_processed_files'/f'NOAA_2_taxa_lists_taxa_list_{date}.csv'


In [3]:
def add_genus_species(taxa_df):

    taxa_df.loc[~taxa_df['species name'].str.contains('spp\.|sp\..*?', regex=True).fillna(False), 'genus species']=taxa_df['genus name'] + ' ' + taxa_df['species name']
    taxa_df.loc[taxa_df['species name'].str.contains('spp\.|sp\..*?', regex=True).fillna(False), 'genus species']=taxa_df['genus name'] 
    
    taxa_df.loc[taxa_df['species name'].isna(), 'genus species']=taxa_df['genus name'] 
    taxa_df.loc[taxa_df['Any taxon above genus'].notna(), 'genus species']=''
    taxa_df.loc[taxa_df['Any taxon above genus modifier'].notna(), 'genus species']=''

    

    taxa_df['genus species'] = taxa_df['genus species'].str.strip()

# get taxa ranks 

In [4]:
df = pd.read_csv(initial_input_file)
nt.add_normalized_name_column(df)
add_genus_species(df)

log_df(df)

(8906, 42)


Unnamed: 0,taxon_group,verbatim_name,name,Comment,Notes (temp; JAS will remove when finished with dinos),Notes (change to Internal only notes?),Any taxon above genus modifier,Any taxon above genus,genus modifier,genus name,...,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name,normalized_name,genus species
0,dinoflagellates,?Labyrinthodinium sp. 1,? Labyrinthodinium sp. 1,,,"Updated taxon_group from ""Dinoflagellates/Acri...",,,?,Labyrinthodinium,...,321578.0,Dinophyceae,,,,,,,? Labyrinthodinium sp. 1,Labyrinthodinium
1,dinoflagellates,?Maduradinium sp.,? Maduradinium sp.,,,"Updated taxon_group from ""Dinoflagellates/Acri...",,,?,Maduradinium,...,321578.0,Dinophyceae,,,,,,,? Maduradinium sp.,Maduradinium
2,dinoflagellates,?Pyxidiella sp. 1,? Pyxidiella sp. 1,,,"Updated taxon_group from ""Dinoflagellates/Acri...",,,?,Pyxidiella,...,321578.0,Dinophyceae,,,,,,,? Pyxidiella sp. 1,Pyxidiella
3,dinoflagellates,Aandalusiella ivoirensis,Andalusiella ivoirensis,genus missplled,,"Updated taxon_group from ""Dinoflagellates/Acri...",,,,Andalusiella,...,,,,,,,,,Andalusiella ivoirensis,Andalusiella ivoirensis
4,dinoflagellates,Abratopdinium cardioforme,,,,"Updated taxon_group from ""Dinoflagellates/Acri...",,,,Abratopdinium,...,,,,,,,,,Abratopdinium cardioforme,Abratopdinium cardioforme


In [199]:
set(df[df['genus species'].isna()]['normalized_name'])

{'', 'Katacycloclypeus spp.', 'bulbosa', 'inornata', 'spp.'}

In [206]:

def get_ranks(name, rank_dict, rank_ids):
    if pd.isna(name):
        return 
    
    url = PBDB_TAXA_LIST_NAME + name
    response = requests.get(url)


    if response.status_code == 200:
        records = response.json()['records']

        for record in records:
            found = False
            rank_id = record['rnk']
            
            if rank_id in rank_ids:
                continue 
            
            if 'gnl' in record:
                if record['gnl'] == record['nam']:
                    rank_dict[rank_id] = 'genus'
                    rank_ids.add(rank_id)
                    found = True

            elif 'fml' in record:
                if record['fml'] == record['nam']:
                    rank_dict[rank_id] = 'family'
                    rank_ids.add(rank_id)  
                    found = True

            elif 'odl' in record:
                if record['odl'] == record['nam']:
                    rank_dict[rank_id] = 'order'
                    rank_ids.add(rank_id)
                    found = True

            elif 'cll' in record:
                if record['cll'] == record['nam']:
                    rank_dict[rank_id] = 'class'
                    rank_ids.add(rank_id)
                    found = True

            elif 'phl' in record:
                if record['phl'] == record['nam']:
                    rank_dict[rank_id] = 'phylum'
                    rank_ids.add(rank_id)
                    found = True

            if not found:
                # print(record)
                url2 = PBDB_TAXA_NAME + record['nam']
                response2 = requests.get(url2)
                if response2.status_code == 200:
                    records2 = response2.json()['records'][0]

                    rank_dict[rank_id] = records2['taxon_rank']
                    rank_ids.add(rank_id)

    return rank_dict

In [207]:
rank_dict = {}
rank_ids = set()

In [211]:
for index, row in df.iterrows():
    # if index < 3500:
    #     continue
    
    if index % 10 == 0:
        time.sleep(0.5)
    if index % 50 == 0:
        print(index, end=' ')
        
    get_ranks(row['genus species'], rank_dict, rank_ids)
    
 

    


3500 3550 3600 3650 3700 3750 3800 3850 3900 3950 4000 4050 4100 4150 4200 4250 4300 4350 4400 4450 4500 4550 4600 4650 4700 4750 4800 4850 4900 4950 5000 5050 5100 5150 5200 5250 5300 5350 5400 5450 5500 5550 5600 5650 5700 5750 5800 5850 5900 5950 6000 6050 6100 6150 6200 6250 6300 6350 6400 6450 6500 6550 6600 6650 6700 6750 6800 6850 6900 6950 7000 7050 7100 7150 7200 7250 7300 7350 7400 7450 7500 7550 7600 7650 7700 7750 7800 7850 7900 7950 8000 8050 8100 8150 8200 8250 8300 8350 8400 8450 8500 8550 8600 8650 8700 8750 8800 8850 8900 

In [212]:
rank_dict

{20: 'phylum',
 19: 'subphylum',
 18: 'superclass',
 17: 'class',
 13: 'order',
 5: 'genus',
 9: 'family',
 3: 'species',
 8: 'subfamily',
 25: 'unranked clade',
 23: 'kingdom',
 21: 'phylum',
 12: 'suborder',
 10: 'superfamily',
 16: 'subclass',
 15: 'infraclass',
 14: 'superorder',
 11: 'infraorder',
 22: 'subkingdom',
 4: 'subgenus',
 2: 'subspecies',
 7: 'tribe'}

{20: 'phylum',
 19: 'subphylum',
 18: 'superclass',
 17: 'class',
 13: 'order',
 5: 'genus',
 9: 'family',
 3: 'species',
 8: 'subfamily',
 25: 'unranked clade',
 23: 'kingdom',
 21: 'phylum',
 12: 'suborder',
 10: 'superfamily',
 16: 'subclass',
 15: 'infraclass',
 14: 'superorder',
 11: 'infraorder',
 22: 'subkingdom',
 4: 'subgenus',
 2: 'subspecies',
 7: 'tribe'}

# get pbdb info

In [5]:
df = pd.read_csv(initial_input_file)

log_df(df)

(8906, 40)


Unnamed: 0,taxon_group,verbatim_name,name,Comment,Notes (temp; JAS will remove when finished with dinos),Notes (change to Internal only notes?),Any taxon above genus modifier,Any taxon above genus,genus modifier,genus name,...,subclass_taxon_id,subclass_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name
0,dinoflagellates,?Labyrinthodinium sp. 1,? Labyrinthodinium sp. 1,,,"Updated taxon_group from ""Dinoflagellates/Acri...",,,?,Labyrinthodinium,...,,,321578.0,Dinophyceae,,,,,,
1,dinoflagellates,?Maduradinium sp.,? Maduradinium sp.,,,"Updated taxon_group from ""Dinoflagellates/Acri...",,,?,Maduradinium,...,,,321578.0,Dinophyceae,,,,,,
2,dinoflagellates,?Pyxidiella sp. 1,? Pyxidiella sp. 1,,,"Updated taxon_group from ""Dinoflagellates/Acri...",,,?,Pyxidiella,...,,,321578.0,Dinophyceae,,,,,,
3,dinoflagellates,Aandalusiella ivoirensis,Andalusiella ivoirensis,genus missplled,,"Updated taxon_group from ""Dinoflagellates/Acri...",,,,Andalusiella,...,,,,,,,,,,
4,dinoflagellates,Abratopdinium cardioforme,,,,"Updated taxon_group from ""Dinoflagellates/Acri...",,,,Abratopdinium,...,,,,,,,,,,


In [18]:
genus_df = df[df['genus name'].notna()].copy()[['taxon_group', 'genus name']]
genus_df.drop_duplicates(inplace=True)
genus_df['check'] = False

log_df(genus_df)

(2067, 3)


Unnamed: 0,taxon_group,genus name,check
0,dinoflagellates,Labyrinthodinium,False
1,dinoflagellates,Maduradinium,False
2,dinoflagellates,Pyxidiella,False
3,dinoflagellates,Andalusiella,False
4,dinoflagellates,Abratopdinium,False


In [19]:

pbdb.fetch_pdbd_data(genus_df, 'genus name')

0 

In [20]:
log_df(genus_df, 20)

(2067, 16)


Unnamed: 0,taxon_group,genus name,check,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,phylum_taxon_id,phylum_taxon_name,class_taxon_id,class_taxon_name,order_taxon_id,order_taxon_name,genus_taxon_id,genus_taxon_name,family_taxon_id,family_taxon_name
0,dinoflagellates,Labyrinthodinium,True,443826.0,Labyrinthodinium,genus,277918.0,Myzozoa,321578.0,Dinophyceae,321606.0,Gonyaulacales,443826.0,Labyrinthodinium,,
1,dinoflagellates,Maduradinium,True,325673.0,Maduradinium,genus,277918.0,Myzozoa,321578.0,Dinophyceae,277919.0,Peridiniales,325673.0,Maduradinium,277915.0,Peridiniaceae
2,dinoflagellates,Pyxidiella,True,336773.0,Pyxidiella,genus,277918.0,Myzozoa,321578.0,Dinophyceae,277919.0,Peridiniales,336773.0,Pyxidiella,277915.0,Peridiniaceae
3,dinoflagellates,Andalusiella,True,276893.0,Andalusiella,genus,277918.0,Myzozoa,321578.0,Dinophyceae,277919.0,Peridiniales,276893.0,Andalusiella,277915.0,Peridiniaceae
4,dinoflagellates,Abratopdinium,True,477210.0,Abratopdinium,genus,277918.0,Myzozoa,321578.0,Dinophyceae,277919.0,Peridiniales,477210.0,Abratopdinium,321581.0,Deflandreaceae
6,dinoflagellates,Acanthaulax,True,276986.0,Acanthaulax,genus,277918.0,Myzozoa,,,,,276986.0,Acanthaulax,,
8,dinoflagellates,Achilleodinium,True,323991.0,Achilleodinium,genus,277918.0,Myzozoa,321578.0,Dinophyceae,321606.0,Gonyaulacales,323991.0,Achilleodinium,321603.0,Gonyaulacaceae
10,dinoflagellates,Achomosphaera,True,277048.0,Achomosphaera,genus,277918.0,Myzozoa,321578.0,Dinophyceae,321606.0,Gonyaulacales,277048.0,Achomosphaera,321603.0,Gonyaulacaceae
35,dinoflagellates,Actiniscus,False,,,,,,,,,,,,,
36,dinoflagellates,Adnatosphaeridium,False,,,,,,,,,,,,,


In [325]:
genus_df.to_csv('tmp/genus.csv', index=False)