# Create Taxa list csv for micropal 4

Create a csv containing taxa names for micropal 4

In [1]:
import sys
sys.path.append('../../../')
import glob
import re
import os.path
import time
import requests

import pandas as pd
import numpy as np

from scripts.normalize_data import (
    csv_cleanup,
    update_metadata,
    get_taxonomy_columns,
    clean_taxon_name,
    extract_taxon_group_from_filename,
    
)
from scripts.normalize_taxa import add_normalized_name_column, taxon_name_parser
from config import CLEAN_DATA_DIR, OUTPUT_DIR, RAW_DATA_DIR

from scripts.pbdb import get_parent_taxa, PBDB_TAXA_NAME

In [2]:
base_dir = CLEAN_DATA_DIR

micropal_4 = CLEAN_DATA_DIR/'LIMS'/'Micropal_CSV_4'
metadata_path = OUTPUT_DIR/'metadata'/'LIMS'/'Micropal_changes_4.csv'

non_taxa_fields_path = OUTPUT_DIR/'taxa'/'non_taxa_fields.csv'

LIMS_taxa_file = OUTPUT_DIR/'taxa'/'LIMS'/f"taxa_crosswalk_2022-02-22.csv"
NOAA_taxa_file =  RAW_DATA_DIR/'PI_processed_files'/'NOAA_taxa_lists_taxa_list_2022-02-24.csv'

date = '2022-02-24'
taxa_file = OUTPUT_DIR/'taxa'/'draft'/'LIMS'/f"micropal_4_taxa_{date}.csv"
genus_file= OUTPUT_DIR/'taxa'/'draft'/'LIMS'/f"micropal_4_genus_{date}.csv"
taxa_pbdb_file = OUTPUT_DIR/'taxa'/'draft'/'LIMS'/f'micropal_4_taxa_pbdb_{date}.csv'
genus_letter_file= OUTPUT_DIR/'taxa'/'draft'/'LIMS'/f"micropal_4_genus_letter_{date}.csv"


In [3]:
clean_csvs = []
clean_csvs = clean_csvs + list(micropal_4.glob("*.csv"))

clean_csvs[0:3]

[PosixPath('../../../output/cleaned_data/LIMS/Micropal_CSV_4/320_U1334_benthic_forams.csv'),
 PosixPath('../../../output/cleaned_data/LIMS/Micropal_CSV_4/361_U1479B_nannofossils.csv'),
 PosixPath('../../../output/cleaned_data/LIMS/Micropal_CSV_4/372_U1517C_planktic_forams.csv')]

In [4]:
def log_df(df, row_count=5):
    print(df.shape)
    return df.head(row_count)


In [5]:
clean_data_path = CLEAN_DATA_DIR
metadata_file = metadata_path

# create metadadta 

get taxon groups from file names and normalize them

In [6]:
raw_taxon_groups = set()

for path in clean_csvs:
    filename = path.name
    group = extract_taxon_group_from_filename(filename)
    raw_taxon_groups.add(group)
    
raw_taxon_groups

{'benthic_forams',
 'diatoms',
 'dinoflagellates',
 'ebridians',
 'nannofossils',
 'ostracods',
 'other',
 'palynology',
 'planktic_forams',
 'radiolarians',
 'rads',
 'silicoflagellates'}

In [7]:
taxon_groups = {
'benthic_forams': 'benthic_forams',
 'diatoms': 'diatoms',
 'dinoflagellates': 'dinoflagellates',
 'ebridians': 'ebridians',
 'nannofossils': 'nannofossils',
 'ostracods': 'ostracods',
 'other': 'other',
 'palynology': 'palynology',
 'planktic_forams': 'planktic_forams',
 'radiolarians': 'radiolarians',
 'rads': 'radiolarians',
 'silicoflagellates': 'silicoflagellates'
}


In [8]:
file_taxon_groups = []
filenames = []
relative_paths = []


for path in clean_csvs:
    relative_path = path.relative_to(base_dir)
    filename = path.name
    raw_taxon_group = extract_taxon_group_from_filename(filename)
    taxon_group = taxon_groups[raw_taxon_group]
        
    filenames.append(filename)
    relative_paths.append(relative_path)
    file_taxon_groups.append(taxon_group)


In [9]:
dict = {"file": filenames,
        "path": relative_paths,
        "taxon_group": file_taxon_groups}

metadata = pd.DataFrame(dict)
log_df(metadata)

(137, 3)


Unnamed: 0,file,path,taxon_group
0,320_U1334_benthic_forams.csv,LIMS/Micropal_CSV_4/320_U1334_benthic_forams.csv,benthic_forams
1,361_U1479B_nannofossils.csv,LIMS/Micropal_CSV_4/361_U1479B_nannofossils.csv,nannofossils
2,372_U1517C_planktic_forams.csv,LIMS/Micropal_CSV_4/372_U1517C_planktic_forams...,planktic_forams
3,317_U1352_planktic_forams.csv,LIMS/Micropal_CSV_4/317_U1352_planktic_forams.csv,planktic_forams
4,323_U1339_palynology.csv,LIMS/Micropal_CSV_4/323_U1339_palynology.csv,palynology


In [10]:
metadata.to_csv(metadata_path, index=False)

## Create a csv of all taxa

In [13]:
metadata = pd.read_csv(metadata_path)
log_df(metadata)

(137, 9)


Unnamed: 0,file,path,taxon_group,change_file_encoding,remove_bad_characters,remove_empty_rows,remove_spaces,delete_duplicate_rows,delete_duplicate_columns
0,320_U1334_benthic_forams.csv,LIMS/Micropal_CSV_4/320_U1334_benthic_forams.csv,benthic_forams,False,False,False,True,False,False
1,361_U1479B_nannofossils.csv,LIMS/Micropal_CSV_4/361_U1479B_nannofossils.csv,nannofossils,True,True,False,True,False,False
2,372_U1517C_planktic_forams.csv,LIMS/Micropal_CSV_4/372_U1517C_planktic_forams...,planktic_forams,False,False,False,True,False,False
3,317_U1352_planktic_forams.csv,LIMS/Micropal_CSV_4/317_U1352_planktic_forams.csv,planktic_forams,False,False,False,True,False,False
4,323_U1339_palynology.csv,LIMS/Micropal_CSV_4/323_U1339_palynology.csv,palynology,False,False,False,True,False,False


get all columns with data

In [14]:
all_columns = set()
for path in metadata['path']:
    if '317_U1351_planktic_forams.csv' in str(path):
        header = 1
    else:
        header = 0
    df = pd.read_csv(clean_data_path/path, dtype=str, header=header)
    df = csv_cleanup(df, clean_data_path/path)
    df = df.dropna(how='all', axis='columns')
    all_columns.update([col.strip() for col in df.columns])

In [None]:
len(all_columns)

In [None]:
strip_cols = [col.strip() for col in all_columns]

get procesessed LIMS taxa

In [None]:
existing_LIMS_taxa = set()

existing_taxa_df = pd.read_csv(LIMS_taxa_file)
for index, row in existing_taxa_df.iterrows():
    existing_LIMS_taxa.add(row['verbatim_name'])
    existing_LIMS_taxa.add(row['normalized_name'])

len(existing_LIMS_taxa)

get NOAA taxa

In [None]:
existing_NOAA_taxa = set()

existing_taxa_df = pd.read_csv(NOAA_taxa_file)
add_normalized_name_column(existing_taxa_df, include_descriptor=True, col_name="normalized_name_descriptor")
add_normalized_name_column(existing_taxa_df, include_descriptor=False, col_name="normalized_name")

for index, row in existing_taxa_df.iterrows():
    existing_NOAA_taxa.add(row['verbatim_name'])
    existing_NOAA_taxa.add(row['normalized_name_descriptor'])
    existing_NOAA_taxa.add(row['normalized_name'])

len(existing_NOAA_taxa)

In [None]:
nontaxa = {
 'Abundance',
 'Abundance ',
 'Abundance (%)',
 'Abundances',
 'Age',
 'Age:',
 'Benthic abundance',
 'Biozone name',
 'Biozone name (short)',
 'Bottom (cm)',
 'Bottom (m CSF-A)',
 'Bottom CSF-A (m)',
 'Bottom Depth (m)',
 'Bottom Depth (m) CSF-A',
 'Bottom Depth CSF-A (m)',
 'Bottom Depth [CFS m]',
 'Bottom Depth [m]',
 'Bottom Offset (cm) on Parent Sample',
 'Bottom [cm]',
 'Bottom depth CSF-B (m)',
 'Bottom depth CSF-B (m):',
 'Bottom interval (cm)',
 'COMMENTS',
 'Comments',
 'Core',
 'Core Type',
 'Core Type - Section',
 'Core type',
 'Core,    section',
 'Core, Section',
 'Core, Section, Interval',
 'Core, section',
 'Core, section, interval',
 'Core, section, interval (cm)',
 'Datum age average (Ma)',
 'Datum name',
 'Datum type',
 'Depth (cm)',
 'Depth (csf)',
 'Depth (m) CSF-A',
 'Depth CSF (m)',
 'Depth CSF-A (m)',
 'Depth Method',
 'Depth bottom CSF-A (m)',
 'Depth m (m csf)',
 'Depth top CSF-A (m)',
 'Exp',
 'Expedition',
 'Expedition ',
 'Expedition, site, hole, core, section, interval (cm):',
 'Foraminferal preservation',
 'Foraminiferal abundance',
 'Foraminiferal preservation',
 'Group Abundance',
 'Group abundance',
 'Half',
 'Hole',
 'Hole, Core, Section',
 'Hole.1',
 'IRD',
 'Interval (bottom)',
 'Interval (top)',
 'Interval Top (cm) on SHLF',
 'Interval Bot (cm) on SHLF',   'Miscellaneous',
 'Nannofossil Zone',
 'Nannofossil abundance',
 'Nannofossil comment',
 'Oberservations',
 'Observations',
 'Original Bottom Depth (m)',
 'Original Top Depth (m)',
 'Other fossil material',
 'Other observations',
 'Other taxa',
 'Preservation',
 'Presevation',
 'REMARKS',
 'Remarks',
 'Sample',
 'Section',
 'Section Half',
 'Secton Half',
 'Site',
 'Top (cm)',
 'Top (m CSF-A)',
 'Top CSF-A (m)',
 'Top Depth (CSF m)',
 'Top Depth (m)',
 'Top Depth (m) CSF-A',
 'Top Depth CFS (m)',
 'Top Depth CSF-A (m)',
 'Top Depth [CFS m]',
 'Top Depth [CSF m]',
 'Top Depth [m]',
 'Top Offset (cm) on Parent Sample',
 'Top [cm]',
 'Top depth CSF (m)',
 'Top depth CSF-B (m)',
 'Top depth CSF-B (m):',
 'Top depth [CSF m]',
 'Top interval (cm)',
 'Total pollen',
 'Total radiolarians',
 'Type',
 'Unnamed: 148',
 'Unnamed: 21',
 'Unnamed: 3',
 'Unnamed: 61',
 'Unnamed: 81',
 'Zone',
 'Zone name (short)',
 'Zone/Subzone',
 'bottom (cm)',
 'bottom interval (cm)',
 'comments',
 'core, section',
 'depth Bottom (m CSF-A)',
 'depth Bottom (m)',
 'depth Bottom CSF-A (m)',
 'depth CSF-A',
 'depth CSF-A (m)',
 'depth CSF-A Bottom (m)',
 'depth CSF-A Top (m)',
 'depth Top (m CSF-A)',
 'depth Top (m)',
 'depth Top CSF-A (m)',
 'interval (cm)',
 'mean depth (mbsf)',
 'preservation',
 'section',
 'top (cm)',
 'top interval (cm)'
}

all_taxa_names = all_columns  - nontaxa
taxa_names = all_taxa_names - existing_LIMS_taxa - existing_NOAA_taxa

In [None]:
print(len(all_taxa_names), len(taxa_names), len(nontaxa))


In [None]:
taxa_names

### Create csv

Create a taxa list csv that contains all the taxon names and the associated taxon group.

In [None]:
taxa_and_group = set()


for index, row in metadata.iterrows():

    file =  clean_data_path/row['path']
    if '317_U1351_planktic_forams.csv' in str(path):
        header = 1
    else:
        header = 0
        
    df = pd.read_csv(file, dtype=str, header=header, nrows=0)
    
    for col in df.columns:
        if col in taxa_names:
            taxa_and_group.add(f'{col}|{row["taxon_group"]}')


len(taxa_and_group)

In [None]:
taxa_list = []

for taxon in taxa_and_group:
    if not pd.isna(taxon):
        taxon_name, taxon_group = taxon.split('|')
        
        taxon_name_parts = taxon_name_parser(taxon_name)

        data = { 
            'taxon_group': taxon_group, 
            'verbatim_name': taxon_name,
        }
        all_ranks =[
            'genus modifier', 'genus name', 'species modifier', 'species name', 
            'subspecies modifier', 'subspecies name', 'non-taxa descriptor'
        ]
        for rank in all_ranks:            
            if rank in taxon_name_parts:
                data[rank] = taxon_name_parts[rank]

        taxa_list.append(data)
        
len(taxa_list)

In [None]:
taxa_df = pd.DataFrame(taxa_list)
taxa_df.sort_values(['taxon_group', 'verbatim_name'], inplace=True)
taxa_df.head()

In [None]:
taxa_df.to_csv(taxa_file, index=False)

# add pbdb data 

In [None]:
taxa_df = pd.read_csv(taxa_file, dtype=str)
log_df(taxa_df)

In [None]:
genus_dft = pd.DataFrame(taxa_df['genus name'].unique(), columns=['genus name'])

log_df(genus_dft)

In [None]:
for index, row in genus_df.iterrows():
    if pd.notna(row['pbdb_taxon_id']):
        continue
        
    time.sleep(0.5)
    
    if index % 50 == 0:
        print(index, end=' ')

        
    url =  PBDB_TAXA_NAME +  row['genus name']
        
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()["records"]
        if len(data) == 1:
            genus_df.at[index, f'pbdb_taxon_id'] = str(data[0]["taxon_no"])
            genus_df.at[index, f'pbdb_taxon_name'] = data[0]["taxon_name"]
            genus_df.at[index, f'pbdb_taxon_rank'] = data[0]["taxon_rank"]
            
            round = 0
            get_parent_taxa(genus_df, data[0]["parent_no"], data[0]["taxon_rank"], round, index, None)

        

In [None]:
log_df(genus_df)

In [None]:
genus_df.to_csv(genus_file, index=False)

## create taxa list with pbdb info for the PIs

In [None]:
genus_df = pd.read_csv(genus_file, dtype= str)
log_df(genus_df)

In [None]:
unapproved_df = pd.read_csv(taxa_file)

log_df(unapproved_df)

In [None]:
merged_df = pd.merge(unapproved_df, genus_df, 
                     on = 'genus name', 
                     how='left',
                     indicator='_merge_pbdb')

log_df(merged_df)

In [None]:
merged_df['name'] = np.nan
merged_df['Comment'] = np.nan
merged_df['Notes (change to Internal only notes?)'] = np.nan
merged_df['Any taxon above genus'] = np.nan
merged_df['subgenera modifier'] = np.nan
merged_df['subgenera name'] = np.nan
merged_df['comments'] = np.nan

In [None]:
merged_df.columns

In [None]:
merged_df = merged_df.reindex(columns=[
    'taxon_group', 'verbatim_name',  'name',
    'Comment', 'Notes (change to Internal only notes?)',
    'Any taxon above genus', 
    'genus modifier', 'genus name', 
    'subgenera modifier','subgenera name', 
    'species modifier',  'species name',
    'subspecies modifier', 'subspecies name',
    'non-taxa descriptor', 
    'comments',
    'pbdb_taxon_id', 'pbdb_taxon_name','pbdb_taxon_rank', 
    'family_taxon_id', 'family_taxon_name',
    'order_taxon_id', 'order_taxon_name', 
    'class_taxon_id', 'class_taxon_name', 
    'phylum_taxon_id', 'phylum_taxon_name',
    'kingdom_taxon_id', 'kingdom_taxon_name', 
    'unranked clade_taxon_id', 'unranked clade_taxon_name',

])

merged_df.sort_values(by=['taxon_group', 'verbatim_name'], inplace=True)

log_df(merged_df)

In [None]:
merged_df[merged_df['pbdb_taxon_id'].notna()].shape

In [None]:
merged_df[merged_df['pbdb_taxon_id'].isna()].shape

In [None]:
merged_df.shape

In [None]:
merged_df.to_csv(taxa_pbdb_file, index=False)

In [None]:
taxa_pbdb_file

# create csv of genus that are only a letter

In [None]:
taxa_df = pd.read_csv(taxa_file, dtype=str)
log_df(taxa_df)

In [None]:
letter_genus = taxa_df[(taxa_df['genus name'].str.endswith('.'))]
letter_genus = letter_genus[['taxon_group','verbatim_name', 'genus name']]
log_df(letter_genus)

In [None]:
letter_genus.to_csv(genus_letter_file, index=False)