# QA Micropal_headers_PBDB_Taxonomy_notes_taxa_list

Check if the taxa that were aproved by the researchers from the google sheet matches the taxa from the LIMS taxa_list.csv.

In [1]:
import sys
import os 

import pandas as pd
import numpy as np

sys.path.append('../scripts/')
import normalize_taxa as nt

In [2]:
input_file =  os.path.join('raw_data', 'taxa', 
                           f'Micropal_headers_PBDB_Taxonomy_notes_taxa_list_{nt.date}.csv' )

all_taxa_file = os.path.join('cleaned_data', 'taxa', 'draft', 'LIMS', 'taxa_list.csv')

## compare LIMS taxa with approved taxa

In [3]:
normalized_df = pd.read_csv(input_file, skiprows = 9)
normalized_df.shape

(4754, 21)

In [4]:
normalized_df.head(2)

Unnamed: 0,taxon_group,verbatim_name,name,"name to use (if different from ""name"")",name comment field,Comment,notes,Any taxon above genus,genus modifier,genus name,...,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,comments,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank
0,,RESOLVED NAMES BY TAXONOMIC GROUP,,,,,,,,,...,,,,,,,,,,
1,benthic_forams,Pyrite,Pyrite,,,"not a taxa; Going into Macrostrat, not PBDB",Summer 2020: deal with later; should go in as ...,,,,...,,,,,,,,,,


In [5]:
taxa_df = pd.read_csv(all_taxa_file)
taxa_df.shape

(4738, 4)

In [6]:
taxa_df.head(2)

Unnamed: 0,verbatim_name,name,taxon_group,genera
0,"""Globigerina"" angulisuturalis _T","""Globigerina"" angulisuturalis _T",planktic_forams,
1,"""Globigerina"" angulisuturalis _T_","""Globigerina"" angulisuturalis _T_",planktic_forams,


In [7]:
normalized_names = set(normalized_df['verbatim_name'])
taxa_names = set(taxa_df['verbatim_name'])

get taxa in the normalized taxa file that isn't in LIMS taxa file

In [8]:
len(normalized_names - taxa_names)

2

In [9]:
normalized_names - taxa_names

{'RESOLVED NAMES BY TAXONOMIC GROUP', nan}

get taxa in the  LIMS taxa file  that isn't in normalized taxa file

In [10]:
len(taxa_names - normalized_names)

1

In [11]:
taxa_names - normalized_names

{'Sponge spicules'}

## check for duplicate taxa


In [12]:
temp_df = normalized_df.copy()
temp_df.head(4)

Unnamed: 0,taxon_group,verbatim_name,name,"name to use (if different from ""name"")",name comment field,Comment,notes,Any taxon above genus,genus modifier,genus name,...,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,comments,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank
0,,RESOLVED NAMES BY TAXONOMIC GROUP,,,,,,,,,...,,,,,,,,,,
1,benthic_forams,Pyrite,Pyrite,,,"not a taxa; Going into Macrostrat, not PBDB",Summer 2020: deal with later; should go in as ...,,,,...,,,,,,,,,,
2,benthic_forams,Euuvigerina miozea (group) >100 m,Euuvigerina miozea (group) >100 m,Euuvigerina miozea,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",,,Euuvigerina,...,,,miozea,,,,>100 m group,,,
3,benthic_forams,Euuvigerina rodleyi (group) >50 m,Euuvigerina rodleyi (group) >50 m,Euuvigerina rodleyi,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",,,Euuvigerina,...,,,rodleyi,,,,>50 m group,,,


In [13]:
temp_df.shape

(4754, 21)

In [19]:
filtered_taxa = temp_df[temp_df['taxon_group'].isin(nt.taxon_groups)].copy()
filtered_taxa.shape

(4743, 21)

In [21]:
nt.add_normalized_name_column(filtered_taxa)
filtered_taxa.head(3)

Unnamed: 0,taxon_group,verbatim_name,name,"name to use (if different from ""name"")",name comment field,Comment,notes,Any taxon above genus,genus modifier,genus name,...,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,comments,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,normalized_name
1,benthic_forams,Pyrite,Pyrite,,,"not a taxa; Going into Macrostrat, not PBDB",Summer 2020: deal with later; should go in as ...,,,,...,,,,,,,,,,
2,benthic_forams,Euuvigerina miozea (group) >100 m,Euuvigerina miozea (group) >100 m,Euuvigerina miozea,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",,,Euuvigerina,...,,miozea,,,,>100 m group,,,,Euuvigerina miozea
3,benthic_forams,Euuvigerina rodleyi (group) >50 m,Euuvigerina rodleyi (group) >50 m,Euuvigerina rodleyi,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",,,Euuvigerina,...,,rodleyi,,,,>50 m group,,,,Euuvigerina rodleyi


In [22]:
group_taxa = {}

for index, row in filtered_taxa.iterrows():
    if row['normalized_name'] not in group_taxa:
        group_taxa[row['normalized_name']] = set()
    
    group_taxa[row['normalized_name']].add(row['taxon_group'])  


In [23]:
taxa_tmp = []
taxon_groups_tmp = []

for key, value in group_taxa.items():
    if len(value) > 1 and key:
        taxa_tmp.append(key)
        taxon_groups_tmp.append(', '.join(value))

In [24]:
dict = {
    "taxon": taxa_tmp,
    "taxon_group": taxon_groups_tmp
}
df = pd.DataFrame(dict)
df.tail()

Unnamed: 0,taxon,taxon_group
5,"""Brown phytoclasts""","dinoflagellates, palynology"
6,Brigantedinium spp.,"dinoflagellates, palynology"
7,Lejeunecysta sp.,"dinoflagellates, palynology"
8,Selenopemphix nephroides,"dinoflagellates, palynology"
9,Dinoflagellata indet.,"dinoflagellates, palynology, nannofossils"


In [25]:
df.shape

(10, 2)

In [21]:
df.to_csv('./tmp/taxa_count.csv', index=False)


(8, 2)