# QA Micropal_headers_PBDB_Taxonomy_notes_taxa_list

Check if the taxa that were aproved by the researchers from the google sheet matches the taxa from the LIMS taxa_list.csv.

In [1]:
import pandas as pd
import numpy as np

In [2]:
date = '2021-05-05'
input_file = f'raw_data/taxa/Micropal_headers_PBDB_Taxonomy_notes_taxa_list_{date}.csv'

taxon_groups = [
    'nannofossils',
    'silicoflagellates',
    'ostracods',
    'ebridians',
    'chrysophyte_cysts',
    'bolboformids',
    'diatoms',
    'planktic_forams',
    'radiolarians'
]

In [3]:
def add_normalized_name_column(df):
    fields = [
        "genus modifier",
        "genus name",
        "subgenera modifier",
        "subgenera name",
        "species modifier",
        "species name",
        "subspecies modifier",
        "subspecies name",
    ]

    # concatenate taxa fields into a string
    df["normalized_name"] = df["Any taxon above genus"].str.cat(
        df[fields], sep=" ", na_rep=""
    )

    # add "(descriptor)" if it exists
    descriptor = np.where(
        df["non-taxa descriptor"].notnull(), "(" + df["non-taxa descriptor"] + ")", ""
    )
    df["normalized_name"] = df["normalized_name"] + descriptor

    # get rid of extra spaces
    df["normalized_name"] = df["normalized_name"].str.strip()
    df["normalized_name"] = df["normalized_name"].replace(
        to_replace="  +", value=" ", regex=True
    )

    return df

## compare LIMS taxa with approved taxa

In [4]:
all_taxa_file = 'cleaned_data/taxa/taxa_list.csv'

In [5]:
normalized_df = pd.read_csv(input_file, skiprows = 9)
normalized_df.shape

(4756, 21)

In [6]:
normalized_df.head(2)

Unnamed: 0,taxon_group,verbatim_name,name,"name to use (if different from ""name"")",name comment field,Comment,notes,Any taxon above genus,genus modifier,genus name,...,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,comments,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank
0,,PROBLEMATIC NAMES,,,,,,,,,...,,,,,,,,,,
1,dinoflagellates,Amorphous organic matter,Amorphous organic matter,,,not a taxa name,Summer 2020: deal with later; should go in as ...,,,,...,,,,,,,,,,


In [7]:
taxa_df = pd.read_csv(all_taxa_file)
taxa_df.shape

(4738, 4)

In [8]:
taxa_df.head(2)

Unnamed: 0,verbatim_name,name,taxon_group,genera
0,"""Globigerina"" angulisuturalis _T","""Globigerina"" angulisuturalis _T",planktic_forams,
1,"""Globigerina"" angulisuturalis _T_","""Globigerina"" angulisuturalis _T_",planktic_forams,


In [9]:
normalized_names = set(normalized_df['verbatim_name'])
taxa_names = set(taxa_df['verbatim_name'])

get taxa in the normalized taxa file that isn't in LIMS taxa file

In [10]:
len(normalized_names - taxa_names)

3

In [11]:
normalized_names - taxa_names

{'PROBLEMATIC NAMES', 'RESOLVED NAMES BY TAXONOMIC GROUP', nan}

get taxa in the  LIMS taxa file  that isn't in normalized taxa file

In [12]:
len(taxa_names - normalized_names)

1

In [13]:
taxa_names - normalized_names

{'Sponge spicules'}

## check for duplicate taxa


In [14]:
temp_df = normalized_df.copy()
temp_df = temp_df.drop(list(range(29)))
temp_df.head(2)

Unnamed: 0,taxon_group,verbatim_name,name,"name to use (if different from ""name"")",name comment field,Comment,notes,Any taxon above genus,genus modifier,genus name,...,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,comments,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank
29,benthic_forams,Euuvigerina miozea (group) >100 m,Euuvigerina miozea (group) >100 m,Euuvigerina miozea,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",,,Euuvigerina,...,,,miozea,,,,,,,
30,benthic_forams,Euuvigerina rodleyi (group) >50 m,Euuvigerina rodleyi (group) >50 m,Euuvigerina rodleyi,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",,,Euuvigerina,...,,,rodleyi,,,,,,,


In [15]:
temp_df.shape

(4727, 21)

In [16]:
filtered_taxa = temp_df[temp_df['taxon_group'].isin(taxon_groups)].copy()
filtered_taxa.shape

(3142, 21)

In [17]:
add_normalized_name_column(filtered_taxa)
filtered_taxa.head(2)

Unnamed: 0,taxon_group,verbatim_name,name,"name to use (if different from ""name"")",name comment field,Comment,notes,Any taxon above genus,genus modifier,genus name,...,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,comments,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,normalized_name
52,planktic_forams,Candeina nitida,Candeina nitida,,,,"in PBDB, so it will be classified correctly; t...",,,Candeina,...,,nitida,,,,,,,,Candeina nitida
53,planktic_forams,Dentoglobigerina altispira,Dentoglobigerina altispira,,,,"in PBDB, so it will be classified correctly; t...",,,Dentoglobigerina,...,,altispira,,,,,,,,Dentoglobigerina altispira


In [18]:
unique_taxa_by_group = filtered_taxa[['taxon_group', 'normalized_name']].drop_duplicates() 
unique_taxa_by_group.head(2)

Unnamed: 0,taxon_group,normalized_name
52,planktic_forams,Candeina nitida
53,planktic_forams,Dentoglobigerina altispira


In [19]:
unique_taxa_by_group.to_csv('./tmp/unique_taxa_by_group.csv')
unique_taxa_by_group.shape

(2695, 2)

In [20]:
unique_taxa = unique_taxa_by_group[['normalized_name']] 

In [21]:
taxa_count = unique_taxa.pivot_table(index=['normalized_name' ], aggfunc='size')
taxa_count = pd.DataFrame(taxa_count, columns=['count'])
taxa_count.shape

(2695, 1)

In [22]:
taxa_count = taxa_count[taxa_count['count'] > 1]
taxa_count.shape


(0, 1)

In [23]:
taxa_count.to_csv('./tmp/taxa_count.csv')