# create stats for paper

In [1]:
import sys
sys.path.append('../../')
import glob
import shutil
from pathlib import Path
import os

import numpy as np
import pandas as pd
from config import CLEAN_DATA_DIR, OUTPUT_DIR, RAW_DATA_DIR
import db as db
from scripts.normalize_taxa import add_normalized_name_column

from scripts.normalize_data import (
    check_duplicate_columns
)

In [2]:
clean_data_path = CLEAN_DATA_DIR

metadata_file = OUTPUT_DIR/'metadata'/'LIMS'/'Micropal_changes.csv' 
# metadata_file = OUTPUT_DIR/'metadata'/'LIMS'/'Micropal_changes_4.csv' 


date = '2022-08-08'


taxa_list_file = OUTPUT_DIR/'taxa'/'LIMS'/f"taxa_list_{date}.csv"
taxa_crosswalk_list_file = OUTPUT_DIR/'taxa'/'LIMS'/f"taxa_crosswalk_{date}.csv"

PI_4_file = OUTPUT_DIR /'taxa'/'LIMS'/f'PI_Micropal_CSV_4_normalized_taxa_list_with_pbdb_{date}.csv'
additional_taxa_path = OUTPUT_DIR/'taxa'/'LIMS'/'addtional_species.csv'
PI_file = RAW_DATA_DIR/'PI_processed_files'/f'LIMS_Micropal_headers_PBDB_Taxonomy_notes_taxa_list_{date}.csv'


## LIMS verbatim taxa names 

In [3]:
sql = """
select count(*) as count
from taxa_crosswalk
where taxon_id in (select taxon_id from samples_taxa)
"""

row = db.fetch_one(sql)
print(  row['count'])
# 5280

5279


## LIMS taxa verbatim names grouped by taxon group

In [4]:
sql = """
select count(*) as count, taxon_group 
from taxa_crosswalk
where taxon_id in (select taxon_id from samples_taxa)
group by taxon_group
order by taxon_group;
"""

sum  = 0

rows = db.fetch_all(sql)
for row in rows:
    print(row['taxon_group'],  row['count'])
    sum  += row['count']

benthic_forams 1689
bolboformids 2
chrysophyte_cysts 1
diatoms 717
dinoflagellates 57
ebridians 8
nannofossils 937
ostracods 16
other 15
palynology 143
planktic_forams 1019
radiolarians 642
silicoflagellates 33


In [5]:
sum

5279

##  LIMS taxa names grouped by taxon group

In [6]:
sql = """
select count(*) as count, taxon_group 
from taxa
where id in (select taxon_id from samples_taxa)
group by taxon_group
order by taxon_group;
"""
sum  = 0

rows = db.fetch_all(sql)
for row in rows:
    print(row['taxon_group'],  row['count'])
    sum  += row['count']

benthic_forams 1508
bolboformids 2
chrysophyte_cysts 1
diatoms 652
dinoflagellates 53
ebridians 6
nannofossils 810
ostracods 15
other 11
palynology 133
planktic_forams 844
radiolarians 598
silicoflagellates 23


In [7]:
sum

4656

## LIMS taxa names

if taxa is in multiple taxon group, count each taxa

In [8]:
sql = """
select count(*) as count 
from taxa 
where id in (select taxon_id from samples_taxa);
"""

row = db.fetch_one(sql)
print(  row['count'])

# 4657

4656


if taxa is in multiple taxon group, count as one taxa

In [9]:
sql = """
select count(distinct(name)) as count 
from taxa 
where id in (select taxon_id from samples_taxa);
"""

row = db.fetch_one(sql)
print(  row['count'])

# 4633

4632


## distinct LIMS taxon name above genus

if taxa is in multiple taxon group, count each taxa

In [10]:
sql = """
select count(*) 
from taxa
where id in (select taxon_id from samples_taxa)
and taxon_name_above_genus is not null;
"""

row = db.fetch_one(sql)
print( row['count'])
# 106

106


if taxa is in multiple taxon group, count as one taxa

In [11]:
sql = """
select count(distinct(name))
from taxa
where id in (select taxon_id from samples_taxa)
and taxon_name_above_genus is not null;
"""

row = db.fetch_one(sql)
print( row['count'])
# 89

89


## distinct LIMS genus

if taxa is in multiple taxon group, count each taxa

In [12]:
sql = """
select count(distinct(genus_name || taxon_group))
from taxa
where id in (select taxon_id from samples_taxa)
and genus_name is not null;
"""
row = db.fetch_one(sql)
print( row['count'])
# 1080

1078


if taxa is in multiple taxon group, count as one taxa

In [13]:
sql = """
select count(distinct(genus_name))
from taxa
where id in (select taxon_id from samples_taxa)
and genus_name is not null;
"""
row = db.fetch_one(sql)
print( row['count'])
# 1080

1060


## distinct LIMS species

if taxa is in multiple taxon group, count each taxa

In [14]:
sql = """
select count(distinct(name || taxon_group))
from taxa
where id in (select taxon_id from samples_taxa)
and species_name is not null;
"""
row = db.fetch_one(sql)
print( row['count'])
# 4566

4546


if taxa is in multiple taxon group, count as one taxa

In [15]:
sql = """
select count(distinct(name))
from taxa
where id in (select taxon_id from samples_taxa)
and species_name is not null;
"""
row = db.fetch_one(sql)
print( row['count'])
# 4566

4539


## distinct LIMS subspecies

if taxa is in multiple taxon group, count each taxa

In [16]:
sql = """
select count(distinct(name || taxon_group))
from taxa
where id in (select taxon_id from samples_taxa)
and subspecies_name is not null;
"""
row = db.fetch_one(sql)
print( row['count'])
# 138

138


if taxa is in multiple taxon group, count as one taxa

In [17]:
sql = """
select count(distinct(name))
from taxa
where id in (select taxon_id from samples_taxa)
and subspecies_name is not null;
"""
row = db.fetch_one(sql)
print( row['count'])
# 138

138


## nontaxa values in taxa files sent to PIs

In [18]:
lims_df = pd.read_csv(PI_file, header=9)
lims_df = lims_df.drop(0)
lims_df.dropna(axis=0, how='all', inplace=True)
add_normalized_name_column(lims_df)

lims_df.head(2)

Unnamed: 0,taxon_group,verbatim_name,name,f# of distinct taxonomic names,name comment field,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,...,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,normalized_name
1,benthic_forams,Pyrite,Pyrite,,,"not a taxa; Going into Macrostrat, not PBDB",Summer 2020: deal with later; should go in as ...,,,,...,,,,,,,,,,
2,benthic_forams,Euuvigerina miozea (group) >100 m,Euuvigerina miozea (group) >100 m,Euuvigerina miozea,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",,,Euuvigerina,...,,,,,,288974.0,Foraminifera,212476.0,Rhizaria,Euuvigerina miozea


In [19]:
lims_4_df = pd.read_csv(PI_4_file)
lims_4_df.dropna(axis=0, how='all', inplace=True)
add_normalized_name_column(lims_4_df)

lims_4_df.head(2)

Unnamed: 0,taxon_group,verbatim_name,Name,Comment,Notes (change to Internal only notes?),Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,...,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name,abbrev_genus,normalized_name
0,benthic_forams,Agglutinated Foram. gen. et sp. indet.,Textulariia indet.,,,Textulariia indet.,,,,,...,,,288974.0,Foraminifera,212476.0,Rhizaria,,,False,Textulariia indet.
1,benthic_forams,Agglutinated indet.,Textulariia indet.,,,Textulariia indet.,,,,,...,,,288974.0,Foraminifera,212476.0,Rhizaria,,,False,Textulariia indet.


In [20]:
add_df = pd.read_csv(additional_taxa_path)
add_df.dropna(axis=0, how='all', inplace=True)

add_df.head(2)

Unnamed: 0,sample_id,file,sample,taxon_group,verbatim_name,code,Name,Comment,Notes (change to Internal only notes?),Any taxon above genus,...,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name,normalized_name,path
0,203528,374_U1525A_radiolarians.csv,374-U1525A-1H-1-IW_MUDLINE,radiolarians,additional species: Lonchosphaera spicata,,,,,,...,,,,,,,,,Lonchosphaera spicata,LIMS/Micropal_CSV_2/374_U1525A_radiolarians.csv
1,203528,374_U1525A_radiolarians.csv,374-U1525A-1H-1-IW_MUDLINE,radiolarians,additional species: Poulpus spp.-like,,Poulpus spp.,,,,...,,,,,,,,,Poulpus spp.,LIMS/Micropal_CSV_2/374_U1525A_radiolarians.csv


In [21]:
cols = ['verbatim_name', 'taxon_group', 'normalized_name']

lims_df_2 = lims_df[cols]
lims_4_df_2 = lims_4_df[cols]
add_df_2 = add_df[cols]

combine = pd.concat([lims_df_2, lims_4_df_2, add_df_2])
combine.loc[combine['normalized_name'] == '', 'normalized_name'] = np.nan

combine.shape

(5453, 3)

In [22]:
combine[combine['normalized_name'].isna()].shape

(23, 3)

In [23]:
combine[combine['normalized_name'].isna()]

Unnamed: 0,verbatim_name,taxon_group,normalized_name
1,Pyrite,benthic_forams,
21,fossil,benthic_forams,
22,fossil_group,benthic_forams,
3068,Preservation palynofacies,palynology,
3082,Exotic,palynology,
3196,Organic matter,planktic_forams,
3197,Terrestrial organic matter,planktic_forams,
3213,Pyrite,planktic_forams,
3215,ADDITIONAL SPECIES,planktic_forams,
4125,ADDITIONAL SPECIES,radiolarians,


In [24]:

combine2 = combine[['normalized_name', 'taxon_group']]
combine2 = combine2.dropna(subset=['normalized_name'])

combine2 = combine2.drop_duplicates() 
combine2.head()

Unnamed: 0,normalized_name,taxon_group
2,Euuvigerina miozea,benthic_forams
3,Euuvigerina rodleyi,benthic_forams
4,Foraminifera indet.,benthic_forams
5,Pleurostomellidae indet.,benthic_forams
6,Ostracoda indet.,benthic_forams


In [25]:
combine2.shape

(4658, 2)

In [26]:
sql = """
select name
from taxa 
where id in (select taxon_id from samples_taxa);
"""

names = set()
rows = db.fetch_all(sql)
for row in rows:
    names.add(row['name'])

In [27]:
len(names)

4632

In [28]:
set(combine2['normalized_name']) - names

{'Gephyrocapsa spp. (large)', 'Hemiaulus danicus'}

In [29]:
names - set(combine2['normalized_name']) 

set()

# create taxa summary reports

In [30]:
def create_report_df(rows):
    records = []
    for row in rows:
        data = {}
        for field, value in row.items():
            data[field] = value if value else np.nan
        records.append(data)

    records

    return pd.DataFrame(records)

In [31]:
sql = """
select  
taxa.name, taxa.taxon_group, taxa.pbdb_taxon_id,
taxa_crosswalk.verbatim_name,
taxa_crosswalk.comment
from taxa 
join taxa_crosswalk on taxa.id = taxa_crosswalk.taxon_id
where   taxa.id in (select taxon_id from samples_taxa)
order by taxa.name, taxa.taxon_group, taxa_crosswalk.verbatim_name,  taxa_crosswalk.comment;
"""

rows = db.fetch_all_dict(sql)
len(rows)

5279

In [32]:
df = create_report_df(rows)
df['pbdb_taxon_id'] = df['pbdb_taxon_id'].astype(pd.Int64Dtype())
df.head()

Unnamed: 0,name,taxon_group,pbdb_taxon_id,verbatim_name,comment
0,"""Amorphous organic matter""",dinoflagellates,,Amorphous organic matter,not a taxa name
1,"""Amorphous organic matter""",palynology,,Amorphous organic matter,not a taxa name
2,"""Black phytoclasts""",dinoflagellates,,Black phytoclasts,not a taxa name
3,"""Black phytoclasts""",palynology,,Black phytoclasts,not a taxa name
4,"""Black woody phytoclasts""",palynology,,Black woody phytoclasts,not a taxa name


In [33]:
df.to_csv(OUTPUT_DIR/'taxa'/'LIMS'/'taxa_list_entries_2022-08-08.csv', index=False)

In [34]:
sql = """
select  
taxa.name, taxa.taxon_group, taxa.pbdb_taxon_id,
array_to_string(array_agg(taxa_crosswalk.verbatim_name), '; ') as verbatim_name, 
array_to_string(array_agg(taxa_crosswalk.comment), '; ') as comment
from taxa 
join taxa_crosswalk on taxa.id = taxa_crosswalk.taxon_id
where   taxa.id in (select taxon_id from samples_taxa)
group by taxa.name, taxa.taxon_group , taxa.pbdb_taxon_id
order by taxa.name, taxa.taxon_group,  verbatim_name, comment ;
"""

rows = db.fetch_all_dict(sql)
len(rows)

4656

In [35]:
df = create_report_df(rows)
df['pbdb_taxon_id'] = df['pbdb_taxon_id'].astype(pd.Int64Dtype())
df.head()

Unnamed: 0,name,taxon_group,pbdb_taxon_id,verbatim_name,comment
0,"""Amorphous organic matter""",dinoflagellates,,Amorphous organic matter,not a taxa name
1,"""Amorphous organic matter""",palynology,,Amorphous organic matter,not a taxa name
2,"""Black phytoclasts""",dinoflagellates,,Black phytoclasts,not a taxa name
3,"""Black phytoclasts""",palynology,,Black phytoclasts,not a taxa name
4,"""Black woody phytoclasts""",palynology,,Black woody phytoclasts,not a taxa name


In [36]:
# df.to_csv(OUTPUT_DIR/'taxa'/'LIMS'/'taxa_list_distinct_taxonomic_entries_2022-08-08.csv', index=False)

In [46]:


sql = """
select  
array_to_string(array_agg(taxa.taxon_group), '; ') as taxon_group, 
array_to_string(array_agg(taxa_crosswalk.verbatim_name), '; ') as verbatim_name, 
taxa.name,
array_to_string(array_agg(taxa_crosswalk.comment), '; ') as comment,
taxon_name_above_genus,
genus_modifier, genus_name,
subgenera_modifier, subgenera_name
species_modifier, species_name,
subspecies_modifier, subspecies_name,
non_taxa_descriptor,
array_to_string(array_agg(taxa_crosswalk.comments), '; ') as comments,
taxa.pbdb_taxon_id
from taxa 
join taxa_crosswalk on taxa.id = taxa_crosswalk.taxon_id
where   taxa.id in (select taxon_id from samples_taxa)
group by taxa.name,  taxon_name_above_genus,
genus_modifier, genus_name,
subgenera_modifier, subgenera_name,
species_modifier, species_name,
subspecies_modifier, subspecies_name,
non_taxa_descriptor,  taxa.pbdb_taxon_id
order by taxa.name,   verbatim_name, comment ;
"""

rows = db.fetch_all_dict(sql)
len(rows)

4632

In [47]:
df = create_report_df(rows)
df['pbdb_taxon_id'] = df['pbdb_taxon_id'].astype(pd.Int64Dtype())
df.head()

Unnamed: 0,taxon_group,verbatim_name,name,comment,taxon_name_above_genus,genus_modifier,genus_name,subgenera_modifier,species_modifier,species_name,subspecies_modifier,subspecies_name,non_taxa_descriptor,comments,pbdb_taxon_id
0,palynology; dinoflagellates,Amorphous organic matter; Amorphous organic ma...,"""Amorphous organic matter""",not a taxa name; not a taxa name,"""Amorphous organic matter""",,,,,,,,,,
1,dinoflagellates; palynology,Black phytoclasts; Black phytoclasts,"""Black phytoclasts""",not a taxa name; not a taxa name,"""Black phytoclasts""",,,,,,,,,,
2,palynology,Black woody phytoclasts,"""Black woody phytoclasts""",not a taxa name,"""Black woody phytoclasts""",,,,,,,,,,
3,dinoflagellates; palynology,Brown phytoclasts; Brown phytoclasts,"""Brown phytoclasts""",not a taxa name; not a taxa name,"""Brown phytoclasts""",,,,,,,,,,
4,palynology,Brown woody phytoclasts,"""Brown woody phytoclasts""",not a taxa name,"""Brown woody phytoclasts""",,,,,,,,,,


In [48]:
df.to_csv(OUTPUT_DIR/'taxa'/'LIMS'/f'taxa_list_distinct_taxonomic_entries_across_groups_{date}.csv', index=False)

## taxa where PIs add quotes to name

taxa name with quotes and verbatim name does not have quotes

In [40]:
sql = """
select distinct taxa.name,
taxa.taxon_group,  
array_to_string(array_agg(taxa_crosswalk.verbatim_name), '; ') as verbatim_names
from taxa 
join taxa_crosswalk on taxa.id = taxa_crosswalk.taxon_id
where taxa.name like '"%'
and taxa_crosswalk.verbatim_name  not like '"%'
and taxa.id in (select taxon_id from samples_taxa)
group by taxa.name, taxa.taxon_group
order by taxa.name;
"""

rows = db.fetch_all_dict(sql)
len(rows)

35

In [41]:
df = create_report_df(rows)
df.head()

Unnamed: 0,name,taxon_group,verbatim_names
0,"""Amorphous organic matter""",dinoflagellates,Amorphous organic matter
1,"""Amorphous organic matter""",palynology,Amorphous organic matter
2,"""Black phytoclasts""",dinoflagellates,Black phytoclasts
3,"""Black phytoclasts""",palynology,Black phytoclasts
4,"""Black woody phytoclasts""",palynology,Black woody phytoclasts
