# Normalize LIMS taxa

In [1]:
import sys
sys.path.append('../../../')
import glob
import re
import os.path
import hashlib
from pathlib import Path
import datetime

import pandas as pd
import numpy as np
import db as db

from config import CLEAN_DATA_DIR, OUTPUT_DIR, RAW_DATA_DIR

from scripts.normalize_data import (
    csv_cleanup,
    update_metadata,
    get_taxonomy_columns,
    clean_taxon_name,
    get_non_taxa_fields,
    remove_whitespace,
    normalize_abundance_codes
)
import scripts.normalize_taxa as nt
from scripts.shared_utils import (
    get_taxa_and_taxon_groups
)


In [2]:
metadata_file = OUTPUT_DIR/'metadata'/'LIMS'/'Micropal_changes.csv' 
# metadata_file = OUTPUT_DIR/'metadata'/'LIMS'/'Micropal_changes_4.csv'

normalized_fields_path =  OUTPUT_DIR/'normalized_data'/f'eODP_unified_data_structure_2022_02_21.csv'


clean_data_path = CLEAN_DATA_DIR

PI_additional_taxa_path = RAW_DATA_DIR/'PI_processed_files'/'LIMS_Micropal_CSV_4_taxa_ADDTL_TAXA.csv'
additional_taxa_path = OUTPUT_DIR/'taxa'/'LIMS'/'addtional_species.csv'
normalized_codes_path = RAW_DATA_DIR/'PI_processed_files'/'eODP Preservation Data v2.xlsx'


date = '2022-08-08'

crosswalk_file = OUTPUT_DIR/'taxa'/'LIMS'/f"taxa_crosswalk_{date}.csv"
taxa_list_file = OUTPUT_DIR/'taxa'/'LIMS'/f"taxa_list_{date}.csv"
PI_file = RAW_DATA_DIR/'PI_processed_files'/f'LIMS_Micropal_headers_PBDB_Taxonomy_notes_taxa_list_{date}.csv'



In [3]:
def log_df(df, row_count=5):
    print(df.shape)
    return df.head(row_count)


## data QA 

In [4]:
def create_report_df(rows):
    records = []
    for row in rows:
        data = {}
        for field, value in row.items():
            data[field] = value if value else np.nan
        records.append(data)

    records

    return pd.DataFrame(records)

### files with multiple expeditions

In [5]:
sql = """
select samples.data_source_notes, 
 array_to_string(array_agg(distinct expeditions.name ), ', ') as exp,  
ARRAY_LENGTH( array_agg(distinct expeditions.name ), 1) as count
from  samples 
join sections on sections.id  = samples.section_id 
join cores on cores.id  = sections.core_id
join holes on holes.id  = cores.hole_id
join sites on sites.id = holes.site_id
join expeditions on expeditions.id = sites.expedition_id
group by samples.data_source_notes
having  ARRAY_LENGTH( array_agg(distinct expeditions.name ), 1)  > 1
;
"""
rows = db.fetch_all_dict(sql)


In [6]:
create_report_df(rows)

Unnamed: 0,data_source_notes,exp,count
0,LIMS/Micropal_CSV_3/341_radiolarians_U1417A.csv,"341, 342, 343",3
1,LIMS/Micropal_CSV_4/323_U1340A_benthic_forams.csv,"323, 324, 325, 326, 327, 328, 329, 330, 331, 3...",68


## create normalized abundance values files and save to database

In [7]:
def clean_value(value):
    if pd.isna(value):
        return 'NULL'
    
    value = value.replace("'", '"')
    return f"'{value}'" 

def create_records(df):
    for index, row in df.iterrows():
        if pd.isna(row["original_abundance"]):
            continue

        sql = f"""
        INSERT INTO public.abundance_crosswalk(original_abundance, expedition, taxon_group, 
            normalized_abundance, definition, notes, 
            type, created_at)  
        VALUES ({clean_value(row["original_abundance"])}, 
        {clean_value(row["expedition"])},
        {clean_value(row["taxon_group"])},
        {clean_value(row["normalized_abundance"])},
        {clean_value(row["definition"])},
        {clean_value(row["notes"])},
        {clean_value(row["type"])},
        '{datetime.datetime.now()}')
        """

        db.execute(sql);


### taxa codes

In [8]:
df = pd.read_excel(normalized_codes_path, sheet_name='Abund.Code.NEW')
log_df(df)

(1109, 7)


Unnamed: 0,code,expedition,taxon,abundance value,abundance unit,Notes,"The PBDB has freeform text fields for ""abundance"":"
0,2?,371,nannofossils,2?,"count, undefined suggest questionable",?,
1,?,342,nannofossils,?,questionable occurrence,?,
2,?,371,radiolarians,?,questionable occurrence,?,
3,?,318,diatoms,?,questionable occurrence,?,
4,??,318,diatoms,?,questionable occurrence,?,


In [9]:
cols = ['code', 'expedition', 'taxon', 'abundance value', 'abundance unit', 'Notes']
codes_df = pd.read_excel(normalized_codes_path, sheet_name='Abund.Code.NEW', usecols=cols, dtype=str)

log_df(codes_df)

(1109, 6)


Unnamed: 0,code,expedition,taxon,abundance value,abundance unit,Notes
0,2?,371,nannofossils,2?,"count, undefined suggest questionable",?
1,?,342,nannofossils,?,questionable occurrence,?
2,?,371,radiolarians,?,questionable occurrence,?
3,?,318,diatoms,?,questionable occurrence,?
4,??,318,diatoms,?,questionable occurrence,?


In [10]:
data = {
    "code": "original_abundance", 
    "expedition": "expedition",
    "taxon": "taxon_group", 
    "abundance value": "normalized_abundance", 
    "abundance unit": "definition",
    "Notes": "notes"
}
codes_df.rename(columns=data, inplace=True)
codes_df['type'] = 'taxa'

log_df(codes_df)

(1109, 7)


Unnamed: 0,original_abundance,expedition,taxon_group,normalized_abundance,definition,notes,type
0,2?,371,nannofossils,2?,"count, undefined suggest questionable",?,taxa
1,?,342,nannofossils,?,questionable occurrence,?,taxa
2,?,371,radiolarians,?,questionable occurrence,?,taxa
3,?,318,diatoms,?,questionable occurrence,?,taxa
4,??,318,diatoms,?,questionable occurrence,?,taxa


In [11]:
groups = list(codes_df['taxon_group'].unique())
groups.sort()
groups

['benthic_forams',
 'bolboformids',
 'chrysophyte_cysts',
 'diatoms',
 'dinoflagellates',
 'ebridians',
 'nannofossils',
 'ostracods',
 'palynology',
 'planktic_forams',
 'radiolarians',
 'silicoflagellates']

In [12]:
# create_records(codes_df)

In [13]:
codes_df.to_csv(OUTPUT_DIR/'normalized_data'/'abundance_codes_taxa.csv', index=False)

### group codes

In [14]:
df = pd.read_excel(normalized_codes_path, sheet_name='Group Abund Code', dtype=str)
log_df(df)

(770, 10)


Unnamed: 0,Exp,Group (from pres),Group Code,Code,Definition,"abundant', 'common', 'few', 'rare'",This is a rough placeholder for the Group (using Pres info),"Extracted from heterogeneous ""group abundance""-related Field at collection level",Unnamed: 8,"This would go in PBDB.collections.abund_in_sediment, which is standardized:"
0,323,Preservation (diatoms),20.8,,,,Depth/age in wrong spot,,,
1,323,Preservation (diatoms),30.48,,,,Depth/age in wrong spot,,,
2,382,(benthic foram),B,B,barren,,,,,
3,383,(benthic foram),B,B,barren,,,,,
4,327,Preservation (benthic foram),F,,,,I think there's something wrong w/ the Exp num...,,,


In [15]:
cols = ['Exp', 'Group (from pres)', 'Group Code', 'Code', 'Definition', 
        'This is a rough placeholder for the Group (using Pres info)']
codes_df = pd.read_excel(normalized_codes_path, sheet_name='Group Abund Code', usecols=cols, dtype=str)

log_df(codes_df)

(770, 6)


Unnamed: 0,Exp,Group (from pres),Group Code,Code,Definition,This is a rough placeholder for the Group (using Pres info)
0,323,Preservation (diatoms),20.8,,,Depth/age in wrong spot
1,323,Preservation (diatoms),30.48,,,Depth/age in wrong spot
2,382,(benthic foram),B,B,barren,
3,383,(benthic foram),B,B,barren,
4,327,Preservation (benthic foram),F,,,I think there's something wrong w/ the Exp num...


In [16]:
data = {
    "Exp": "expedition",
    "Group (from pres)": "taxon_group", 
    "Group Code": "original_abundance", 
    "Code": "normalized_abundance", 
    "Definition": "definition",
    "This is a rough placeholder for the Group (using Pres info)": "notes"
}
codes_df.rename(columns=data, inplace=True)
codes_df['type'] = 'taxa'

log_df(codes_df)

(770, 7)


Unnamed: 0,expedition,taxon_group,original_abundance,normalized_abundance,definition,notes,type
0,323,Preservation (diatoms),20.8,,,Depth/age in wrong spot,taxa
1,323,Preservation (diatoms),30.48,,,Depth/age in wrong spot,taxa
2,382,(benthic foram),B,B,barren,,taxa
3,383,(benthic foram),B,B,barren,,taxa
4,327,Preservation (benthic foram),F,,,I think there's something wrong w/ the Exp num...,taxa


In [17]:
codes_df['taxon_group'].unique()

array(['Preservation (diatoms)', '(benthic foram)',
       'Preservation (benthic foram)', 'Preservation (nannos)', '(rads)',
       'Preservation (planktic foram)',
       'Diatom preservation dissolution (diatoms)', '(planktic foram)',
       'BF preservation (benthic foram)',
       'PF preservation (planktic foram)', 'Preservation (rads)',
       'Group preservation (palynology)',
       'PF Preservation (planktic foram)', '(nannos)',
       'Preservation (ebridians)', 'Preservation (silicoflag)',
       '(diatoms)', '(silicoflag)', '(ebridians)', nan, '(palynology)'],
      dtype=object)

In [18]:
groups_dict = {
    'Preservation (diatoms)': 'diatoms', 
    '(benthic foram)': 'benthic_forams',
    'Preservation (benthic foram)': 'benthic_forams', 
    'Preservation (nannos)': 'nannofossils', 
    '(rads)': 'radiolarians',
    'Preservation (planktic foram)': 'planktic_forams',
    'Diatom preservation dissolution (diatoms)': 'diatoms', 
    '(planktic foram)': 'planktic_forams',
    'BF preservation (benthic foram)': 'benthic_forams',
    'PF preservation (planktic foram)': 'planktic_forams', 
    'Preservation (rads)': 'radiolarians',
    'Group preservation (palynology)': 'palynology',
    'PF Preservation (planktic foram)': 'planktic_forams', 
    '(nannos)': 'nannofossils',
    'Preservation (ebridians)': 'ebridians', 
    'Preservation (silicoflag)': 'silicoflagellates',
    '(diatoms)': 'diatoms', 
    '(silicoflag)': 'silicoflagellates', 
    '(ebridians)': 'ebridians', 
    '(palynology)': 'palynology'   
}

codes_df['type'] = 'groups'

for old, new in groups_dict.items():
    codes_df.loc[codes_df['taxon_group'] == old, 'taxon_group'] = new

In [19]:
groups = list(codes_df['taxon_group'].dropna().unique())
groups.sort()
groups

['benthic_forams',
 'diatoms',
 'ebridians',
 'nannofossils',
 'palynology',
 'planktic_forams',
 'radiolarians',
 'silicoflagellates']

In [20]:
# create_records(codes_df)

In [21]:
codes_df.to_csv(OUTPUT_DIR/'normalized_data'/'abundance_codes_groups.csv', index=False)

## abundance codes QA

In [22]:
sql = """
select original_abundance, 
array_agg(distinct normalized_abundance) as normalized_abundance, 
taxon_group,
expedition,  
type,
array_length(array_agg(distinct normalized_abundance), 1)
from abundance_crosswalk
group by original_abundance, taxon_group, expedition, type
having array_length(array_agg(distinct normalized_abundance), 1) > 1;
"""

# TODO BUG: 'A' maps to multiple abundances

rows = db.fetch_all_dict(sql)
create_report_df(rows)

Unnamed: 0,original_abundance,normalized_abundance,taxon_group,expedition,type,array_length
0,A,"[A, A-D]",palynology,339,groups,2


In [23]:
sql = """
select original_abundance, 
array_agg(distinct normalized_abundance) as normalized_abundance, 
array_agg(taxon_group) as taxon_group,
expedition, 
type,
array_length(array_agg(distinct normalized_abundance), 1)
from abundance_crosswalk
group by original_abundance, expedition, type
having array_length(array_agg(distinct normalized_abundance), 1) > 1;
"""

rows = db.fetch_all_dict(sql)
create_report_df(rows)

Unnamed: 0,original_abundance,normalized_abundance,taxon_group,expedition,type,array_length
0,A,"[A, A-D]","[benthic_forams, planktic_forams, benthic_fora...",339,groups,2
1,F,"[F, Freq]","[nannofossils, benthic_forams, planktic_forams]",324,taxa,2
2,F,"[F, R]","[radiolarians, benthic_forams, nannofossils]",349,groups,2
3,F,"[F, Freq]","[benthic_forams, diatoms, radiolarians]",374,groups,2
4,F,"[F, Freq]","[planktic_forams, ebridians, diatoms, radiolar...",374,taxa,2
5,P,"[P, None]","[benthic_forams, diatoms, planktic_forams]",323,groups,2
6,P,"[P, None]","[nannofossils, planktic_forams, benthic_forams]",369,groups,2
7,X,"[P, X]","[ebridians, diatoms, radiolarians, dinoflagell...",374,taxa,2
8,rw,"[*, rw]","[diatoms, ebridians]",374,taxa,2


In [24]:
sql = """
select original_abundance, 
array_agg(distinct normalized_abundance) as normalized_abundance, 
taxon_group,
array_agg(expedition) as expedition, 
type,
array_length(array_agg(distinct normalized_abundance), 1)
from abundance_crosswalk
group by original_abundance, taxon_group , type
having array_length(array_agg(distinct normalized_abundance), 1) > 1;
"""

rows = db.fetch_all_dict(sql)
create_report_df(rows)

Unnamed: 0,original_abundance,normalized_abundance,taxon_group,expedition,type,array_length
0,A,"[A, None]",benthic_forams,"[330, 375, 351, 374, 342, 371, 339, 339, 341, ...",groups,2
1,A,"[A, A-D]",palynology,"[339, 339]",groups,2
2,F,"[F, R, None]",benthic_forams,"[341, 374, 356, 342, 317, 323, 329, 342, 340, ...",groups,3
3,F,"[F, Freq]",diatoms,"[318, 317, 368, 323, 341, 346, 355, 346, 353, ...",groups,2
4,F,"[F, Freq]",diatoms,"[374, 346, 362, 341, 353, 361, 355, 318, 368, ...",taxa,2
5,F,"[F, Freq]",nannofossils,"[369, 368, 350, 342, 367, 324, 363, 344, 346, ...",groups,2
6,F,"[F, Freq]",nannofossils,"[320, 355, 330, 359, 371, 375, 342, 367, 368, ...",taxa,2
7,F,"[F, Freq]",radiolarians,"[371, 362, 359, 349, 374, 346, 323, 320, 342, ...",groups,2
8,F,"[F, Freq]",radiolarians,"[349, 361, 341, 356, 355, 362, 344, 323, 321, ...",taxa,2
9,P,"[P, None]",benthic_forams,"[339, 375, 341, 374, 360, 371, 371, 369, 356, ...",groups,2


## Update taxa abundances


In [25]:
metadata = pd.read_csv(metadata_file, dtype=str)
log_df(metadata)


(1253, 16)


Unnamed: 0,file,path,taxon_groups,change_file_encoding,remove_bad_characters,remove_empty_rows,remove_spaces,delete_duplicate_rows,delete_duplicate_columns,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols,clean_up_taxa_values,clean_up_taxa_metadata_values,split_dextral_sinistral
0,363-U1482A-Benthic_Forams.csv,LIMS/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,benthic_forams,False,False,False,True,False,False,True,False,False,False,False,True,False
1,320_U1336A_Nannofossils_2.csv,LIMS/Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False
2,375_U1518F_planktic_forams.csv,LIMS/Micropal_CSV_1/375_U1518F_planktic_forams...,planktic_forams,False,False,False,True,False,False,True,False,False,False,False,False,False
3,320_U1334A_Nannofossils_1.csv,LIMS/Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False
4,318_U1358B_Palynology.csv,LIMS/Micropal_CSV_1/318_U1358B_Palynology.csv,palynology,False,False,False,False,False,False,False,True,False,False,False,False,False


In [26]:
taxa_df = pd.read_csv(crosswalk_file, dtype=str)
log_df(taxa_df)


(5281, 18)


Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,normalized_name,taxon_group,verbatim_name,name comment field,Comment,Notes (change to Internal only notes?),comments,eodp_id
0,,,Euuvigerina,,,,miozea,,,,Euuvigerina miozea,benthic_forams,Euuvigerina miozea (group) >100 m,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",>100 m group,0
1,,,Euuvigerina,,,,rodleyi,,,,Euuvigerina rodleyi,benthic_forams,Euuvigerina rodleyi (group) >50 m,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",>50 m group,1
2,Foraminifera indet.,,,,,,,,,,Foraminifera indet.,benthic_forams,Others,,not a taxa,Andy,other benthic foraminifera,2
3,Pleurostomellidae indet.,,,,,,,,,,Pleurostomellidae indet.,benthic_forams,Pleurostomellids comment,,not a taxa,Andy,,3
4,Ostracoda indet.,,,,,,,,,,Ostracoda indet.,benthic_forams,Ostracoda spp.,Ostracoda spp.,group abundance,"Summer 2020: go in as Ostracoda indet, and the...",,4


In [27]:
verbatim_names_taxon_groups = get_taxa_and_taxon_groups(taxa_df)
len(verbatim_names_taxon_groups)

5269

In [28]:
verbatim_names = set(taxa_df['verbatim_name'])
len(verbatim_names)

5269

In [29]:
set(verbatim_names_taxon_groups) - set(verbatim_names)

set()

In [30]:
cols = ['original_abundance', 'expedition', 'taxon_group', 'normalized_abundance']
codes_taxa_df = pd.read_csv(OUTPUT_DIR/'normalized_data'/'abundance_codes_taxa.csv', usecols=cols, dtype=str)

log_df(codes_taxa_df)

(1109, 4)


Unnamed: 0,original_abundance,expedition,taxon_group,normalized_abundance
0,2?,371,nannofossils,2?
1,?,342,nannofossils,?
2,?,371,radiolarians,?
3,?,318,diatoms,?
4,??,318,diatoms,?


In [31]:
set(codes_taxa_df['taxon_group']) - set(metadata['taxon_groups'])

set()

In [32]:
set(metadata['taxon_groups']) - set(codes_taxa_df['taxon_group']) 

{'other'}

In [33]:
change_columns = []
for index, row in metadata.iterrows():
    path = Path(CLEAN_DATA_DIR/row['path'])
    df = pd.read_csv(path, dtype=str)
    res = normalize_abundance_codes(df, row['taxon_groups'], codes_taxa_df, 
                                    verbatim_names_taxon_groups, row['path']) 
    
    if res['changed']:
        content = csv_cleanup(res['df'], path)
        content.to_csv(path, index=False)
        
    change_columns.append(res['changed'])

multiple expeditions:  LIMS/Micropal_CSV_4/320_U1336B_108_T10_planktic forams.csv ['320' nan]


In [34]:
dict = {"update_taxa_abundances": change_columns}
new_metadata = update_metadata(metadata, dict)
new_metadata.head()

Unnamed: 0,file,path,taxon_groups,change_file_encoding,remove_bad_characters,remove_empty_rows,remove_spaces,delete_duplicate_rows,delete_duplicate_columns,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols,clean_up_taxa_values,clean_up_taxa_metadata_values,split_dextral_sinistral,update_taxa_abundances
0,363-U1482A-Benthic_Forams.csv,LIMS/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,benthic_forams,False,False,False,True,False,False,True,False,False,False,False,True,False,False
1,320_U1336A_Nannofossils_2.csv,LIMS/Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False,False
2,375_U1518F_planktic_forams.csv,LIMS/Micropal_CSV_1/375_U1518F_planktic_forams...,planktic_forams,False,False,False,True,False,False,True,False,False,False,False,False,False,False
3,320_U1334A_Nannofossils_1.csv,LIMS/Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False,False
4,318_U1358B_Palynology.csv,LIMS/Micropal_CSV_1/318_U1358B_Palynology.csv,palynology,False,False,False,False,False,False,False,True,False,False,False,False,False,False
