# Normalize LIMS taxa abundance

In [1]:
import sys
sys.path.append('../../../')
import glob
import re
import os.path
import hashlib
from pathlib import Path
import datetime
import shutil

import pandas as pd
import numpy as np
import db as db

from config import CLEAN_DATA_DIR, OUTPUT_DIR, RAW_DATA_DIR

from scripts.normalize_data import (
    csv_cleanup,
    update_metadata,
    get_taxonomy_columns,
    clean_taxon_name,
    remove_whitespace,
    normalize_abundance_codes,
    normalize_switched_abundance_preservation,
    normalize_abundance_codes_group
)
import scripts.normalize_taxa as nt
from scripts.shared_utils import (
    get_taxa_and_taxon_groups,
    create_df_from_db_rows,
    log_df
)

In [2]:
metadata_file = OUTPUT_DIR/'metadata'/'LIMS'/'Micropal_changes.csv' 
# metadata_file = OUTPUT_DIR/'metadata'/'LIMS'/'Micropal_changes_4.csv'

clean_data_path = CLEAN_DATA_DIR

PI_additional_taxa_path = RAW_DATA_DIR/'PI_processed_files'/'LIMS_Micropal_CSV_4_taxa_ADDTL_TAXA.csv'
additional_taxa_path = OUTPUT_DIR/'taxa'/'LIMS'/'addtional_species.csv'
normalized_codes_path = RAW_DATA_DIR/'PI_processed_files'/'eODP Preservation Data v2.xlsx'


date = '2022-08-08'
crosswalk_file = OUTPUT_DIR/'taxa'/'LIMS'/f"taxa_crosswalk_{date}.csv"
taxa_list_file = OUTPUT_DIR/'taxa'/'LIMS'/f"taxa_list_{date}.csv"
PI_file = RAW_DATA_DIR/'PI_processed_files'/f'LIMS_Micropal_headers_PBDB_Taxonomy_notes_taxa_list_{date}.csv'


abundance_codes_taxa_path = OUTPUT_DIR/'normalized_data'/'LIMS'/'abundance_codes_taxa.csv'
abundance_codes_group_path = OUTPUT_DIR/'normalized_data'/'LIMS'/'abundance_codes_groups.csv'

PI_abundance_codes_group_path = RAW_DATA_DIR/'PI_processed_files'/'group_abundance_preservation.csv'

## data QA 

### check for files with multiple expeditions

In [3]:
sql = """
select samples.data_source_notes, 
 array_to_string(array_agg(distinct expeditions.name ), ', ') as exp,  
ARRAY_LENGTH( array_agg(distinct expeditions.name ), 1) as count
from  samples 
join sections on sections.id  = samples.section_id 
join cores on cores.id  = sections.core_id
join holes on holes.id  = cores.hole_id
join sites on sites.id = holes.site_id
join expeditions on expeditions.id = sites.expedition_id
group by samples.data_source_notes
having  ARRAY_LENGTH( array_agg(distinct expeditions.name ), 1)  > 1
;
"""
rows = db.fetch_all_dict(sql)


In [4]:
create_df_from_db_rows(rows)

## create normalized abundance values files and save to database

read normalized abundance codes from PIs, create file , save to database

In [5]:
def clean_value(value):
    if pd.isna(value):
        return 'NULL'
    
    value = value.replace("'", '"')
    return f"'{value}'" 

def create_records(df):
    for index, row in df.iterrows():
        if pd.isna(row["original_abundance"]):
            continue

        sql = f"""
        INSERT INTO public.abundance_crosswalk(original_abundance, expedition, taxon_group, 
            normalized_abundance, definition, notes, 
            type, created_at)  
        VALUES ({clean_value(row["original_abundance"])}, 
        {clean_value(row["expedition"])},
        {clean_value(row["taxon_group"])},
        {clean_value(row["normalized_abundance"])},
        {clean_value(row["definition"])},
        {clean_value(row["notes"])},
        {clean_value(row["type"])},
        '{datetime.datetime.now()}')
        """

        db.execute(sql);


### taxa codes

In [6]:
cols = ['code', 'expedition', 'taxon', 'abundance value', 'abundance unit', 'Notes']
codes_df = pd.read_excel(normalized_codes_path, sheet_name='Abund.Code.NEW', usecols=cols, dtype=str)

log_df(codes_df)

(1109, 6)


Unnamed: 0,code,expedition,taxon,abundance value,abundance unit,Notes
0,2?,371,nannofossils,2?,"count, undefined suggest questionable",?
1,?,342,nannofossils,?,questionable occurrence,?
2,?,371,radiolarians,?,questionable occurrence,?
3,?,318,diatoms,?,questionable occurrence,?
4,??,318,diatoms,?,questionable occurrence,?


In [7]:
data = {
    "code": "original_abundance", 
    "expedition": "expedition",
    "taxon": "taxon_group", 
    "abundance value": "normalized_abundance", 
    "abundance unit": "definition",
    "Notes": "notes"
}
codes_df.rename(columns=data, inplace=True)
codes_df['type'] = 'taxa'

log_df(codes_df)

(1109, 7)


Unnamed: 0,original_abundance,expedition,taxon_group,normalized_abundance,definition,notes,type
0,2?,371,nannofossils,2?,"count, undefined suggest questionable",?,taxa
1,?,342,nannofossils,?,questionable occurrence,?,taxa
2,?,371,radiolarians,?,questionable occurrence,?,taxa
3,?,318,diatoms,?,questionable occurrence,?,taxa
4,??,318,diatoms,?,questionable occurrence,?,taxa


In [9]:
# create_records(codes_df)

In [10]:
codes_df.to_csv(abundance_codes_taxa_path, index=False)

## abundance codes QA

In [11]:
sql = """
select original_abundance, 
array_agg(distinct normalized_abundance) as normalized_abundance, 
taxon_group,
expedition,  
type,
array_length(array_agg(distinct normalized_abundance), 1)
from abundance_crosswalk
group by original_abundance, taxon_group, expedition, type
having array_length(array_agg(distinct normalized_abundance), 1) > 1;
"""

# TODO BUG: 'A' maps to multiple abundances

rows = db.fetch_all_dict(sql)
create_df_from_db_rows(rows)

In [12]:
sql = """
select original_abundance, 
array_agg(distinct normalized_abundance) as normalized_abundance, 
array_agg(taxon_group) as taxon_group,
expedition, 
type,
array_length(array_agg(distinct normalized_abundance), 1)
from abundance_crosswalk
group by original_abundance, expedition, type
having array_length(array_agg(distinct normalized_abundance), 1) > 1;
"""

rows = db.fetch_all_dict(sql)
create_df_from_db_rows(rows)

Unnamed: 0,original_abundance,normalized_abundance,taxon_group,expedition,type,array_length
0,F,"[F, Freq]","[benthic_forams, planktic_forams, nannofossils]",324,taxa,2
1,F,"[F, Freq]","[dinoflagellates, palynology, planktic_forams,...",374,taxa,2
2,X,"[P, X]","[radiolarians, dinoflagellates, ebridians, pal...",374,taxa,2
3,rw,"[*, rw]","[ebridians, diatoms]",374,taxa,2


In [13]:
sql = """
select original_abundance, 
array_agg(distinct normalized_abundance) as normalized_abundance, 
taxon_group,
array_agg(expedition) as expedition, 
type,
array_length(array_agg(distinct normalized_abundance), 1)
from abundance_crosswalk
group by original_abundance, taxon_group , type
having array_length(array_agg(distinct normalized_abundance), 1) > 1;
"""

rows = db.fetch_all_dict(sql)
create_df_from_db_rows(rows)

Unnamed: 0,original_abundance,normalized_abundance,taxon_group,expedition,type,array_length
0,F,"[F, Freq]",diatoms,"[353, 355, 361, 368, 374, 341, 346, 318, 362, ...",taxa,2
1,F,"[F, Freq]",nannofossils,"[355, 349, 367, 368, 320, 323, 330, 359, 371, ...",taxa,2
2,F,"[F, Freq]",radiolarians,"[349, 351, 320, 321, 323, 355, 362, 344, 346, ...",taxa,2
3,T,"[T, X]",diatoms,"[318, 321]",taxa,2
4,V,"[V, VA]",nannofossils,"[369, 350, 363, 352, 353, 354, 356, 355]",taxa,2
5,X,"[P, X]",planktic_forams,"[375, 361]",taxa,2
6,X,"[P, X]",silicoflagellates,"[374, 318]",taxa,2


## Update taxa abundances in LIMS data files

In [25]:
metadata = pd.read_csv(metadata_file, dtype=str)
log_df(metadata)


(1253, 16)


Unnamed: 0,file,path,taxon_groups,change_file_encoding,remove_bad_characters,remove_empty_rows,remove_spaces,delete_duplicate_rows,delete_duplicate_columns,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols,clean_up_taxa_values,clean_up_taxa_metadata_values,split_dextral_sinistral
0,363-U1482A-Benthic_Forams.csv,LIMS/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,benthic_forams,False,False,False,True,False,False,True,False,False,False,False,True,False
1,320_U1336A_Nannofossils_2.csv,LIMS/Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False
2,375_U1518F_planktic_forams.csv,LIMS/Micropal_CSV_1/375_U1518F_planktic_forams...,planktic_forams,False,False,False,True,False,False,True,False,False,False,False,False,False
3,320_U1334A_Nannofossils_1.csv,LIMS/Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False
4,318_U1358B_Palynology.csv,LIMS/Micropal_CSV_1/318_U1358B_Palynology.csv,palynology,False,False,False,False,False,False,False,True,False,False,False,False,False


In [26]:
taxa_df = pd.read_csv(crosswalk_file, dtype=str)
log_df(taxa_df)


(5281, 18)


Unnamed: 0,Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,subspecies modifier,subspecies name,non-taxa descriptor,normalized_name,taxon_group,verbatim_name,name comment field,Comment,Notes (change to Internal only notes?),comments,eodp_id
0,,,Euuvigerina,,,,miozea,,,,Euuvigerina miozea,benthic_forams,Euuvigerina miozea (group) >100 m,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",>100 m group,0
1,,,Euuvigerina,,,,rodleyi,,,,Euuvigerina rodleyi,benthic_forams,Euuvigerina rodleyi (group) >50 m,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",>50 m group,1
2,Foraminifera indet.,,,,,,,,,,Foraminifera indet.,benthic_forams,Others,,not a taxa,Andy,other benthic foraminifera,2
3,Pleurostomellidae indet.,,,,,,,,,,Pleurostomellidae indet.,benthic_forams,Pleurostomellids comment,,not a taxa,Andy,,3
4,Ostracoda indet.,,,,,,,,,,Ostracoda indet.,benthic_forams,Ostracoda spp.,Ostracoda spp.,group abundance,"Summer 2020: go in as Ostracoda indet, and the...",,4


In [27]:
verbatim_names_taxon_groups = get_taxa_and_taxon_groups(taxa_df)
len(verbatim_names_taxon_groups)

5269

In [28]:
verbatim_names = set(taxa_df['verbatim_name'])
len(verbatim_names)

5269

In [29]:
set(verbatim_names_taxon_groups) - set(verbatim_names)

set()

In [30]:
cols = ['original_abundance', 'expedition', 'taxon_group', 'normalized_abundance']
codes_taxa_df = pd.read_csv(abundance_codes_taxa_path, usecols=cols, dtype=str)

log_df(codes_taxa_df)

(1109, 4)


Unnamed: 0,original_abundance,expedition,taxon_group,normalized_abundance
0,2?,371,nannofossils,2?
1,?,342,nannofossils,?
2,?,371,radiolarians,?
3,?,318,diatoms,?
4,??,318,diatoms,?


In [31]:
set(codes_taxa_df['taxon_group']) - set(metadata['taxon_groups'])

set()

In [32]:
set(metadata['taxon_groups']) - set(codes_taxa_df['taxon_group']) 

{'other'}

In [33]:
change_columns = []
for index, row in metadata.iterrows():
    path = Path(CLEAN_DATA_DIR/row['path'])
    df = pd.read_csv(path, dtype=str)
    res = normalize_abundance_codes(df, row['taxon_groups'], codes_taxa_df, 
                                    verbatim_names_taxon_groups, row['path']) 
    
    if res['changed']:
        content = csv_cleanup(res['df'], path)
        content.to_csv(path, index=False)
        
    change_columns.append(res['changed'])

multiple expeditions:  LIMS/Micropal_CSV_4/320_U1336B_108_T10_planktic forams.csv ['320' nan]


In [34]:
dict = {"update_taxa_abundances": change_columns}
new_metadata = update_metadata(metadata, dict)
new_metadata.head()

Unnamed: 0,file,path,taxon_groups,change_file_encoding,remove_bad_characters,remove_empty_rows,remove_spaces,delete_duplicate_rows,delete_duplicate_columns,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols,clean_up_taxa_values,clean_up_taxa_metadata_values,split_dextral_sinistral,update_taxa_abundances
0,363-U1482A-Benthic_Forams.csv,LIMS/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,benthic_forams,False,False,False,True,False,False,True,False,False,False,False,True,False,False
1,320_U1336A_Nannofossils_2.csv,LIMS/Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False,False
2,375_U1518F_planktic_forams.csv,LIMS/Micropal_CSV_1/375_U1518F_planktic_forams...,planktic_forams,False,False,False,True,False,False,True,False,False,False,False,False,False,False
3,320_U1334A_Nannofossils_1.csv,LIMS/Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False,False
4,318_U1358B_Palynology.csv,LIMS/Micropal_CSV_1/318_U1358B_Palynology.csv,palynology,False,False,False,False,False,False,False,True,False,False,False,False,False,False


In [None]:
new_metadata.to_csv(metadata_file, index=False)

## create file that list group abundances and preservation group columns

redo group abundances and preservation file

In [240]:
nontaxa = {
'342-U1408A-2H-2-W 100/102-FORAM',
'A/W',
'ADDITIONAL SPECIES',
'Additional zone name',
'Additional zone name (short)',
'Age',
'Age:',
'Aspect comment (etching)',
'BF comment',
'Bathymetry',
'Biozone name',
'Biozone name (short)',
'Bottom (cm)',
'Bottom (m CSF-A)',
'Bottom CSF-A (m)',
'Bottom Depth (m)',
'Bottom Depth (m) CSF-A',
'Bottom Depth CSF-A (m)',
'Bottom Depth [CFS m]',
'Bottom Depth [m]',
'Bottom Offset (cm) on Parent Sample',
'Bottom [cm]',
'Bottom depth CSF-B (m)',
'Bottom depth CSF-B (m):',
'Bottom interval (cm)',
'COMMENTS',
'Comment',
'Comment (general)',
'Comments',
'Core',
'Core Type',
'Core Type - Section',
'Core type',
'Core,    section',
'Core, Section',
'Core, Section, Interval',
'Core, Type, Section',
'Core, section',
'Core, section, interval',
'Core, section, interval (cm)',
'Core-Sect',
'Datum age average (Ma)',
'Datum age average [Ma]',
'Datum age maximum [Ma]',
'Datum age minimum [Ma]',
'Datum author year',
'Datum comment',
'Datum group',
'Datum group code',
'Datum name',
'Datum name generic',
'Datum region',
'Datum type',
'Depth (cm)',
'Depth (csf)',
'Depth (m) CSF-A',
'Depth CSF (m)',
'Depth CSF-A (m)',
'Depth Method',
'Depth bottom CSF-A (m)',
'Depth m (m csf)',
'Depth top CSF-A (m)',
'Dextral:Sinistral _N. acostaensis_',
'Dextral:Sinistral _P. finalis_',
'Dextral:Sinistral _P. obliquiloculata_',
'Dextral:Sinistral _P. praecursor_',
'Dextral:Sinistral _P. praespectabilis_',
'Dextral:Sinistral _P. primalis_',
'Dextral:Sinistral _P. spectabilis_',
'Diatom Zone (NPD) in Yanagisawa and Akiba (1998)',
'Diatom Zone (Yanagisawa and Akiba, 1998)',
'Diatoms and siliceous plankton comment',
'Exotic',
'Exp',
'Expedition',
'Expedition ',
'Expedition, site, hole, core, section, interval (cm):',
'Extra Sample ID Data',
'File Data',
'Gen. et sp. indet',
'General comment',
'Genus/species (upper zone)',
'Genus/species lower zone)',
'Half',
'Hole',
'Hole, Core, Section',
'Hole.1',
'IRD',
'Interval (bottom)',
'Interval (top)',
'Interval Bot (cm) on SHLF',
'Interval Top (cm) on SHLF',
'Labl ID',
'Lower boundary age av. [Ma]',
'Marine',
'Martini (1971) Zone',
'Miscellaneous',
'Mixing',
'Nannofossil Zone',
'Nannofossil comment',
'Oberservations',
'Observations',
'Organic matter',
'Original Bottom Depth (m)',
'Original Top Depth (m)',
'PALEO WATER DEPTH (IS=inner shelf, MS=middle shelf, OS=outer shelf)',
'PF Zone',
'Piece',
'Pyrite',
'REMARKS',
'Radiolarian zone',
'Radiolarian zone/subzone',
'Remarks',
'Reworking comment (1= <1%, 2= light 1-10%, 3= >10%)',
'Reworking comment (1= <1%, 2=light 1-10%, 3= >10%)',
'Sample',
'Sample comment',
'Sample preparation comment',
'Section',
'Section Half',
'Secton Half',
'Ship File Links',
'Shore File Links',
'Silicoflagellate Zone in Ling (1992)',
'Site',
'Temperature Range',
'Terrestrial organic matter',
'Tintinids',
'Top (cm)',
'Top (m CSF-A)',
'Top CSF-A (m)',
'Top Depth (CSF m)',
'Top Depth (m)',
'Top Depth (m) CSF-A',
'Top Depth CFS (m)',
'Top Depth CSF-A (m)',
'Top Depth [CFS m]',
'Top Depth [CSF m]',
'Top Depth [m]',
'Top Offset (cm) on Parent Sample',
'Top [cm]',
'Top depth CSF (m)',
'Top depth CSF-B (m)',
'Top depth CSF-B (m):',
'Top depth [CSF m]',
'Top interval (cm)',
'Type',
'Type (lower zone)',
'Type (upper zone)',
'Unnamed: 13',
'Unnamed: 14',
'Unnamed: 148',
'Unnamed: 179',
'Unnamed: 21',
'Unnamed: 3',
'Unnamed: 61',
'Unnamed: 81',
'Upper boundary age av. [Ma]',
'Upper boundary age min [Ma]',
'XBroken',
'XCorroded',
'XCrumpled',
'Zone',
'Zone author (year)',
'Zone comment',
'Zone group',
'Zone in Ling (1992)',
'Zone name',
'Zone name (short)',
'Zone name [short]',
'Zone/Subzone',
'bottom (cm)',
'bottom interval (cm)',
'comments',
'constituent',
'core, section',
'depth Bottom (m CSF-A)',
'depth Bottom (m)',
'depth Bottom CSF-A (m)',
'depth CSF-A',
'depth CSF-A (m)',
'depth CSF-A Bottom (m)',
'depth CSF-A Top (m)',
'depth Top (m CSF-A)',
'depth Top (m)',
'depth Top CSF-A (m)',
'dupes and comments',
'eodp_id',
'fossil',
'fossil_group',
'interval (cm)',
'mean depth (mbsf)',
'path',
'pc_fossil_group',
'pc_fossil_name',
'physical_constituent_name',
'section',
'top (cm)',
'top interval (cm)',
'Fragmentation',
'Fragmentation rank [auto-pop]',
'Planktonic Benthic ratio (P:B)',
'Diatom preservation - pyritization2',
'pc_abundance_name_mode',
'Percentage of non-calcareous agglutinated forams in total foram assemblage [%]',

}

preservation = {
"Preservation",
"Presevation",
"Group preservation",
"Foraminferal preservation",
"Foraminiferal preservation",
"BF Preservation",
"BF preservation",
"Diatom preservation dissolution",
"Diatom preservation fragmentation",
"PF preservation",
"PF Preservation",
"preservation",
"Preservation",
"Preservation palynofacies",
}

abundance = {
"Group Abundance",
"Group abundance",
"Abundance",
"Abundance",
"Abundance (%)",
"Abundances",
"Abundance",
"Group Abundance",
"Group abundance",
"Group abundance (%)",
"Planktic foraminiferal %",
"Planktic foraminiferal (%)",
"Foraminiferal abundance",
"Benthic abundance",
"PF Group Abundance",
"PF group abundance",
"% Planktic Foraminifera within whole sample",
"Percentage of planktic forams in total foram assemblage [%]",
"Foram abundance",
"Planktic foraminiferal %",
"Planktic foraminiferal (%)",
"Percentage of benthic forams in total foram assemblage [%]",
"BF Group abundance",
"Diatom abundance",
"Diatoms group abundance",
"Silicoflagellates group abundance",
"Sillicoflagellate abundance",
"Nannofossil abundance",
"Nannofossil abundance",
"Total pollen",
"Total radiolarians",
"Chrysophyte cyst group abundance",
"Ebridian group abundance",
"Ebridian group abundance",
"Large Benthic Forams [%]",
"Pteropod group abundance",
"Total in situ dinocysts",


}

other = {
'Other observations',
"Other fossil material",
"Other taxa",
"No. specimens/tray",
"count",
"count_type",
}

In [241]:
nontaxa.intersection(other)

set()

In [248]:
 preservation.union(abundance).union(other)

{'% Planktic Foraminifera within whole sample',
 'Abundance',
 'Abundance (%)',
 'Abundances',
 'BF Group abundance',
 'BF Preservation',
 'BF preservation',
 'Benthic abundance',
 'Chrysophyte cyst group abundance',
 'Diatom abundance',
 'Diatom preservation dissolution',
 'Diatom preservation fragmentation',
 'Diatoms group abundance',
 'Ebridian group abundance',
 'Foram abundance',
 'Foraminferal preservation',
 'Foraminiferal abundance',
 'Foraminiferal preservation',
 'Group Abundance',
 'Group abundance',
 'Group abundance (%)',
 'Group preservation',
 'Large Benthic Forams [%]',
 'Nannofossil abundance',
 'No. specimens/tray',
 'Other fossil material',
 'Other observations',
 'Other taxa',
 'PF Group Abundance',
 'PF Preservation',
 'PF group abundance',
 'PF preservation',
 'Percentage of benthic forams in total foram assemblage [%]',
 'Percentage of planktic forams in total foram assemblage [%]',
 'Planktic foraminiferal %',
 'Planktic foraminiferal (%)',
 'Preservation',
 'P

In [242]:
metadata = pd.read_csv(metadata_file, dtype=str)
log_df(metadata)

(1253, 16)


Unnamed: 0,file,path,taxon_groups,change_file_encoding,remove_bad_characters,remove_empty_rows,remove_spaces,delete_duplicate_rows,delete_duplicate_columns,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols,clean_up_taxa_values,clean_up_taxa_metadata_values,split_dextral_sinistral
0,363-U1482A-Benthic_Forams.csv,LIMS/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,benthic_forams,False,False,False,True,False,False,True,False,False,False,False,True,False
1,320_U1336A_Nannofossils_2.csv,LIMS/Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False
2,375_U1518F_planktic_forams.csv,LIMS/Micropal_CSV_1/375_U1518F_planktic_forams...,planktic_forams,False,False,False,True,False,False,True,False,False,False,False,False,False
3,320_U1334A_Nannofossils_1.csv,LIMS/Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False
4,318_U1358B_Palynology.csv,LIMS/Micropal_CSV_1/318_U1358B_Palynology.csv,palynology,False,False,False,False,False,False,False,True,False,False,False,False,False


In [243]:
taxa_df = pd.read_csv(crosswalk_file)
taxa = set(taxa_df['verbatim_name'])

In [244]:
rows = []
my_abundance = set()
for index, row in metadata.iterrows():
    df = pd.read_csv(CLEAN_DATA_DIR /row['path'], dtype=str)
    df = df.dropna(axis=0, how="all")
    df = df.dropna(axis=1, how="all")
                                         
    misc_cols = set(df.columns) - nontaxa - taxa - preservation - other
    for col in misc_cols:
        my_abundance.add(col)
                      

In [245]:
my_abundance - abundance

set()

In [262]:
rows = []
abundances = set()
for index, row in metadata.iterrows():
    df = pd.read_csv(CLEAN_DATA_DIR /row['path'], dtype=str)
    df = df.dropna(axis=0, how="all")
    df = df.dropna(axis=1, how="all")
                                         
    review_cols = set(df.columns).intersection(preservation.union(abundance).union(other))
    
    for col in review_cols:
        for value in df[col]:
            if pd.notna(value):
                data = {
                    'file': row['file'], 
                    'Exp': df.loc[0, 'Exp'],
                    'original_header': col, 
                    'taxon_group': row['taxon_groups'], 
                }
                data['value'] = value
                rows.append(data)

                      

In [263]:
df = pd.DataFrame(rows)
log_df(df)
# 72176

(72176, 5)


Unnamed: 0,file,Exp,original_header,taxon_group,abundance_code
0,363-U1482A-Benthic_Forams.csv,363,Preservation,benthic_forams,E [P46]
1,363-U1482A-Benthic_Forams.csv,363,Preservation,benthic_forams,E [P46]
2,363-U1482A-Benthic_Forams.csv,363,Preservation,benthic_forams,E [P46]
3,363-U1482A-Benthic_Forams.csv,363,Preservation,benthic_forams,E [P46]
4,363-U1482A-Benthic_Forams.csv,363,Preservation,benthic_forams,E [P46]


In [266]:
group_df = df.groupby(['Exp', 'original_header', 'taxon_group',  'value'], as_index=False).agg({'file': '; '.join})
log_df(group_df)
# 2085

(2085, 5)


Unnamed: 0,Exp,original_header,taxon_group,abundance_code,file
0,317,Abundance,diatoms,B,317_U1353_diatoms.csv; 317_U1353_diatoms.csv; ...
1,317,Abundance,diatoms,C,317_U1353_diatoms.csv; 317_U1353_diatoms.csv
2,317,Abundance,diatoms,X,317_U1353_diatoms.csv; 317_U1353_diatoms.csv; ...
3,317,Abundance,nannofossils,A,317_U1351_nannofossils.csv; 317_U1351_nannofos...
4,317,Abundance,nannofossils,B,317_U1351_nannofossils.csv; 317_U1351_nannofos...


In [267]:
group_df.to_csv(OUTPUT_DIR/'tmp'/'group_abundance_preservation.csv', index=False)

## get files that might have abundance and preservation switched

copy files so that PIs can review them

In [14]:
metadata = pd.read_csv(metadata_file, dtype=str)
log_df(metadata)

(1253, 16)


Unnamed: 0,file,path,taxon_groups,change_file_encoding,remove_bad_characters,remove_empty_rows,remove_spaces,delete_duplicate_rows,delete_duplicate_columns,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols,clean_up_taxa_values,clean_up_taxa_metadata_values,split_dextral_sinistral
0,363-U1482A-Benthic_Forams.csv,LIMS/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,benthic_forams,False,False,False,True,False,False,True,False,False,False,False,True,False
1,320_U1336A_Nannofossils_2.csv,LIMS/Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False
2,375_U1518F_planktic_forams.csv,LIMS/Micropal_CSV_1/375_U1518F_planktic_forams...,planktic_forams,False,False,False,True,False,False,True,False,False,False,False,False,False
3,320_U1334A_Nannofossils_1.csv,LIMS/Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False
4,318_U1358B_Palynology.csv,LIMS/Micropal_CSV_1/318_U1358B_Palynology.csv,palynology,False,False,False,False,False,False,False,True,False,False,False,False,False


In [15]:
df = pd.read_csv(PI_abundance_codes_group_path, dtype=str)
log_df(df)

(2031, 8)


Unnamed: 0,Exp,original_header,taxon_group,abundance_code,preservation in andance column,harmonized_code,abundance_preservation_definition,file
0,320,% Planktic Foraminifera within whole sample,planktic_forams,0.0,,0.0,% Planktic Foraminifera within whole sample,320_U1331B_Planktic_Forams.csv; 320_U1331B_Pla...
1,321,% Planktic Foraminifera within whole sample,planktic_forams,0.0,,0.0,% Planktic Foraminifera within whole sample,321_U1338A_Planktic_forams.csv; 321_U1338A_Pla...
2,321,% Planktic Foraminifera within whole sample,planktic_forams,0.0,,0.0,% Planktic Foraminifera within whole sample,321_U1337A_Planktic_forams.csv; 321_U1337A_Pla...
3,321,% Planktic Foraminifera within whole sample,planktic_forams,0.5,,0.5,% Planktic Foraminifera within whole sample,321_U1337A_Planktic_forams.csv
4,320,% Planktic Foraminifera within whole sample,planktic_forams,1.0,,1.0,% Planktic Foraminifera within whole sample,320_U1332A_Planktic_Forams.csv


In [16]:
filter_df = df[df['preservation in andance column'].notna()]
log_df(filter_df)

(48, 8)


Unnamed: 0,Exp,original_header,taxon_group,abundance_code,preservation in andance column,harmonized_code,abundance_preservation_definition,file
206,320,Abundance,radiolarians,G,X,G,"good (majority of specimens complete, with min...",320_U1331C_Radiolarians_3.csv; 320_U1331C_Radi...
207,323,Abundance,benthic_forams,G,X,G,good (only very minor dissolution and no recry...,323_U1344D_benthic_forams.csv; 323_U1343AB_ben...
208,320,Abundance,radiolarians,M,X,M,"moderate (minor but common dissolution, with a...",320_U1331C_Radiolarians_3.csv
209,323,Abundance,benthic_forams,M,X,M,moderate (frequent etching and partial breakag...,323_U1344A_benthic_forams.csv
210,323,Abundance,diatoms,M,X,M,moderate (finely silicified forms are present ...,323_U1343A_diatoms.csv; 323_U1343A_diatoms.csv...


In [17]:
files = set()

for file in list(filter_df['file']):
    for f in file.split('; '):
        files.add(f)
        
len(files)

42

In [18]:
temp_path = OUTPUT_DIR /'tmp'/ 'swapped_abundance_preservation'
if not os.path.exists(temp_path):
    os.makedirs(temp_path)

for path in metadata[metadata['file'].isin(files)]['path']:
    file = CLEAN_DATA_DIR / path
    shutil.copy(file, temp_path)

    

## update  group abundances in data files

In [19]:
metadata = pd.read_csv(metadata_file, dtype=str)
log_df(metadata)

(1253, 16)


Unnamed: 0,file,path,taxon_groups,change_file_encoding,remove_bad_characters,remove_empty_rows,remove_spaces,delete_duplicate_rows,delete_duplicate_columns,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols,clean_up_taxa_values,clean_up_taxa_metadata_values,split_dextral_sinistral
0,363-U1482A-Benthic_Forams.csv,LIMS/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,benthic_forams,False,False,False,True,False,False,True,False,False,False,False,True,False
1,320_U1336A_Nannofossils_2.csv,LIMS/Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False
2,375_U1518F_planktic_forams.csv,LIMS/Micropal_CSV_1/375_U1518F_planktic_forams...,planktic_forams,False,False,False,True,False,False,True,False,False,False,False,False,False
3,320_U1334A_Nannofossils_1.csv,LIMS/Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False
4,318_U1358B_Palynology.csv,LIMS/Micropal_CSV_1/318_U1358B_Palynology.csv,palynology,False,False,False,False,False,False,False,True,False,False,False,False,False


In [20]:
codes_df = pd.read_csv(PI_abundance_codes_group_path, dtype=str)
codes_df = codes_df.dropna(subset=['abundance_preservation_definition'])

log_df(codes_df)

(2031, 8)


Unnamed: 0,Exp,original_header,taxon_group,abundance_code,preservation in andance column,harmonized_code,abundance_preservation_definition,file
0,320,% Planktic Foraminifera within whole sample,planktic_forams,0.0,,0.0,% Planktic Foraminifera within whole sample,320_U1331B_Planktic_Forams.csv; 320_U1331B_Pla...
1,321,% Planktic Foraminifera within whole sample,planktic_forams,0.0,,0.0,% Planktic Foraminifera within whole sample,321_U1338A_Planktic_forams.csv; 321_U1338A_Pla...
2,321,% Planktic Foraminifera within whole sample,planktic_forams,0.0,,0.0,% Planktic Foraminifera within whole sample,321_U1337A_Planktic_forams.csv; 321_U1337A_Pla...
3,321,% Planktic Foraminifera within whole sample,planktic_forams,0.5,,0.5,% Planktic Foraminifera within whole sample,321_U1337A_Planktic_forams.csv
4,320,% Planktic Foraminifera within whole sample,planktic_forams,1.0,,1.0,% Planktic Foraminifera within whole sample,320_U1332A_Planktic_Forams.csv


In [21]:
switched_df = codes_df[codes_df['preservation in andance column'].notna()]
log_df(switched_df)

(48, 8)


Unnamed: 0,Exp,original_header,taxon_group,abundance_code,preservation in andance column,harmonized_code,abundance_preservation_definition,file
206,320,Abundance,radiolarians,G,X,G,"good (majority of specimens complete, with min...",320_U1331C_Radiolarians_3.csv; 320_U1331C_Radi...
207,323,Abundance,benthic_forams,G,X,G,good (only very minor dissolution and no recry...,323_U1344D_benthic_forams.csv; 323_U1343AB_ben...
208,320,Abundance,radiolarians,M,X,M,"moderate (minor but common dissolution, with a...",320_U1331C_Radiolarians_3.csv
209,323,Abundance,benthic_forams,M,X,M,moderate (frequent etching and partial breakag...,323_U1344A_benthic_forams.csv
210,323,Abundance,diatoms,M,X,M,moderate (finely silicified forms are present ...,323_U1343A_diatoms.csv; 323_U1343A_diatoms.csv...


In [22]:
switched_files = set()
for file in switched_df['file']:
    switched_files.update(file.split('; '))
    
len(switched_files)

42

In [24]:
change_columns = []
for index, row in metadata.iterrows():
    path = Path(CLEAN_DATA_DIR/row['path'])
    df = pd.read_csv(path, dtype=str)
    if row['file'] in switched_files:
        fixed_path = Path(RAW_DATA_DIR/'PI_processed_files'/'swapped_abundance_preservation'/row['file'])
        fixed_df = pd.read_csv(fixed_path, dtype=str)
        res = normalize_switched_abundance_preservation(df, codes_df, row['taxon_groups'], fixed_df, row['file'])
    else:
        res = normalize_abundance_codes_group(df,  codes_df, row['taxon_groups'], row['file']) 
    
    if res['changed']:
        content = csv_cleanup(res['df'], path)
        content.to_csv(path, index=False)
        
    change_columns.append(res['changed'])

In [25]:
dict = {"update_group_abundances": change_columns}
new_metadata = update_metadata(metadata, dict)
new_metadata.head()

Unnamed: 0,file,path,taxon_groups,change_file_encoding,remove_bad_characters,remove_empty_rows,remove_spaces,delete_duplicate_rows,delete_duplicate_columns,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols,clean_up_taxa_values,clean_up_taxa_metadata_values,split_dextral_sinistral,update_group_abundances
0,363-U1482A-Benthic_Forams.csv,LIMS/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,benthic_forams,False,False,False,True,False,False,True,False,False,False,False,True,False,True
1,320_U1336A_Nannofossils_2.csv,LIMS/Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False,True
2,375_U1518F_planktic_forams.csv,LIMS/Micropal_CSV_1/375_U1518F_planktic_forams...,planktic_forams,False,False,False,True,False,False,True,False,False,False,False,False,False,True
3,320_U1334A_Nannofossils_1.csv,LIMS/Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False,True
4,318_U1358B_Palynology.csv,LIMS/Micropal_CSV_1/318_U1358B_Palynology.csv,palynology,False,False,False,False,False,False,False,True,False,False,False,False,False,False


In [26]:
new_metadata.to_csv(metadata_file, index=False)