# Normalize LIMS taxa

In [1]:
import sys
sys.path.append('../../../')
import glob
import re
import os.path
import hashlib

import pandas as pd
import numpy as np

from config import CLEAN_DATA_DIR, OUTPUT_DIR

from scripts.normalize_data import (
    csv_cleanup,
    update_metadata,
    get_taxonomy_columns,
    clean_taxon_name,
    get_non_taxa_fields,
    delete_duplicate_colums_with_spaces
)

In [2]:
taxa_list_path = OUTPUT_DIR /'taxa'/'LIMS'/'PI_normalized_taxa_list_with_pbdb_2022-02-22.csv'
normalized_fields_path =  OUTPUT_DIR/'normalized_data'/'eODP_unified_data_structure_2022_02_21.csv'
metadata_file = OUTPUT_DIR/'metadata'/'LIMS'/'Micropal_changes.csv' 
clean_data_path = CLEAN_DATA_DIR

## Clean up taxa values

Look for taxa columns that have "code [extra text]", and remove "[extra text]".

In [10]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,path,change_file_encoding,remove_empty_rows,delete_duplicate_colums_with_spaces,delete_duplicate_rows,delete_renamed_duplicate_columns,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols
0,363-U1482A-Benthic_Forams.csv,LIMS/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,False,False,False,False,False,True,False,False,False
1,320_U1336A_Nannofossils_2.csv,LIMS/Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,False,False,False,False,False,False,True,False,False
2,375_U1518F_planktic_forams.csv,LIMS/Micropal_CSV_1/375_U1518F_planktic_forams...,False,False,False,False,False,True,False,False,False
3,320_U1334A_Nannofossils_1.csv,LIMS/Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,False,False,False,False,False,False,True,False,False
4,318_U1358B_Palynology.csv,LIMS/Micropal_CSV_1/318_U1358B_Palynology.csv,False,False,False,False,False,False,True,False,False


In [12]:
taxa_df = pd.read_csv(taxa_list_path, dtype=str)
taxa_df.head()


Unnamed: 0,taxon_group,verbatim_name,name,"name to use (if different from ""name"")",name comment field,Comment,notes,Any taxon above genus,genus modifier,genus name,...,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,unranked clade_taxon_id,unranked clade_taxon_name
0,benthic_forams,Euuvigerina miozea (group) >100 m,Euuvigerina miozea (group) >100 m,Euuvigerina miozea,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",,,Euuvigerina,...,,,,,288974.0,Foraminifera,212476,Rhizaria,,
1,benthic_forams,Euuvigerina rodleyi (group) >50 m,Euuvigerina rodleyi (group) >50 m,Euuvigerina rodleyi,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",,,Euuvigerina,...,,,,,288974.0,Foraminifera,212476,Rhizaria,,
2,benthic_forams,Others,Others,Foraminifera indet.,,not a taxa,Andy,Foraminifera indet.,,,...,,,,,,,212476,Rhizaria,,
3,benthic_forams,Pleurostomellids comment,Pleurostomellids comment,Pleurostomellia indet.,,not a taxa,Andy,Pleurostomellidae indet.,,,...,,,,,288974.0,Foraminifera,212476,Rhizaria,,
4,benthic_forams,Ostracoda spp.,Ostracoda spp.,Ostracoda indet.,Ostracoda spp.,group abundance,"Summer 2020: go in as Ostracoda indet, and the...",Ostracoda indet.,,,...,,,,,18891.0,Arthropoda,325038,Animalia,,


In [13]:
taxa_list = set(taxa_df['verbatim_name'].unique())
len(taxa_list)

4588

In [14]:
def clean_up_taxa_values(file):
    path = f"{clean_data_path}/{file}"
    content = pd.read_csv(path)
    
    taxa_cols = set(content.columns).intersection(taxa_list)
    filter_df = content[taxa_cols]
    content[list(taxa_cols)] = filter_df.replace(to_replace =' *\[.*\] *', value = '', regex = True) 
    
    # get rid of NAs in order to compare two dataframes   
    changed = not filter_df.fillna('').equals(content[taxa_cols].fillna(''))

    if changed:
        content = csv_cleanup(content, path)
        content.to_csv(path, index=False)

    return changed
    
change_columns = [clean_up_taxa_values(file) for file in metadata['path']] 



### Update metadata

In [15]:
dict = {"clean_up_taxa_values": change_columns}
new_metadata = update_metadata(metadata, dict)
new_metadata.head()

Unnamed: 0,file,path,change_file_encoding,remove_empty_rows,delete_duplicate_colums_with_spaces,delete_duplicate_rows,delete_renamed_duplicate_columns,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols,clean_up_taxa_values
0,363-U1482A-Benthic_Forams.csv,LIMS/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,False,False,False,False,False,True,False,False,False,False
1,320_U1336A_Nannofossils_2.csv,LIMS/Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,False,False,False,False,False,False,True,False,False,True
2,375_U1518F_planktic_forams.csv,LIMS/Micropal_CSV_1/375_U1518F_planktic_forams...,False,False,False,False,False,True,False,False,False,False
3,320_U1334A_Nannofossils_1.csv,LIMS/Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,False,False,False,False,False,False,True,False,False,True
4,318_U1358B_Palynology.csv,LIMS/Micropal_CSV_1/318_U1358B_Palynology.csv,False,False,False,False,False,False,True,False,False,True


In [16]:
new_metadata.to_csv(metadata_file, index=False)

## Clean up non-taxa metadata values

Look for non-taxa metadata columns that have "code [extra text]", and remove "[extra text]".

In [3]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,path,change_file_encoding,remove_empty_rows,delete_duplicate_colums_with_spaces,delete_duplicate_rows,delete_renamed_duplicate_columns,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols,clean_up_taxa_values,clean_up_taxa_metadata_values,split_dextral_sinistral
0,363-U1482A-Benthic_Forams.csv,LIMS/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,False,False,False,False,False,True,False,False,False,False,True,False
1,320_U1336A_Nannofossils_2.csv,LIMS/Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,False,False,False,False,False,False,True,False,False,True,False,False
2,375_U1518F_planktic_forams.csv,LIMS/Micropal_CSV_1/375_U1518F_planktic_forams...,False,False,False,False,False,True,False,False,False,False,False,False
3,320_U1334A_Nannofossils_1.csv,LIMS/Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,False,False,False,False,False,False,True,False,False,True,False,False
4,318_U1358B_Palynology.csv,LIMS/Micropal_CSV_1/318_U1358B_Palynology.csv,False,False,False,False,False,False,True,False,False,True,False,False


In [4]:
cols = ['normalized', 'taxa_317-present']
non_taxa_df = pd.read_csv(normalized_fields_path, dtype=str, header=5, usecols=cols)
non_taxa_df.head()


Unnamed: 0,normalized,taxa_317-present
0,Sample,Label ID; Sample
1,Expedition,Exp
2,Site,Site
3,Hole,Hole
4,Core,Core


In [5]:
non_taxa_dict = get_non_taxa_fields(non_taxa_df, 'taxa_317-present')
non_taxa_list = set(non_taxa_dict.keys())

In [7]:
non_taxa_list

{'% Planktic Foraminifera within whole sample',
 'A/W',
 'Abundance',
 'Additional zone name',
 'Additional zone name (short)',
 'Age',
 'Aspect comment (etching)',
 'BF Group abundance',
 'BF Preservation',
 'BF comment',
 'BF preservation',
 'Bottom Depth [m]',
 'Bottom Depth[m] [m]',
 'Bottom [cm]',
 'Bottom depth [m]',
 'Bottom offset [cm]',
 'Bottom[cm] [cm]',
 'COMMENTS',
 'Comment',
 'Comment (general)',
 'Comments',
 'Core',
 'Datum age average [Ma]',
 'Datum age maximum (Ma)',
 'Datum age minimum (Ma)',
 'Datum author year',
 'Datum name',
 'Datum name generic',
 'Datum region',
 'Datum type',
 'Diatom abundance',
 'Diatom preservation dissolution',
 'Diatom preservation fragmentation',
 'Diatoms group abundance',
 'Exp',
 'Extra Sample ID Data',
 'Foram abundance',
 'General comment',
 'Genus/species (upper zone)',
 'Genus/species lower zone)',
 'Group Abundance',
 'Group abundance',
 'Group abundance (%)',
 'Group preservation',
 'Hole',
 'Label ID',
 'Large Benthic Forams [

In [20]:
def clean_up_taxa_meta_values(file):
    path = f"{clean_data_path}/{file}"
    content = pd.read_csv(path)
    
    nontaxa_cols = set(content.columns).intersection(non_taxa_list)
    filter_df = content[nontaxa_cols]
    content[list(nontaxa_cols)] = filter_df.replace(to_replace =' *\[.*\] *', value = '', regex = True) 
    
    # get rid of NAs in order to compare two dataframes   
    changed = not filter_df.fillna('').equals(content[nontaxa_cols].fillna(''))

    if changed:
        content = csv_cleanup(content, path)
        content.to_csv(path, index=False)

    return changed
    
change_columns = [clean_up_taxa_meta_values(file) for file in metadata['path']] 

### Update metadata

In [21]:
dict = {"clean_up_taxa_metadata_values": change_columns}
new_metadata = update_metadata(metadata, dict)
new_metadata.head()

Unnamed: 0,file,path,change_file_encoding,remove_empty_rows,delete_duplicate_colums_with_spaces,delete_duplicate_rows,delete_renamed_duplicate_columns,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols,clean_up_taxa_values,clean_up_taxa_metadata_values
0,363-U1482A-Benthic_Forams.csv,LIMS/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,False,False,False,False,False,True,False,False,False,False,True
1,320_U1336A_Nannofossils_2.csv,LIMS/Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,False,False,False,False,False,False,True,False,False,True,False
2,375_U1518F_planktic_forams.csv,LIMS/Micropal_CSV_1/375_U1518F_planktic_forams...,False,False,False,False,False,True,False,False,False,False,False
3,320_U1334A_Nannofossils_1.csv,LIMS/Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,False,False,False,False,False,False,True,False,False,True,False
4,318_U1358B_Palynology.csv,LIMS/Micropal_CSV_1/318_U1358B_Palynology.csv,False,False,False,False,False,False,True,False,False,True,False


In [22]:
new_metadata.to_csv(metadata_file, index=False)

## Add eodp_id

In [23]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,path,change_file_encoding,remove_empty_rows,delete_duplicate_colums_with_spaces,delete_duplicate_rows,delete_renamed_duplicate_columns,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols,clean_up_taxa_values,clean_up_taxa_metadata_values
0,363-U1482A-Benthic_Forams.csv,LIMS/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,False,False,False,False,False,True,False,False,False,False,True
1,320_U1336A_Nannofossils_2.csv,LIMS/Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,False,False,False,False,False,False,True,False,False,True,False
2,375_U1518F_planktic_forams.csv,LIMS/Micropal_CSV_1/375_U1518F_planktic_forams...,False,False,False,False,False,True,False,False,False,False,False
3,320_U1334A_Nannofossils_1.csv,LIMS/Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,False,False,False,False,False,False,True,False,False,True,False
4,318_U1358B_Palynology.csv,LIMS/Micropal_CSV_1/318_U1358B_Palynology.csv,False,False,False,False,False,False,True,False,False,True,False


In [24]:
def add_eodp_id(file):
    path = f"{clean_data_path}/{file}"
    content = pd.read_csv(path, dtype=str)
    content.dropna(how='all', axis='rows', inplace=True)
    
    def add_hash_id(id):
        return hashlib.md5(f'{file}{id}'.encode()).hexdigest()
            
    content['eodp_id'] = content.index 
    content['eodp_id'] = content['eodp_id'].apply(add_hash_id)
    
    
    content.to_csv(path, index=False)

res = [add_eodp_id(file) for file in metadata['path']]

## split Dextral:Sinistral taxa

split  Dextral:Sinistral column into two columns.

In [25]:
target_taxa = {
    "Dextral:Sinistral _N. acostaensis_": ['Neogloboquadrina acostaensis (dextral)', 'Neogloboquadrina acostaensis (sinistral)'],
    "Dextral:Sinistral _P. finalis_": ['Pulleniatina finalis (dextral)', 'Pulleniatina finalis (sinistral)'],
    "Dextral:Sinistral _P. obliquiloculata_": ['Pulleniatina obliquiloculata (dextral)', 'Pulleniatina obliquiloculata (sinistral)'],
    "Dextral:Sinistral _P. praecursor_": ['Pulleniatina praecursor (dextral)', 'Pulleniatina praecursor (sinistral)'],
    "Dextral:Sinistral _P. praespectabilis_": ['Pulleniatina praespectabilis (dextral)', 'Pulleniatina praespectabilis (sinistral)'],
    "Dextral:Sinistral _P. primalis_": ['Pulleniatina primalis (dextral)', 'Pulleniatina primalis (sinistral)'],
    "Dextral:Sinistral _P. spectabilis_": ['Pulleniatina spectabilis (dextral)', 'Pulleniatina spectabilis (sinistral)']
}

In [26]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,path,change_file_encoding,remove_empty_rows,delete_duplicate_colums_with_spaces,delete_duplicate_rows,delete_renamed_duplicate_columns,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols,clean_up_taxa_values,clean_up_taxa_metadata_values
0,363-U1482A-Benthic_Forams.csv,LIMS/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,False,False,False,False,False,True,False,False,False,False,True
1,320_U1336A_Nannofossils_2.csv,LIMS/Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,False,False,False,False,False,False,True,False,False,True,False
2,375_U1518F_planktic_forams.csv,LIMS/Micropal_CSV_1/375_U1518F_planktic_forams...,False,False,False,False,False,True,False,False,False,False,False
3,320_U1334A_Nannofossils_1.csv,LIMS/Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,False,False,False,False,False,False,True,False,False,True,False
4,318_U1358B_Palynology.csv,LIMS/Micropal_CSV_1/318_U1358B_Palynology.csv,False,False,False,False,False,False,True,False,False,True,False


In [27]:
def split_dextral_sinistral(file):
    path = f"{clean_data_path}/{file}"

    changed = False
    df_0 = pd.read_csv(path, dtype=str, nrows=0)
    
    file_target_taxa = set(df_0.columns).intersection(set(target_taxa.keys()))
    if len(file_target_taxa) > 0:
        changed = True
        df = pd.read_csv(path, dtype=str)
        for taxon in file_target_taxa:
            new_columns = target_taxa[taxon]
            split_columns = df[taxon].str.split(':', expand=True)
            if split_columns.shape[1] == 1:
                for col in new_columns:
                    df[col] = np.nan
            else:
                df[new_columns] = split_columns
            
            del df[taxon]

        df = csv_cleanup(df, path)
        df.to_csv(path, index=False)

    return changed 
            

change_columns = [split_dextral_sinistral(file) for file in metadata['path']]

In [28]:
dict = {"split_dextral_sinistral": change_columns}
new_metadata = update_metadata(metadata, dict)
new_metadata.head()

Unnamed: 0,file,path,change_file_encoding,remove_empty_rows,delete_duplicate_colums_with_spaces,delete_duplicate_rows,delete_renamed_duplicate_columns,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols,clean_up_taxa_values,clean_up_taxa_metadata_values,split_dextral_sinistral
0,363-U1482A-Benthic_Forams.csv,LIMS/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,False,False,False,False,False,True,False,False,False,False,True,False
1,320_U1336A_Nannofossils_2.csv,LIMS/Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,False,False,False,False,False,False,True,False,False,True,False,False
2,375_U1518F_planktic_forams.csv,LIMS/Micropal_CSV_1/375_U1518F_planktic_forams...,False,False,False,False,False,True,False,False,False,False,False,False
3,320_U1334A_Nannofossils_1.csv,LIMS/Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,False,False,False,False,False,False,True,False,False,True,False,False
4,318_U1358B_Palynology.csv,LIMS/Micropal_CSV_1/318_U1358B_Palynology.csv,False,False,False,False,False,False,True,False,False,True,False,False


In [29]:
new_metadata.to_csv(metadata_file, index=False)