# Normalize Micropal CSVs

In [1]:
import sys
sys.path.append('../scripts/')
import glob
import re
import os.path

import pandas as pd
import numpy as np

from normalize_data import (
    csv_cleanup,
    update_metadata,
    get_taxonomy_columns,
    clean_taxon_name
)

In [2]:
taxa_list_path = 'cleaned_data/taxa_list.csv'
non_taxa_fields_path = 'cleaned_data/non_taxa_fields.csv'

In [3]:
micropal_1 = 'cleaned_data/Micropal_CSV_1'
micropal_meta_1 = 'cleaned_data/metadata/Micropal_1_changes.csv'

micropal_2 = 'cleaned_data/Micropal_CSV_2'
micropal_meta_2 = 'cleaned_data/metadata/Micropal_2_changes.csv'

micropal_3 = 'cleaned_data/Micropal_CSV_3'
micropal_meta_3 = 'cleaned_data/metadata/Micropal_3_changes.csv'

In [4]:
metadata_file = micropal_meta_3
clean_data_path = micropal_3

In [5]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,taxon_group,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols,clean_up_taxa_values,fix_expedition_aw_cols,add_extra_sample_data,clean_up_taxa_metadata_values
0,339_benthic_forams_U1388B_5.csv,benthic_forams,False,False,True,False,False,False,False,False
1,324_U1348A_benthic_forams.csv,benthic_forams,False,True,True,False,True,False,False,True
2,339_planktic_forams_U1387C.csv,planktic_forams,False,False,True,False,False,False,False,False
3,339_benthic_forams_U1390A_6.csv,benthic_forams,False,False,True,False,False,False,False,False
4,341_radiolarians_U1419D.csv,radiolarians,True,False,False,False,True,True,False,True


In [6]:
non_taxa_content = pd.read_csv(non_taxa_fields_path)
non_taxa_columns = set(non_taxa_content['field'])

In [7]:
taxa_metadata_columns = set(non_taxa_content[non_taxa_content['type'] == 'taxonomy metadata']['field'])

## Clean up taxa values

Look for taxa columns that have "code [extra text]", and remove "[extra text]".

In [7]:
def clean_up_taxa_values(file):
    path = f"{clean_data_path}/{file}"
    content = pd.read_csv(path)
    
    taxa_cols = get_taxonomy_columns(content.columns, non_taxa_columns)            
    taxa_df = content[taxa_cols]
    content[taxa_cols] = taxa_df.replace(to_replace =' *\[.*\] *', value = '', regex = True) 
    
    # get rid of NAs in order to compare two dataframes   
    changed = not taxa_df.fillna('').equals(content[taxa_cols].fillna(''))

    if changed:
        content = csv_cleanup(content, path)
        content.to_csv(path, index=False)

    return changed
    
change_columns = [clean_up_taxa_values(file) for file in metadata['file']] 

### Update metadata

In [8]:
dict = {"clean_up_taxa_values": change_columns}
new_metadata = update_metadata(metadata, dict)
new_metadata.head()

Unnamed: 0,file,taxon_group,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols,clean_up_taxa_values
0,339_benthic_forams_U1388B_5.csv,benthic_forams,False,False,True,False,False
1,324_U1348A_benthic_forams.csv,benthic_forams,False,True,True,False,True
2,339_planktic_forams_U1387C.csv,planktic_forams,False,False,True,False,False
3,339_benthic_forams_U1390A_6.csv,benthic_forams,False,False,True,False,False
4,341_radiolarians_U1419D.csv,radiolarians,True,False,False,False,True


In [9]:
new_metadata.to_csv(metadata_file, index=False)

## Clean up taxa metadata values

Look for taxa metadata columns that have "code [extra text]", and remove "[extra text]".

In [20]:
def clean_up_taxa_meta_values(file):
    path = f"{clean_data_path}/{file}"
    content = pd.read_csv(path)
    
    available_metadata = list(taxa_metadata_columns.intersection(set(content.columns)))
    
    taxa_meta_df = content[available_metadata]
    content[available_metadata] = taxa_meta_df.replace(to_replace =' *\[.*\] *', value = '', regex = True) 
    
    # get rid of NAs in order to compare two dataframes   
    changed = not taxa_meta_df.fillna('').equals(content[available_metadata].fillna(''))

    if changed:
        content = csv_cleanup(content, path)
        content.to_csv(path, index=False)

    return changed
    
change_columns = [clean_up_taxa_meta_values(file) for file in metadata['file']] 

### Update metadata

In [21]:
dict = {"clean_up_taxa_metadata_values": change_columns}
new_metadata = update_metadata(metadata, dict)
new_metadata.head()

Unnamed: 0,file,taxon_group,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols,clean_up_taxa_values,fix_expedition_aw_cols,add_extra_sample_data,clean_up_taxa_metadata_values
0,339_benthic_forams_U1388B_5.csv,benthic_forams,False,False,True,False,False,False,False,False
1,324_U1348A_benthic_forams.csv,benthic_forams,False,True,True,False,True,False,False,True
2,339_planktic_forams_U1387C.csv,planktic_forams,False,False,True,False,False,False,False,False
3,339_benthic_forams_U1390A_6.csv,benthic_forams,False,False,True,False,False,False,False,False
4,341_radiolarians_U1419D.csv,radiolarians,True,False,False,False,True,True,False,True


In [22]:
new_metadata.to_csv(metadata_file, index=False)