# Normalize Micropal CSVs

In [1]:
import sys
sys.path.append('../scripts/')
import glob
import re
import os.path

import pandas as pd
import numpy as np

from normalize_data import (
    csv_cleanup,
    update_metadata,
    get_taxonomy_columns,
    clean_taxon_name,
    fetch_unique_column_names,
    append_set,
    normalize_columns,
    add_missing_columns
)

In [2]:
taxa_list_path = 'cleaned_data/taxa/taxa_list.csv'
non_taxa_fields_path = 'cleaned_data/taxa/non_taxa_fields.csv'

In [3]:
micropal_1 = 'cleaned_data/Micropal_CSV_1'
micropal_meta_1 = 'cleaned_data/metadata/lims_micropal_1_changes.csv'

micropal_2 = 'cleaned_data/Micropal_CSV_2'
micropal_meta_2 = 'cleaned_data/metadata/lims_micropal_2_changes.csv'

micropal_3 = 'cleaned_data/Micropal_CSV_3'
micropal_meta_3 = 'cleaned_data/metadata/lims_micropal_3_changes.csv'

micropal_4 = 'cleaned_data/Micropal_CSV_revised'
micropal_meta_4 = 'cleaned_data/metadata/lims_micropal_revised_changes.csv'

all_LIMS = [
    micropal_1,
    micropal_2,
    micropal_3,
    micropal_4
]

In [4]:
clean_data_path = micropal_4
metadata_file = micropal_meta_4

In [5]:
non_taxa_content = pd.read_csv(non_taxa_fields_path)
non_taxa_columns = set(non_taxa_content['field'])

In [6]:
taxa_metadata_columns = set(non_taxa_content[non_taxa_content['type'] == 'taxonomy metadata']['field'])

## Normalize zone columns

Normalize all the zone columns to have the same names.

In [7]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,path,taxon_group,empty_rows_columns,remove_identical_rows,remove_identical_columns,add_expedition_section_cols,rename_label_id,add_sample_name,update_top_bottom,add_missing_cols,update_zones,add_missing_zone,clean_up_taxa_values,clean_up_taxa_metadata_values
0,363-U1482A-nannofossils_revised.csv,Micropal_CSV_revised/363-U1482A-nannofossils_r...,nannofossils,False,False,False,True,False,False,False,False,False,True,True,True


### Get zone columns

In [8]:
columns_all = set()

res=[fetch_unique_column_names(f"{clean_data_path}/{file}", columns_all) for file in metadata['file']] 

In [9]:
zone_all = set()
zone_name = set()
zone_name_short = set()

append_set(zone_all, r".*?zone.*?", columns_all)
append_set(zone_name, r"zone[_ ]name$|zone$", columns_all)
append_set(zone_name_short, r"zone_name_short|zone name [\[(]short[\])]", columns_all)


In [10]:
zone_all

{'Zone name', 'Zone name (short)'}

In [11]:
zone_name

{'Zone name'}

In [12]:
zone_name_short

{'Zone name (short)'}

### Normalize zone columns

In [13]:
def normalize_zone(file):
    path = f"{clean_data_path}/{file}"
    content = pd.read_csv(path, dtype=str)
    columns = list(content.columns)
    
    normalized_cols = normalize_columns(zone_name, 'Zone name', columns)
    normalized_cols = normalize_columns(zone_name_short, 'Zone name (short)', normalized_cols)

    changed = columns != normalized_cols
    
    if changed:
        content.columns = normalized_cols
        content = csv_cleanup(content, path)
        content.to_csv(path, index=False)

    return changed

change_columns = [normalize_zone(file) for file in metadata['file']] 

In [14]:
dict = {"update_zones": change_columns}
new_metadata = update_metadata(metadata, dict)

In [15]:
new_metadata.to_csv(metadata_file, index=False)

## Add missing columns

In [16]:
metadata = pd.read_csv(metadata_file)
new_metadata.head()

Unnamed: 0,file,path,taxon_group,empty_rows_columns,remove_identical_rows,remove_identical_columns,add_expedition_section_cols,rename_label_id,add_sample_name,update_top_bottom,add_missing_cols,update_zones,add_missing_zone,clean_up_taxa_values,clean_up_taxa_metadata_values
0,363-U1482A-nannofossils_revised.csv,Micropal_CSV_revised/363-U1482A-nannofossils_r...,nannofossils,False,False,False,True,False,False,False,False,False,True,True,True


In [17]:
normalized_columns = [
    'Zone name',
    'Zone name (short)'
]

In [18]:
change_columns = [add_missing_columns(f"{clean_data_path}/{file}", normalized_columns) for file in metadata['file']] 

In [19]:
dict = {"add_missing_zone": change_columns}
new_metadata = update_metadata(metadata, dict)
new_metadata.head()

Unnamed: 0,file,path,taxon_group,empty_rows_columns,remove_identical_rows,remove_identical_columns,add_expedition_section_cols,rename_label_id,add_sample_name,update_top_bottom,add_missing_cols,update_zones,add_missing_zone,clean_up_taxa_values,clean_up_taxa_metadata_values
0,363-U1482A-nannofossils_revised.csv,Micropal_CSV_revised/363-U1482A-nannofossils_r...,nannofossils,False,False,False,True,False,False,False,False,False,True,True,True


In [20]:
new_metadata.to_csv(metadata_file, index=False)

## Clean up taxa values
Look for taxa columns that have "code [extra text]", and remove "[extra text]".

In [21]:
metadata = pd.read_csv(metadata_file)

In [22]:
def clean_up_taxa_values(file):
    path = f"{clean_data_path}/{file}"
    content = pd.read_csv(path, dtype=str)
    
    taxa_cols = get_taxonomy_columns(content.columns, non_taxa_columns)            
    taxa_df = content[taxa_cols]
    content[taxa_cols] = taxa_df.replace(to_replace =' *\[.*\] *', value = '', regex = True) 
    
    # get rid of NAs in order to compare two dataframes   
    changed = not taxa_df.fillna('').equals(content[taxa_cols].fillna(''))

    if changed:
        content = csv_cleanup(content, path)
        content.to_csv(path, index=False)

    return changed
    
change_columns = [clean_up_taxa_values(file) for file in metadata['file']] 

In [23]:
dict = {"clean_up_taxa_values": change_columns}
new_metadata = update_metadata(metadata, dict)
new_metadata.head()

Unnamed: 0,file,path,taxon_group,empty_rows_columns,remove_identical_rows,remove_identical_columns,add_expedition_section_cols,rename_label_id,add_sample_name,update_top_bottom,add_missing_cols,update_zones,add_missing_zone,clean_up_taxa_values,clean_up_taxa_metadata_values
0,363-U1482A-nannofossils_revised.csv,Micropal_CSV_revised/363-U1482A-nannofossils_r...,nannofossils,False,False,False,True,False,False,False,False,False,True,True,True


In [24]:
new_metadata.to_csv(metadata_file, index=False)

## Clean up taxa metadata values
Look for taxa metadata columns that have "code [extra text]", and remove "[extra text]".

In [25]:
metadata = pd.read_csv(metadata_file)

In [26]:
def clean_up_taxa_meta_values(file):
    path = f"{clean_data_path}/{file}"
    content = pd.read_csv(path, dtype=str)
    
    available_metadata = list(taxa_metadata_columns.intersection(set(content.columns)))
    
    taxa_meta_df = content[available_metadata]
    content[available_metadata] = taxa_meta_df.replace(to_replace =' *\[.*\] *', value = '', regex = True) 
    
    # get rid of NAs in order to compare two dataframes   
    changed = not taxa_meta_df.fillna('').equals(content[available_metadata].fillna(''))

    if changed:
        content = csv_cleanup(content, path)
        content.to_csv(path, index=False)

    return changed
    
change_columns = [clean_up_taxa_meta_values(file) for file in metadata['file']] 

In [27]:
dict = {"clean_up_taxa_metadata_values": change_columns}
new_metadata = update_metadata(metadata, dict)
new_metadata.head()

Unnamed: 0,file,path,taxon_group,empty_rows_columns,remove_identical_rows,remove_identical_columns,add_expedition_section_cols,rename_label_id,add_sample_name,update_top_bottom,add_missing_cols,update_zones,add_missing_zone,clean_up_taxa_values,clean_up_taxa_metadata_values
0,363-U1482A-nannofossils_revised.csv,Micropal_CSV_revised/363-U1482A-nannofossils_r...,nannofossils,False,False,False,True,False,False,False,False,False,True,True,True


In [28]:
new_metadata.to_csv(metadata_file, index=False)