# Normalize Micropal CSVs

In [1]:
import sys
sys.path.append('../scripts/')
import glob
import re
import os.path

import pandas as pd
import numpy as np

from normalize_data import (
    csv_cleanup,
    update_metadata,
    get_taxonomy_columns,
    clean_taxon_name,
    fetch_unique_column_names,
    append_set,
    normalize_columns,
    add_missing_columns,
    get_columns_from_file_or_disk,
    get_common_columns,
)

In [2]:
base_directory = 'cleaned_data'
raw_data_directory = 'raw_data'

taxa_file = os.path.join(base_directory, 'taxa', 'approved_eodp_taxa_list.csv')
columns_file = os.path.join(base_directory, 'metadata', 'LIMS', 'columns_list.csv')
non_taxa_fields_path = os.path.join(base_directory, 
                                    'taxa', 'draft', 'LIMS', 'non_taxa_fields.csv')

metadata_file = os.path.join(base_directory, 
                             'metadata', 'LIMS', 'Micropal_changes.csv')

## Set taxa metadata columns

In [3]:
non_taxa_content = pd.read_csv(non_taxa_fields_path)
non_taxa_columns = set(non_taxa_content['field'])

In [4]:
taxa_metadata_columns = set(non_taxa_content[non_taxa_content['type'] == 'taxonomy metadata']['field'])

## Normalize columns

Find all variants of a column name and set all the variants to use the same column name.

In [5]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,path,taxon_group,empty_rows_columns,remove_identical_rows,remove_identical_columns,standardize_headers,add_expedition_aw_cols,add_sample_column,add_missing_cols,clean_up_taxa_values,clean_up_taxa_metadata_values,update_zones,add_missing_zone
0,363-U1482A-Benthic_Forams.csv,Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,benthic_foraminfera,False,False,False,False,True,False,False,False,True,False,True
1,320_U1336A_Nannofossils_2.csv,Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,nannofossils,False,False,False,True,False,False,False,True,True,False,True
2,375_U1518F_planktic_forams.csv,Micropal_CSV_1/375_U1518F_planktic_forams.csv,planktic_foraminfera,False,False,False,False,True,False,False,False,False,False,True
3,320_U1334A_Nannofossils_1.csv,Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,nannofossils,False,False,False,True,False,False,False,True,True,True,True
4,318_U1358B_Palynology.csv,Micropal_CSV_1/318_U1358B_Palynology.csv,palynology,False,False,False,True,False,False,False,True,False,False,True


In [6]:
metadata.shape

(1116, 14)

### Get column  variants

In [8]:
# 4708 columns_all
columns_all = get_columns_from_file_or_disk(columns_file = columns_file, 
                                            metadata = metadata, 
                                            data_directory = raw_data_directory, 
                                            column_type = 'taxa')
len(columns_all)

4708

In [10]:
# 4686
columns_all = columns_all - get_common_columns()
len(columns_all)

4686

In [11]:
zone_all = set()
zone_name = set()
zone_name_short = set()

append_set(zone_all, r".*?zone.*?", columns_all)
append_set(zone_name, r"zone[_ ]name$|zone$", columns_all)
append_set(zone_name_short, r"zone_name_short|zone name [\[(]short[\])]", columns_all)


In [12]:
zone_all

{'Additional zone name',
 'Additional zone name (short)',
 'Genus/species (upper zone)',
 'Genus/species lower zone)',
 'PF Zone',
 'Siphonina pozonensis',
 'Type (lower zone)',
 'Type (upper zone)',
 'Zone',
 'Zone author (year)',
 'Zone comment',
 'Zone group',
 'Zone name',
 'Zone name (short)',
 'Zone name [short]'}

In [13]:
zone_name

{'Zone', 'Zone name'}

In [14]:
zone_name_short

{'Zone name (short)', 'Zone name [short]'}

### get columns that need to be standardized

In [15]:
def get_taxa_columns():
    df = pd.read_csv(taxa_file, usecols=['verbatim_name'])

    return set(df['verbatim_name'])    

In [16]:
# 4581 taxa_cols
taxa_cols = get_taxa_columns()
len(taxa_cols)

4581

In [17]:
# 106
not_standardized = columns_filtered - taxa_cols
len(not_standardized)


106

In [19]:
# not_standardized 

### Normalize columns

In [None]:
metadata = pd.read_csv(metadata_file)

In [54]:
def normalize_columns(file):
    path = f"{base_directory}/{file}"
    content = pd.read_csv(path, dtype=str)
    columns = list(content.columns)
    
    normalized_cols = normalize_columns(zone_name, 'Zone name', columns)
    normalized_cols = normalize_columns(zone_name_short, 'Zone name (short)', normalized_cols)

    changed = columns != normalized_cols
    
    if changed:
        content.columns = normalized_cols
        content = csv_cleanup(content, path)
        content.to_csv(path, index=False)

    return changed

change_columns = [normalize_columns(file) for file in metadata['path']] 

In [55]:
dict = {"update_taxa": change_columns}
new_metadata = update_metadata(metadata, dict)
new_metadata.head()

Unnamed: 0,file,path,taxon_group,empty_rows_columns,remove_identical_rows,remove_identical_columns,standardize_headers,add_expedition_aw_cols,add_sample_column,add_missing_cols,clean_up_taxa_values,clean_up_taxa_metadata_values,update_zones
0,363-U1482A-Benthic_Forams.csv,Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,benthic_forams,False,False,False,False,True,False,False,False,True,False
1,320_U1336A_Nannofossils_2.csv,Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,nannofossils,False,False,False,True,False,False,False,True,True,False
2,375_U1518F_planktic_forams.csv,Micropal_CSV_1/375_U1518F_planktic_forams.csv,planktic_forams,False,False,False,False,True,False,False,False,False,False
3,320_U1334A_Nannofossils_1.csv,Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,nannofossils,False,False,False,True,False,False,False,True,True,True
4,318_U1358B_Palynology.csv,Micropal_CSV_1/318_U1358B_Palynology.csv,palynology,False,False,False,True,False,False,False,True,False,False


In [56]:
new_metadata.to_csv(metadata_file, index=False)

## Clean up taxa values
Look for taxa columns that have "code [extra text]", and remove "[extra text]".

In [4]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,path,taxon_group,empty_rows_columns,remove_identical_rows,remove_identical_columns,standardize_headers,add_expedition_aw_cols,add_sample_column,add_missing_cols,clean_up_taxa_values,clean_up_taxa_metadata_values,update_zones,add_missing_zone
0,363-U1482A-Benthic_Forams.csv,Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,benthic_foraminfera,False,False,False,False,True,False,False,False,True,False,True
1,320_U1336A_Nannofossils_2.csv,Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,nannofossils,False,False,False,True,False,False,False,True,True,False,True
2,375_U1518F_planktic_forams.csv,Micropal_CSV_1/375_U1518F_planktic_forams.csv,planktic_foraminfera,False,False,False,False,True,False,False,False,False,False,True
3,320_U1334A_Nannofossils_1.csv,Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,nannofossils,False,False,False,True,False,False,False,True,True,True,True
4,318_U1358B_Palynology.csv,Micropal_CSV_1/318_U1358B_Palynology.csv,palynology,False,False,False,True,False,False,False,True,False,False,True


In [33]:
def clean_up_taxa_values(file):
    path = f"{base_directory}/{file}"
    content = pd.read_csv(path, dtype=str)
    
    taxa_cols = get_taxonomy_columns(content.columns, non_taxa_columns)            
    taxa_df = content[taxa_cols]
    content[taxa_cols] = taxa_df.replace(to_replace =' *\[.*\] *', value = '', regex = True) 
    
    # get rid of NAs in order to compare two dataframes   
    changed = not taxa_df.fillna('').equals(content[taxa_cols].fillna(''))

    if changed:
        content = csv_cleanup(content, path)
        content.to_csv(path, index=False)

    return changed
    
change_columns = [clean_up_taxa_values(file) for file in metadata['path']] 

In [37]:
dict = {"clean_up_taxa_values": change_columns}
new_metadata = update_metadata(metadata, dict)
new_metadata.head()

Unnamed: 0,file,path,taxon_group,empty_rows_columns,remove_identical_rows,remove_identical_columns,standardize_headers,add_expedition_aw_cols,add_sample_column,add_missing_cols,clean_up_taxa_values
0,363-U1482A-Benthic_Forams.csv,Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,benthic_forams,False,False,False,False,True,False,False,False
1,320_U1336A_Nannofossils_2.csv,Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,nannofossils,False,False,False,True,False,False,False,True
2,375_U1518F_planktic_forams.csv,Micropal_CSV_1/375_U1518F_planktic_forams.csv,planktic_forams,False,False,False,False,True,False,False,False
3,320_U1334A_Nannofossils_1.csv,Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,nannofossils,False,False,False,True,False,False,False,True
4,318_U1358B_Palynology.csv,Micropal_CSV_1/318_U1358B_Palynology.csv,palynology,False,False,False,True,False,False,False,True


In [38]:
new_metadata.to_csv(metadata_file, index=False)

## Clean up taxa metadata values
Look for taxa metadata columns that have "code [extra text]", and remove "[extra text]".

In [62]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,path,taxon_group,empty_rows_columns,remove_identical_rows,remove_identical_columns,standardize_headers,add_expedition_aw_cols,add_sample_column,add_missing_cols,clean_up_taxa_values,clean_up_taxa_metadata_values,update_zones,add_missing_zone
0,363-U1482A-Benthic_Forams.csv,Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,benthic_forams,False,False,False,False,True,False,False,False,True,False,True
1,320_U1336A_Nannofossils_2.csv,Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,nannofossils,False,False,False,True,False,False,False,True,True,False,True
2,375_U1518F_planktic_forams.csv,Micropal_CSV_1/375_U1518F_planktic_forams.csv,planktic_forams,False,False,False,False,True,False,False,False,False,False,True
3,320_U1334A_Nannofossils_1.csv,Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,nannofossils,False,False,False,True,False,False,False,True,True,True,True
4,318_U1358B_Palynology.csv,Micropal_CSV_1/318_U1358B_Palynology.csv,palynology,False,False,False,True,False,False,False,True,False,False,True


In [41]:
def clean_up_taxa_meta_values(file):
    path = f"{base_directory}/{file}"
    content = pd.read_csv(path, dtype=str)
    
    available_metadata = list(taxa_metadata_columns.intersection(set(content.columns)))
    
    taxa_meta_df = content[available_metadata]
    content[available_metadata] = taxa_meta_df.replace(to_replace =' *\[.*\] *', value = '', regex = True) 
    
    # get rid of NAs in order to compare two dataframes   
    changed = not taxa_meta_df.fillna('').equals(content[available_metadata].fillna(''))

    if changed:
        content = csv_cleanup(content, path)
        content.to_csv(path, index=False)

    return changed
    
change_columns = [clean_up_taxa_meta_values(file) for file in metadata['path']] 

In [42]:
dict = {"clean_up_taxa_metadata_values": change_columns}
new_metadata = update_metadata(metadata, dict)
new_metadata.head()

Unnamed: 0,file,path,taxon_group,empty_rows_columns,remove_identical_rows,remove_identical_columns,standardize_headers,add_expedition_aw_cols,add_sample_column,add_missing_cols,clean_up_taxa_values,clean_up_taxa_metadata_values
0,363-U1482A-Benthic_Forams.csv,Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,benthic_forams,False,False,False,False,True,False,False,False,True
1,320_U1336A_Nannofossils_2.csv,Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,nannofossils,False,False,False,True,False,False,False,True,True
2,375_U1518F_planktic_forams.csv,Micropal_CSV_1/375_U1518F_planktic_forams.csv,planktic_forams,False,False,False,False,True,False,False,False,False
3,320_U1334A_Nannofossils_1.csv,Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,nannofossils,False,False,False,True,False,False,False,True,True
4,318_U1358B_Palynology.csv,Micropal_CSV_1/318_U1358B_Palynology.csv,palynology,False,False,False,True,False,False,False,True,False


In [43]:
new_metadata.to_csv(metadata_file, index=False)

## Add missing columns

In [64]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,path,taxon_group,empty_rows_columns,remove_identical_rows,remove_identical_columns,standardize_headers,add_expedition_aw_cols,add_sample_column,add_missing_cols,clean_up_taxa_values,clean_up_taxa_metadata_values,update_zones,add_missing_zone
0,363-U1482A-Benthic_Forams.csv,Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,benthic_forams,False,False,False,False,True,False,False,False,True,False,True
1,320_U1336A_Nannofossils_2.csv,Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,nannofossils,False,False,False,True,False,False,False,True,True,False,True
2,375_U1518F_planktic_forams.csv,Micropal_CSV_1/375_U1518F_planktic_forams.csv,planktic_forams,False,False,False,False,True,False,False,False,False,False,True
3,320_U1334A_Nannofossils_1.csv,Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,nannofossils,False,False,False,True,False,False,False,True,True,True,True
4,318_U1358B_Palynology.csv,Micropal_CSV_1/318_U1358B_Palynology.csv,palynology,False,False,False,True,False,False,False,True,False,False,True


In [58]:
normalized_columns = [
    'Zone name',
    'Zone name (short)'
]

In [59]:
change_columns = [add_missing_columns(f"{base_directory}/{file}", normalized_columns) for file in metadata['path']] 

In [60]:
dict = {"add_missing_zone": change_columns}
new_metadata = update_metadata(metadata, dict)
new_metadata.head()

Unnamed: 0,file,path,taxon_group,empty_rows_columns,remove_identical_rows,remove_identical_columns,standardize_headers,add_expedition_aw_cols,add_sample_column,add_missing_cols,clean_up_taxa_values,clean_up_taxa_metadata_values,update_zones,add_missing_zone
0,363-U1482A-Benthic_Forams.csv,Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,benthic_forams,False,False,False,False,True,False,False,False,True,False,True
1,320_U1336A_Nannofossils_2.csv,Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,nannofossils,False,False,False,True,False,False,False,True,True,False,True
2,375_U1518F_planktic_forams.csv,Micropal_CSV_1/375_U1518F_planktic_forams.csv,planktic_forams,False,False,False,False,True,False,False,False,False,False,True
3,320_U1334A_Nannofossils_1.csv,Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,nannofossils,False,False,False,True,False,False,False,True,True,True,True
4,318_U1358B_Palynology.csv,Micropal_CSV_1/318_U1358B_Palynology.csv,palynology,False,False,False,True,False,False,False,True,False,False,True


In [61]:
new_metadata.to_csv(metadata_file, index=False)