# Normalize Micropal CSVs

In [2]:
import sys
sys.path.append('../scripts/')
import glob
import re
import os.path

import pandas as pd
import numpy as np

from normalize_data import (
    csv_cleanup,
    update_metadata,
    get_taxonomy_columns,
    clean_taxon_name,
    fetch_unique_column_names,
    append_set,
    normalize_columns,
    add_missing_columns,
    get_columns_from_file_or_disk,
    get_common_columns,
)

In [3]:
base_directory = 'cleaned_data'
raw_data_directory = 'raw_data'

taxa_file = os.path.join(base_directory, 'taxa', 'approved_eodp_taxa_list.csv')
columns_file = os.path.join(base_directory, 'metadata', 'LIMS', 'columns_list.csv')
non_taxa_fields_path = os.path.join(base_directory, 
                                    'taxa', 'draft', 'LIMS', 'non_taxa_fields.csv')

metadata_file = os.path.join(base_directory, 
                             'metadata', 'LIMS', 'Micropal_changes.csv')

## Set taxa metadata columns

In [4]:
non_taxa_content = pd.read_csv(non_taxa_fields_path)
non_taxa_columns = set(non_taxa_content['field'])

In [5]:
taxa_metadata_columns = set(non_taxa_content[non_taxa_content['type'] == 'taxonomy metadata']['field'])

## Normalize columns

Find all variants of a column name and set all the variants to use the same column name.

In [6]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,path,taxon_group,empty_rows_columns,remove_identical_rows,remove_identical_columns,standardize_headers,add_expedition_aw_cols,add_sample_column,add_missing_cols,clean_up_taxa_values,clean_up_taxa_metadata_values,update_zones,add_missing_zone
0,363-U1482A-Benthic_Forams.csv,Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,benthic_foraminfera,False,False,False,False,True,False,False,False,True,False,True
1,320_U1336A_Nannofossils_2.csv,Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,nannofossils,False,False,False,True,False,False,False,True,True,False,True
2,375_U1518F_planktic_forams.csv,Micropal_CSV_1/375_U1518F_planktic_forams.csv,planktic_foraminfera,False,False,False,False,True,False,False,False,False,False,True
3,320_U1334A_Nannofossils_1.csv,Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,nannofossils,False,False,False,True,False,False,False,True,True,True,True
4,318_U1358B_Palynology.csv,Micropal_CSV_1/318_U1358B_Palynology.csv,palynology,False,False,False,True,False,False,False,True,False,False,True


In [7]:
metadata.shape

(1116, 14)

### Get column  variants

In [8]:
# 4707 columns_all
columns_all = get_columns_from_file_or_disk(columns_file = columns_file, 
                                            metadata = metadata, 
                                            data_directory = raw_data_directory, 
                                            column_type = 'taxa')
len(columns_all)

4707

In [9]:
# 4685
columns_all = columns_all - get_common_columns()
len(columns_all)

4685

In [10]:
zone_all = set()
zone_name = set()
zone_name_short = set()

append_set(zone_all, r"zone", columns_all)
append_set(zone_name, r"zone[_ ]name$|zone$", columns_all)
append_set(zone_name_short, r"zone_name_short|zone name [\[(]short[\])]", columns_all)


In [11]:
zone_all

{'Zone',
 'Zone author (year)',
 'Zone comment',
 'Zone group',
 'Zone name',
 'Zone name (short)',
 'Zone name [short]'}

In [12]:
zone_name

{'Zone', 'Zone name'}

In [13]:
zone_name_short

{'Zone name (short)', 'Zone name [short]'}

### get columns that need to be standardized

In [14]:
def get_taxa_columns():
    df = pd.read_csv(taxa_file, usecols=['verbatim_name'])

    return set(df['verbatim_name'])    

In [15]:
# 4581 taxa_cols
taxa_cols = get_taxa_columns()
len(taxa_cols)

4581

In [16]:
# 105
not_standardized = columns_all - taxa_cols
len(not_standardized)


105

In [17]:
not_standardized 

{'% Planktic Foraminifera within whole sample',
 '342-U1408A-2H-2-W 100/102-FORAM',
 'Abundance',
 'Additional zone name',
 'Additional zone name (short)',
 'Age',
 'Aspect comment (etching)',
 'BF Group abundance',
 'BF Preservation',
 'BF comment',
 'BF preservation',
 'COMMENTS',
 'Chrysophyte cyst group abundance',
 'Comment',
 'Comment (general)',
 'Comments',
 'Datum age average [Ma]',
 'Datum age maximum [Ma]',
 'Datum age minimum [Ma]',
 'Datum author year',
 'Datum comment',
 'Datum group',
 'Datum group code',
 'Datum name',
 'Datum name generic',
 'Datum region',
 'Datum type',
 'Diatom abundance',
 'Diatom preservation - pyritization2',
 'Diatom preservation dissolution',
 'Diatom preservation fragmentation',
 'Diatoms and siliceous plankton comment',
 'Diatoms group abundance',
 'Ebridian group abundance',
 'Exotic',
 'File Data',
 'Foram abundance',
 'Fragmentation',
 'Fragmentation rank [auto-pop]',
 'General comment',
 'Genus/species (upper zone)',
 'Genus/species lower

### Normalize columns

In [17]:
metadata = pd.read_csv(metadata_file)

In [18]:
def process_file(file):
    path = f"{base_directory}/{file}"
    content = pd.read_csv(path, dtype=str)
    columns = list(content.columns)
    
#     normalized_cols = normalize_columns(zone_name, 'Zone name', columns)
#     normalized_cols = normalize_columns(zone_name_short, 'Zone name (short)', normalized_cols)
    normalized_cols = columns

    changed = columns != normalized_cols
    
    if changed:
        content.columns = normalized_cols
        content = csv_cleanup(content, path)
        content.to_csv(path, index=False)

    return changed

change_columns = [process_file(file) for file in metadata['path']] 

In [19]:
dict = {"update_taxa": change_columns}
new_metadata = update_metadata(metadata, dict)
new_metadata.head()

Unnamed: 0,file,path,taxon_group,empty_rows_columns,remove_identical_rows,remove_identical_columns,standardize_headers,add_expedition_aw_cols,add_sample_column,add_missing_cols,clean_up_taxa_values,clean_up_taxa_metadata_values,update_zones,add_missing_zone,update_taxa
0,363-U1482A-Benthic_Forams.csv,Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,benthic_foraminfera,False,False,False,False,True,False,False,False,True,False,True,False
1,320_U1336A_Nannofossils_2.csv,Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,nannofossils,False,False,False,True,False,False,False,True,True,False,True,False
2,375_U1518F_planktic_forams.csv,Micropal_CSV_1/375_U1518F_planktic_forams.csv,planktic_foraminfera,False,False,False,False,True,False,False,False,False,False,True,False
3,320_U1334A_Nannofossils_1.csv,Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,nannofossils,False,False,False,True,False,False,False,True,True,True,True,False
4,318_U1358B_Palynology.csv,Micropal_CSV_1/318_U1358B_Palynology.csv,palynology,False,False,False,True,False,False,False,True,False,False,True,False


In [20]:
new_metadata.to_csv(metadata_file, index=False)

## Clean up taxa values
Look for taxa columns that have "code [extra text]", and remove "[extra text]".

In [21]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,path,taxon_group,empty_rows_columns,remove_identical_rows,remove_identical_columns,standardize_headers,add_expedition_aw_cols,add_sample_column,add_missing_cols,clean_up_taxa_values,clean_up_taxa_metadata_values,update_zones,add_missing_zone,update_taxa
0,363-U1482A-Benthic_Forams.csv,Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,benthic_foraminfera,False,False,False,False,True,False,False,False,True,False,True,False
1,320_U1336A_Nannofossils_2.csv,Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,nannofossils,False,False,False,True,False,False,False,True,True,False,True,False
2,375_U1518F_planktic_forams.csv,Micropal_CSV_1/375_U1518F_planktic_forams.csv,planktic_foraminfera,False,False,False,False,True,False,False,False,False,False,True,False
3,320_U1334A_Nannofossils_1.csv,Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,nannofossils,False,False,False,True,False,False,False,True,True,True,True,False
4,318_U1358B_Palynology.csv,Micropal_CSV_1/318_U1358B_Palynology.csv,palynology,False,False,False,True,False,False,False,True,False,False,True,False


In [22]:
def process_file(file):
    path = f"{base_directory}/{file}"
    content = pd.read_csv(path, dtype=str)
    
    taxa_cols = get_taxonomy_columns(content.columns, non_taxa_columns)            
    taxa_df = content[taxa_cols]
    content[taxa_cols] = taxa_df.replace(to_replace =' *\[.*\] *', value = '', regex = True) 
    
    # get rid of NAs in order to compare two dataframes   
    changed = not taxa_df.fillna('').equals(content[taxa_cols].fillna(''))

    if changed:
        content = csv_cleanup(content, path)
        content.to_csv(path, index=False)

    return changed
    
change_columns = [process_file(file) for file in metadata['path']] 

In [23]:
dict = {"clean_up_taxa_values": change_columns}
new_metadata = update_metadata(metadata, dict)
new_metadata.head()

Unnamed: 0,file,path,taxon_group,empty_rows_columns,remove_identical_rows,remove_identical_columns,standardize_headers,add_expedition_aw_cols,add_sample_column,add_missing_cols,clean_up_taxa_values,clean_up_taxa_metadata_values,update_zones,add_missing_zone,update_taxa
0,363-U1482A-Benthic_Forams.csv,Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,benthic_foraminfera,False,False,False,False,True,False,False,False,True,False,True,False
1,320_U1336A_Nannofossils_2.csv,Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,nannofossils,False,False,False,True,False,False,False,True,True,False,True,False
2,375_U1518F_planktic_forams.csv,Micropal_CSV_1/375_U1518F_planktic_forams.csv,planktic_foraminfera,False,False,False,False,True,False,False,False,False,False,True,False
3,320_U1334A_Nannofossils_1.csv,Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,nannofossils,False,False,False,True,False,False,False,True,True,True,True,False
4,318_U1358B_Palynology.csv,Micropal_CSV_1/318_U1358B_Palynology.csv,palynology,False,False,False,True,False,False,False,True,False,False,True,False


In [25]:
new_metadata.to_csv(metadata_file, index=False)

## Clean up taxa metadata values
Look for taxa metadata columns that have "code [extra text]", and remove "[extra text]".

In [26]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,path,taxon_group,empty_rows_columns,remove_identical_rows,remove_identical_columns,standardize_headers,add_expedition_aw_cols,add_sample_column,add_missing_cols,clean_up_taxa_values,clean_up_taxa_metadata_values,update_zones,add_missing_zone,update_taxa
0,363-U1482A-Benthic_Forams.csv,Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,benthic_foraminfera,False,False,False,False,True,False,False,False,True,False,True,False
1,320_U1336A_Nannofossils_2.csv,Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,nannofossils,False,False,False,True,False,False,False,True,True,False,True,False
2,375_U1518F_planktic_forams.csv,Micropal_CSV_1/375_U1518F_planktic_forams.csv,planktic_foraminfera,False,False,False,False,True,False,False,False,False,False,True,False
3,320_U1334A_Nannofossils_1.csv,Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,nannofossils,False,False,False,True,False,False,False,True,True,True,True,False
4,318_U1358B_Palynology.csv,Micropal_CSV_1/318_U1358B_Palynology.csv,palynology,False,False,False,True,False,False,False,True,False,False,True,False


In [27]:
def process_file(file):
    path = f"{base_directory}/{file}"
    content = pd.read_csv(path, dtype=str)
    
    available_metadata = list(taxa_metadata_columns.intersection(set(content.columns)))
    
    taxa_meta_df = content[available_metadata]
    content[available_metadata] = taxa_meta_df.replace(to_replace =' *\[.*\] *', value = '', regex = True) 
    
    # get rid of NAs in order to compare two dataframes   
    changed = not taxa_meta_df.fillna('').equals(content[available_metadata].fillna(''))

    if changed:
        content = csv_cleanup(content, path)
        content.to_csv(path, index=False)

    return changed
    
change_columns = [process_file(file) for file in metadata['path']] 

In [28]:
dict = {"clean_up_taxa_metadata_values": change_columns}
new_metadata = update_metadata(metadata, dict)
new_metadata.head()

Unnamed: 0,file,path,taxon_group,empty_rows_columns,remove_identical_rows,remove_identical_columns,standardize_headers,add_expedition_aw_cols,add_sample_column,add_missing_cols,clean_up_taxa_values,clean_up_taxa_metadata_values,update_zones,add_missing_zone,update_taxa
0,363-U1482A-Benthic_Forams.csv,Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,benthic_foraminfera,False,False,False,False,True,False,False,False,True,False,True,False
1,320_U1336A_Nannofossils_2.csv,Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,nannofossils,False,False,False,True,False,False,False,True,True,False,True,False
2,375_U1518F_planktic_forams.csv,Micropal_CSV_1/375_U1518F_planktic_forams.csv,planktic_foraminfera,False,False,False,False,True,False,False,False,False,False,True,False
3,320_U1334A_Nannofossils_1.csv,Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,nannofossils,False,False,False,True,False,False,False,True,True,True,True,False
4,318_U1358B_Palynology.csv,Micropal_CSV_1/318_U1358B_Palynology.csv,palynology,False,False,False,True,False,False,False,True,False,False,True,False


In [29]:
new_metadata.to_csv(metadata_file, index=False)

## Add missing columns

In [30]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,path,taxon_group,empty_rows_columns,remove_identical_rows,remove_identical_columns,standardize_headers,add_expedition_aw_cols,add_sample_column,add_missing_cols,clean_up_taxa_values,clean_up_taxa_metadata_values,update_zones,add_missing_zone,update_taxa
0,363-U1482A-Benthic_Forams.csv,Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,benthic_foraminfera,False,False,False,False,True,False,False,False,True,False,True,False
1,320_U1336A_Nannofossils_2.csv,Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,nannofossils,False,False,False,True,False,False,False,True,True,False,True,False
2,375_U1518F_planktic_forams.csv,Micropal_CSV_1/375_U1518F_planktic_forams.csv,planktic_foraminfera,False,False,False,False,True,False,False,False,False,False,True,False
3,320_U1334A_Nannofossils_1.csv,Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,nannofossils,False,False,False,True,False,False,False,True,True,True,True,False
4,318_U1358B_Palynology.csv,Micropal_CSV_1/318_U1358B_Palynology.csv,palynology,False,False,False,True,False,False,False,True,False,False,True,False


In [32]:
normalized_columns = [
    'Zone name',
    'Zone name (short)'
]

In [33]:
change_columns = [add_missing_columns(f"{base_directory}/{file}", normalized_columns) for file in metadata['path']] 

In [34]:
dict = {"add_missing_zone": change_columns}
new_metadata = update_metadata(metadata, dict)
new_metadata.head()

Unnamed: 0,file,path,taxon_group,empty_rows_columns,remove_identical_rows,remove_identical_columns,standardize_headers,add_expedition_aw_cols,add_sample_column,add_missing_cols,clean_up_taxa_values,clean_up_taxa_metadata_values,update_zones,add_missing_zone,update_taxa
0,363-U1482A-Benthic_Forams.csv,Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,benthic_foraminfera,False,False,False,False,True,False,False,False,True,False,True,False
1,320_U1336A_Nannofossils_2.csv,Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,nannofossils,False,False,False,True,False,False,False,True,True,False,True,False
2,375_U1518F_planktic_forams.csv,Micropal_CSV_1/375_U1518F_planktic_forams.csv,planktic_foraminfera,False,False,False,False,True,False,False,False,False,False,True,False
3,320_U1334A_Nannofossils_1.csv,Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,nannofossils,False,False,False,True,False,False,False,True,True,True,True,False
4,318_U1358B_Palynology.csv,Micropal_CSV_1/318_U1358B_Palynology.csv,palynology,False,False,False,True,False,False,False,True,False,False,True,False


In [35]:
new_metadata.to_csv(metadata_file, index=False)