# Normalize Lithology CSVs

In [1]:
import sys
import os
sys.path.append('../scripts/')
import glob
import re

import pandas as pd
import numpy as np

from normalize_data import (
    normalize_columns,
    csv_cleanup,
    update_metadata,
    append_set,
    filter_existing_set,
    add_missing_columns,
    get_columns_from_file_or_disk,
    get_common_columns,
)

In [2]:
base_directory = 'cleaned_data'
raw_data_directory = 'raw_data'

metadata_file = os.path.join(base_directory, 'metadata', 'LIMS','Lithology_changes.csv')
columns_file = os.path.join(base_directory, 'metadata', 'LIMS','columns_list.csv')

## Normalize columns

Find all variants of a column name and set all the variants to use the same column name.

In [3]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,path,empty_rows_columns,remove_identical_rows,remove_identical_columns,standardize_headers,add_expedition_aw_cols,add_sample_column,add_missing_cols,update_lithology,add_missing_lith_cols
0,361_macroscopic_U1474D.csv,Lithology_CSV/361_macroscopic_U1474D.csv,False,False,False,False,False,True,True,True,True
1,323 Core Description Template_U1341A.csv,Lithology_CSV/323 Core Description Template_U1...,False,False,False,False,False,False,True,True,True
2,361_macroscopic_U1479C.csv,Lithology_CSV/361_macroscopic_U1479C.csv,False,False,False,False,False,True,True,True,True
3,340_sediment_U1393A.csv,Lithology_CSV/340_sediment_U1393A.csv,False,False,False,True,False,False,True,True,True
4,339_sediment_U1386A.csv,Lithology_CSV/339_sediment_U1386A.csv,False,False,False,False,False,False,True,True,False


### Get column  variants

In [4]:
# 480
columns_all = get_columns_from_file_or_disk(columns_file = columns_file, 
                                            metadata = metadata, 
                                            data_directory = raw_data_directory, 
                                            column_type = 'lithology')
len(columns_all)

480

In [5]:
# 457
columns_all = columns_all - get_common_columns()
len(columns_all)

457

In [6]:
def remove_symbols_from_set(old_set):
    return filter_existing_set(old_set, r"(.*?%)|(.*?\+)")

def remove_minor_from_set(old_set):
    return filter_existing_set(old_set, r"(.*?min .*?)|(.*?minor .*?)|(Lithology 2.*?)")

In [7]:
lith_all = set()
prefix_all = set()
suffix_all = set()
principal_all = set()
modifier_all = set()
minor_all = set()
color_all = set()

append_set(lith_all, r".*?lith.*?", columns_all)

append_set(modifier_all, r".*?modifier.*?", columns_all)

append_set(suffix_all, r".*?suffix.*?", columns_all)
append_set(suffix_all, r".*?modifier after.*?", columns_all)

append_set(prefix_all, r".*?prefix.*?", columns_all)
append_set(prefix_all, r".*?modifier before.*?", columns_all)

append_set(principal_all, r".*?principal.*?", columns_all)
append_set(principal_all, r"^Lithology$", columns_all)

append_set(minor_all, r"(.*?min lith.*?)|(.*?minor modifier.*?)|(.*?minor suffix.*?)|(Lithology 2.*?)", columns_all)

append_set(color_all, r".*?color.*?", columns_all)


In [8]:
minor_prefix = set()
minor_principal = set()
minor_suffix = set()

suffix = remove_symbols_from_set(suffix_all)
suffix = remove_minor_from_set(suffix)

prefix = remove_symbols_from_set(prefix_all)
prefix = remove_minor_from_set(prefix)

principal = remove_symbols_from_set(principal_all)
principal = remove_minor_from_set(principal)
principal = filter_existing_set(principal, r".*?modifier.*?")

minor = remove_symbols_from_set(minor_all)

append_set(minor_prefix, r".*?prefix.*?", minor)
append_set(minor_principal, r".*?principal.*?", minor)
append_set(minor_suffix, r"(.*?suffix.*?)|(.*?modifier after.*?)", minor)
minor_principal = filter_existing_set(minor_principal, r".*?modifier.*?")


In [9]:
# lith_all

In [10]:
modifier_all

{'Major  prefix modifier before principal',
 'Major modifier before principal',
 'Major prefix modifier before principal',
 'Matrix modifier',
 'Minor suffix modifier after principal'}

In [11]:
prefix_all

{'LITH 1 prefix+name [read only]',
 'LITH 1 prefix+name+suffix [read only]',
 'LITH 2 prefix+name [read only]',
 'LITH 2 prefix+name+suffix [read only]',
 'Lith. prefix+name+suffix',
 'Lithology 1 prefix',
 'Lithology 2 prefix',
 'Lithology Prefix',
 'Lithology prefix',
 'Lithology prefix + name',
 'Lithology prefix+name [read only]',
 'Lithology prefix+name+suffix [read only]',
 'MAJ Lith. Prefix',
 'MAJ Lith. Prefix abundance Plot [%]',
 'MAJ Lith. prefix',
 'MAJ Lith. prefix + name',
 'MAJ Lith. prefix abundance plot [%]',
 'MAJ Lith. prefix+name [read only]',
 'MAJ Lith. prefix+name+suffix',
 'MAJ Lith. prefix+name+suffix [read only]',
 'MAJ Lithology prefix A',
 'MIN Lith. Prefix',
 'MIN Lith. prefix',
 'MIN Lith. prefix + name',
 'MIN Lith. prefix+name+suffix',
 'Major  prefix modifier before principal',
 'Major modifier before principal',
 'Major prefix modifier before principal',
 'PREFIX',
 'Prefix'}

In [12]:
prefix

{'Lithology 1 prefix',
 'Lithology Prefix',
 'Lithology prefix',
 'MAJ Lith. Prefix',
 'MAJ Lith. prefix',
 'MAJ Lithology prefix A',
 'Major  prefix modifier before principal',
 'Major modifier before principal',
 'Major prefix modifier before principal',
 'PREFIX',
 'Prefix'}

In [13]:
suffix_all

{'LITH 1 prefix+name+suffix [read only]',
 'LITH 2 prefix+name+suffix [read only]',
 'Lith. prefix+name+suffix',
 'Lithology 1 suffix',
 'Lithology 2 suffix',
 'Lithology SUFFIX',
 'Lithology Suffix',
 'Lithology name+suffix',
 'Lithology name+suffix [read only]',
 'Lithology prefix+name+suffix [read only]',
 'Lithology suffix',
 'MAJ Lith. Suffix',
 'MAJ Lith. name+suffix',
 'MAJ Lith. prefix+name+suffix',
 'MAJ Lith. prefix+name+suffix [read only]',
 'MAJ Lith. suffix',
 'MIN Lith. Suffix',
 'MIN Lith. name+suffix',
 'MIN Lith. prefix+name+suffix',
 'MIN Lith. suffix',
 'Minor suffix modifier after principal',
 'Suffix'}

In [14]:
suffix 

{'Lithology 1 suffix',
 'Lithology SUFFIX',
 'Lithology Suffix',
 'Lithology suffix',
 'MAJ Lith. Suffix',
 'MAJ Lith. suffix',
 'Suffix'}

In [15]:
principal_all

{'LITHOLOGY',
 'Lithology',
 'Lithology 1 principal name',
 'Lithology 2 principal name',
 'Lithology Principal name',
 'Lithology principal name',
 'MAJ Lith. Principal name',
 'MAJ Lith. Principal name abundance Plot [%]',
 'MAJ Lith. principal name',
 'MAJ Lith. principal name abundance plot [%]',
 'MAJ Principal lithology B',
 'MIN Lith. Principal name',
 'MIN Lith. principal name',
 'Major  prefix modifier before principal',
 'Major modifier before principal',
 'Major prefix modifier before principal',
 'Minor suffix modifier after principal',
 'PRINCIPAL Lithology',
 'PRINCIPALLITHOLOGY',
 'Principal  lithology',
 'Principal Lithology',
 'Principal lithology',
 'Principal lithology name'}

In [16]:
principal

{'LITHOLOGY',
 'Lithology',
 'Lithology 1 principal name',
 'Lithology Principal name',
 'Lithology principal name',
 'MAJ Lith. Principal name',
 'MAJ Lith. principal name',
 'MAJ Principal lithology B',
 'PRINCIPAL Lithology',
 'PRINCIPALLITHOLOGY',
 'Principal  lithology',
 'Principal Lithology',
 'Principal lithology',
 'Principal lithology name'}

In [17]:
minor_all

{'Lithology 2 prefix',
 'Lithology 2 principal name',
 'Lithology 2 suffix',
 'MIN Lith. Prefix',
 'MIN Lith. Principal name',
 'MIN Lith. Suffix',
 'MIN Lith. abundance [%]',
 'MIN Lith. ave. grain size',
 'MIN Lith. ave. grain size rank',
 'MIN Lith. color',
 'MIN Lith. max. grain size',
 'MIN Lith. max. grain size rank',
 'MIN Lith. name+suffix',
 'MIN Lith. prefix',
 'MIN Lith. prefix + name',
 'MIN Lith. prefix+name+suffix',
 'MIN Lith. principal name',
 'MIN Lith. suffix',
 'MIN lith color (simple)',
 'MIN lithology comment',
 'Minor suffix modifier after principal'}

In [18]:
minor

{'Lithology 2 prefix',
 'Lithology 2 principal name',
 'Lithology 2 suffix',
 'MIN Lith. Prefix',
 'MIN Lith. Principal name',
 'MIN Lith. Suffix',
 'MIN Lith. ave. grain size',
 'MIN Lith. ave. grain size rank',
 'MIN Lith. color',
 'MIN Lith. max. grain size',
 'MIN Lith. max. grain size rank',
 'MIN Lith. prefix',
 'MIN Lith. principal name',
 'MIN Lith. suffix',
 'MIN lith color (simple)',
 'MIN lithology comment',
 'Minor suffix modifier after principal'}

In [19]:
minor_prefix

{'Lithology 2 prefix', 'MIN Lith. Prefix', 'MIN Lith. prefix'}

In [20]:
minor_principal

{'Lithology 2 principal name',
 'MIN Lith. Principal name',
 'MIN Lith. principal name'}

In [21]:
minor_suffix

{'Lithology 2 suffix',
 'MIN Lith. Suffix',
 'MIN Lith. suffix',
 'Minor suffix modifier after principal'}

In [22]:
color_all

{'COLOR',
 'Clast color',
 'Color',
 'Color (name)',
 'Color code',
 'Color(name)',
 'LITH 1 color',
 'LITH 2 color',
 'Lithology color',
 'Lithology color (Munsell)',
 'Lithology color (simple)',
 'MAJ Lith. color',
 'MAJ Lithology color',
 'MAJ lith color (simple)',
 'MIN Lith. color',
 'MIN lith color (simple)'}

### get columns that need to be standardized

In [23]:
# 415
not_standardized = (
    columns_all
    - prefix
    - principal
    - suffix
    - minor_prefix
    - minor_principal
    - minor_suffix
)
len(not_standardized)

415

In [24]:
# not_standardized

### Normalize Lithology columns

In [25]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,path,empty_rows_columns,remove_identical_rows,remove_identical_columns,standardize_headers,add_expedition_aw_cols,add_sample_column,add_missing_cols,update_lithology,add_missing_lith_cols
0,361_macroscopic_U1474D.csv,Lithology_CSV/361_macroscopic_U1474D.csv,False,False,False,False,False,True,True,True,True
1,323 Core Description Template_U1341A.csv,Lithology_CSV/323 Core Description Template_U1...,False,False,False,False,False,False,True,True,True
2,361_macroscopic_U1479C.csv,Lithology_CSV/361_macroscopic_U1479C.csv,False,False,False,False,False,True,True,True,True
3,340_sediment_U1393A.csv,Lithology_CSV/340_sediment_U1393A.csv,False,False,False,True,False,False,True,True,True
4,339_sediment_U1386A.csv,Lithology_CSV/339_sediment_U1386A.csv,False,False,False,False,False,False,True,True,False


In [26]:
def process_file(file):
    path = f"{base_directory}/{file}"
    content = pd.read_csv(path, dtype=str)
    columns = list(content.columns)
    
    normalized_cols = normalize_columns(prefix, 'Lithology Prefix', columns)
    normalized_cols = normalize_columns(principal, 'Lithology Principal Name', normalized_cols)
    normalized_cols = normalize_columns(suffix, 'Lithology Suffix', normalized_cols)

    normalized_cols = normalize_columns(minor_prefix, 'Minor Lithology Prefix', normalized_cols)
    normalized_cols = normalize_columns(minor_principal, 'Minor Lithology Name', normalized_cols)
    normalized_cols = normalize_columns(minor_suffix, 'Minor Lithology Suffix', normalized_cols)

    changed = columns != normalized_cols
    
    if changed:
        content.columns = normalized_cols
        content = csv_cleanup(content, path)
        content.to_csv(path, index=False)

    return changed

change_columns = [process_file(file) for file in metadata['path']] 

In [27]:
dict = {"update_lithology": change_columns}
new_metadata = update_metadata(metadata, dict)
new_metadata.head()

Unnamed: 0,file,path,empty_rows_columns,remove_identical_rows,remove_identical_columns,standardize_headers,add_expedition_aw_cols,add_sample_column,add_missing_cols,update_lithology,add_missing_lith_cols
0,361_macroscopic_U1474D.csv,Lithology_CSV/361_macroscopic_U1474D.csv,False,False,False,False,False,True,True,True,True
1,323 Core Description Template_U1341A.csv,Lithology_CSV/323 Core Description Template_U1...,False,False,False,False,False,False,True,True,True
2,361_macroscopic_U1479C.csv,Lithology_CSV/361_macroscopic_U1479C.csv,False,False,False,False,False,True,True,True,True
3,340_sediment_U1393A.csv,Lithology_CSV/340_sediment_U1393A.csv,False,False,False,True,False,False,True,True,True
4,339_sediment_U1386A.csv,Lithology_CSV/339_sediment_U1386A.csv,False,False,False,False,False,False,True,True,False


In [28]:
new_metadata.to_csv(metadata_file, index=False)

## Add missing Lithology columns

In [29]:
metadata = pd.read_csv(metadata_file)

In [30]:
normalized_columns = [
    'Lithology Prefix',
    'Lithology Principal Name',
    'Lithology Suffix',
    'Minor Lithology Prefix',
    'Minor Lithology Name',
    'Minor Lithology Suffix',
]

In [31]:
change_columns = [add_missing_columns(f"{base_directory}/{file}", normalized_columns) for file in metadata['path']] 

In [32]:
dict = {"add_missing_lith_cols": change_columns}
new_metadata = update_metadata(metadata, dict)
new_metadata.head()

Unnamed: 0,file,path,empty_rows_columns,remove_identical_rows,remove_identical_columns,standardize_headers,add_expedition_aw_cols,add_sample_column,add_missing_cols,update_lithology,add_missing_lith_cols
0,361_macroscopic_U1474D.csv,Lithology_CSV/361_macroscopic_U1474D.csv,False,False,False,False,False,True,True,True,True
1,323 Core Description Template_U1341A.csv,Lithology_CSV/323 Core Description Template_U1...,False,False,False,False,False,False,True,True,True
2,361_macroscopic_U1479C.csv,Lithology_CSV/361_macroscopic_U1479C.csv,False,False,False,False,False,True,True,True,True
3,340_sediment_U1393A.csv,Lithology_CSV/340_sediment_U1393A.csv,False,False,False,True,False,False,True,True,True
4,339_sediment_U1386A.csv,Lithology_CSV/339_sediment_U1386A.csv,False,False,False,False,False,False,True,True,False


In [33]:
new_metadata.to_csv(metadata_file, index=False)