# Normalize column names, part 2

In [1]:
import sys
sys.path.append('../scripts/')
import glob
import re

import pandas as pd
import numpy as np

from normalize_data import normalize_sample_col, normalize_expedition_section_cols

In [2]:
metadata_file = 'cleaned_data/metadata/Lithology_changes.csv'
clean_data_path = 'cleaned_data/Lithology_CSV'

In [3]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,expedition,filename_has_exp,add_expedition_section_cols,update_sample_col,update_top_bottom,update_lithology
0,361_macroscopic_U1474D.csv,361,True,False,True,False,True
1,323 Core Description Template_U1341A.csv,323,True,False,False,False,True
2,361_macroscopic_U1479C.csv,361,True,False,True,False,True
3,340_sediment_U1393A.csv,340,True,False,True,False,True
4,339_sediment_U1386A.csv,339,True,False,False,False,True


## Get columns names

In [4]:
columns_all = set()

def find_field_names(file):
    path = f"{clean_data_path}/{file}"
    content = pd.read_csv(path)
    columns_all.update(set(content.columns))
    
res=[find_field_names(file) for file in metadata['file']] 

In [5]:
def append_set(my_set,  regex, cols):
    [my_set.add(col) for col in cols if re.match(regex, col, re.IGNORECASE)]

def remove_from_set(old_set, regex):
    return {word  for word in old_set if not  re.match(regex, word, re.IGNORECASE)}

def remove_symbols_from_set(old_set):
    return remove_from_set(old_set, r"(.*?%)|(.*?\+)")

def remove_minor_from_set(old_set):
    return remove_from_set(old_set, r"(.*?min .*?)|(.*?minor .*?)|(Lithology 2.*?)")

In [6]:
top_all = set()
bottom_all = set()

lith_all = set()
prefix_all = set()
suffix_all = set()
principal_all = set()
modifier_all = set()
minor_all = set()


append_set(top_all, r".*?top.*?", columns_all)
append_set(bottom_all, r".*?bottom.*?", columns_all)

append_set(lith_all, r".*?lith.*?", columns_all)

append_set(modifier_all, r".*?modifier.*?", columns_all)

append_set(suffix_all, r".*?suffix.*?", columns_all)
append_set(suffix_all, r".*?modifier after.*?", columns_all)

append_set(prefix_all, r".*?prefix.*?", columns_all)
append_set(prefix_all, r".*?modifier before.*?", columns_all)

append_set(principal_all, r".*?principal.*?", columns_all)
append_set(principal_all, r"^Lithology$", columns_all)

append_set(minor_all, r"(.*?min lith.*?)|(.*?minor modifier.*?)|(.*?minor suffix.*?)|(Lithology 2.*?)", columns_all)



In [7]:
top = set()
top_depth = set()

bottom = set()
bottom_depth = set()

minor_prefix = set()
minor_principal = set()
minor_suffix = set()


append_set(top_depth, r"top depth", columns_all)
append_set(top, r"top offset|top ?\[", columns_all)

append_set(bottom_depth, r"bottom depth", columns_all)
append_set(bottom, r"bottom offset|bottom ?\[", columns_all)


suffix = remove_symbols_from_set(suffix_all)
suffix = remove_minor_from_set(suffix)

prefix = remove_symbols_from_set(prefix_all)
prefix = remove_minor_from_set(prefix)


principal = remove_symbols_from_set(principal_all)
principal = remove_minor_from_set(principal)
principal = remove_from_set(principal, r".*?modifier.*?")

minor = remove_symbols_from_set(minor_all)

append_set(minor_prefix, r".*?prefix.*?", minor)
append_set(minor_principal, r".*?principal.*?", minor)
append_set(minor_suffix, r"(.*?suffix.*?)|(.*?modifier after.*?)", minor)
minor_principal = remove_from_set(minor_principal, r".*?modifier.*?")


In [8]:
columns_all

{' Lamination',
 '2ND crystal roundness',
 '2ND lithic roundness',
 '2ND vitric roundness',
 '2ND volcanic clast roundness',
 '2nd sedimentary structure',
 '3RD crystal roundness',
 '3RD lithic roundness',
 '3RD vitric roundness',
 '3RD volcanic clast roundness',
 '3rd sedimentary structure',
 'A/W',
 'Accessory mineral (comments)',
 'Accessory mineral 1 (<1 Vol. %)',
 'Accessory mineral 2 (<1 Vol. %)',
 'Actinolite abundance',
 'Altered glass (palagonite) abundance [%]',
 'Amount of ash in lithology',
 'Amount of ash in lithology rank [read only]',
 'Apparent dip angle 1 [deg]',
 'Apparent dip angle 2 [deg]',
 'Apparent dip azimuth 1 [deg]',
 'Apparent dip azimuth 2 [deg]',
 'Bedding thickness',
 'Bedding thickness of MAJOR lithology',
 'Biogenic carbonate ',
 'Biogenic material',
 'Biotite abundance ',
 'Bioturbation intensity',
 'Bioturbation intensity rank',
 'Bioturbation intensity rank [read only]',
 'Bioturbation type',
 'Bottom Depth [m]',
 'Bottom Depth[m] [m]',
 'Bottom [cm]'

In [9]:
top_all

{'Top Depth [m]',
 'Top Depth[m] [m]',
 'Top [cm]',
 'Top depth [m]',
 'Top offset [cm]',
 'Top[cm] [cm]'}

In [10]:
top

{'Top [cm]', 'Top offset [cm]', 'Top[cm] [cm]'}

In [11]:
top_depth

{'Top Depth [m]', 'Top Depth[m] [m]', 'Top depth [m]'}

In [12]:
bottom_all

{'Bottom Depth [m]',
 'Bottom Depth[m] [m]',
 'Bottom [cm]',
 'Bottom contact attitude',
 'Bottom contact definition',
 'Bottom contact geometry',
 'Bottom contact or boundary attitude',
 'Bottom contact or boundary definition',
 'Bottom contact or boundary geometry',
 'Bottom contact or boundary geometry+definition [read only]',
 'Bottom contact or boundary type',
 'Bottom depth [m]',
 'Bottom offset [cm]',
 'Bottom[cm] [cm]'}

In [13]:
bottom

{'Bottom [cm]', 'Bottom offset [cm]', 'Bottom[cm] [cm]'}

In [14]:
bottom_depth

{'Bottom Depth [m]', 'Bottom Depth[m] [m]', 'Bottom depth [m]'}

In [15]:
lith_all

{'2ND lithic roundness',
 '3RD lithic roundness',
 'Amount of ash in lithology',
 'Amount of ash in lithology rank [read only]',
 'Bedding thickness of MAJOR lithology',
 'Clast 3 -other lithology [%]',
 'Clast 3 -other lithology grain size',
 'Clast 7 -other lithology [%]',
 'Clast 7 -other lithology grain size',
 'Clast lith. abund. sum [%]',
 'Clast lithology',
 'Clast lithology comment',
 'Complete lithology name',
 'DOMINANT lithic roundness',
 'GRAVEL SIZE CLASTLITHOLOGY',
 'Interbed lithology',
 'LITH 1 Ash abundance name',
 'LITH 1 Ash abundance rank (read only)',
 'LITH 1 Bioturbation intensity',
 'LITH 1 Bioturbation intensity rank [read only]',
 'LITH 1 Clast 1 igneous -  grain size',
 'LITH 1 Clast 1 igneous -  lithology',
 'LITH 1 Clast 1 igneous -  roundness',
 'LITH 1 Clast 1 igneous [%]',
 'LITH 1 Clast 2 sediment - grain size',
 'LITH 1 Clast 2 sediment - roundness',
 'LITH 1 Clast 2 sediment [%]',
 'LITH 1 Clast 3 -other LITH [%]',
 'LITH 1 Clast 3 metamorphic - grain

In [16]:
modifier_all

{'Major  prefix modifier before principal',
 'Major ModifierPrefixModifier Before Principal',
 'Major modifier before principal',
 'Major prefix modifier before principal',
 'Matrix modifier',
 'Minor ModifierSuffixModifier After Principal',
 'Minor modifier after principal',
 'Minor suffix modifier after principal'}

In [17]:
prefix_all

{'LITH 1 prefix+name [read only]',
 'LITH 1 prefix+name+suffix [read only]',
 'LITH 2 prefix+name [read only]',
 'LITH 2 prefix+name+suffix [read only]',
 'Lith. prefix+name+suffix',
 'Lithology 1 prefix',
 'Lithology 2 prefix',
 'Lithology PREFIX',
 'Lithology Prefix',
 'Lithology prefix',
 'Lithology prefix + name',
 'Lithology prefix+name [read only]',
 'Lithology prefix+name+suffix [read only]',
 'MAJ Lith. Prefix',
 'MAJ Lith. Prefix abundance Plot [%]',
 'MAJ Lith. prefix',
 'MAJ Lith. prefix + name',
 'MAJ Lith. prefix abundance plot [%]',
 'MAJ Lith. prefix+name [read only]',
 'MAJ Lith. prefix+name+suffix',
 'MAJ Lith. prefix+name+suffix [read only]',
 'MAJ Lithology prefix A',
 'MIN Lith. Prefix',
 'MIN Lith. prefix',
 'MIN Lith. prefix + name',
 'MIN Lith. prefix+name+suffix',
 'Major  prefix modifier before principal',
 'Major ModifierPrefixModifier Before Principal',
 'Major modifier before principal',
 'Major prefix modifier before principal',
 'PREFIX',
 'Prefix'}

In [18]:
prefix

{'Lithology 1 prefix',
 'Lithology PREFIX',
 'Lithology Prefix',
 'Lithology prefix',
 'MAJ Lith. Prefix',
 'MAJ Lith. prefix',
 'MAJ Lithology prefix A',
 'Major  prefix modifier before principal',
 'Major ModifierPrefixModifier Before Principal',
 'Major modifier before principal',
 'Major prefix modifier before principal',
 'PREFIX',
 'Prefix'}

In [19]:
suffix_all

{'LITH 1 prefix+name+suffix [read only]',
 'LITH 2 prefix+name+suffix [read only]',
 'Lith. prefix+name+suffix',
 'Lithology 1 suffix',
 'Lithology 2 suffix',
 'Lithology SUFFIX',
 'Lithology Suffix',
 'Lithology name+suffix',
 'Lithology name+suffix [read only]',
 'Lithology prefix+name+suffix [read only]',
 'Lithology suffix',
 'MAJ Lith. Suffix',
 'MAJ Lith. name+suffix',
 'MAJ Lith. prefix+name+suffix',
 'MAJ Lith. prefix+name+suffix [read only]',
 'MAJ Lith. suffix',
 'MAJ Lithology suffix C',
 'MIN Lith. Suffix',
 'MIN Lith. name+suffix',
 'MIN Lith. prefix+name+suffix',
 'MIN Lith. suffix',
 'Minor ModifierSuffixModifier After Principal',
 'Minor modifier after principal',
 'Minor suffix modifier after principal',
 'SUFFIX',
 'Suffix'}

In [20]:
suffix 

{'Lithology 1 suffix',
 'Lithology SUFFIX',
 'Lithology Suffix',
 'Lithology suffix',
 'MAJ Lith. Suffix',
 'MAJ Lith. suffix',
 'MAJ Lithology suffix C',
 'SUFFIX',
 'Suffix'}

In [21]:
principal_all

{'LITHOLOGY',
 'Lithology',
 'Lithology 1 principal name',
 'Lithology 2 principal name',
 'Lithology Principal name',
 'Lithology principal name',
 'MAJ Lith. Principal name',
 'MAJ Lith. Principal name abundance Plot [%]',
 'MAJ Lith. principal name',
 'MAJ Lith. principal name abundance plot [%]',
 'MAJ Principal lithology B',
 'MIN Lith. Principal name',
 'MIN Lith. principal name',
 'Major  prefix modifier before principal',
 'Major ModifierPrefixModifier Before Principal',
 'Major modifier before principal',
 'Major prefix modifier before principal',
 'Minor ModifierSuffixModifier After Principal',
 'Minor modifier after principal',
 'Minor suffix modifier after principal',
 'PRINCIPAL Lithology',
 'PRINCIPALLITHOLOGY',
 'Principal  lithology',
 'Principal Lithology',
 'Principal lithology',
 'Principal lithology name',
 'PrincipalLithology Name'}

In [22]:
principal

{'LITHOLOGY',
 'Lithology',
 'Lithology 1 principal name',
 'Lithology Principal name',
 'Lithology principal name',
 'MAJ Lith. Principal name',
 'MAJ Lith. principal name',
 'MAJ Principal lithology B',
 'PRINCIPAL Lithology',
 'PRINCIPALLITHOLOGY',
 'Principal  lithology',
 'Principal Lithology',
 'Principal lithology',
 'Principal lithology name',
 'PrincipalLithology Name'}

In [23]:
minor_all

{'Lithology 2 prefix',
 'Lithology 2 principal name',
 'Lithology 2 suffix',
 'MIN Lith. Prefix',
 'MIN Lith. Principal name',
 'MIN Lith. Suffix',
 'MIN Lith. abundance [%]',
 'MIN Lith. ave. grain size ',
 'MIN Lith. ave. grain size rank',
 'MIN Lith. color',
 'MIN Lith. max. grain size ',
 'MIN Lith. max. grain size rank',
 'MIN Lith. name+suffix',
 'MIN Lith. prefix',
 'MIN Lith. prefix + name',
 'MIN Lith. prefix+name+suffix',
 'MIN Lith. principal name',
 'MIN Lith. suffix',
 'MIN lith color (simple)',
 'MIN lithology comment',
 'Minor ModifierSuffixModifier After Principal',
 'Minor modifier after principal',
 'Minor suffix modifier after principal'}

In [24]:
minor

{'Lithology 2 prefix',
 'Lithology 2 principal name',
 'Lithology 2 suffix',
 'MIN Lith. Prefix',
 'MIN Lith. Principal name',
 'MIN Lith. Suffix',
 'MIN Lith. ave. grain size ',
 'MIN Lith. ave. grain size rank',
 'MIN Lith. color',
 'MIN Lith. max. grain size ',
 'MIN Lith. max. grain size rank',
 'MIN Lith. prefix',
 'MIN Lith. principal name',
 'MIN Lith. suffix',
 'MIN lith color (simple)',
 'MIN lithology comment',
 'Minor ModifierSuffixModifier After Principal',
 'Minor modifier after principal',
 'Minor suffix modifier after principal'}

In [25]:
minor_prefix

{'Lithology 2 prefix', 'MIN Lith. Prefix', 'MIN Lith. prefix'}

In [26]:
minor_principal

{'Lithology 2 principal name',
 'MIN Lith. Principal name',
 'MIN Lith. principal name'}

In [27]:
minor_suffix

{'Lithology 2 suffix',
 'MIN Lith. Suffix',
 'MIN Lith. suffix',
 'Minor ModifierSuffixModifier After Principal',
 'Minor modifier after principal',
 'Minor suffix modifier after principal'}

## Demo search for phrases in file

In [28]:
def read_file(file):
    print(file)
    path = f"{clean_data_path}/{file}"
    return pd.read_csv(path)

In [29]:
def normalize_columns(old_columns, new_column, all_columns):
    return [new_column if column in old_columns else column for column in all_columns]
    

In [30]:
content = read_file(metadata['file'][8])
cols = list(content.columns)

339_sediment_U1390A.csv


In [31]:
normalized_cols = cols
normalized_cols = normalize_columns(top, 'Top [cm]', normalized_cols)
normalized_cols = normalize_columns(bottom, 'Bottom [cm]', normalized_cols)
normalized_cols = normalize_columns(top_depth, 'Top Depth [m]', normalized_cols)
normalized_cols = normalize_columns(bottom_depth, 'Bottom Depth [m]', normalized_cols)

normalized_cols = normalize_columns(prefix, 'Lithology Prefix', normalized_cols)
normalized_cols = normalize_columns(principal, 'Lithology Principal Name', normalized_cols)
normalized_cols = normalize_columns(suffix, 'Lithology Suffix', normalized_cols)

normalized_cols = normalize_columns(minor_prefix, 'Minor Lithology Prefix', normalized_cols)
normalized_cols = normalize_columns(minor_principal, 'Minor Lithology Name', normalized_cols)
normalized_cols = normalize_columns(minor_suffix, 'Minor Lithology Suffix', normalized_cols)

In [32]:
normalized_cols

['Sample',
 'Exp',
 'Site',
 'Hole',
 'Core',
 'Core-Sect',
 'Type',
 'Section',
 'A/W',
 'Top [cm]',
 'Bottom [cm]',
 'Top Depth [m]',
 'Bottom Depth [m]',
 'Lithology Prefix',
 'Lithology Principal Name',
 'Lithology Suffix',
 'MAJ Lith. prefix + name',
 'MAJ Lith. prefix+name+suffix',
 'MAJ Lith. name+suffix',
 'MAJ Lith. abundance [%]',
 'MAJ Lith. color',
 'MAJ Lith. ave. grain size ',
 'MAJ Lith. ave. grain size rank',
 'MAJ Lith. max. grain size',
 'MAJ Lith. max. grain size rank',
 'Minor Lithology Prefix',
 'Minor Lithology Name',
 'Minor Lithology Suffix',
 'MIN Lith. prefix + name',
 'MIN Lith. prefix+name+suffix',
 'MIN Lith. name+suffix',
 'MIN Lith. abundance [%]',
 'MIN Lith. color',
 'MIN Lith. ave. grain size ',
 'MIN Lith. ave. grain size rank',
 'MIN Lith. max. grain size ',
 'MIN Lith. max. grain size rank',
 'Lith. abundance sum [%]',
 'Bottom contact or boundary type',
 'Layer, boundary, lamination, grading',
 'Layer or bedding thickness',
 'Clasts abundance',
 'D

## Normalize Top and Bottom columns

Normalize all the Top, Top Depth, Bottom, and Bottom Depth column to have the same names.

In [33]:
def normalize_top_bottom(file):
    path = f"{clean_data_path}/{file}"
    content = pd.read_csv(path)
    columns = list(content.columns)
    
    normalized_cols = normalize_columns(top, 'Top [cm]', columns)
    normalized_cols = normalize_columns(bottom, 'Bottom [cm]', normalized_cols)
    normalized_cols = normalize_columns(top_depth, 'Top Depth [m]', normalized_cols)
    normalized_cols = normalize_columns(bottom_depth, 'Bottom Depth [m]', normalized_cols)
    
    changed = columns != normalized_cols
    
    if changed:
        content.columns = normalized_cols
        content.to_csv(path, index=False)
#         pass

    return changed

change_columns = [normalize_top_bottom(file) for file in metadata['file']] 

## Update metadata

In [34]:
dict = {"update_top_bottom": change_columns}
new_metadata = pd.DataFrame(dict)
new_metadata.head()

Unnamed: 0,update_top_bottom
0,False
1,False
2,False
3,False
4,False


In [35]:
metadata = pd.read_csv(metadata_file)
merged_metadata = metadata.join(new_metadata)
merged_metadata.head()

In [36]:
merged_metadata.to_csv(metadata_file, index=False)

## Normalize Lithology columns

Normalize all the Lithology Prefix, Principal Name, Suffix, Minor Prefix, Minor Name, Minor Suffix columns to have the same names.

In [35]:
def normalize_top_bottom(file):
    path = f"{clean_data_path}/{file}"
    content = pd.read_csv(path)
    columns = list(content.columns)
    
    normalized_cols = normalize_columns(prefix, 'Lithology Prefix', columns)
    normalized_cols = normalize_columns(principal, 'Lithology Principal Name', normalized_cols)
    normalized_cols = normalize_columns(suffix, 'Lithology Suffix', normalized_cols)

    normalized_cols = normalize_columns(minor_prefix, 'Minor Lithology Prefix', normalized_cols)
    normalized_cols = normalize_columns(minor_principal, 'Minor Lithology Name', normalized_cols)
    normalized_cols = normalize_columns(minor_suffix, 'Minor Lithology Suffix', normalized_cols)

    changed = columns != normalized_cols
    
    if changed:
        content.columns = normalized_cols
        content.to_csv(path, index=False)
#         pass

    return changed

change_columns = [normalize_top_bottom(file) for file in metadata['file']] 

## Update metadata

In [36]:
dict = {"update_lithology": change_columns}
new_metadata = pd.DataFrame(dict)
new_metadata.head()

Unnamed: 0,update_lithology
0,True
1,True
2,True
3,True
4,True


In [39]:
metadata = pd.read_csv(metadata_file)
merged_metadata = metadata.join(new_metadata)
merged_metadata.head()

Unnamed: 0,file,expedition,filename_has_exp,add_expedition_section_cols,update_sample_col,update_top_bottom,update_lithology
0,361_macroscopic_U1474D.csv,361,True,False,True,False,True
1,323 Core Description Template_U1341A.csv,323,True,False,False,False,True
2,361_macroscopic_U1479C.csv,361,True,False,True,False,True
3,340_sediment_U1393A.csv,340,True,False,True,False,True
4,339_sediment_U1386A.csv,339,True,False,False,False,True


In [40]:
merged_metadata.to_csv(metadata_file, index=False)

## Add missing columns

In [39]:
metadata = pd.read_csv(metadata_file)

In [40]:
normalized_columns = [
    'Top [cm]',
    'Bottom [cm]',
    'Top Depth [m]',
    'Bottom Depth [m]', 
    'Lithology Prefix',
    'Lithology Principal Name',
    'Lithology Suffix',
    'Minor Lithology Prefix',
    'Minor Lithology Name',
    'Minor Lithology Suffix',
]

In [41]:
def add_columns(file):
    path = f"{clean_data_path}/{file}"
    content =  pd.read_csv(path)
    columns = list(content.columns)
    
    missing_columns = list(set(normalized_columns) - set(columns))
    content = content.reindex(columns = columns + missing_columns)
    
    changed = len(columns) != len(content.columns)

    if changed:
        content.to_csv(path, index=False)
#         pass

    return changed
    
change_columns = [add_columns(file) for file in metadata['file']] 

## Update metadata

In [42]:
dict = {"add_top_bottom_lith_cols": change_columns}
new_metadata = pd.DataFrame(dict)
new_metadata.head()

Unnamed: 0,add_top_bottom_lith_cols
0,True
1,True
2,True
3,True
4,False


In [43]:
merged_metadata = metadata.join(new_metadata)
merged_metadata.head()

Unnamed: 0,file,expedition,filename_has_exp,add_expedition_section_cols,update_sample_col,update_top_bottom,update_lithology,add_top_bottom_lith_cols
0,361_macroscopic_U1474D.csv,361,True,False,True,False,True,True
1,323 Core Description Template_U1341A.csv,323,True,False,False,False,True,True
2,361_macroscopic_U1479C.csv,361,True,False,True,False,True,True
3,340_sediment_U1393A.csv,340,True,False,True,False,True,True
4,339_sediment_U1386A.csv,339,True,False,False,False,True,False


In [44]:
merged_metadata.to_csv(metadata_file, index=False)