# Normalize CSV columns

Normalize the columns for the eODP CSVs.

In [1]:
import sys
sys.path.append('../scripts/')
import glob
import re

import pandas as pd
import numpy as np

from normalize_data import (
    normalize_sample_col, 
    normalize_expedition_section_cols, 
    csv_cleanup,
    update_metadata,
    fetch_unique_column_names,
    append_set,
    filter_existing_set,
    normalize_columns,
    add_missing_columns
)

In [2]:
metadata_file = 'cleaned_data/metadata/Lithology_changes.csv'
clean_data_path = 'cleaned_data/Lithology_CSV'

In [2]:
metadata_file = 'cleaned_data/metadata/Micropal_changes.csv'
clean_data_path = 'cleaned_data/Micropal_CSV'

## Normalize expedition..section columns 

Read each Lithology CSV to check if expedition..section columns exist. Overwrite existing Lithology CSV if columns need to be added.

### Read metadata file

In [3]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,expedition,filename_has_exp
0,361_macroscopic_U1474D.csv,361,True
1,323 Core Description Template_U1341A.csv,323,True
2,361_macroscopic_U1479C.csv,361,True
3,340_sediment_U1393A.csv,340,True
4,339_sediment_U1386A.csv,339,True


In [4]:
metadata.shape

(518, 3)

### Normalize columns

In [5]:
def process_filename(file):
    path = f"{clean_data_path}/{file}"
    content = pd.read_csv(path)
    
    original_cols = content.columns
    content = normalize_expedition_section_cols(content)
    changed = list(original_cols) != list(content.columns)

        
    if changed:
        content = csv_cleanup(content)
        content.to_csv(path, index=False)

    return changed
    
change_columns = [process_filename(file) for file in metadata['file']] 


### Update metadata

In [6]:
dict = {"add_expedition_section_cols": change_columns}
new_metadata = update_metadata(metadata, dict)
new_metadata.head()

Unnamed: 0,file,expedition,filename_has_exp,add_expedition_section_cols
0,361_macroscopic_U1474D.csv,361,True,False
1,323 Core Description Template_U1341A.csv,323,True,False
2,361_macroscopic_U1479C.csv,361,True,False
3,340_sediment_U1393A.csv,340,True,False
4,339_sediment_U1386A.csv,339,True,False


In [7]:
new_metadata.to_csv(metadata_file, index=False)

## Normalize Sample column

Read each Lithology CSV to check if Sample column needs to be updated. Change 'Label ID' to 'Sample'. Add 'Sample' if no 'Sample' or 'Label ID' based on expedition...section columns. Overwrite existing Lithology CSV if Sample column is updated.

### Read metadata file

In [10]:
metadata = pd.read_csv(metadata_file)

In [11]:
metadata.shape

(518, 4)

### Normalize columns

In [12]:
def process_filename(file):
    path = f"{clean_data_path}/{file}"
    content = pd.read_csv(path)
    
    original_cols = content.columns
    normalize_sample_col(content)
    
    changed = list(original_cols) != list(content.columns)
    
    if changed:
        content = csv_cleanup(content)
        content.to_csv(path, index=False)

    return changed
    
change_columns = [process_filename(file) for file in metadata['file']] 

### Update metadata

In [13]:
dict = {"update_sample_col": change_columns}
new_metadata = update_metadata(metadata, dict)
new_metadata.head()

Unnamed: 0,file,expedition,filename_has_exp,add_expedition_section_cols,update_sample_col
0,361_macroscopic_U1474D.csv,361,True,False,True
1,323 Core Description Template_U1341A.csv,323,True,False,False
2,361_macroscopic_U1479C.csv,361,True,False,True
3,340_sediment_U1393A.csv,340,True,False,True
4,339_sediment_U1386A.csv,339,True,False,False


In [14]:
new_metadata.to_csv(metadata_file, index=False)

## Normalize Top and Bottom columns

Normalize all the Top, Top Depth, Bottom, and Bottom Depth column to have the same names.

In [15]:
metadata = pd.read_csv(metadata_file)

### Get top bottom columns

In [16]:
columns_all = set()

res=[fetch_unique_column_names(f"{clean_data_path}/{file}", columns_all) for file in metadata['file']] 

In [17]:
top_all = set()
top = set()
top_depth = set()

bottom_all = set()
bottom = set()
bottom_depth = set()

append_set(top_all, r".*?top.*?", columns_all)
append_set(top_depth, r"top depth", columns_all)
append_set(top, r"top offset|top ?\[", columns_all)

append_set(bottom_all, r".*?bottom.*?", columns_all)
append_set(bottom_depth, r"bottom depth", columns_all)
append_set(bottom, r"bottom offset|bottom ?\[", columns_all)

In [18]:
top_all

{'Top Depth [m]',
 'Top Depth[m] [m]',
 'Top [cm]',
 'Top depth [m]',
 'Top offset [cm]',
 'Top[cm] [cm]'}

In [19]:
top

{'Top [cm]', 'Top offset [cm]', 'Top[cm] [cm]'}

In [20]:
top_depth

{'Top Depth [m]', 'Top Depth[m] [m]', 'Top depth [m]'}

In [21]:
bottom_all

{'Bottom Depth [m]',
 'Bottom Depth[m] [m]',
 'Bottom [cm]',
 'Bottom contact attitude',
 'Bottom contact definition',
 'Bottom contact geometry',
 'Bottom contact or boundary attitude',
 'Bottom contact or boundary definition',
 'Bottom contact or boundary geometry',
 'Bottom contact or boundary geometry+definition [read only]',
 'Bottom contact or boundary type',
 'Bottom depth [m]',
 'Bottom offset [cm]',
 'Bottom[cm] [cm]'}

In [22]:
bottom

{'Bottom [cm]', 'Bottom offset [cm]', 'Bottom[cm] [cm]'}

In [23]:
bottom_depth

{'Bottom Depth [m]', 'Bottom Depth[m] [m]', 'Bottom depth [m]'}

### Normalize top bottom columns

In [24]:
def normalize_top_bottom(file):
    path = f"{clean_data_path}/{file}"
    content = pd.read_csv(path)
    columns = list(content.columns)
    
    normalized_cols = normalize_columns(top, 'Top [cm]', columns)
    normalized_cols = normalize_columns(bottom, 'Bottom [cm]', normalized_cols)
    normalized_cols = normalize_columns(top_depth, 'Top Depth [m]', normalized_cols)
    normalized_cols = normalize_columns(bottom_depth, 'Bottom Depth [m]', normalized_cols)
    
    changed = columns != normalized_cols
    
    if changed:
        content.columns = normalized_cols
        content = csv_cleanup(content)
        content.to_csv(path, index=False)

    return changed

change_columns = [normalize_top_bottom(file) for file in metadata['file']] 

### Update metadata

In [25]:
dict = {"update_top_bottom": change_columns}
new_metadata = update_metadata(metadata, dict)
new_metadata.head()

Unnamed: 0,file,expedition,filename_has_exp,add_expedition_section_cols,update_sample_col,update_top_bottom
0,361_macroscopic_U1474D.csv,361,True,False,True,False
1,323 Core Description Template_U1341A.csv,323,True,False,False,False
2,361_macroscopic_U1479C.csv,361,True,False,True,False
3,340_sediment_U1393A.csv,340,True,False,True,False
4,339_sediment_U1386A.csv,339,True,False,False,False


In [26]:
new_metadata.to_csv(metadata_file, index=False)

## Add missing columns

In [27]:
metadata = pd.read_csv(metadata_file)

In [28]:
normalized_columns = [
    'Top [cm]',
    'Bottom [cm]',
    'Top Depth [m]',
    'Bottom Depth [m]', 
    'Sample',
    'Exp',
    'Site',
    'Hole',
    'Core',
    'Type',
    'Section',
    'A/W'
]

In [29]:
change_columns = [add_missing_columns(f"{clean_data_path}/{file}", normalized_columns) for file in metadata['file']] 

In [30]:
dict = {"add_missing_cols": change_columns}
new_metadata = update_metadata(metadata, dict)
new_metadata.head()

Unnamed: 0,file,expedition,filename_has_exp,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols
0,361_macroscopic_U1474D.csv,361,True,False,True,False,False
1,323 Core Description Template_U1341A.csv,323,True,False,False,False,False
2,361_macroscopic_U1479C.csv,361,True,False,True,False,False
3,340_sediment_U1393A.csv,340,True,False,True,False,False
4,339_sediment_U1386A.csv,339,True,False,False,False,False


In [31]:
new_metadata.to_csv(metadata_file, index=False)