# Normalize CSV columns

Normalize the columns for the eODP CSVs.

In [67]:
import sys
import os
import glob
sys.path.append('../scripts/')

import pandas as pd

from normalize_data import (
    add_sample_col, 
    add_expedition_aw_cols, 
    csv_cleanup,
    update_metadata,
    append_set,
    filter_existing_set,
    normalize_columns,
    add_missing_columns,
    create_sample_name_for_row,
    check_duplicate_columns
)

In [68]:
base_directory = 'cleaned_data'
raw_data_directory = 'raw_data'

lithology_meta = os.path.join(base_directory, 'metadata', 'LIMS','Lithology_changes.csv')
micropal_meta = os.path.join(base_directory, 'metadata', 'LIMS', 'Micropal_changes.csv')

lithology_type = 'lithology'
taxa_type = 'taxa'

columns_file = os.path.join(base_directory, 'metadata', 'LIMS','columns_list.csv')

In [69]:
metadata_file = lithology_meta
data_type = lithology_type
duplicate_columns_file = os.path.join(base_directory, 
                                      'metadata', 
                                      'LIMS',
                                      f'duplicate_columns_needs_reivew_{data_type}.csv')

## 1. Remove empty rows and columns

remove empty rows and columns without headers or data

In [66]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,path,empty_rows_columns,remove_identical_rows,remove_identical_columns,standardize_headers,add_expedition_aw_cols,add_sample_column,add_missing_cols,update_lithology,add_missing_lith_cols
0,361_macroscopic_U1474D.csv,Lithology_CSV/361_macroscopic_U1474D.csv,False,False,False,False,False,True,True,True,True
1,323 Core Description Template_U1341A.csv,Lithology_CSV/323 Core Description Template_U1...,False,False,False,False,False,False,True,True,True
2,361_macroscopic_U1479C.csv,Lithology_CSV/361_macroscopic_U1479C.csv,False,False,False,False,False,True,True,True,True
3,340_sediment_U1393A.csv,Lithology_CSV/340_sediment_U1393A.csv,False,False,False,True,False,False,True,True,True
4,339_sediment_U1386A.csv,Lithology_CSV/339_sediment_U1386A.csv,False,False,False,False,False,False,True,True,False


In [5]:
def process_filename(file_path):
    path = f"{base_directory}/{file_path}"
    df = pd.read_csv(path, dtype=str, header=None)
    
    orginal_dim = df.shape
    df.dropna(axis="index", how="all", inplace=True)
    df.dropna(axis="columns", how="all", inplace=True)
    new_dim = df.shape
    changed = orginal_dim != new_dim

    if changed:
        df.to_csv(path, index=False, header=False)

    return changed
    
change_columns = [process_filename(file) for file in metadata['path']] 

In [6]:
dict = {"empty_rows_columns": change_columns}
new_metadata = update_metadata(metadata, dict)

In [7]:
new_metadata.to_csv(metadata_file, index=False)

## 2. Remove duplicate identical rows
Remove rows that are identical

In [8]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,path,empty_rows_columns
0,361_macroscopic_U1474D.csv,Lithology_CSV/361_macroscopic_U1474D.csv,False
1,323 Core Description Template_U1341A.csv,Lithology_CSV/323 Core Description Template_U1...,False
2,361_macroscopic_U1479C.csv,Lithology_CSV/361_macroscopic_U1479C.csv,False
3,340_sediment_U1393A.csv,Lithology_CSV/340_sediment_U1393A.csv,False
4,339_sediment_U1386A.csv,Lithology_CSV/339_sediment_U1386A.csv,False


In [9]:
def process_filename(file_path):
    path = f"{base_directory}/{file_path}"
    df = pd.read_csv(path, dtype=str)
    
    original_rows = len(df)
    df.drop_duplicates(inplace=True)
    new_rows = len(df)
    changed = original_rows != new_rows

    if changed:
        df = csv_cleanup(df, path)
        df.to_csv(path, index=False)
        
    return changed
    
change_columns = [process_filename(file) for file in metadata['path']] 

In [10]:
dict = {"remove_identical_rows": change_columns}
new_metadata = update_metadata(metadata, dict)

In [11]:
new_metadata.to_csv(metadata_file, index=False)

## 3. duplicate column names

check if csv has duplicate column names. If headers and values are the same, delete column. If the headers  are the same but the values are different, log the error.

In [12]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,path,empty_rows_columns,remove_identical_rows
0,361_macroscopic_U1474D.csv,Lithology_CSV/361_macroscopic_U1474D.csv,False,False
1,323 Core Description Template_U1341A.csv,Lithology_CSV/323 Core Description Template_U1...,False,False
2,361_macroscopic_U1479C.csv,Lithology_CSV/361_macroscopic_U1479C.csv,False,False
3,340_sediment_U1393A.csv,Lithology_CSV/340_sediment_U1393A.csv,False,False
4,339_sediment_U1386A.csv,Lithology_CSV/339_sediment_U1386A.csv,False,False


In [13]:
needs_review = []

def process_filename(file_path):
    path = f"{base_directory}/{file_path}"
    df = pd.read_csv(path, dtype=str)
    
    original_rows = len(df.columns)
    check_duplicate_columns(df, file_path, needs_review)
    new_rows = len(df.columns)

    changed = original_rows != new_rows

    if changed:
        df = csv_cleanup(df, path)
        df.to_csv(path, index=False)

    return changed

change_columns = [process_filename(file) for file in metadata['path']] 

Lithology_CSV/323 Core Description Template_U1341A.csv, GRAVEL SIZE CLAST: duplicate columns
                        have different values


In [14]:
dict = {"remove_identical_columns": change_columns}
new_metadata = update_metadata(metadata, dict)

In [15]:
new_metadata.to_csv(metadata_file, index=False)

### create file for columns that need to be reviewed

In [16]:
df = pd.DataFrame(needs_review)
df.to_csv(duplicate_columns_file, index=False)

## 4. Standardize headers

Find all variants for specified headers, and convert all the variants to the same term.

In [70]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,path,empty_rows_columns,remove_identical_rows,remove_identical_columns,standardize_headers,add_expedition_aw_cols,add_sample_column,add_missing_cols,update_lithology,add_missing_lith_cols
0,361_macroscopic_U1474D.csv,Lithology_CSV/361_macroscopic_U1474D.csv,False,False,False,False,False,True,True,True,True
1,323 Core Description Template_U1341A.csv,Lithology_CSV/323 Core Description Template_U1...,False,False,False,False,False,False,True,True,True
2,361_macroscopic_U1479C.csv,Lithology_CSV/361_macroscopic_U1479C.csv,False,False,False,False,False,True,True,True,True
3,340_sediment_U1393A.csv,Lithology_CSV/340_sediment_U1393A.csv,False,False,False,True,False,False,True,True,True
4,339_sediment_U1386A.csv,Lithology_CSV/339_sediment_U1386A.csv,False,False,False,False,False,False,True,True,False


### Find all variants for specified headers

In [22]:
columns_all = get_columns_from_file_or_disk(columns_file = columns_file, 
                                            metadata = metadata, 
                                            data_directory = raw_data_directory, 
                                            column_type = data_type)

len(columns_all)

564

In [27]:
sample_all = set()
append_set(sample_all, r".*?(sample|label|labl).*?", columns_all)

sample = set()
append_set(sample, r"^(sample|labe?l id)$", columns_all)

exp_all = set()
append_set(exp_all, r".*?exp.*?", columns_all)

site_all = set()
append_set(site_all, r".*?site.*?", columns_all)

hole_all = set()
append_set(hole_all, r".*?hole.*?", columns_all)

core_all = set()
append_set(core_all, r".*?core.*?", columns_all)

type_all = set()
append_set(type_all, r".*?type.*?", columns_all)

section_all = set()
append_set(section_all, r".*?sect.*?", columns_all)

aw_all = set()
append_set(aw_all, r".*?a/?w.*?", columns_all)

extra_all = set()
append_set(extra_all, r".*?extra.*?", columns_all)

top_all = set()
append_set(top_all, r".*?top.*?", columns_all)

top_depth = set()
append_set(top_depth, r"top depth", columns_all)

top = set()
append_set(top, r"top offset|top ?\[", columns_all)

bottom_all = set()
append_set(bottom_all, r".*?bottom.*?", columns_all)

bottom_depth = set()
append_set(bottom_depth, r"bottom depth", columns_all)

bottom = set()
append_set(bottom, r"bottom offset|bottom ?\[", columns_all)

In [28]:
sample_all

{'Extra Sample ID Data',
 'Label ID',
 'Sample',
 'Sample domain name (if >1 domain)',
 'Sample domain number (if >1 domain)'}

In [29]:
sample

{'Label ID', 'Sample'}

In [30]:
exp_all

{'Exp'}

In [31]:
site_all

{'Site'}

In [32]:
hole_all

{'Hole'}

In [33]:
core_all

{'Core', 'Core-Sect'}

In [34]:
type_all

{'Bioturbation type',
 'Bottom contact or boundary type',
 'Contact or boundary type',
 'Contact type',
 'Drilling disturbance type',
 'TEPHRA type',
 'Type',
 'Type and sense of shear'}

In [35]:
section_all

{'Core-Sect', 'Section'}

In [36]:
aw_all

{'A/W'}

In [37]:
extra_all

{'Extra Sample ID Data'}

In [38]:
top_all

{'Top Depth [m]',
 'Top Depth[m] [m]',
 'Top [cm]',
 'Top depth [m]',
 'Top offset [cm]',
 'Top[cm] [cm]'}

In [39]:
top

{'Top [cm]', 'Top offset [cm]', 'Top[cm] [cm]'}

In [40]:
top_depth

{'Top Depth [m]', 'Top Depth[m] [m]', 'Top depth [m]'}

In [41]:
bottom_all

{'Bottom Depth [m]',
 'Bottom Depth[m] [m]',
 'Bottom [cm]',
 'Bottom contact attitude',
 'Bottom contact definition',
 'Bottom contact geometry',
 'Bottom contact or boundary attitude',
 'Bottom contact or boundary definition',
 'Bottom contact or boundary geometry',
 'Bottom contact or boundary geometry+definition [read only]',
 'Bottom contact or boundary type',
 'Bottom depth [m]',
 'Bottom offset [cm]',
 'Bottom[cm] [cm]'}

In [42]:
bottom

{'Bottom [cm]', 'Bottom offset [cm]', 'Bottom[cm] [cm]'}

In [43]:
bottom_depth

{'Bottom Depth [m]', 'Bottom Depth[m] [m]', 'Bottom depth [m]'}

### Standardize header names

In [44]:
def process_filename(file_path):
    path = f"{base_directory}/{file_path}"
    content = pd.read_csv(path, dtype=str)
    columns = list(content.columns)
    
    normalized_cols = normalize_columns(sample, 'Sample', columns)
    normalized_cols = normalize_columns(top, 'Top [cm]', normalized_cols)
    normalized_cols = normalize_columns(bottom, 'Bottom [cm]', normalized_cols)
    normalized_cols = normalize_columns(top_depth, 'Top Depth [m]', normalized_cols)
    normalized_cols = normalize_columns(bottom_depth, 'Bottom Depth [m]', normalized_cols)

    changed = columns != normalized_cols
    
    if changed:
        content.columns = normalized_cols
        content = csv_cleanup(content, path)
        content.to_csv(path, index=False)

    return changed

change_columns = [process_filename(file) for file in metadata['path']] 

In [45]:
dict = {"standardize_headers": change_columns}
new_metadata = update_metadata(metadata, dict)

In [46]:
new_metadata.to_csv(metadata_file, index=False)

## 5. Convert Sample to missing expedition..a/w columns 

If expedition..A/W columns do not exist, convert Sample string into expedition..A/W  columns. 

In [60]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,path,empty_rows_columns,remove_identical_rows,remove_identical_columns,standardize_headers,add_expedition_aw_cols,add_sample_column,add_missing_cols,update_lithology,add_missing_lith_cols
0,361_macroscopic_U1474D.csv,Lithology_CSV/361_macroscopic_U1474D.csv,False,False,False,False,False,True,True,True,True
1,323 Core Description Template_U1341A.csv,Lithology_CSV/323 Core Description Template_U1...,False,False,False,False,False,False,True,True,True
2,361_macroscopic_U1479C.csv,Lithology_CSV/361_macroscopic_U1479C.csv,False,False,False,False,False,True,True,True,True
3,340_sediment_U1393A.csv,Lithology_CSV/340_sediment_U1393A.csv,False,False,False,True,False,False,True,True,True
4,339_sediment_U1386A.csv,Lithology_CSV/339_sediment_U1386A.csv,False,False,False,False,False,False,True,True,False


In [48]:
def process_filename(file_path):
    path = f"{base_directory}/{file_path}"
    content = pd.read_csv(path, dtype=str)
    
    original_cols = content.columns
    content = add_expedition_aw_cols(content)
    changed = list(original_cols) != list(content.columns)

        
    if changed:
        content = csv_cleanup(content, path)
        content.to_csv(path, index=False)

    return changed
    
change_columns = [process_filename(file) for file in metadata['path']] 


In [49]:
dict = {"add_expedition_aw_cols": change_columns}
new_metadata = update_metadata(metadata, dict)

In [50]:
new_metadata.to_csv(metadata_file, index=False)

## 6. Add Sample column

If no 'Sample', add 'Sample' based on expedition...A/W columns.

In [51]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,path,empty_rows_columns,remove_identical_rows,remove_identical_columns,standardize_headers,add_expedition_aw_cols
0,361_macroscopic_U1474D.csv,Lithology_CSV/361_macroscopic_U1474D.csv,False,False,False,False,False
1,323 Core Description Template_U1341A.csv,Lithology_CSV/323 Core Description Template_U1...,False,False,False,False,False
2,361_macroscopic_U1479C.csv,Lithology_CSV/361_macroscopic_U1479C.csv,False,False,False,False,False
3,340_sediment_U1393A.csv,Lithology_CSV/340_sediment_U1393A.csv,False,False,False,True,False
4,339_sediment_U1386A.csv,Lithology_CSV/339_sediment_U1386A.csv,False,False,False,False,False


In [52]:
def process_filename(file_path):
    path = f"{base_directory}/{file_path}"
    content = pd.read_csv(path, dtype=str)
    
    original_cols = content.columns
    add_sample_col(content)
    
    changed = list(original_cols) != list(content.columns)
    
    if changed:
        content = csv_cleanup(content, path)
        content.to_csv(path, index=False)

    return changed
    
change_columns = [process_filename(file) for file in metadata['path']] 

In [53]:
dict = {"add_sample_column": change_columns}
new_metadata = update_metadata(metadata, dict)

In [54]:
new_metadata.to_csv(metadata_file, index=False)

## 7. Add missing columns

In [55]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,path,empty_rows_columns,remove_identical_rows,remove_identical_columns,standardize_headers,add_expedition_aw_cols,add_sample_column
0,361_macroscopic_U1474D.csv,Lithology_CSV/361_macroscopic_U1474D.csv,False,False,False,False,False,True
1,323 Core Description Template_U1341A.csv,Lithology_CSV/323 Core Description Template_U1...,False,False,False,False,False,False
2,361_macroscopic_U1479C.csv,Lithology_CSV/361_macroscopic_U1479C.csv,False,False,False,False,False,True
3,340_sediment_U1393A.csv,Lithology_CSV/340_sediment_U1393A.csv,False,False,False,True,False,False
4,339_sediment_U1386A.csv,Lithology_CSV/339_sediment_U1386A.csv,False,False,False,False,False,False


In [56]:
normalized_columns = [
    'Top [cm]',
    'Bottom [cm]',
    'Top Depth [m]',
    'Bottom Depth [m]', 
    'Sample',
    'Exp',
    'Site',
    'Hole',
    'Core',
    'Type',
    'Section',
    'A/W',
    'Extra Sample ID Data',
]

In [57]:
change_columns = [add_missing_columns(f"{base_directory}/{file}", normalized_columns) for file in metadata['path']] 

In [58]:
dict = {"add_missing_cols": change_columns}
new_metadata = update_metadata(metadata, dict)

In [59]:
new_metadata.to_csv(metadata_file, index=False)