# Normalize CSV columns

Normalize the columns for the eODP CSVs.

In [1]:
import sys
import glob
sys.path.append('../scripts/')

import pandas as pd

from normalize_data import (
    normalize_sample_col, 
    normalize_expedition_section_cols, 
    csv_cleanup,
    update_metadata,
    fetch_unique_column_names,
    append_set,
    filter_existing_set,
    normalize_columns,
    add_missing_columns,
    check_duplicate_columns
)

In [2]:
lithology = 'cleaned_data/Lithology_CSV'
lithology_meta = 'cleaned_data/metadata/lims_lithology_changes.csv'

micropal_1 = 'cleaned_data/Micropal_CSV_1'
micropal_meta_1 = 'cleaned_data/metadata/lims_micropal_1_changes.csv'

micropal_2 = 'cleaned_data/Micropal_CSV_2'
micropal_meta_2 = 'cleaned_data/metadata/lims_micropal_2_changes.csv'

micropal_3 = 'cleaned_data/Micropal_CSV_3'
micropal_meta_3 = 'cleaned_data/metadata/lims_micropal_3_changes.csv'

micropal_4 = 'cleaned_data/Micropal_CSV_revised'
micropal_meta_4 = 'cleaned_data/metadata/lims_micropal_revised_changes.csv'

all_LIMS = [
    lithology,
    micropal_1,
    micropal_2,
    micropal_3,
    micropal_4
]

In [3]:
clean_data_path = micropal_4
metadata_file = micropal_meta_4

## Remove empty rows and columns

remove empty rows and columns without headers or data

In [4]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,path,taxon_group,empty_rows_columns,remove_identical_rows,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols
0,363-U1482A-nannofossils_revised.csv,Micropal_CSV_revised/363-U1482A-nannofossils_r...,nannofossils,False,False,True,False,False,False


In [5]:
def process_filename(file):
    path = f"{clean_data_path}/{file}"
    df = pd.read_csv(path, dtype=str, header=None)
    
    orginal_dim = df.shape
    new_df = df.dropna(axis="index", how="all").copy()
    new_df.dropna(axis="columns", how="all", inplace=True)
    new_dim = new_df.shape
    changed = orginal_dim != new_dim

    if changed:
        new_df.to_csv(path, index=False, header=False)

    return changed
    
change_columns = [process_filename(file) for file in metadata['file']] 

In [6]:
dict = {"empty_rows_columns": change_columns}
new_metadata = update_metadata(metadata, dict)
new_metadata.head()

Unnamed: 0,file,path,taxon_group,empty_rows_columns,remove_identical_rows,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols
0,363-U1482A-nannofossils_revised.csv,Micropal_CSV_revised/363-U1482A-nannofossils_r...,nannofossils,False,False,True,False,False,False


In [7]:
new_metadata.to_csv(metadata_file, index=False)

## Remove duplicate identical rows
Remove rows that are identical

In [8]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,path,taxon_group,empty_rows_columns,remove_identical_rows,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols
0,363-U1482A-nannofossils_revised.csv,Micropal_CSV_revised/363-U1482A-nannofossils_r...,nannofossils,False,False,True,False,False,False


In [9]:
def process_filename(file):
    path = f"{clean_data_path}/{file}"
    df = pd.read_csv(path, dtype=str)
    
    original_rows = len(df)
    df.drop_duplicates(inplace=True)
    new_rows = len(df)
    changed = original_rows != new_rows

    if changed:
        df = csv_cleanup(df, path)
        df.to_csv(path, index=False)
        
    return changed
    
change_columns = [process_filename(file) for file in metadata['file']] 

In [10]:
dict = {"remove_identical_rows": change_columns}
new_metadata = update_metadata(metadata, dict)
new_metadata.head()

Unnamed: 0,file,path,taxon_group,empty_rows_columns,remove_identical_rows,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols
0,363-U1482A-nannofossils_revised.csv,Micropal_CSV_revised/363-U1482A-nannofossils_r...,nannofossils,False,False,True,False,False,False


In [11]:
new_metadata.to_csv(metadata_file, index=False)

## duplicate column names

check if csv has duplicate column names

In [12]:
def duplicate_columns(directory):
    raw_csvs = glob.glob(f"{directory}/*.csv")

    for path in raw_csvs:
        content = pd.read_csv(path, dtype=str)
        content.dropna(inplace=True, axis='columns', how='all')

        check_duplicate_columns(content, path)
    print('done')
            
duplicate_columns(clean_data_path)

done


## find all expedition...section variants

check to see if  expedition...section have variants

In [13]:
columns_all = set()

raw_csvs = glob.glob(f"{clean_data_path}/*.csv")
for path in raw_csvs:
    df = pd.read_csv(path, nrows = 1)
    columns_all.update(df.columns)

len(columns_all)

179

In [14]:
exp = set()
append_set(exp, r"exp", columns_all)

site = set()
append_set(site, r"site", columns_all)

hole = set()
append_set(hole, r"hole", columns_all)

core = set()
append_set(core, r"core", columns_all)

type = set()
append_set(type, r"type", columns_all)

section = set()
append_set(section, r"sect", columns_all)

aw = set()
append_set(aw, r"a/?w", columns_all)

extra = set()
append_set(extra, r"extra", columns_all)

In [15]:
exp

{'Exp'}

In [16]:
site

{'Site'}

In [17]:
hole

{'Hole'}

In [18]:
core

{'Core'}

In [19]:
type

{'Type'}

In [20]:
section

{'Section'}

In [21]:
aw

{'A/W'}

In [22]:
extra

{'Extra Sample ID Data'}

## Normalize expedition..section columns 

Read each Lithology CSV to check if expedition..section columns exist. Overwrite existing Lithology CSV if columns need to be added.

In [23]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,path,taxon_group,empty_rows_columns,remove_identical_rows,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols
0,363-U1482A-nannofossils_revised.csv,Micropal_CSV_revised/363-U1482A-nannofossils_r...,nannofossils,False,False,True,False,False,False


In [24]:
def process_filename(file):
    path = f"{clean_data_path}/{file}"
    content = pd.read_csv(path, dtype=str)
    
    original_cols = content.columns
    content = normalize_expedition_section_cols(content)
    changed = list(original_cols) != list(content.columns)

        
    if changed:
        content = csv_cleanup(content, path)
        content.to_csv(path, index=False)

    return changed
    
change_columns = [process_filename(file) for file in metadata['file']] 


In [25]:
dict = {"add_expedition_section_cols": change_columns}
new_metadata = update_metadata(metadata, dict)
new_metadata.head()

Unnamed: 0,file,path,taxon_group,empty_rows_columns,remove_identical_rows,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols
0,363-U1482A-nannofossils_revised.csv,Micropal_CSV_revised/363-U1482A-nannofossils_r...,nannofossils,False,False,True,False,False,False


In [26]:
new_metadata.to_csv(metadata_file, index=False)

## Normalize Sample column

Read each Lithology CSV to check if Sample column needs to be updated. Change 'Label ID' to 'Sample'. Add 'Sample' if no 'Sample' or 'Label ID' based on expedition...section columns. Overwrite existing Lithology CSV if Sample column is updated.

In [27]:
metadata = pd.read_csv(metadata_file)

In [28]:
metadata.shape

(1, 9)

In [29]:
def process_filename(file):
    path = f"{clean_data_path}/{file}"
    content = pd.read_csv(path, dtype=str)
    
    original_cols = content.columns
    normalize_sample_col(content)
    
    changed = list(original_cols) != list(content.columns)
    
    if changed:
        content = csv_cleanup(content, path)
        content.to_csv(path, index=False)

    return changed
    
change_columns = [process_filename(file) for file in metadata['file']] 

In [30]:
dict = {"update_sample_col": change_columns}
new_metadata = update_metadata(metadata, dict)
new_metadata.head()

Unnamed: 0,file,path,taxon_group,empty_rows_columns,remove_identical_rows,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols
0,363-U1482A-nannofossils_revised.csv,Micropal_CSV_revised/363-U1482A-nannofossils_r...,nannofossils,False,False,True,False,False,False


In [31]:
new_metadata.to_csv(metadata_file, index=False)

## Normalize Top and Bottom columns

Normalize all the Top, Top Depth, Bottom, and Bottom Depth column to have the same names.

In [32]:
metadata = pd.read_csv(metadata_file)

### Get top bottom columns

In [33]:
columns_all = set()

res=[fetch_unique_column_names(f"{clean_data_path}/{file}", columns_all) for file in metadata['file']] 

In [34]:
top_all = set()
top = set()
top_depth = set()

bottom_all = set()
bottom = set()
bottom_depth = set()

extra = set()

append_set(top_all, r".*?top.*?", columns_all)
append_set(top_depth, r"top depth", columns_all)
append_set(top, r"top offset|top ?\[", columns_all)

append_set(bottom_all, r".*?bottom.*?", columns_all)
append_set(bottom_depth, r"bottom depth", columns_all)
append_set(bottom, r"bottom offset|bottom ?\[", columns_all)

In [35]:
top_all

{'Calcidiscus leptoporus', 'Top Depth [m]', 'Top [cm]'}

In [36]:
top

{'Top [cm]'}

In [37]:
top_depth

{'Top Depth [m]'}

In [38]:
bottom_all

{'Bottom Depth [m]', 'Bottom [cm]'}

In [39]:
bottom

{'Bottom [cm]'}

In [40]:
bottom_depth

{'Bottom Depth [m]'}

### Normalize top bottom columns

In [41]:
def normalize_top_bottom(file):
    path = f"{clean_data_path}/{file}"
    content = pd.read_csv(path, dtype=str)
    columns = list(content.columns)
    
    normalized_cols = normalize_columns(top, 'Top [cm]', columns)
    normalized_cols = normalize_columns(bottom, 'Bottom [cm]', normalized_cols)
    normalized_cols = normalize_columns(top_depth, 'Top Depth [m]', normalized_cols)
    normalized_cols = normalize_columns(bottom_depth, 'Bottom Depth [m]', normalized_cols)

    changed = columns != normalized_cols
    
    if changed:
        content.columns = normalized_cols
        content = csv_cleanup(content, path)
        content.to_csv(path, index=False)

    return changed

change_columns = [normalize_top_bottom(file) for file in metadata['file']] 

In [42]:
dict = {"update_top_bottom": change_columns}
new_metadata = update_metadata(metadata, dict)
new_metadata.head()

Unnamed: 0,file,path,taxon_group,empty_rows_columns,remove_identical_rows,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols
0,363-U1482A-nannofossils_revised.csv,Micropal_CSV_revised/363-U1482A-nannofossils_r...,nannofossils,False,False,True,False,False,False


In [43]:
new_metadata.to_csv(metadata_file, index=False)

## Add missing columns

In [44]:
metadata = pd.read_csv(metadata_file)

In [45]:
normalized_columns = [
    'Top [cm]',
    'Bottom [cm]',
    'Top Depth [m]',
    'Bottom Depth [m]', 
    'Sample',
    'Exp',
    'Site',
    'Hole',
    'Core',
    'Type',
    'Section',
    'A/W',
    'Extra Sample ID Data',
]

In [46]:
change_columns = [add_missing_columns(f"{clean_data_path}/{file}", normalized_columns) for file in metadata['file']] 

In [47]:
dict = {"add_missing_cols": change_columns}
new_metadata = update_metadata(metadata, dict)
new_metadata.head()

Unnamed: 0,file,path,taxon_group,empty_rows_columns,remove_identical_rows,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols
0,363-U1482A-nannofossils_revised.csv,Micropal_CSV_revised/363-U1482A-nannofossils_r...,nannofossils,False,False,True,False,False,False


In [48]:
new_metadata.to_csv(metadata_file, index=False)