# Normalize CSV columns

Normalize the colums fot the Lithography CSVs.

In [1]:
import sys
sys.path.append('../scripts/')
import glob
import re

import pandas as pd
import numpy as np

from normalize_data import (
    normalize_sample_col, 
    normalize_expedition_section_cols, 
    csv_cleanup,
    update_metadata
)

In [2]:
metadata_file = 'cleaned_data/metadata/Lithology_changes.csv'
clean_data_path = 'cleaned_data/Lithology_CSV'

## Normalize expedition..section columns 

Read each Lithology CSV to check if expedition..section columns exist. Overwrite existing Lithology CSV if columns need to be added.

### Read metadata file

In [3]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,expedition,filename_has_exp
0,361_macroscopic_U1474D.csv,361,True
1,323 Core Description Template_U1341A.csv,323,True
2,361_macroscopic_U1479C.csv,361,True
3,340_sediment_U1393A.csv,340,True
4,339_sediment_U1386A.csv,339,True


In [4]:
metadata.shape

(518, 3)

### Normalize columns

In [5]:
def process_filename(file):
    path = f"{clean_data_path}/{file}"
    content = pd.read_csv(path)
    
    original_cols = content.columns
    content = normalize_expedition_section_cols(content)
    changed = list(original_cols) != list(content.columns)

        
    if changed:
        content = csv_cleanup(content)
        content.to_csv(path, index=False)

    return changed
    
change_columns = [process_filename(file) for file in metadata['file']] 


### Update metadata

In [6]:
dict = {"add_expedition_section_cols": change_columns}
new_metadata = update_metadata(metadata, dict)
new_metadata.head()

Unnamed: 0,file,expedition,filename_has_exp,add_expedition_section_cols
0,361_macroscopic_U1474D.csv,361,True,False
1,323 Core Description Template_U1341A.csv,323,True,False
2,361_macroscopic_U1479C.csv,361,True,False
3,340_sediment_U1393A.csv,340,True,False
4,339_sediment_U1386A.csv,339,True,False


In [7]:
new_metadata.to_csv(metadata_file, index=False)

## Normalize Sample column

Read each Lithology CSV to check if Sample column needs to be updated. Change 'Label ID' to 'Sample'. Add 'Sample' if no 'Sample' or 'Label ID' based on expedition...section columns. Overwrite existing Lithology CSV if Sample column is updated.

### Read metadata file

In [9]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,expedition,filename_has_exp,add_expedition_section_cols
0,361_macroscopic_U1474D.csv,361,True,False
1,323 Core Description Template_U1341A.csv,323,True,False
2,361_macroscopic_U1479C.csv,361,True,False
3,340_sediment_U1393A.csv,340,True,False
4,339_sediment_U1386A.csv,339,True,False


In [10]:
metadata.shape

(518, 4)

### Normalize columns

In [11]:
def process_filename(file):
    path = f"{clean_data_path}/{file}"
    content = pd.read_csv(path)
    
    original_cols = content.columns
    normalize_sample_col(content)
    
    changed = list(original_cols) != list(content.columns)
    
    if changed:
        content.to_csv(path, index=False)
        pass

    return changed
    
change_columns = [process_filename(file) for file in metadata['file']] 

### Update metadata

In [12]:
dict = {"update_sample_col": change_columns}
new_metadata = update_metadata(metadata, dict)
new_metadata.head()

Unnamed: 0,update_sample_col
0,True
1,False
2,True
3,True
4,False


In [29]:
new_metadata.to_csv(metadata_file, index=False)