# Normalize LIMS columns

Normalize the columns for the eODP CSVs.

In [14]:
import sys
sys.path.append('../../../')
from pathlib import Path

import pandas as pd
import shutil

from config import CLEAN_DATA_DIR, OUTPUT_DIR
from scripts.normalize_data import (
    normalize_sample_col, 
    normalize_expedition_section_cols, 
    csv_cleanup,
    update_metadata,
    fetch_unique_column_names,
    append_set,
    filter_existing_set,
    normalize_columns,
    add_missing_columns,
    extract_sample_parts,
    create_sample_name_for_row
)

In [2]:
base_dir = CLEAN_DATA_DIR

metadata_file = OUTPUT_DIR/'metadata'/'LIMS'/'Micropal_changes.csv' 



In [3]:
def log_df(df, row_count=5):
    print(df.shape)
    return df.head(row_count)


## check basic columns

check if files have basic columns

In [4]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,path,taxon_groups,change_file_encoding,remove_bad_characters,remove_empty_rows,remove_spaces,delete_duplicate_rows,delete_duplicate_columns
0,363-U1482A-Benthic_Forams.csv,LIMS/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,benthic_forams,False,False,False,True,False,False
1,320_U1336A_Nannofossils_2.csv,LIMS/Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,nannofossils,False,False,False,True,False,False
2,375_U1518F_planktic_forams.csv,LIMS/Micropal_CSV_1/375_U1518F_planktic_forams...,planktic_forams,False,False,False,True,False,False
3,320_U1334A_Nannofossils_1.csv,LIMS/Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,nannofossils,False,False,False,True,False,False
4,318_U1358B_Palynology.csv,LIMS/Micropal_CSV_1/318_U1358B_Palynology.csv,palynology,False,False,False,False,False,False


In [5]:
bad_files = set()
for file in metadata['path']:     
    
    path = base_dir/file
    df = pd.read_csv(path, dtype=str, nrows=0)
    cols = df.columns
    if ('Sample' in cols or 'Label ID' in cols) or (
        'Exp' in cols
        and 'Hole' in cols 
        and 'Site' in cols 
        and 'Core' in cols 
        and 'Type' in cols 
        and 'Section' in cols
        ):
        pass
    else:
        bad_files.add(path)
        

        
len(bad_files)

0

In [6]:
bad_files

set()

## Normalize expedition..section columns 

Read each Lithology CSV to check if expedition..section columns exist. Overwrite existing Lithology CSV if columns need to be added.

In [4]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,path,taxon_groups,change_file_encoding,remove_bad_characters,remove_empty_rows,remove_spaces,delete_duplicate_rows,delete_duplicate_columns
0,363-U1482A-Benthic_Forams.csv,LIMS/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,benthic_forams,False,False,False,True,False,False
1,320_U1336A_Nannofossils_2.csv,LIMS/Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,nannofossils,False,False,False,True,False,False
2,375_U1518F_planktic_forams.csv,LIMS/Micropal_CSV_1/375_U1518F_planktic_forams...,planktic_forams,False,False,False,True,False,False
3,320_U1334A_Nannofossils_1.csv,LIMS/Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,nannofossils,False,False,False,True,False,False
4,318_U1358B_Palynology.csv,LIMS/Micropal_CSV_1/318_U1358B_Palynology.csv,palynology,False,False,False,False,False,False


In [5]:
metadata.shape

(1253, 9)

In [6]:
def process_filename(file):
    path = base_dir/file
    content = pd.read_csv(path, dtype=str)
    
    original_cols = content.columns
    try:
        content = normalize_expedition_section_cols(content)
    except ValueError as err:
        print(file, err)
    changed = list(original_cols) != list(content.columns)

        
    if changed:
        content = csv_cleanup(content, path)
        content.to_csv(path, index=False)

    return changed
    
change_columns = [process_filename(file) for file in metadata['path']] 


LIMS/Micropal_CSV_4/320_U1334A_106_T06_planktic forams.csv Sample name uses wrong format.
LIMS/Micropal_CSV_4/321_U1338_nannofossils.csv Sample name uses wrong format.


In [7]:
dict = {"add_expedition_section_cols": change_columns}
new_metadata = update_metadata(metadata, dict)
new_metadata.head()

Unnamed: 0,file,path,taxon_groups,change_file_encoding,remove_bad_characters,remove_empty_rows,remove_spaces,delete_duplicate_rows,delete_duplicate_columns,add_expedition_section_cols
0,363-U1482A-Benthic_Forams.csv,LIMS/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,benthic_forams,False,False,False,True,False,False,True
1,320_U1336A_Nannofossils_2.csv,LIMS/Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,nannofossils,False,False,False,True,False,False,False
2,375_U1518F_planktic_forams.csv,LIMS/Micropal_CSV_1/375_U1518F_planktic_forams...,planktic_forams,False,False,False,True,False,False,True
3,320_U1334A_Nannofossils_1.csv,LIMS/Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,nannofossils,False,False,False,True,False,False,False
4,318_U1358B_Palynology.csv,LIMS/Micropal_CSV_1/318_U1358B_Palynology.csv,palynology,False,False,False,False,False,False,False


In [8]:
new_metadata.to_csv(metadata_file, index=False)

## Normalize Sample column

Read each  CSV to check if Sample column needs to be updated. Change 'Label ID' to 'Sample'. Add 'Sample' if no 'Sample' or 'Label ID' based on expedition...section columns. Overwrite existing Lithology CSV if Sample column is updated.

In [9]:
metadata = pd.read_csv(metadata_file)
metadata.head(2)

Unnamed: 0,file,path,taxon_groups,change_file_encoding,remove_bad_characters,remove_empty_rows,remove_spaces,delete_duplicate_rows,delete_duplicate_columns,add_expedition_section_cols
0,363-U1482A-Benthic_Forams.csv,LIMS/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,benthic_forams,False,False,False,True,False,False,True
1,320_U1336A_Nannofossils_2.csv,LIMS/Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,nannofossils,False,False,False,True,False,False,False


In [10]:
metadata.shape

(1253, 10)

In [15]:



path = base_dir/'LIMS/Micropal_CSV_4/320_U1334_benthic_forams.csv'
content = pd.read_csv(path, dtype=str)
for index, row in content.iterrows():
    print(index)
    create_sample_name_for_row(row, content.columns)

0


KeyError: 'A/W'

In [12]:
def process_filename(file):
    path = base_dir/file
    content = pd.read_csv(path, dtype=str)
    print(file)
    
    original_cols = content.columns
    normalize_sample_col(content)
    
    changed = list(original_cols) != list(content.columns)
    
    if changed:
        content = csv_cleanup(content, path)
        content.to_csv(path, index=False)

    return changed
    
change_columns = [process_filename(file) for file in metadata['path']] 

LIMS/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv
LIMS/Micropal_CSV_1/320_U1336A_Nannofossils_2.csv
LIMS/Micropal_CSV_1/375_U1518F_planktic_forams.csv
LIMS/Micropal_CSV_1/320_U1334A_Nannofossils_1.csv
LIMS/Micropal_CSV_1/318_U1358B_Palynology.csv
LIMS/Micropal_CSV_1/363-U1483A-benthic_forams.csv
LIMS/Micropal_CSV_1/320_U1335A_Nannofossils_1.csv
LIMS/Micropal_CSV_1/318_U1359D_Diatoms_2.csv
LIMS/Micropal_CSV_1/320_U1336A_Nannofossils_1.csv
LIMS/Micropal_CSV_1/318_U1355A_Silicoflagellates.csv
LIMS/Micropal_CSV_1/318_U1356A_Benthic_Forams.csv
LIMS/Micropal_CSV_1/363-U1487A-nannofossils.csv
LIMS/Micropal_CSV_1/363-U1486B-benthic_forams.csv
LIMS/Micropal_CSV_1/320_U1334A_Nannofossils_2.csv
LIMS/Micropal_CSV_1/320_U1335A_Nannofossils_2.csv
LIMS/Micropal_CSV_1/318_U1359D_Diatoms_1.csv
LIMS/Micropal_CSV_1/318_U1360A_Chrysophyte Cysts.csv
LIMS/Micropal_CSV_1/318_U1359A_Palynology.csv
LIMS/Micropal_CSV_1/375_U1518F_nannofossils.csv
LIMS/Micropal_CSV_1/363-U1483A-nannofossils.csv
LIMS/Micropal_CSV

ValueError: File does not have the expected columns.

In [None]:
dict = {"update_sample_col": change_columns}
new_metadata = update_metadata(metadata, dict)
new_metadata.head()

In [None]:
new_metadata.to_csv(metadata_file, index=False)

## Normalize Top and Bottom columns

Normalize all the Top, Top Depth, Bottom, and Bottom Depth column to have the same names.

In [51]:
metadata = pd.read_csv(metadata_file)

### Get top bottom columns

In [52]:
columns_all = set()

res=[fetch_unique_column_names(base_dir/file, columns_all) for file in metadata['path']] 

In [60]:
top_all = set()
top = set()
top_depth = set()

bottom_all = set()
bottom = set()
bottom_depth = set()

depth = set()

append_set(depth, r".*?depth.*?", columns_all)

append_set(top_all, r".*?top.*?", columns_all)
append_set(top_depth, r"top depth", columns_all)
append_set(top, r"top offset|top ?\[", columns_all)

append_set(bottom_all, r".*?bottom.*?", columns_all)
append_set(bottom_depth, r"bottom depth", columns_all)
append_set(bottom, r"bottom offset|bottom ?\[", columns_all)

In [61]:
depth

{'Bottom Depth (m)',
 'Bottom Depth (m) CSF-A',
 'Bottom Depth CSF-A (m)',
 'Bottom Depth [CFS m]',
 'Bottom Depth [m]',
 'Bottom Depth[m] [m]',
 'Bottom depth CSF-B (m)',
 'Bottom depth CSF-B (m):',
 'Bottom depth [m]',
 'Depth (cm)',
 'Depth (csf)',
 'Depth (m) CSF-A',
 'Depth CSF (m)',
 'Depth CSF-A (m)',
 'Depth Method',
 'Depth bottom CSF-A (m)',
 'Depth m (m csf)',
 'Depth top CSF-A (m)',
 'Original Bottom Depth (m)',
 'Original Top Depth (m)',
 'PALEO WATER DEPTH (IS=inner shelf, MS=middle shelf, OS=outer shelf)',
 'Top Depth (CSF m)',
 'Top Depth (m)',
 'Top Depth (m) CSF-A',
 'Top Depth CFS (m)',
 'Top Depth CSF-A (m)',
 'Top Depth [CFS m]',
 'Top Depth [CSF m]',
 'Top Depth [m]',
 'Top Depth[m] [m]',
 'Top depth CSF (m)',
 'Top depth CSF-B (m)',
 'Top depth CSF-B (m):',
 'Top depth [CSF m]',
 'Top depth [m]',
 'depth Bottom (m CSF-A)',
 'depth Bottom (m)',
 'depth Bottom CSF-A (m)',
 'depth CSF-A',
 'depth CSF-A (m)',
 'depth CSF-A Bottom (m)',
 'depth CSF-A Top (m)',
 'depth

In [54]:
top_all

{'Acarinina praetopilensis',
 'Acarinina praetopilensis Blow, 1979',
 'Acarinina pseudotopilensis',
 'Acarinina pseudotopilensis Subbotina, 1953',
 'Acarinina topilensis',
 'Acarinina topilensis (Cushman, 1925)',
 'Acrocubus octopylus',
 'Anogramma leptophylla',
 'Artophormis aff. dominasinensis',
 'Artophormis barbadensis',
 'Artophormis dominasinensis',
 'Artophormis gracilis',
 'C. leptoporus',
 'C. leptoporus (small)',
 'Calcidiscus leptoporus',
 'Calcidiscus leptoporus (10-11 µm)',
 'Calcidiscus leptoporus (5-8 microns)',
 'Calcidiscus leptoporus (<5 microns)',
 'Calcidiscus leptoporus (>10 μm)',
 'Calcidiscus leptoporus (>8 micron)',
 'Calcidiscus leptoporus (small 3-5 um)',
 'Cushmanina striatopunctata',
 'Cyrtopera laguncula',
 'Cyrtopera languncula',
 'Cystophormis brevispina',
 'Cystophormis ob',
 'Depth top CSF-A (m)',
 'Entopyla spp.',
 'Eprolithus octopetalus',
 'Grammatophora angulata',
 'Grammatophora arcuata',
 'Grammatophora arcunata',
 'Grammatophora sp.',
 'Grammatop

In [55]:
top

{'Top Offset (cm) on Parent Sample', 'Top [cm]', 'Top[cm] [cm]'}

In [56]:
top_depth

{'Top Depth (CSF m)',
 'Top Depth (m)',
 'Top Depth (m) CSF-A',
 'Top Depth CFS (m)',
 'Top Depth CSF-A (m)',
 'Top Depth [CFS m]',
 'Top Depth [CSF m]',
 'Top Depth [m]',
 'Top Depth[m] [m]',
 'Top depth CSF (m)',
 'Top depth CSF-B (m)',
 'Top depth CSF-B (m):',
 'Top depth [CSF m]',
 'Top depth [m]'}

In [57]:
bottom_all

{'Bottom',
 'Bottom (cm)',
 'Bottom (m CSF-A)',
 'Bottom CSF-A (m)',
 'Bottom Depth (m)',
 'Bottom Depth (m) CSF-A',
 'Bottom Depth CSF-A (m)',
 'Bottom Depth [CFS m]',
 'Bottom Depth [m]',
 'Bottom Depth[m] [m]',
 'Bottom Offset (cm) on Parent Sample',
 'Bottom [cm]',
 'Bottom depth CSF-B (m)',
 'Bottom depth CSF-B (m):',
 'Bottom depth [m]',
 'Bottom interval (cm)',
 'Bottom[cm] [cm]',
 'Depth bottom CSF-A (m)',
 'Interval (bottom)',
 'Original Bottom Depth (m)',
 'bottom (cm)',
 'bottom interval (cm)',
 'depth Bottom (m CSF-A)',
 'depth Bottom (m)',
 'depth Bottom CSF-A (m)',
 'depth CSF-A Bottom (m)'}

In [58]:
bottom

{'Bottom Offset (cm) on Parent Sample', 'Bottom [cm]', 'Bottom[cm] [cm]'}

In [59]:
bottom_depth

{'Bottom Depth (m)',
 'Bottom Depth (m) CSF-A',
 'Bottom Depth CSF-A (m)',
 'Bottom Depth [CFS m]',
 'Bottom Depth [m]',
 'Bottom Depth[m] [m]',
 'Bottom depth CSF-B (m)',
 'Bottom depth CSF-B (m):',
 'Bottom depth [m]'}

### Normalize top bottom columns

In [24]:
def normalize_top_bottom(file):
    path = base_dir/file
    content = pd.read_csv(path, dtype=str)
    columns = list(content.columns)
    
    normalized_cols = normalize_columns(top, 'Top [cm]', columns)
    normalized_cols = normalize_columns(bottom, 'Bottom [cm]', normalized_cols)
    normalized_cols = normalize_columns(top_depth, 'Top Depth [m]', normalized_cols)
    normalized_cols = normalize_columns(bottom_depth, 'Bottom Depth [m]', normalized_cols)
    
    changed = columns != normalized_cols
    
    if changed:
        content.columns = normalized_cols
        content = csv_cleanup(content, path)
        content.to_csv(path, index=False)

    return changed

change_columns = [normalize_top_bottom(file) for file in metadata['path']] 

In [25]:
dict = {"update_top_bottom": change_columns}
new_metadata = update_metadata(metadata, dict)
new_metadata.head()

Unnamed: 0,file,path,change_file_encoding,remove_empty_rows,delete_duplicate_colums_with_spaces,delete_duplicate_rows,delete_renamed_duplicate_columns,add_expedition_section_cols,update_sample_col,update_top_bottom
0,363-U1482A-Benthic_Forams.csv,LIMS/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,False,False,False,False,False,True,False,False
1,320_U1336A_Nannofossils_2.csv,LIMS/Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,False,False,False,False,False,False,True,False
2,375_U1518F_planktic_forams.csv,LIMS/Micropal_CSV_1/375_U1518F_planktic_forams...,False,False,False,False,False,True,False,False
3,320_U1334A_Nannofossils_1.csv,LIMS/Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,False,False,False,False,False,False,True,False
4,318_U1358B_Palynology.csv,LIMS/Micropal_CSV_1/318_U1358B_Palynology.csv,False,False,False,False,False,False,True,False


In [26]:
new_metadata.to_csv(metadata_file, index=False)

## Add missing columns

In [27]:
metadata = pd.read_csv(metadata_file)

In [28]:
normalized_columns = [
    'Top [cm]',
    'Bottom [cm]',
    'Top Depth [m]',
    'Bottom Depth [m]', 
    'Sample',
    'Exp',
    'Site',
    'Hole',
    'Core',
    'Type',
    'Section',
    'A/W'
]

In [29]:
change_columns = [add_missing_columns(base_dir/file, normalized_columns) for file in metadata['path']] 

In [30]:
dict = {"add_missing_cols": change_columns}
new_metadata = update_metadata(metadata, dict)
new_metadata.head()

Unnamed: 0,file,path,change_file_encoding,remove_empty_rows,delete_duplicate_colums_with_spaces,delete_duplicate_rows,delete_renamed_duplicate_columns,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols
0,363-U1482A-Benthic_Forams.csv,LIMS/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,False,False,False,False,False,True,False,False,False
1,320_U1336A_Nannofossils_2.csv,LIMS/Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,False,False,False,False,False,False,True,False,False
2,375_U1518F_planktic_forams.csv,LIMS/Micropal_CSV_1/375_U1518F_planktic_forams...,False,False,False,False,False,True,False,False,False
3,320_U1334A_Nannofossils_1.csv,LIMS/Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,False,False,False,False,False,False,True,False,False
4,318_U1358B_Palynology.csv,LIMS/Micropal_CSV_1/318_U1358B_Palynology.csv,False,False,False,False,False,False,True,False,False


In [31]:
new_metadata.to_csv(metadata_file, index=False)