# fix micropal 4 files

In [1]:
import sys
sys.path.append('../../../')
from pathlib import Path

import pandas as pd
import shutil
import numpy as np

from config import CLEAN_DATA_DIR, OUTPUT_DIR
from scripts.normalize_data import (
    normalize_sample_col, 
    normalize_expedition_section_cols, 
    csv_cleanup,
    update_metadata,
    fetch_unique_column_names,
    append_set,
    filter_existing_set,
    normalize_columns,
    add_missing_columns,
)

In [2]:
base_dir = CLEAN_DATA_DIR

metadata_file = OUTPUT_DIR/'metadata'/'LIMS'/'Micropal_changes.csv' 



In [3]:
def log_df(df, row_count=5):
    print(df.shape)
    return df.head(row_count)


## check basic columns

check if files have basic columns

In [118]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,path,taxon_groups,change_file_encoding,remove_bad_characters,remove_empty_rows,remove_spaces,delete_duplicate_rows,delete_duplicate_columns,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols,clean_up_taxa_values,clean_up_taxa_metadata_values,split_dextral_sinistral
0,363-U1482A-Benthic_Forams.csv,LIMS/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,benthic_forams,False,False,False,True,False,False,True,False,False,False,False,True,False
1,320_U1336A_Nannofossils_2.csv,LIMS/Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False
2,375_U1518F_planktic_forams.csv,LIMS/Micropal_CSV_1/375_U1518F_planktic_forams...,planktic_forams,False,False,False,True,False,False,True,False,False,False,False,False,False
3,320_U1334A_Nannofossils_1.csv,LIMS/Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False
4,318_U1358B_Palynology.csv,LIMS/Micropal_CSV_1/318_U1358B_Palynology.csv,palynology,False,False,False,False,False,False,False,True,False,False,False,False,False


In [119]:
bad_files = set()
for file in metadata['path']:
    if 'Micropal_CSV_4' not in file:
        continue
        
    
    path = base_dir/file
    df = pd.read_csv(path, dtype=str, nrows=0)
    cols = df.columns
    if ('Sample' in cols or 'Label ID' in cols) or (
        'Exp' in cols
        and 'Hole' in cols 
        and 'Site' in cols 
        and 'Core' in cols 
        and 'Type' in cols 
        and 'Section' in cols
        ):
        pass
    else:
        bad_files.add(path)
        

        
len(bad_files)

124

In [120]:
# bad_files

# prep work

## get all files with intervals

create directory for files with intervals

In [121]:
columns_all = set()

for file in metadata['path']:
    if 'Micropal_CSV_4' not in file:
        continue
        
    fetch_unique_column_names(base_dir/file, columns_all)

In [122]:
interval_all=set()

append_set(interval_all, r".*?\binterval\b.*?", columns_all)

In [123]:
interval_all

{'Bottom interval (cm)',
 'Core, Section, Interval',
 'Core, section, interval',
 'Core, section, interval (cm)',
 'Expedition, site, hole, core, section, interval (cm):',
 'Interval (bottom)',
 'Interval (top)',
 'Interval Bot (cm) on SHLF',
 'Interval Top (cm) on SHLF',
 'Top interval (cm)',
 'bottom interval (cm)',
 'interval (cm)',
 'top interval (cm)'}

copy interval files

In [124]:
intervals = {
 'Core, Section, Interval',
 'Core, section, interval',
 'Core, section, interval (cm)',
 'Expedition, site, hole, core, section, interval (cm):',
 'interval (cm)'
}

for file in metadata['path']:
    if 'Micropal_CSV_4' not in file:
        continue
    path = Path(base_dir/file)

        
    df = pd.read_csv(path, dtype=str)
    for col in df.columns:
        if col in intervals:   
            new_path = Path('..', '..', '..', 'output', 'tmp', 'intervals_lims_4', path.name)
            # shutil.copy(path, new_path)


## create file for header depths

In [125]:
columns_all = set()

for file in metadata['path']:
    if 'Micropal_CSV_4' not in file:
        continue
        
    fetch_unique_column_names(base_dir/file, columns_all)

In [126]:
top_all = set()
bottom_all = set()
depth = set()
interval_all=set()

append_set(depth, r".*?\bdepth\b.*?", columns_all)
append_set(top_all, r".*?\btop\b.*?", columns_all)
append_set(bottom_all, r".*?\bbottom\b.*?", columns_all)
append_set(interval_all, r".*?\binterval\b.*?", columns_all)

In [127]:
cols = []
for file in metadata['path']:
    if 'Micropal_CSV_4' not in file:
        continue
        
    keep_cols = [file]
    path = Path(base_dir/file)
    df = pd.read_csv(path, dtype=str)
    for col in df.columns:
        if col in top_all or col in bottom_all or col in depth or col in interval_all:
            keep_cols.append(col)
    cols.append(keep_cols)
            
    

In [128]:
df = pd.DataFrame(cols, columns=['file','top','bottom','top depth','bottom depth','depth','interval',7,8,9])
log_df(df)

(137, 10)


Unnamed: 0,file,top,bottom,top depth,bottom depth,depth,interval,7,8,9
0,LIMS/Micropal_CSV_4/320_U1334_benthic_forams.csv,"Core, section, interval (cm)",Depth (csf),,,,,,,
1,LIMS/Micropal_CSV_4/361_U1479B_nannofossils.csv,Top (cm),Bottom (cm),,,,,,,
2,LIMS/Micropal_CSV_4/372_U1517C_planktic_forams...,"Expedition, site, hole, core, section, interva...",Bottom depth CSF-B (m):,Top depth CSF-B (m):,,,,,,
3,LIMS/Micropal_CSV_4/317_U1352_planktic_forams.csv,Depth (cm),Top Depth (CSF m),,,,,,,
4,LIMS/Micropal_CSV_4/323_U1339_palynology.csv,Depth top CSF-A (m),Depth bottom CSF-A (m),,,,,,,


In [50]:
# df.to_csv('../../../output/normalized_data/lims_4_depth_headers.csv', index=False)

manually fix lims_4_depth_headers

# update files

## fix bad header row

In [152]:
file = '../../../output/cleaned_data/LIMS/Micropal_CSV_4/317_U1351_planktic_forams.csv'
df = pd.read_csv(file, header=1, dtype=str)

log_df(df)

(232, 49)


Unnamed: 0,Expedition,Hole,Core,Core Type,Section,Top Depth [CSF m],Foraminferal preservation,Foraminiferal abundance,Planktic foraminiferal %,Globigerina spp.,...,Zeaglobierina woodi,Echinoid plate fragments,Echinoid spines,Fish teeth,Ostracods,Shell fragments,Micro-mollusks,Baranacle fragments,Bryozoan fragments,Remarks
0,317,U1351A,1,H,CC,2.56,G,A,13,X,...,,,F,,F,A,,,,
1,317,U1351A,2,H,CC,11.44,G,A,17,X,...,,,,,,A,A,,,
2,317,U1351A,3,H,CC,18.15,G,A,26,X,...,,,,,,A,,,,
3,317,U1351A,4,H,CC,25.08,G,A,29,X,...,,,F,,,A,,,,
4,317,U1351A,5,H,CC,27.5,G,A,8,X,...,,,F,,F,A,,,,


In [153]:
df.to_csv(file, index=False)

## rename exp..type columns

In [154]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,path,taxon_groups,change_file_encoding,remove_bad_characters,remove_empty_rows,remove_spaces,delete_duplicate_rows,delete_duplicate_columns,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols,clean_up_taxa_values,clean_up_taxa_metadata_values,split_dextral_sinistral
0,363-U1482A-Benthic_Forams.csv,LIMS/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,benthic_forams,False,False,False,True,False,False,True,False,False,False,False,True,False
1,320_U1336A_Nannofossils_2.csv,LIMS/Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False
2,375_U1518F_planktic_forams.csv,LIMS/Micropal_CSV_1/375_U1518F_planktic_forams...,planktic_forams,False,False,False,True,False,False,True,False,False,False,False,False,False
3,320_U1334A_Nannofossils_1.csv,LIMS/Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False
4,318_U1358B_Palynology.csv,LIMS/Micropal_CSV_1/318_U1358B_Palynology.csv,palynology,False,False,False,False,False,False,False,True,False,False,False,False,False


In [155]:
temp_files = [
    '321-U1337C_109_T08_rads.csv', 
    '321_U1337B_109_T07_rads.csv', 
    '321-U1337D_109_T09_rads.csv',
    '320_U1337A_109_T06_rads.csv'
]

for file in metadata['path']:
    if 'Micropal_CSV_4' not in file:
        continue
 
    path = Path(base_dir/file)
    df = pd.read_csv(path, dtype=str)
    
 
    if  path.name in ['317_U1351_benthic_forams.csv', '317_U1351_diatoms.csv']:
        if 'Hole.1' in df.columns:
            del df['Hole']
            del df['Exp']
            df.rename(columns={'Hole.1': 'Hole', 'Exp.1' : 'Exp'}, inplace=True)
            
    df.rename(columns={'Expedition' : 'Exp', 
                       'section': 'Section', 
                       'Core Type': 'Type',
                       'Core type': 'Type'}, inplace=True)

    if path.name in temp_files:
        df.rename(columns={'Core, section': 'Sample'}, inplace=True)
   
    
    df = csv_cleanup(df, path)
    df.to_csv(path, index=False)
    

## add missing expedition

In [156]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,path,taxon_groups,change_file_encoding,remove_bad_characters,remove_empty_rows,remove_spaces,delete_duplicate_rows,delete_duplicate_columns,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols,clean_up_taxa_values,clean_up_taxa_metadata_values,split_dextral_sinistral
0,363-U1482A-Benthic_Forams.csv,LIMS/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,benthic_forams,False,False,False,True,False,False,True,False,False,False,False,True,False
1,320_U1336A_Nannofossils_2.csv,LIMS/Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False
2,375_U1518F_planktic_forams.csv,LIMS/Micropal_CSV_1/375_U1518F_planktic_forams...,planktic_forams,False,False,False,True,False,False,True,False,False,False,False,False,False
3,320_U1334A_Nannofossils_1.csv,LIMS/Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False
4,318_U1358B_Palynology.csv,LIMS/Micropal_CSV_1/318_U1358B_Palynology.csv,palynology,False,False,False,False,False,False,False,True,False,False,False,False,False


In [157]:
for file in metadata['path']:
    if 'Micropal_CSV_4' not in file:
        continue
 
    path = Path(base_dir/file)
    df = pd.read_csv(path, dtype=str)
    
    if path.name == '323_U1339_palynology.csv':
        df['Exp'] = '323'
    if path.name == '323_U1339_dinoflagellates.csv':
        df['Exp'] = '323'
    if path.name == '321_U1338_nannofossils.csv':
        df['Sample'] = '321-' + df['Sample']

    
    df = csv_cleanup(df, path)
    df.to_csv(path, index=False)

## split columns

In [158]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,path,taxon_groups,change_file_encoding,remove_bad_characters,remove_empty_rows,remove_spaces,delete_duplicate_rows,delete_duplicate_columns,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols,clean_up_taxa_values,clean_up_taxa_metadata_values,split_dextral_sinistral
0,363-U1482A-Benthic_Forams.csv,LIMS/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,benthic_forams,False,False,False,True,False,False,True,False,False,False,False,True,False
1,320_U1336A_Nannofossils_2.csv,LIMS/Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False
2,375_U1518F_planktic_forams.csv,LIMS/Micropal_CSV_1/375_U1518F_planktic_forams...,planktic_forams,False,False,False,True,False,False,True,False,False,False,False,False,False
3,320_U1334A_Nannofossils_1.csv,LIMS/Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False
4,318_U1358B_Palynology.csv,LIMS/Micropal_CSV_1/318_U1358B_Palynology.csv,palynology,False,False,False,False,False,False,False,True,False,False,False,False,False


In [159]:
def split_column(df, column, new_columns, delim='-'):
    df[new_columns] = df[column].str.split(delim, expand=True)


def split_core(df):    
    df['Core'] = df['Core temp'].str.extract(r'([0-9]+)', expand=True)
    df['Type'] = df['Core temp'].str.extract(r'([a-zA-Z]+)', expand=True )

    
def split_core_interval(df, col):
    split_column(df, col, ['Core, section', 'temp_interval'], ', ')
    split_column(df, 'Core, section', ['Core temp', 'Section'])
    split_core(df)
    split_interval(df)

    del df['Core, section'] 
    del df['Core temp'] 
    
def split_sample_interval(df, col):
    split_column(df, col, ['Sample', 'temp_interval'], ', ')
    split_interval(df)
      
    
def split_interval(df, delim='-'):
    df['temp_interval'] = df['temp_interval'].str.replace(' cm', '')
    df['temp_interval'] = df['temp_interval'].str.replace('cm', '')
    df.loc[df['temp_interval'].isna(), 'temp_interval'] = delim

    split_column(df, 'temp_interval', ['Top [cm]', 'Bottom [cm]'], delim)
    
    del df['temp_interval'] 
    

def log_files(cols, file):
    if 'Exp.1' in cols:
        print('Exp.1', file)    
    if 'Site.1' in cols:
        print('Site.1', file)   
    if 'Hole.1' in cols:
        print('Hole.1', file)
    if 'Core.1' in cols:
        print('Core.1', file) 
    if 'Section.1' in cols:
        print('Section.1', file)
        
        
    if ('Exp' in cols) and ('Hole' in cols) and ('Site' in cols) and ('Core' in cols) and ('Type' in cols) and ('Section' in cols):
        pass
    elif 'Sample' in cols:
        pass
    elif 'Core, Section, Interval' in cols:
        # TODO: have PIs fix column
        print('Core, Section, Interval', path)
    elif 'Core, section, interval' in cols:
        # TODO: have PIs fix column
        print('Core, section, interval', path)
    else:
        pass
        # print('invalid:', file)

### split intervals

In [160]:
for file in metadata['path']:
    if 'Micropal_CSV_4' not in file:
        continue
 
    path = Path(base_dir/file)
    df = pd.read_csv(path, dtype=str)
    
    
    # split interval into separate columns 
    if 'Core, section, interval (cm)' in df.columns:
        interval_col = 'Core, section, interval (cm)' 
        if path.name == '320_U1331_benthic_forams.csv':
            df.loc[df[interval_col] == "10H7, 44-46 cm", interval_col] = "10H-7, 44-46 cm"
            df.loc[df[interval_col] == "10H7, 47-49 cm", interval_col] = "10H-7, 47-49 cm"
            
            split_core_interval(df, interval_col) 

        elif path.name == '320_U1336B_108_T10_planktic forams.csv':
            df[interval_col] = df[interval_col].str.replace('–', '-')
            split_sample_interval(df, interval_col)
        else:
            split_core_interval(df, interval_col) 
            
    elif 'Core, Section, Interval'  in df.columns:
        interval_col = 'Core, Section, Interval' 
        if path.name == '323_U1339A_diatoms.csv':
            df.loc[df[interval_col] == "4H-1-150-152", interval_col] = "4H-1, 150-152"
            df.loc[df[interval_col] == "4H-3-150-156", interval_col] = "4H-3, 150-156"
            df.loc[df[interval_col] == "4H-3-156-159", interval_col] = "4H-3, 156-159"
            
        elif path.name == '323_U1339B_diatoms.csv':
            df.loc[df[interval_col] == "9H-2-45cm", interval_col] = "9H-2, 45"
            df.loc[df[interval_col] == "9H-4-45cm", interval_col] = "9H-4, 45"

        elif path.name == '323_U1339C_diatoms.csv':
            df.loc[df[interval_col] == "10H-1-25cm", interval_col] = "10H-1, 25cm"
            df.loc[df[interval_col] == "10H-2-25cm", interval_col] = "10H-2, 25cm"
            
        split_core_interval(df, interval_col)
        
    elif 'Core, section, interval'  in df.columns:
        interval_col = 'Core, section, interval' 
        split_core_interval(df, interval_col)
    
    elif 'Expedition, site, hole, core, section, interval (cm):' in df.columns:
        interval_col = 'Expedition, site, hole, core, section, interval (cm):'
        split_sample_interval(df, interval_col)
    
        
    elif 'interval (cm)' in df.columns:
        interval_col = 'interval (cm)'
        df[interval_col] = df[interval_col].str.replace('–', '-')
        df[interval_col] = df[interval_col].str.replace('-', '-')
        df['temp_interval'] = df[interval_col]

        split_interval(df)
        
    elif path.name == '321_U1338_nannofossils.csv':
        interval_col = 'Sample'
        df.loc[df[interval_col] == "321-U1338A-2H-2,24", interval_col] = "321-U1338A-2H-2, 24"
        df.loc[df[interval_col] == "321-U1338A-5H-5,70", interval_col] = "321-U1338A-5H-5, 70"
        df.loc[df[interval_col] == "321-U1338A-24H-4-80", interval_col] = "321-U1338A-24H-4, 80"

        split_sample_interval(df, interval_col)

    cols = list(df.columns)
    log_files(cols, file)

            
    df = csv_cleanup(df, path)
    df.to_csv(path, index=False)
    


### split core

In [161]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,path,taxon_groups,change_file_encoding,remove_bad_characters,remove_empty_rows,remove_spaces,delete_duplicate_rows,delete_duplicate_columns,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols,clean_up_taxa_values,clean_up_taxa_metadata_values,split_dextral_sinistral
0,363-U1482A-Benthic_Forams.csv,LIMS/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,benthic_forams,False,False,False,True,False,False,True,False,False,False,False,True,False
1,320_U1336A_Nannofossils_2.csv,LIMS/Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False
2,375_U1518F_planktic_forams.csv,LIMS/Micropal_CSV_1/375_U1518F_planktic_forams...,planktic_forams,False,False,False,True,False,False,True,False,False,False,False,False,False
3,320_U1334A_Nannofossils_1.csv,LIMS/Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False
4,318_U1358B_Palynology.csv,LIMS/Micropal_CSV_1/318_U1358B_Palynology.csv,palynology,False,False,False,False,False,False,False,True,False,False,False,False,False


In [162]:
for file in metadata['path']:
    if 'Micropal_CSV_4' not in file:
        continue
 
    path = Path(base_dir/file)
    df = pd.read_csv(path, dtype=str)
    

    # edit column so that we can split it into separate columns  
    if path.name == '323_U1339_radiolarians.csv':
        df['Hole, Core, Section temp'] = df['Hole, Core, Section'].str.replace(' ', '-') 
        
    elif path.name == '323_U1343_silicoflagellates.csv':
        df.rename(columns={'Unnamed: 3': 'Core, Type, Section'}, inplace=True)
        df['Core, Type, Section temp'] = df['Core, Type, Section'].str.replace('^(\d+)([A-Z-]+)$', r'\1-\2', regex=True)
    
    
    # split core into separate columns 
    temp_files = [
        '323_U1339C_nannofossils.csv',
        '323_U1339C_benthic_forams.csv',
        '323_U1339B_nannofossils.csv',
        '320_U1335_planktic_forams.csv',
        '323_U1341B_benthic_forams.csv',
        '320_U1333_planktic_forams.csv',
        '323_U1341A_benthic_forams.csv',
        '323_U1339A_nannofossils.csv',
        '323_U1339_silicoflagellates.csv',
        '323_U1339A_benthic_forams.csv',
        '323_U1339B_benthic_forams.csv',
        '323_U1341A_ostracods.csv',
        '323_U1341C_benthic_forams.csv',
        '320_U1336A_planktic_forams.csv',
        '323_U1339D_benthic_forams.csv',
        '323_U1339D_nannofossils.csv'
    ]
    for temp_file in temp_files:
        if path.name == temp_file:
            df['Core temp'] = df['Core']
            split_core(df)
            del df['Core temp']
            
    
    # split column into multiple columns    
    target_columns = ['core, section', 'Core, section', 'Core, Section', 'Core,    section']
    for target_column in target_columns:
        if target_column in df.columns:
            split_column(df, target_column, ['Core','Section'])
            df['Core temp'] = df['Core']
            split_core(df) 
            del df['Core temp']


    if 'Hole, Core, Section temp' in df.columns:
        split_column(df, 'Hole, Core, Section temp', ['Hole', 'Core', 'Section'])
        df['Core temp'] = df['Core']
        split_core(df)
        del df['Hole, Core, Section temp']
        del df['Core temp']

    elif 'Core Type - Section' in df.columns:
        split_column(df, 'Core Type - Section', ['Type', 'Section'])
        
    elif 'Core, Type, Section temp' in df.columns:
        split_column(df, 'Core, Type, Section temp', ['Core', 'Type', 'Section'])
        del df['Core, Type, Section temp']
    
        
    cols = list(df.columns)
    
    # log_files(cols, file)
        
    df = csv_cleanup(df, path)
    df.to_csv(path, index=False)
    


### split site hole

In [163]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,path,taxon_groups,change_file_encoding,remove_bad_characters,remove_empty_rows,remove_spaces,delete_duplicate_rows,delete_duplicate_columns,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols,clean_up_taxa_values,clean_up_taxa_metadata_values,split_dextral_sinistral
0,363-U1482A-Benthic_Forams.csv,LIMS/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,benthic_forams,False,False,False,True,False,False,True,False,False,False,False,True,False
1,320_U1336A_Nannofossils_2.csv,LIMS/Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False
2,375_U1518F_planktic_forams.csv,LIMS/Micropal_CSV_1/375_U1518F_planktic_forams...,planktic_forams,False,False,False,True,False,False,True,False,False,False,False,False,False
3,320_U1334A_Nannofossils_1.csv,LIMS/Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False
4,318_U1358B_Palynology.csv,LIMS/Micropal_CSV_1/318_U1358B_Palynology.csv,palynology,False,False,False,False,False,False,False,True,False,False,False,False,False


In [164]:
def split_site_hole(df, col):
    df[['Site', 'Hole']] = df[col].str.extract(r'(U[0-9]+)([a-zA-Z])', expand=True)
    

for file in metadata['path']:
    if 'Micropal_CSV_4' not in file:
        continue
 
    path = Path(base_dir/file)
    df = pd.read_csv(path, dtype=str)

    if path.name == '323_U1344_Radiolarians.csv':
        df['Hole'] = 'U' + df['Hole']
        split_site_hole(df, 'Hole')
        
    elif path.name == '317-U1352_benthic_forams.csv':
        split_site_hole(df, 'Site')
    

    temp_files = [
        '321_U1337A_nannofossils.csv',
        '320_U1334_benthic_forams.csv',
        '317_U1352_planktic_forams.csv',
        '317_U1353_planktic_forams.csv',
        '317_U1351_nannofossils.csv',
        '317_U1351_planktic_forams.csv',
        '323_U1339_radiolarians.csv',
        '323_U1343_radiolarians.csv',
        '317_U1354_planktic_forams.csv',
        '317_U1354_benthic_forams.csv',
        '320_U1336A_benthic_forams.csv',
        '320_U1331_benthic_forams.csv',
        '317_U1353_diatoms.csv',
        '323_U1340A_dinoflagellates.csv',
        '320_U1332_benthic_forams.csv',
        '317_U1354_nannofossils.csv',
        '317_U1353_nannofossils.csv',
        '320_U1333_benthic_forams.csv',
        '317_U1354_diatoms.csv',
        '317_U1353_benthic_forams.csv',
        '323_U1340A_palynology.csv',
        '317_U1352_nannofossils.csv',
        '320_U1335_benthic_forams.csv',
    ]
    if path.name in temp_files:
        split_site_hole(df, 'Hole')
  
    
    df = csv_cleanup(df, path)
    df.to_csv(path, index=False)

## normalize depth headers

In [165]:
header_df = pd.read_csv('../../../output/normalized_data/lims_4_depth_headers.csv')
log_df(header_df)

(137, 12)


Unnamed: 0,file,top,bottom,top depth,bottom depth,depth,interval,7,8,9,Unnamed: 10,Unnamed: 11
0,LIMS/Micropal_CSV_4/320_U1334_benthic_forams.csv,Top [cm],Bottom [cm],Depth (csf),,,"Core, section, interval (cm)",,,,,
1,LIMS/Micropal_CSV_4/361_U1479B_nannofossils.csv,Top (cm),Bottom (cm),,,,,,,,,
2,LIMS/Micropal_CSV_4/372_U1517C_planktic_forams...,Top [cm],Bottom [cm],Top depth CSF-B (m):,Bottom depth CSF-B (m):,,"Expedition, site, hole, core, section, interva...",,,,,
3,LIMS/Micropal_CSV_4/317_U1352_planktic_forams.csv,,,Top Depth (CSF m),,,,,,,,
4,LIMS/Micropal_CSV_4/323_U1339_palynology.csv,,,Depth top CSF-A (m),Depth bottom CSF-A (m),,,,,,,


In [166]:
for index, row in header_df.iterrows():
    path = base_dir/row['file']
    df = pd.read_csv(path, dtype=str)
    df.rename(columns={row['top']: 'Top [cm]', 
                       row['bottom']: 'Bottom [cm]', 
                       row['top depth']: 'Top Depth [m]',
                       row['bottom depth']: 'Bottom Depth [m]'
    }, inplace=True)
    
    df = csv_cleanup(df, path)
    df.to_csv(path, index=False)

## add missing columns 

In [167]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,path,taxon_groups,change_file_encoding,remove_bad_characters,remove_empty_rows,remove_spaces,delete_duplicate_rows,delete_duplicate_columns,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols,clean_up_taxa_values,clean_up_taxa_metadata_values,split_dextral_sinistral
0,363-U1482A-Benthic_Forams.csv,LIMS/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,benthic_forams,False,False,False,True,False,False,True,False,False,False,False,True,False
1,320_U1336A_Nannofossils_2.csv,LIMS/Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False
2,375_U1518F_planktic_forams.csv,LIMS/Micropal_CSV_1/375_U1518F_planktic_forams...,planktic_forams,False,False,False,True,False,False,True,False,False,False,False,False,False
3,320_U1334A_Nannofossils_1.csv,LIMS/Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False
4,318_U1358B_Palynology.csv,LIMS/Micropal_CSV_1/318_U1358B_Palynology.csv,palynology,False,False,False,False,False,False,False,True,False,False,False,False,False


In [168]:
normalized_columns = [
    'Top [cm]',
    'Bottom [cm]',
    'Top Depth [m]',
    'Bottom Depth [m]',
]

exp_columns = [
    'Site',
    'Hole',
    'Core',
    'Type',
    'Section',
    'A/W'
]

for file in metadata['path']:
    if 'Micropal_CSV_4' not in file:
        continue
 
    path = Path(base_dir/file)
    df = pd.read_csv(path, dtype=str)
    
    for col in normalized_columns:
        if col not in df.columns:
            df[col] = np.nan
            
    if 'Exp' in df.columns:
        for col in exp_columns:
            if col not in df.columns:
                df[col] = np.nan
    
    
    df = csv_cleanup(df, path)
    df.to_csv(path, index=False)

## fix individual files

In [4]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,path,taxon_groups,change_file_encoding,remove_bad_characters,remove_empty_rows,remove_spaces,delete_duplicate_rows,delete_duplicate_columns,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols,clean_up_taxa_values,clean_up_taxa_metadata_values,split_dextral_sinistral
0,363-U1482A-Benthic_Forams.csv,LIMS/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,benthic_forams,False,False,False,True,False,False,True,False,False,False,False,True,False
1,320_U1336A_Nannofossils_2.csv,LIMS/Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False
2,375_U1518F_planktic_forams.csv,LIMS/Micropal_CSV_1/375_U1518F_planktic_forams...,planktic_forams,False,False,False,True,False,False,True,False,False,False,False,False,False
3,320_U1334A_Nannofossils_1.csv,LIMS/Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False
4,318_U1358B_Palynology.csv,LIMS/Micropal_CSV_1/318_U1358B_Palynology.csv,palynology,False,False,False,False,False,False,False,True,False,False,False,False,False


### 341_radiolarians_U1417A.csv

In [5]:
path = CLEAN_DATA_DIR/metadata[metadata['file'] == '341_radiolarians_U1417A.csv']['path'].tolist()[0]

df = pd.read_csv(path, dtype=str)
df.tail()

Unnamed: 0,Sample,Exp,Site,Hole,Core,Core-Sect,Type,Section,A/W,Top [cm],...,Thecosphaera japonica,Theocorys redondoensis,Theocorythium trachelium,Theocorythium vetulum,Tholospyris scaphipes,Comments,Ship File Links,Shore File Links,File Data,eodp_id
17,341-U1417A-17H-7-PAL,341,U1417,A,17,17-7,H,7,PAL,0,...,C,,,,,,,,,d3f033c35d37c99637ab4914c6753d3a
18,341-U1417A-18H-CC-PAL,341,U1417,A,18,18-CC,H,CC,PAL,0,...,R,,,,,,,,,4d42a2a2e51a7f8682afba7c3a5fbf5b
19,341-U1417A-19H-CC-PAL,341,U1417,A,19,19-CC,H,CC,PAL,0,...,C,,,,,,,,,0843d3ae7143cc84238b37647256c377
20,341-U1417A-20H-CC-PAL,342,U1418,A,20,20-CC,H,CC,PAL,0,...,,,,,,,,,,f3716aa45d26e71b197174d7aaff0203
21,341-U1417A-21H-CC-PAL,343,U1419,A,21,21-CC,H,CC,PAL,0,...,F,,,,,,,,,309eb1b372791156b64c0b6be8744eb2


In [6]:
df['Exp'] = 341
df.tail()

Unnamed: 0,Sample,Exp,Site,Hole,Core,Core-Sect,Type,Section,A/W,Top [cm],...,Thecosphaera japonica,Theocorys redondoensis,Theocorythium trachelium,Theocorythium vetulum,Tholospyris scaphipes,Comments,Ship File Links,Shore File Links,File Data,eodp_id
17,341-U1417A-17H-7-PAL,341,U1417,A,17,17-7,H,7,PAL,0,...,C,,,,,,,,,d3f033c35d37c99637ab4914c6753d3a
18,341-U1417A-18H-CC-PAL,341,U1417,A,18,18-CC,H,CC,PAL,0,...,R,,,,,,,,,4d42a2a2e51a7f8682afba7c3a5fbf5b
19,341-U1417A-19H-CC-PAL,341,U1417,A,19,19-CC,H,CC,PAL,0,...,C,,,,,,,,,0843d3ae7143cc84238b37647256c377
20,341-U1417A-20H-CC-PAL,341,U1418,A,20,20-CC,H,CC,PAL,0,...,,,,,,,,,,f3716aa45d26e71b197174d7aaff0203
21,341-U1417A-21H-CC-PAL,341,U1419,A,21,21-CC,H,CC,PAL,0,...,F,,,,,,,,,309eb1b372791156b64c0b6be8744eb2


In [7]:
df.to_csv(path, index=False)

 ### 323_U1340A_benthic_forams.csv


In [8]:
path = CLEAN_DATA_DIR/metadata[metadata['file'] == '323_U1340A_benthic_forams.csv']['path'].tolist()[0]

df = pd.read_csv(path, dtype=str)
df.head()

Unnamed: 0,Exp,Site,Hole,Core,Type,Section,Top [cm],Bottom [cm],Top Depth [m],Bottom Depth [m],...,Triloculina cf. trihedra,Uvigerina cf. peregrina,Uvigerina auberiana,Valvulineria sp.,IRD,Other observations,Unnamed: 62,A/W,Sample,eodp_id
0,323,U1340,A,1,H,1,0.0,0.0,0.0,0.0,...,,,,,,,,,323-U1340A-1H-1,f875fdfee1148cfc9d3db116a2f5e685
1,324,U1341,A,1,H,CC,,,3.8,3.9,...,P,R,,,D,,,,324-U1341A-1H-CC,143d41a1f9d35cb53f4813dba0f2a680
2,325,U1342,A,2,H,CC,,,13.82,13.92,...,,P,,,D,,,,325-U1342A-2H-CC,1bbf6fba771b8c5ac3d458b517a80cea
3,326,U1343,A,3,H,CC,,,23.39,23.49,...,,R,,,D,"Sponge spicules, quartz, wood fragments",,,326-U1343A-3H-CC,1d576c7f73c023d470c840d17dc32d15
4,327,U1344,A,4,H,CC,,,32.75,32.85,...,,,,P,D,"Sponge spicules, quartz, wood fragments",,,327-U1344A-4H-CC,0c91cff2679aaab162c23c4cc89a528c


In [9]:
df['Exp'] = 323
df.head()

Unnamed: 0,Exp,Site,Hole,Core,Type,Section,Top [cm],Bottom [cm],Top Depth [m],Bottom Depth [m],...,Triloculina cf. trihedra,Uvigerina cf. peregrina,Uvigerina auberiana,Valvulineria sp.,IRD,Other observations,Unnamed: 62,A/W,Sample,eodp_id
0,323,U1340,A,1,H,1,0.0,0.0,0.0,0.0,...,,,,,,,,,323-U1340A-1H-1,f875fdfee1148cfc9d3db116a2f5e685
1,323,U1341,A,1,H,CC,,,3.8,3.9,...,P,R,,,D,,,,324-U1341A-1H-CC,143d41a1f9d35cb53f4813dba0f2a680
2,323,U1342,A,2,H,CC,,,13.82,13.92,...,,P,,,D,,,,325-U1342A-2H-CC,1bbf6fba771b8c5ac3d458b517a80cea
3,323,U1343,A,3,H,CC,,,23.39,23.49,...,,R,,,D,"Sponge spicules, quartz, wood fragments",,,326-U1343A-3H-CC,1d576c7f73c023d470c840d17dc32d15
4,323,U1344,A,4,H,CC,,,32.75,32.85,...,,,,P,D,"Sponge spicules, quartz, wood fragments",,,327-U1344A-4H-CC,0c91cff2679aaab162c23c4cc89a528c


In [10]:
df.to_csv(path, index=False)