# Fix files

Fix files that were processed incorrectly

In [1]:
import sys
sys.path.append('../scripts/')
import glob

import pandas as pd
import numpy as np

from normalize_data import (
    check_duplicate_columns,
    normalize_expedition_section_cols,
    csv_cleanup,
    create_sample_cols,
    update_metadata
)

In [2]:
lithology = 'cleaned_data/Lithology_CSV'
lithology_meta = 'cleaned_data/metadata/Lithology_changes.csv'

micropal_1 = 'cleaned_data/Micropal_CSV_1'
micropal_meta_1 = 'cleaned_data/metadata/Micropal_1_changes.csv'

micropal_2 = 'cleaned_data/Micropal_CSV_2'
micropal_meta_2 = 'cleaned_data/metadata/Micropal_2_changes.csv'

micropal_3 = 'cleaned_data/Micropal_CSV_3'
micropal_meta_3 = 'cleaned_data/metadata/Micropal_3_changes.csv'

micropal_revised = 'cleaned_data/Micropal_CSV_revised'

all_LIMS = [lithology, micropal_1, micropal_2, micropal_3, micropal_revised]


In [3]:
clean_data_path = micropal_3
metadata_file = micropal_meta_3

## fix exp..aw columns

Refactor regex matching to better extract exp...aw columns from the sample name.

In [4]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,taxon_group,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols,clean_up_taxa_values,fix_expedition_aw_cols,add_extra_sample_data,clean_up_taxa_metadata_values
0,339_benthic_forams_U1388B_5.csv,benthic_forams,False,False,True,False,False,False,False,False
1,324_U1348A_benthic_forams.csv,benthic_forams,False,True,True,False,True,False,False,True
2,339_planktic_forams_U1387C.csv,planktic_forams,False,False,True,False,False,False,False,False
3,339_benthic_forams_U1390A_6.csv,benthic_forams,False,False,True,False,False,False,False,False
4,341_radiolarians_U1419D.csv,radiolarians,True,False,False,False,True,True,False,True


In [5]:
len(metadata)

416

In [6]:
filtered_metadata = metadata[metadata['add_expedition_section_cols'] == True]
filtered_metadata = filtered_metadata['file'].to_list()
len(filtered_metadata)

85

In [7]:
def fix_expedition_section_cols(df):
    """ Create Exp...Section columns using Sample or Label ID """
    if "Sample" in df.columns:
        new_df = create_sample_cols(df["Sample"])
    elif "Label ID" in df.columns:
        new_df = create_sample_cols(df["Label ID"])
    else:
        raise ValueError("File does not have the expected columns.")
        
    df['Exp'] = new_df['Exp']
    df['Site'] = new_df['Site']
    df['Hole'] = new_df['Hole']
    df['Core'] = new_df['Core']
    df['Type'] = new_df['Type']
    df['Section'] = new_df['Section']
    df['A/W'] = new_df['A/W']

    return df

In [8]:
def process_filename(file):
    changed = False
    if file in filtered_metadata:
        path = f"{clean_data_path}/{file}"
        content = pd.read_csv(path, dtype=str)


        original = pd.DataFrame(content['A/W'])
        content = fix_expedition_section_cols(content)

        changed = not original['A/W'].equals(content['A/W'])

        if changed:
            content = csv_cleanup(content, path)
            content.to_csv(path, index=False)
        
    return changed

change_columns = [process_filename(file) for file in metadata['file']] 

### Update metadata

In [9]:
dict = {"fix_expedition_aw_cols": change_columns}
new_metadata = update_metadata(metadata, dict)
new_metadata.head()

Unnamed: 0,file,taxon_group,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols,clean_up_taxa_values,fix_expedition_aw_cols,add_extra_sample_data,clean_up_taxa_metadata_values
0,339_benthic_forams_U1388B_5.csv,benthic_forams,False,False,True,False,False,False,False,False
1,324_U1348A_benthic_forams.csv,benthic_forams,False,True,True,False,True,False,False,True
2,339_planktic_forams_U1387C.csv,planktic_forams,False,False,True,False,False,False,False,False
3,339_benthic_forams_U1390A_6.csv,benthic_forams,False,False,True,False,False,False,False,False
4,341_radiolarians_U1419D.csv,radiolarians,True,False,False,False,True,True,False,True


In [10]:
new_metadata.to_csv(metadata_file, index=False)

## Fix sample name

Add 'Extra Sample ID Data' when creating the sample name from Exp...A/W columns.

In [11]:
clean_data_path = micropal_3
metadata_file = micropal_meta_3

In [12]:
metadata = pd.read_csv(metadata_file)
filtered_files = metadata[metadata['update_sample_col'] == True]['file'].to_list()

In [13]:
def add_extra_sample_data(row, columns):
    if columns[0] == 'Sample':
        return row['Sample']
    elif columns[1] == 'Sample':
        return row['Sample']
    elif 'Extra Sample ID Data' not in columns:
        return row['Sample']
    elif row['Extra Sample ID Data'] is None:
        return row['Sample']
    elif row['Extra Sample ID Data'] is np.NaN:
        return row['Sample']
    
    if row['A/W'] == 'PAL':
        return row['Sample'] + '-' + row['Extra Sample ID Data'] 
    else:
        return row['Sample'] + ' ' + row['Extra Sample ID Data'] 
    
def process_filename(file, filtered_files):
    changed = False
    if file in filtered_files:
        path = f"{clean_data_path}/{file}"
        content = pd.read_csv(path, dtype=str)

        columns = content.columns

        original = pd.DataFrame(content['Sample'])

        content['Sample'] = content.apply(lambda row: add_extra_sample_data(row, columns), axis=1)
        
        changed = not original['Sample'].equals(content['Sample'])
        
        if changed:
            content = csv_cleanup(content, path)
            content.to_csv(path, index=False)

    return changed

change_columns = [process_filename(file, filtered_files) for file in metadata['file']] 

In [14]:
dict = {"add_extra_sample_data": change_columns}
new_metadata = update_metadata(metadata, dict)
new_metadata.head()

Unnamed: 0,file,taxon_group,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols,clean_up_taxa_values,fix_expedition_aw_cols,add_extra_sample_data,clean_up_taxa_metadata_values
0,339_benthic_forams_U1388B_5.csv,benthic_forams,False,False,True,False,False,False,False,False
1,324_U1348A_benthic_forams.csv,benthic_forams,False,True,True,False,True,False,False,True
2,339_planktic_forams_U1387C.csv,planktic_forams,False,False,True,False,False,False,False,False
3,339_benthic_forams_U1390A_6.csv,benthic_forams,False,False,True,False,False,False,False,False
4,341_radiolarians_U1419D.csv,radiolarians,True,False,False,False,True,True,False,True


In [15]:
new_metadata.to_csv(metadata_file, index=False)

## Remove empty rows and columns

remove empty rows and columns without headers or data

In [4]:
empty_rows_cols = []

for directory in all_LIMS:
    paths = glob.glob(f"{directory}/*.csv")

    for path in paths:
        df = pd.read_csv(path, dtype=str, header=None)
        orginal_dim = df.shape
        new_df = df.dropna(axis="index", how="all").copy()
        new_df.dropna(axis="columns", how="all", inplace=True)
        new_dim = new_df.shape
        
        if orginal_dim != new_dim:
            new_df.to_csv(path, index=False, header=False)
            empty_rows_cols.append(path)

In [5]:
len(empty_rows_cols)

0

In [6]:
df = pd.DataFrame({'files with empty rows or columns': empty_rows_cols})
df.to_csv('cleaned_data/metadata/lims_empty_rows_cols.csv', index=False)

## Remove duplicate identical rows

In [7]:
files = []

for clean_data_path in all_LIMS:
    raw_csvs = glob.glob(f"{clean_data_path}/*.csv")
    
    for path in raw_csvs:
        content = pd.read_csv(path, dtype=str)
        content.dropna(inplace=True, axis='index', how='all')
        original_rows = len(content)
        content.drop_duplicates(inplace=True)
        new_rows = len(content)
        
        if original_rows != new_rows:
            content = csv_cleanup(content, path)
            content.to_csv(path, index=False)
            files.append(path)
        
        

In [8]:
len(files)

0

In [9]:
df = pd.DataFrame({'files with identical rows': files})
df.to_csv('cleaned_data/metadata/lims_identical_rows.csv', index=False)

## Resave each file
Open and resave each file to get rid of weird starting character and ending character

In [None]:
for directory in all_LIMS:
    raw_csvs = glob.glob(f"{directory}/*.csv")

    for path in raw_csvs:
        df = pd.read_csv(path, dtype=str)
        df = csv_cleanup(df, path)
        df.to_csv(path, index=False)