# Normalize CSV columns

Normalize the columns for the eODP CSVs.

In [1]:
import sys
import os
import glob
sys.path.append('../scripts/')

import pandas as pd

from normalize_data import (
    add_sample_col, 
    add_expedition_aw_cols, 
    csv_cleanup,
    update_metadata,
    append_set,
    filter_existing_set,
    normalize_columns,
    add_missing_columns,
    create_sample_name_for_row,
    check_duplicate_columns,
    get_columns_from_file_or_disk,
)

In [2]:
base_directory = 'cleaned_data'
raw_data_directory = 'raw_data'

lithology_meta = os.path.join(base_directory, 'metadata', 'LIMS','Lithology_changes.csv')
micropal_meta = os.path.join(base_directory, 'metadata', 'LIMS', 'Micropal_changes.csv')

lithology_type = 'lithology'
taxa_type = 'taxa'

columns_file = os.path.join(base_directory, 'metadata', 'LIMS','columns_list.csv')

In [3]:
metadata_file = micropal_meta
data_type = taxa_type
duplicate_columns_file = os.path.join(base_directory, 
                                      'metadata', 
                                      'LIMS',
                                      f'duplicate_columns_needs_reivew_{data_type}.csv')

## 1. Remove empty rows and columns

remove empty rows and columns without headers or data

In [4]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,path,taxon_group,empty_rows_columns,remove_identical_rows,remove_identical_columns,standardize_headers,add_expedition_aw_cols,add_sample_column,add_missing_cols,clean_up_taxa_values,clean_up_taxa_metadata_values,update_zones,add_missing_zone
0,363-U1482A-Benthic_Forams.csv,Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,benthic_foraminfera,False,False,False,False,True,False,False,False,True,False,True
1,320_U1336A_Nannofossils_2.csv,Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,nannofossils,False,False,False,True,False,False,False,True,True,False,True
2,375_U1518F_planktic_forams.csv,Micropal_CSV_1/375_U1518F_planktic_forams.csv,planktic_foraminfera,False,False,False,False,True,False,False,False,False,False,True
3,320_U1334A_Nannofossils_1.csv,Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,nannofossils,False,False,False,True,False,False,False,True,True,True,True
4,318_U1358B_Palynology.csv,Micropal_CSV_1/318_U1358B_Palynology.csv,palynology,False,False,False,True,False,False,False,True,False,False,True


In [5]:
def process_filename(file_path):
    path = f"{base_directory}/{file_path}"
    df = pd.read_csv(path, dtype=str, header=None)
    
    orginal_dim = df.shape
    df.dropna(axis="index", how="all", inplace=True)
    df.dropna(axis="columns", how="all", inplace=True)
    new_dim = df.shape
    changed = orginal_dim != new_dim

    if changed:
        df.to_csv(path, index=False, header=False)

    return changed
    
change_columns = [process_filename(file) for file in metadata['path']] 

In [6]:
dict = {"empty_rows_columns": change_columns}
new_metadata = update_metadata(metadata, dict)

In [7]:
new_metadata.to_csv(metadata_file, index=False)

## 2. Remove duplicate identical rows
Remove rows that are identical

In [8]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,path,taxon_group,empty_rows_columns,remove_identical_rows,remove_identical_columns,standardize_headers,add_expedition_aw_cols,add_sample_column,add_missing_cols,clean_up_taxa_values,clean_up_taxa_metadata_values,update_zones,add_missing_zone
0,363-U1482A-Benthic_Forams.csv,Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,benthic_foraminfera,False,False,False,False,True,False,False,False,True,False,True
1,320_U1336A_Nannofossils_2.csv,Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,nannofossils,False,False,False,True,False,False,False,True,True,False,True
2,375_U1518F_planktic_forams.csv,Micropal_CSV_1/375_U1518F_planktic_forams.csv,planktic_foraminfera,False,False,False,False,True,False,False,False,False,False,True
3,320_U1334A_Nannofossils_1.csv,Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,nannofossils,False,False,False,True,False,False,False,True,True,True,True
4,318_U1358B_Palynology.csv,Micropal_CSV_1/318_U1358B_Palynology.csv,palynology,False,False,False,True,False,False,False,True,False,False,True


In [9]:
def process_filename(file_path):
    path = f"{base_directory}/{file_path}"
    df = pd.read_csv(path, dtype=str)
    
    original_rows = len(df)
    df.drop_duplicates(inplace=True)
    new_rows = len(df)
    changed = original_rows != new_rows

    if changed:
        df = csv_cleanup(df, path)
        df.to_csv(path, index=False)
        
    return changed
    
change_columns = [process_filename(file) for file in metadata['path']] 

In [10]:
dict = {"remove_identical_rows": change_columns}
new_metadata = update_metadata(metadata, dict)

In [11]:
new_metadata.to_csv(metadata_file, index=False)

## 3. duplicate column names

check if csv has duplicate column names. If headers and values are the same, delete column. If the headers  are the same but the values are different, log the error.

In [12]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,path,taxon_group,empty_rows_columns,remove_identical_rows,remove_identical_columns,standardize_headers,add_expedition_aw_cols,add_sample_column,add_missing_cols,clean_up_taxa_values,clean_up_taxa_metadata_values,update_zones,add_missing_zone
0,363-U1482A-Benthic_Forams.csv,Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,benthic_foraminfera,False,False,False,False,True,False,False,False,True,False,True
1,320_U1336A_Nannofossils_2.csv,Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,nannofossils,False,False,False,True,False,False,False,True,True,False,True
2,375_U1518F_planktic_forams.csv,Micropal_CSV_1/375_U1518F_planktic_forams.csv,planktic_foraminfera,False,False,False,False,True,False,False,False,False,False,True
3,320_U1334A_Nannofossils_1.csv,Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,nannofossils,False,False,False,True,False,False,False,True,True,True,True
4,318_U1358B_Palynology.csv,Micropal_CSV_1/318_U1358B_Palynology.csv,palynology,False,False,False,True,False,False,False,True,False,False,True


In [13]:
needs_review = []

def process_filename(file_path):
    path = f"{base_directory}/{file_path}"
    df = pd.read_csv(path, dtype=str)
    
    original_rows = len(df.columns)
    check_duplicate_columns(df, file_path, needs_review)
    new_rows = len(df.columns)

    changed = original_rows != new_rows

    if changed:
        df = csv_cleanup(df, path)
        df.to_csv(path, index=False)

    return changed

change_columns = [process_filename(file) for file in metadata['path']] 

Micropal_CSV_3/342_nannofossils_U1406B_1.csv, Umbilicosphaera jafari: duplicate columns
                        have different values
Micropal_CSV_3/342_planktic_forams_U1406A_2.csv, Zone name (short): duplicate columns
                        have different values
Micropal_CSV_3/341_benthic_forams_U1417B.csv, Type: duplicate columns
                        have different values
Micropal_CSV_3/342_planktic_forams_U1408A_2.csv, Zone name (short): duplicate columns
                        have different values
Micropal_CSV_3/342_planktic_forams_U1407A_2.csv, Zone name (short): duplicate columns
                        have different values
Micropal_CSV_3/342_planktic_forams_U1407A_2.csv, Zone name: duplicate columns
                        have different values


In [14]:
dict = {"remove_identical_columns": change_columns}
new_metadata = update_metadata(metadata, dict)

In [15]:
new_metadata.to_csv(metadata_file, index=False)

### create file for columns that need to be reviewed

In [16]:
df = pd.DataFrame(needs_review)
df.to_csv(duplicate_columns_file, index=False)

## 4. Standardize headers

Find all variants for specified headers, and convert all the variants to the same term.

In [17]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,path,taxon_group,empty_rows_columns,remove_identical_rows,remove_identical_columns,standardize_headers,add_expedition_aw_cols,add_sample_column,add_missing_cols,clean_up_taxa_values,clean_up_taxa_metadata_values,update_zones,add_missing_zone
0,363-U1482A-Benthic_Forams.csv,Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,benthic_foraminfera,False,False,False,False,True,False,False,False,True,False,True
1,320_U1336A_Nannofossils_2.csv,Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,nannofossils,False,False,False,True,False,False,False,True,True,False,True
2,375_U1518F_planktic_forams.csv,Micropal_CSV_1/375_U1518F_planktic_forams.csv,planktic_foraminfera,False,False,False,False,True,False,False,False,False,False,True
3,320_U1334A_Nannofossils_1.csv,Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,nannofossils,False,False,False,True,False,False,False,True,True,True,True
4,318_U1358B_Palynology.csv,Micropal_CSV_1/318_U1358B_Palynology.csv,palynology,False,False,False,True,False,False,False,True,False,False,True


### Find all variants for specified headers

In [18]:
columns_all = get_columns_from_file_or_disk(columns_file = columns_file, 
                                            metadata = metadata, 
                                            data_directory = raw_data_directory, 
                                            column_type = data_type)

len(columns_all)

4707

In [19]:
sample_all = set()
append_set(sample_all, r".*?(sample|label|labl).*?", columns_all)

sample = set()
append_set(sample, r"^(sample|labe?l id)$", columns_all)

exp_all = set()
append_set(exp_all, r".*?exp.*?", columns_all)

site_all = set()
append_set(site_all, r".*?site.*?", columns_all)

hole_all = set()
append_set(hole_all, r".*?hole.*?", columns_all)

core_all = set()
append_set(core_all, r".*?core.*?", columns_all)

type_all = set()
append_set(type_all, r".*?type.*?", columns_all)

section_all = set()
append_set(section_all, r".*?sect.*?", columns_all)

aw_all = set()
append_set(aw_all, r".*?a/?w.*?", columns_all)

extra_all = set()
append_set(extra_all, r".*?extra.*?", columns_all)

top_all = set()
append_set(top_all, r".*?top.*?", columns_all)

top_depth = set()
append_set(top_depth, r"top depth", columns_all)

top = set()
append_set(top, r"top offset|top ?\[", columns_all)

bottom_all = set()
append_set(bottom_all, r".*?bottom.*?", columns_all)

bottom_depth = set()
append_set(bottom_depth, r"bottom depth", columns_all)

bottom = set()
append_set(bottom, r"bottom offset|bottom ?\[", columns_all)

In [20]:
sample_all

{'% Planktic Foraminifera within whole sample',
 'Asteromphalus flabellatus',
 'Extra Sample ID Data',
 'Gladiolithus flabellatus',
 'Heterohelix labellosa',
 'Label ID',
 'Labl ID',
 'Neoflabellina semireticulata',
 'Neoflabellina sp.',
 'Pavonina flabelliformis',
 'Sample',
 'Sample comment',
 'Sample preparation comment'}

In [21]:
sample

{'Label ID', 'Labl ID', 'Sample'}

In [22]:
exp_all

{'Broinsonia parca ssp expansa',
 'Chiasmolithus expansus',
 'Exp',
 'Shionodiscus gracilis var. expectus',
 'Shionodiscus sp. Exp 318'}

In [23]:
site_all

{'Chrysalogonium crassitestum', 'Hayesites albiensis', 'Site'}

In [24]:
hole_all

{'% Planktic Foraminifera within whole sample', 'Hole'}

In [25]:
core_all

{'Core',
 'Core-Sect',
 'Corethron criophilum',
 'Corethron pennatum',
 'Spongocore puella',
 'Spongocore sp.'}

In [26]:
type_all

{'Anthemis type',
 'Aster type',
 'Botrychium type',
 'Centaurea nigra type',
 'Datum type',
 'Ephedra distachya type',
 'Ephedra fragilis type',
 'Fraxinus excelsior type',
 'Galium type',
 'Polypodium vulgare type',
 'Quercus deciduous type',
 'Quercus evergreen type',
 'Quercus suber type',
 'Type',
 'Type (lower zone)',
 'Type (upper zone)',
 'count_type'}

In [27]:
section_all

{'Coccolithus abisectus',
 'Core-Sect',
 'Cyclicargolithus abisectus',
 'Cyclicargolithus abisectus (11 µm)',
 'Cyclicargolithus abisectus (>11 microns)',
 'Dictyococcites bisectus',
 'Dictyococcites bisectus (>10 microns)',
 'Dictyococcites bisectus (>10 µm)',
 'Dictyococcites bisectus (>10µm)',
 'Reticulofenestra bisecta',
 'Reticulofenestra bisecta (5-10 microns)',
 'Reticulofenestra bisecta (>10 microns)',
 'Section',
 'Siphonodosaria insecta'}

In [28]:
aw_all

{'A/W',
 'Awhea spp.',
 'Cycladophora funakawai',
 'Denticulopsis ichikawae',
 'Globigerinoides parawoodi',
 'Hanzawaia ammophila',
 'Hanzawaia bertheloti',
 'Hanzawaia complanata',
 'Hanzawaia concentrica',
 'Hanzawaia mantaensis',
 'Hanzawaia rhodiensis',
 'Hanzawaia sp.',
 'Hanzawaia spp.',
 'Hanzawaia turgida',
 'Hanzawaia? sp.'}

In [29]:
extra_all

{'Dextral:Sinistral _N. acostaensis_',
 'Dextral:Sinistral _P. finalis_',
 'Dextral:Sinistral _P. obliquiloculata_',
 'Dextral:Sinistral _P. praecursor_',
 'Dextral:Sinistral _P. praespectabilis_',
 'Dextral:Sinistral _P. primalis_',
 'Dextral:Sinistral _P. spectabilis_',
 'Extra Sample ID Data',
 'Neogloboquadrina acostaensis (dextral)',
 'Neogloboquadrina incompta (dextral)',
 'Neogloboquadrina pachyderma (dextral)',
 'Neogloboquadrina pachyderma A (dextral, inflated form)',
 'Pulleniatina coiling (dextral)'}

In [30]:
top_all

{'Acarinina praetopilensis',
 'Acarinina pseudotopilensis',
 'Acarinina topilensis',
 'Acrocubus octopylus',
 'Artophormis barbadensis',
 'Artophormis gracilis',
 'Calcidiscus leptoporus',
 'Calcidiscus leptoporus (5-8 microns)',
 'Calcidiscus leptoporus (<5 microns)',
 'Calcidiscus leptoporus (>8 micron)',
 'Cushmanina striatopunctata',
 'Cyrtopera languncula',
 'Cystophormis brevispina',
 'Entopyla spp.',
 'Eprolithus octopetalus',
 'Grammatophora spp.',
 'Notoplanulina rakauraoana',
 'Octopyle stenozona',
 'Octopyle/Tetrapyle Group',
 'Protoperidinium sp. 2',
 'Protoperidinium sp. 3',
 'Protoperidinium sp.1',
 'Pylospira octopyle',
 'Thalassiosira leptopus',
 'Thalssiosira leptopus',
 'Top Depth [m]',
 'Top Depth[m] [m]',
 'Top [cm]',
 'Top depth [m]',
 'Top[cm] [cm]',
 'Zygodiscus plectopons'}

In [31]:
top

{'Top [cm]', 'Top[cm] [cm]'}

In [32]:
top_depth

{'Top Depth [m]', 'Top Depth[m] [m]', 'Top depth [m]'}

In [33]:
bottom_all

{'Bottom Depth [m]',
 'Bottom Depth[m] [m]',
 'Bottom [cm]',
 'Bottom depth [m]',
 'Bottom[cm] [cm]'}

In [34]:
bottom

{'Bottom [cm]', 'Bottom[cm] [cm]'}

In [35]:
bottom_depth

{'Bottom Depth [m]', 'Bottom Depth[m] [m]', 'Bottom depth [m]'}

### Standardize header names

In [36]:
def process_filename(file_path):
    path = f"{base_directory}/{file_path}"
    content = pd.read_csv(path, dtype=str)
    columns = list(content.columns)
    
    normalized_cols = normalize_columns(sample, 'Sample', columns)
    normalized_cols = normalize_columns(top, 'Top [cm]', normalized_cols)
    normalized_cols = normalize_columns(bottom, 'Bottom [cm]', normalized_cols)
    normalized_cols = normalize_columns(top_depth, 'Top Depth [m]', normalized_cols)
    normalized_cols = normalize_columns(bottom_depth, 'Bottom Depth [m]', normalized_cols)

    changed = columns != normalized_cols
    
    if changed:
        content.columns = normalized_cols
        content = csv_cleanup(content, path)
        content.to_csv(path, index=False)

    return changed

change_columns = [process_filename(file) for file in metadata['path']] 

In [37]:
dict = {"standardize_headers": change_columns}
new_metadata = update_metadata(metadata, dict)

In [38]:
new_metadata.to_csv(metadata_file, index=False)

## 5. Convert Sample to missing expedition..a/w columns 

If expedition..A/W columns do not exist, convert Sample string into expedition..A/W  columns. 

In [39]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,path,taxon_group,empty_rows_columns,remove_identical_rows,remove_identical_columns,standardize_headers,add_expedition_aw_cols,add_sample_column,add_missing_cols,clean_up_taxa_values,clean_up_taxa_metadata_values,update_zones,add_missing_zone
0,363-U1482A-Benthic_Forams.csv,Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,benthic_foraminfera,False,False,False,False,True,False,False,False,True,False,True
1,320_U1336A_Nannofossils_2.csv,Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,nannofossils,False,False,False,True,False,False,False,True,True,False,True
2,375_U1518F_planktic_forams.csv,Micropal_CSV_1/375_U1518F_planktic_forams.csv,planktic_foraminfera,False,False,False,False,True,False,False,False,False,False,True
3,320_U1334A_Nannofossils_1.csv,Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,nannofossils,False,False,False,True,False,False,False,True,True,True,True
4,318_U1358B_Palynology.csv,Micropal_CSV_1/318_U1358B_Palynology.csv,palynology,False,False,False,True,False,False,False,True,False,False,True


In [40]:
def process_filename(file_path):
    path = f"{base_directory}/{file_path}"
    content = pd.read_csv(path, dtype=str)
    
    original_cols = content.columns
    content = add_expedition_aw_cols(content)
    changed = list(original_cols) != list(content.columns)

        
    if changed:
        content = csv_cleanup(content, path)
        content.to_csv(path, index=False)

    return changed
    
change_columns = [process_filename(file) for file in metadata['path']] 


In [41]:
dict = {"add_expedition_aw_cols": change_columns}
new_metadata = update_metadata(metadata, dict)

In [42]:
new_metadata.to_csv(metadata_file, index=False)

## 6. Add Sample column

If no 'Sample', add 'Sample' based on expedition...A/W columns.

In [43]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,path,taxon_group,empty_rows_columns,remove_identical_rows,remove_identical_columns,standardize_headers,add_expedition_aw_cols,add_sample_column,add_missing_cols,clean_up_taxa_values,clean_up_taxa_metadata_values,update_zones,add_missing_zone
0,363-U1482A-Benthic_Forams.csv,Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,benthic_foraminfera,False,False,False,False,True,False,False,False,True,False,True
1,320_U1336A_Nannofossils_2.csv,Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,nannofossils,False,False,False,True,False,False,False,True,True,False,True
2,375_U1518F_planktic_forams.csv,Micropal_CSV_1/375_U1518F_planktic_forams.csv,planktic_foraminfera,False,False,False,False,True,False,False,False,False,False,True
3,320_U1334A_Nannofossils_1.csv,Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,nannofossils,False,False,False,True,False,False,False,True,True,True,True
4,318_U1358B_Palynology.csv,Micropal_CSV_1/318_U1358B_Palynology.csv,palynology,False,False,False,True,False,False,False,True,False,False,True


In [44]:
def process_filename(file_path):
    path = f"{base_directory}/{file_path}"
    content = pd.read_csv(path, dtype=str)
    
    original_cols = content.columns
    add_sample_col(content)
    
    changed = list(original_cols) != list(content.columns)
    
    if changed:
        content = csv_cleanup(content, path)
        content.to_csv(path, index=False)

    return changed
    
change_columns = [process_filename(file) for file in metadata['path']] 

In [45]:
dict = {"add_sample_column": change_columns}
new_metadata = update_metadata(metadata, dict)

In [46]:
new_metadata.to_csv(metadata_file, index=False)

## 7. Add missing columns

In [47]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,path,taxon_group,empty_rows_columns,remove_identical_rows,remove_identical_columns,standardize_headers,add_expedition_aw_cols,add_sample_column,add_missing_cols,clean_up_taxa_values,clean_up_taxa_metadata_values,update_zones,add_missing_zone
0,363-U1482A-Benthic_Forams.csv,Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,benthic_foraminfera,False,False,False,False,True,False,False,False,True,False,True
1,320_U1336A_Nannofossils_2.csv,Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,nannofossils,False,False,False,True,False,False,False,True,True,False,True
2,375_U1518F_planktic_forams.csv,Micropal_CSV_1/375_U1518F_planktic_forams.csv,planktic_foraminfera,False,False,False,False,True,False,False,False,False,False,True
3,320_U1334A_Nannofossils_1.csv,Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,nannofossils,False,False,False,True,False,False,False,True,True,True,True
4,318_U1358B_Palynology.csv,Micropal_CSV_1/318_U1358B_Palynology.csv,palynology,False,False,False,True,False,False,False,True,False,False,True


In [48]:
normalized_columns = [
    'Top [cm]',
    'Bottom [cm]',
    'Top Depth [m]',
    'Bottom Depth [m]', 
    'Sample',
    'Exp',
    'Site',
    'Hole',
    'Core',
    'Type',
    'Section',
    'A/W',
    'Extra Sample ID Data',
]

In [49]:
change_columns = [add_missing_columns(f"{base_directory}/{file}", normalized_columns) for file in metadata['path']] 

In [50]:
dict = {"add_missing_cols": change_columns}
new_metadata = update_metadata(metadata, dict)

In [51]:
new_metadata.to_csv(metadata_file, index=False)