# Normalize non-taxa fields

Normalize the final list of non-taxa fields from eODP researchers.

In [1]:
import sys
sys.path.append('../../../')
import glob
import re
import os.path

import pandas as pd
import numpy as np

from config import CLEAN_DATA_DIR, OUTPUT_DIR, RAW_DATA_DIR

from scripts.normalize_data import (
    csv_cleanup,
    get_non_taxa_fields,
    update_metadata
)
from scripts.shared_utils import (
    log_df
)

In [2]:
date = '2022-08-08'
crosswalk_file = OUTPUT_DIR/'taxa'/'LIMS'/f"taxa_crosswalk_{date}.csv"
taxa_list_file = OUTPUT_DIR/'taxa'/'LIMS'/f"taxa_list_{date}.csv"
taxa_list_path = OUTPUT_DIR /'taxa'/'LIMS'/f'PI_normalized_taxa_list_with_pbdb_{date}.csv'
metadata_file = OUTPUT_DIR/'metadata'/'LIMS'/'Micropal_changes.csv' 

date = '2022-08-11'
normalized_fields_path =  OUTPUT_DIR/'normalized_data'/'LIMS'/f'unified_data_structure_{date}.csv'
input_normalized_fields_path = RAW_DATA_DIR/'PI_processed_files'/f'eODP unified data structure_{date}.csv'
nontaxa_fields_path =  OUTPUT_DIR/'normalized_data'/'LIMS'/f'nontaxa_fields_{date}.csv'


## create nontaxa file

In [4]:

nontaxa_df = pd.read_csv(input_normalized_fields_path, dtype=str, header=5, nrows=131)
nontaxa_df.tail()

Unnamed: 0,category,normalized,lithology_1-96,lithology_101-126,lithology_317-present,taxa_1-96,taxa_101-210,taxa_317-present-b,taxa_317-present,age_models_1-96,age_models_101-190,hard_rocks_2-94
126,,Sample Zone radiolarian,,,,,,Radiolarian zone | Radiolarian zone/subzone,Radiolarian zone | Radiolarian zone/subzone,,,
127,,Sample Zone Silicoflagellate,,,,,,Silicoflagellate Zone in Ling (1992) | Zone in...,Silicoflagellate Zone in Ling (1992) | Zone in...,,,
128,,,,,,,,,,,,
129,,,,,,,,,,,,
130,,,,,,,,,,,,


In [5]:
# nontaxa_df.to_csv(normalized_fields_path, index=False)

## create nontaxa file that will be imported  into database

In [16]:
cols = ['normalized', 'taxa_317-present-b', 'taxa_317-present']
df = pd.read_csv(normalized_fields_path, dtype=str, usecols = cols)
log_df(df)

(131, 3)


Unnamed: 0,normalized,taxa_317-present-b,taxa_317-present
0,Sample,,Label ID | Sample
1,Expedition,Exp | Expedition,Exp
2,Site,Site,Site
3,Hole,Hole,Hole
4,Core,Core,Core


In [17]:
ignore_fields = [
    'Sample',
    'Expedition',
    'Site',
    'Hole',
    'Core',
    'Core Type',
    'Section',
    'A/W',
    'Extra Sample ID Data',
    'Top [cm]',
    'Bottom [cm]',
    'Top Depth [m]',
    'Bottom Depth [m]',
]

In [18]:
data = []
old_new_fields = set()

def process_column(column):
    if pd.notna(row[column]):
        for field in row[column].split(' | '):
            name = field + "|" + row['normalized']
            if name not in old_new_fields:
                data.append({"original_name": field, "name": row['normalized'], 
                             "dataset": "LIMS", "dataset_type": "taxa"})
                old_new_fields.add(name)

for index, row in df.iterrows():
    if row['normalized'] not in ignore_fields:
        process_column('taxa_317-present')
        process_column('taxa_317-present-b')


In [19]:
nontaxa_df = pd.DataFrame(data)
log_df(nontaxa_df)

(89, 4)


Unnamed: 0,original_name,name,dataset,dataset_type
0,(taxon names are in the header),Taxon Name,LIMS,taxa
1,Abundance,Group Abundance,LIMS,taxa
2,Group Abundance,Group Abundance,LIMS,taxa
3,Group abundance,Group Abundance,LIMS,taxa
4,Group abundance (%),Group Abundance,LIMS,taxa


In [20]:
nontaxa_df.to_csv(nontaxa_fields_path, index=False)

## QA normlized fields

look for files where multiple columns map to the same normalized column name

In [5]:
metadata = pd.read_csv(metadata_file)
log_df(metadata)

(1253, 16)


Unnamed: 0,file,path,taxon_groups,change_file_encoding,remove_bad_characters,remove_empty_rows,remove_spaces,delete_duplicate_rows,delete_duplicate_columns,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols,clean_up_taxa_values,clean_up_taxa_metadata_values,split_dextral_sinistral
0,363-U1482A-Benthic_Forams.csv,LIMS/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,benthic_forams,False,False,False,True,False,False,True,False,False,False,False,True,False
1,320_U1336A_Nannofossils_2.csv,LIMS/Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False
2,375_U1518F_planktic_forams.csv,LIMS/Micropal_CSV_1/375_U1518F_planktic_forams...,planktic_forams,False,False,False,True,False,False,True,False,False,False,False,False,False
3,320_U1334A_Nannofossils_1.csv,LIMS/Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False
4,318_U1358B_Palynology.csv,LIMS/Micropal_CSV_1/318_U1358B_Palynology.csv,palynology,False,False,False,False,False,False,False,True,False,False,False,False,False


In [6]:
def process_columns(columns):
    old_new_dict = {}
    for column in columns:
        old_new_dict = {**old_new_dict, **get_non_taxa_fields(df, column)}
        
    return old_new_dict

columns = ['taxa_317-present', 'taxa_317-present-b']
old_new_dict = process_columns(columns)
old_fields = list(old_new_dict.keys())
len(old_fields)

115

In [7]:
for index, row in metadata.iterrows():
    path = f"{CLEAN_DATA_DIR}/{row['path']}"
    df = pd.read_csv(path, dtype=str)
    df = df.dropna(axis=1, how="all")
    df = df.dropna(axis=0, how="all")
    
    nontaxa_cols = set(df.columns).intersection(old_fields)
    
    new_fields = set()
    for col in nontaxa_cols:
        new_field = old_new_dict[col]
        if new_field in new_fields:
            print(row['file'], ',', col, ',', new_field)
            print('old: ', nontaxa_cols)
            print('new:', new_fields)
            print('----')
        else:
            new_fields.add(new_field)

    

318_U1359D_Diatoms_1.csv , Diatom preservation fragmentation , diatom preservation
old:  {'Diatom preservation dissolution', 'Sample', 'Core', 'A/W', 'Top Depth [m]', 'Bottom Depth [m]', 'Diatoms group abundance', 'Extra Sample ID Data', 'Section', 'Top [cm]', 'Bottom [cm]', 'Hole', 'Diatom preservation fragmentation', 'Type', 'Exp', 'Site'}
new: {'A/W', 'Sample', 'diatom preservation', 'Top Depth [m]', 'Core', 'Bottom Depth [m]', 'Extra Sample ID Data', 'Section', 'Top [cm]', 'Bottom [cm]', 'diatom group abundance', 'Hole'}
----
318_U1359C_Diatoms_1.csv , Diatom preservation fragmentation , diatom preservation
old:  {'Diatom preservation dissolution', 'Sample', 'Core', 'A/W', 'Top Depth [m]', 'Bottom Depth [m]', 'Diatoms group abundance', 'Extra Sample ID Data', 'Section', 'Top [cm]', 'Bottom [cm]', 'Hole', 'Diatom preservation fragmentation', 'Type', 'Exp', 'Site'}
new: {'A/W', 'Sample', 'diatom preservation', 'Top Depth [m]', 'Core', 'Bottom Depth [m]', 'Extra Sample ID Data', 'Sect

## Normalize non-taxa fields in data files
Use the normalized non taxa fields from the PIs to update the non-taxa fields in the data files.

In [53]:
metadata = pd.read_csv(metadata_file)
log_df(metadata)

(1253, 16)


Unnamed: 0,file,path,taxon_groups,change_file_encoding,remove_bad_characters,remove_empty_rows,remove_spaces,delete_duplicate_rows,delete_duplicate_columns,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols,clean_up_taxa_values,clean_up_taxa_metadata_values,split_dextral_sinistral
0,363-U1482A-Benthic_Forams.csv,LIMS/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,benthic_forams,False,False,False,True,False,False,True,False,False,False,False,True,False
1,320_U1336A_Nannofossils_2.csv,LIMS/Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False
2,375_U1518F_planktic_forams.csv,LIMS/Micropal_CSV_1/375_U1518F_planktic_forams...,planktic_forams,False,False,False,True,False,False,True,False,False,False,False,False,False
3,320_U1334A_Nannofossils_1.csv,LIMS/Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False
4,318_U1358B_Palynology.csv,LIMS/Micropal_CSV_1/318_U1358B_Palynology.csv,palynology,False,False,False,False,False,False,False,True,False,False,False,False,False


In [54]:
cols = ['normalized', 'taxa_317-present-b', 'taxa_317-present']
non_taxa_df = pd.read_csv(normalized_fields_path, dtype=str, usecols = cols)
log_df(non_taxa_df)

(131, 3)


Unnamed: 0,normalized,taxa_317-present-b,taxa_317-present
0,Sample,,Label ID | Sample
1,Expedition,Exp | Expedition,Exp
2,Site,Site,Site
3,Hole,Hole,Hole
4,Core,Core,Core


In [55]:

non_taxa_dict = get_non_taxa_fields(non_taxa_df, 'taxa_317-present')
non_taxa_list = set(non_taxa_dict.keys())

In [56]:
def normalize_non_taxa(file):
    path = f"{CLEAN_DATA_DIR}/{file}"
    content = pd.read_csv(path)
    
    nontaxa_cols = set(content.columns).intersection(non_taxa_list)
    
    rename_dict = {}
    old_columns = list(content.columns)
    for col in nontaxa_cols:
        rename_dict[col] = non_taxa_dict[col]
        
    content.rename(columns = rename_dict, inplace=True)
    
    changed = list(content.columns) != list(old_columns)

    if changed:
        content = csv_cleanup(content, path)
        content.to_csv(path, index=False)
    return changed
    
# change_columns = [normalize_non_taxa(file) for file in metadata['path']] 

change_columns = []

In [57]:
dict = {"normalize_non_taxa": change_columns}
new_metadata = update_metadata(metadata, dict)
new_metadata.head()

Unnamed: 0,file,path,taxon_groups,change_file_encoding,remove_bad_characters,remove_empty_rows,remove_spaces,delete_duplicate_rows,delete_duplicate_columns,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols,clean_up_taxa_values,clean_up_taxa_metadata_values,split_dextral_sinistral,normalize_non_taxa
0,363-U1482A-Benthic_Forams.csv,LIMS/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,benthic_forams,False,False,False,True,False,False,True,False,False,False,False,True,False,
1,320_U1336A_Nannofossils_2.csv,LIMS/Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False,
2,375_U1518F_planktic_forams.csv,LIMS/Micropal_CSV_1/375_U1518F_planktic_forams...,planktic_forams,False,False,False,True,False,False,True,False,False,False,False,False,False,
3,320_U1334A_Nannofossils_1.csv,LIMS/Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,nannofossils,False,False,False,True,False,False,False,True,False,False,False,False,False,
4,318_U1358B_Palynology.csv,LIMS/Micropal_CSV_1/318_U1358B_Palynology.csv,palynology,False,False,False,False,False,False,False,True,False,False,False,False,False,


In [58]:
# new_metadata.to_csv(metadata_file, index=False)