# Normalize non-taxa fields

Normalize the final list of non-taxa fields from eODP researchers.

In [33]:
import sys
sys.path.append('../../../')
import glob
import re
import os.path

import pandas as pd
import numpy as np

from config import CLEAN_DATA_DIR, OUTPUT_DIR

from scripts.normalize_data import (
    csv_cleanup,

    get_non_taxa_fields,
    update_metadata
)

In [18]:
taxa_list_path = OUTPUT_DIR /'taxa'/'LIMS'/'PI_normalized_taxa_list_with_pbdb_2021-07-28.csv'
normalized_fields_path =  OUTPUT_DIR/'normalized_data'/'eODP_unified_data_structure_2022_02_21.csv'
metadata_file = OUTPUT_DIR/'metadata'/'LIMS'/'Micropal_changes.csv' 
clean_data_path = CLEAN_DATA_DIR

## created grouped non taxa file

In [19]:
dir_path = OUTPUT_DIR / 'taxa'/'draft'/'LIMS'


In [20]:
df = pd.read_csv(dir_path/'non_taxa_fields_normalized.csv')
df.head()

Unnamed: 0,final,original
0,comment,Comments
1,comment,COMMENTS
2,comment,Comment
3,comment,General comment
4,comment,Sample comment


Group all variants of a non-taxa field into one row 

In [21]:
df['original'] = df.groupby('final')['original'].transform(lambda col: '|'.join(col))
df.size

148

In [22]:
df = df.drop_duplicates()
df.size

106

In [23]:
df.head()

Unnamed: 0,final,original
0,comment,Comments|COMMENTS|Comment|General comment|Samp...
8,planktic foraminifera group abundance,PF Group Abundance|PF group abundance|% Plankt...
13,benthic foraminifera group abundance,Percentage of benthic forams in total foram as...
15,diatom group abundance,Diatom abundance|Diatoms group abundance
17,Diatoms and siliceous plankton comment,Diatoms and siliceous plankton comment


In [16]:
df.to_csv(dir_path/'non_taxa_fields_normalized_grouped.csv', index=False)

## Normalize non-taxa fields
Use the normalized non taxa fields from the PIs to update the non-taxa fields in the data files.

In [30]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,path,taxon_group,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols,clean_up_taxa_values,clean_up_taxa_metadata_values,normalize_non_taxa
0,363-U1482A-Benthic_Forams.csv,LIMS/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,Benthic foraminifera,True,False,False,False,False,True,True
1,320_U1336A_Nannofossils_2.csv,LIMS/Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,Calc nannos,False,True,False,False,False,False,True
2,375_U1518F_planktic_forams.csv,LIMS/Micropal_CSV_1/375_U1518F_planktic_forams...,Planktic foraminifera,True,False,False,False,False,False,True
3,320_U1334A_Nannofossils_1.csv,LIMS/Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,Calc nannos,False,True,False,False,False,False,True
4,318_U1358B_Palynology.csv,LIMS/Micropal_CSV_1/318_U1358B_Palynology.csv,Palynology,False,True,False,False,False,False,True


In [25]:
cols = ['normalized', 'taxa_317-present']
non_taxa_df = pd.read_csv(normalized_fields_path, dtype=str, header=5, usecols=cols)


In [26]:

non_taxa_dict = get_non_taxa_fields(non_taxa_df, 'taxa_317-present')
non_taxa_list = set(non_taxa_dict.keys())

In [29]:
def normalize_non_taxa(file):
    path = f"{clean_data_path}/{file}"
    content = pd.read_csv(path)
    
    nontaxa_cols = set(content.columns).intersection(non_taxa_list)
    
    rename_dict = {}
    old_columns = list(content.columns)
    for col in nontaxa_cols:
        rename_dict[col] = non_taxa_dict[col]
        
    content.rename(columns = rename_dict, inplace=True)
    
    changed = list(content.columns) != list(old_columns)

    if changed:
        content = csv_cleanup(content, path)
        content.to_csv(path, index=False)
    return changed
    
change_columns = [normalize_non_taxa(file) for file in metadata['path']] 

In [34]:
dict = {"normalize_non_taxa": change_columns}
new_metadata = update_metadata(metadata, dict)
new_metadata.head()

Unnamed: 0,file,path,taxon_group,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols,clean_up_taxa_values,clean_up_taxa_metadata_values,normalize_non_taxa
0,363-U1482A-Benthic_Forams.csv,LIMS/Micropal_CSV_1/363-U1482A-Benthic_Forams.csv,Benthic foraminifera,True,False,False,False,False,True,True
1,320_U1336A_Nannofossils_2.csv,LIMS/Micropal_CSV_1/320_U1336A_Nannofossils_2.csv,Calc nannos,False,True,False,False,False,False,True
2,375_U1518F_planktic_forams.csv,LIMS/Micropal_CSV_1/375_U1518F_planktic_forams...,Planktic foraminifera,True,False,False,False,False,False,True
3,320_U1334A_Nannofossils_1.csv,LIMS/Micropal_CSV_1/320_U1334A_Nannofossils_1.csv,Calc nannos,False,True,False,False,False,False,True
4,318_U1358B_Palynology.csv,LIMS/Micropal_CSV_1/318_U1358B_Palynology.csv,Palynology,False,True,False,False,False,False,True


In [None]:
new_metadata.to_csv(metadata_file, index=False)