# Normalize Micropal CSVs

In [1]:
import sys
sys.path.append('../scripts/')
import glob
import re
import os.path

import pandas as pd
import numpy as np

from normalize_data import (
    csv_cleanup,
    update_metadata,
    get_taxonomy_columns,
    clean_taxon_name,
)

In [2]:
taxa_list_path = 'cleaned_data/taxa_list.csv'
non_taxa_fields_path = 'cleaned_data/non_taxa_fields.csv'

In [3]:
metadata_file = 'cleaned_data/metadata/Micropal_1_changes.csv'
clean_data_path = 'cleaned_data/Micropal_CSV_1'

In [15]:
metadata_file = 'cleaned_data/metadata/Micropal_2_changes.csv'
clean_data_path = 'cleaned_data/Micropal_CSV_2'

In [16]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,taxon_group,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols,clean_up_taxa_values
0,356-U1464C_benthic_forams.csv,benthic_forams,True,False,False,False,True
1,374_U1523B_benthic_forams.csv,benthic_forams,True,False,False,False,False
2,353_U1443A_nannofossils.csv,nannofossils,True,False,False,False,False
3,369_U1513D_planktic_forams.csv,planktic_forams,True,False,False,False,False
4,371_U1510B_planktic_forams.csv,planktic_forams,True,False,False,False,False


In [17]:
content = pd.read_csv(non_taxa_fields_path)['field']
non_taxa_columns = set(content)

## Clean up taxa values

Look for taxa columns that have "code [extra text]", and remove "[extra text]".

In [18]:
def clean_up_taxa_values(file):
    path = f"{clean_data_path}/{file}"
    content = pd.read_csv(path)
    
    taxa_cols = get_taxonomy_columns(content.columns, non_taxa_columns)            
    taxa_df = content[taxa_cols]
    content[taxa_cols] = taxa_df.replace(to_replace =' *\[.*\] *', value = '', regex = True) 
    
    # get rid of NAs in order to compare two dataframes   
    changed = not taxa_df.fillna('').equals(content[taxa_cols].fillna(''))

    if changed:
        content = csv_cleanup(content, path)
        content.to_csv(path, index=False)

    return changed
    
change_columns = [clean_up_taxa_values(file) for file in metadata['file']] 

### Update metadata

In [19]:
dict = {"clean_up_taxa_values": change_columns}
new_metadata = update_metadata(metadata, dict)
new_metadata.head()

Unnamed: 0,file,taxon_group,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols,clean_up_taxa_values
0,356-U1464C_benthic_forams.csv,benthic_forams,True,False,False,False,True
1,374_U1523B_benthic_forams.csv,benthic_forams,True,False,False,False,False
2,353_U1443A_nannofossils.csv,nannofossils,True,False,False,False,False
3,369_U1513D_planktic_forams.csv,planktic_forams,True,False,False,False,False
4,371_U1510B_planktic_forams.csv,planktic_forams,True,False,False,False,False


In [20]:
new_metadata.to_csv(metadata_file, index=False)

## Create a csv of all taxa

Create a taxa list csv that contains all the taxon names and the associated taxon group.

In [21]:
taxa_data = set()

Read the existing data from the existing taxa list csv. 

In [22]:
def fetch_existing_taxa_data(row):
    taxa_data.update([(row['verbatim_name'], row['name'], row['taxon_group'])])

if os.path.isfile(taxa_list_path): 
    existing_taxa_df = pd.read_csv(taxa_list_path)
    existing_taxa_df.apply(fetch_existing_taxa_data, axis=1)
    
len(taxa_data)

3540

Get taxa data from the cleaned up taxa csv.

In [23]:
def fetch_taxa_data(row):
    path = f"{clean_data_path}/{row['file']}"
    content = pd.read_csv(path)
    
    content = content.dropna(axis="columns", how="all")
    columns = [col.strip() for col in content.columns]
    
    taxa_columns = set(columns) - non_taxa_columns    
    taxa_data.update(((col, clean_taxon_name(col), row['taxon_group']) for col in taxa_columns))

res = metadata[['file', 'taxon_group']].apply(fetch_taxa_data, axis=1)

In [24]:
len(taxa_data)

3540

### Create csv

In [25]:
taxa_df = pd.DataFrame(taxa_data, columns=['verbatim_name', 'name', 'taxon_group'])
taxa_df.sort_values('name', inplace=True)
taxa_df.head()

Unnamed: 0,verbatim_name,name,taxon_group
2778,"""Globigerina"" angulisuturalis _T","""Globigerina"" angulisuturalis _T",planktic_forams
1464,"""Globigerina"" angulisuturalis _T_","""Globigerina"" angulisuturalis _T_",planktic_forams
2432,"""Globigerina"" ciperoensis _T","""Globigerina"" ciperoensis _T",planktic_forams
803,"""Globigerina"" ciperoensis _T_","""Globigerina"" ciperoensis _T_",planktic_forams
213,"""Skeletonema"" utriculosa","""Skeletonema"" utriculosa",diatoms


In [26]:
taxa_df.to_csv(taxa_list_path, index=False)