# Get taxon group

In [1]:
import sys
sys.path.append('../scripts/')
import glob
import re

import pandas as pd
import numpy as np

from normalize_data import (
    csv_cleanup,
    update_metadata,
)

In [2]:
metadata_file = 'cleaned_data/metadata/Micropal_1_changes.csv'
clean_data_path = 'cleaned_data/Micropal_CSV_1'
taxa_list_path = 'cleaned_data/taxa_list.csv'

In [3]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,taxon_group,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols,clean_up_taxa_values
0,363-U1482A-Benthic_Forams.csv,benthic_forams,True,False,False,False,False
1,320_U1336A_Nannofossils_2.csv,nannofossils,False,True,False,False,True
2,363-U1482A-nannofossils.csv,nannofossils,True,False,False,False,True
3,375_U1518F_planktic_forams.csv,planktic_forams,True,False,False,False,False
4,320_U1334A_Nannofossils_1.csv,nannofossils,False,True,False,False,True


## Get non-taxa columns

The Micropal CSVs have non-taxa columns, then taxa columns, then non-taxa columns. We want to form a set of non-taxa columns.

Get the column names from the beginning and end of the header row. Manually inspect the columns to select the non-taxa columns.

In [4]:
normalized_columns = [
    'Sample',
    'Exp',
    'Site',
    'Hole',
    'Core',
    'Type',
    'Section',
    'A/W',
    'Top [cm]',
    'Bottom [cm]',
    'Top Depth [m]',
    'Bottom Depth [m]', 
]

In [5]:
non_taxa_columns = set()


non_taxa_columns.update([
'% Planktic Foraminifera within whole sample',
'A/W',
'Abundance',
'Additional zone name',
'Additional zone name (short)',
'Age',
'BF comment',
'Bottom Depth [m]',
'Bottom [cm]',
'Chrysophyte cyst group abundance',
'Chrysophyte cyst preservation dissolution',
'Chrysophyte cyst preservation fragmentation',
'Comment',
'Comments',
'Core',
'Core-Sect',
'Datum type',
'Diatom preservation - pyritization2',
'Diatom preservation dissolution',
'Diatom preservation fragmentation',
'Diatom preservation pyritization',
'Diatoms group abundance',
'Ebridian group abundance',
'Ebridian preservation dissolution',
'Ebridian preservation fragmentation',
'Exp',
'Extra Sample ID Data',
'File Data',
'File data',
'Foraminifer test linings',
'General comment',
'Group Abundance',
'Group abundance',
'Hole',
'Mixing',
'Nannofossil abundance',
'PF Preservation',
'Percentage of non-calcareous agglutinated forams in total foram assemblage [%]',
'Preservation',
'Sample',
'Sample comment',
'Section',
'Ship File Links',
'Shore File Links',
'Silicoflagellate preservation dissolution',
'Silicoflagellate preservation fragmentation',
'Silicoflagellates group abundance',
'Site',
'Top Depth [m]',
'Top [cm]',
'Type',
'Zone',
'Zone name',
'Zone name (short)',
'Zone name [short]',
'dupes and comments' 
])


Get all the columns from the beginning of the file.

In [6]:
all_columns = set()

for file in metadata['file']:
    path = f"{clean_data_path}/{file}"
    content = pd.read_csv(path)
    
    all_columns.update(set(content.columns[0:15]))

In [7]:
all_columns  - non_taxa_columns

{'(Duplicate) Dentoglobigerina pseudovenezuelana',
 'Acarinina africana',
 'Acarinina alticonica',
 'Acarinina angulosa',
 'Acarinina aspensis',
 'Acarinina boudreauxi',
 'Acarinina bullbrooki',
 'Alabamina creta',
 'Allomorphina conica',
 'Ammobaculites agglutinans',
 'Ammonia beccarii',
 'Amphicoryna scalaris',
 'Amphisorus hemprichii',
 'Amphistegina aucklandica',
 'Anomalina aotea',
 'Anomalina spherica',
 'Anomalina visenda',
 'Anomalinoides fasciatus',
 'Anomalinoides globulosus',
 'Anomalinoides orbiculus',
 'Anomalinoides vitrinodus',
 'Archaeoglobigerina sp.',
 'Arkhangelskiella cymbiformis',
 'Astalocus bradyi',
 'Beella digitata',
 'Beella praedigitata',
 'Bigenerina nodosaria',
 'Bigenerina pliocenica',
 'Bigenerina spp.',
 'Bolivina affliata',
 'Bolivina albatrossi',
 'Bolivina barnwelli',
 'Bolivina spp.',
 'Braarudosphaera bigelowii',
 'Calcidiscus leptoporus',
 'Calcidiscus macintyrei',
 'Calcidiscus premacintyrei',
 'Calcidiscus tropicus',
 'Calciosolenia brasiliensis'

Get all the columns from the end of the file.

In [8]:
all_columns = set()

for file in metadata['file']:
    path = f"{clean_data_path}/{file}"
    content = pd.read_csv(path)
 
    original_cols = [col for col in content.columns if not col.startswith('Unnamed:')]
    for col in normalized_columns:
        original_cols.remove(col)
        
    all_columns.update(set(original_cols[-5:]))

In [9]:
all_columns - non_taxa_columns

{'Acrocubus octopylus ',
 'Algae sp. A',
 'Anthocyrtoma spp.',
 'Archaeosphaeridium australensis',
 'Archaeosphaeridium tasmaniae',
 'Artophormis gracilis',
 'Calocycloma castum',
 'Cyrtocapsella tetrapera',
 'Dicty mongolferi',
 'Dorcadospyris alata',
 'Globoconella miozea',
 'Globorotalia sp.',
 'Goniothecium decoratum',
 'Goniothecium odontella',
 'Goniothecium rogersii',
 'Grammatophora spp.',
 'Lithopera renzae',
 'Lychnocanoma apodora',
 'Micromarsupium anceps',
 'Micromarsupium curticannum',
 'Naviculopsis constricta',
 'Naviculopsis foliacea',
 'Neogloboquadrina acostaensis',
 'Neogloboquadrina cf. pachyderma',
 'Neogloboquadrina nympha',
 'Nitzschia denticuloides var. M',
 'Nonionella iridea',
 'Nothofagus pollen',
 'Paradictyocha sp',
 'Paradictyocha sp.',
 'Phormocyrtis turgida',
 'Pseudammodochium dictyoides',
 'Pseudammodochium lingii',
 'Pseudammodochium sphericum',
 'Pterocodon ampla',
 'Pullenia bulloides',
 'Pullenia coryelli',
 'Pyrgo murrhina',
 'Radiolarian fragment

## Clean up taxa values

Look for taxa columns that have "code [extra text]", and remove "[extra text]".

In [10]:
def clean_up_taxa_values(file):
    path = f"{clean_data_path}/{file}"
    content = pd.read_csv(path)
    
    taxa_cols = list(set(content.columns) - non_taxa_columns)
    taxa_df = content[taxa_cols]
    content[taxa_cols] = taxa_df.replace(to_replace =' *\[.*\] *', value = '', regex = True) 
    
    # get rid of NAs in order to compare two columns   
    changed = not taxa_df.fillna('').equals(content[taxa_cols].fillna(''))

    if changed:
        content = csv_cleanup(content, path)
        content.to_csv(path, index=False)

    return changed
    
change_columns = [clean_up_taxa_values(file) for file in metadata['file']] 

### Update metadata

In [11]:
dict = {"clean_up_taxa_values": change_columns}
new_metadata = update_metadata(metadata, dict)
new_metadata.head()

Unnamed: 0,file,taxon_group,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols,clean_up_taxa_values
0,363-U1482A-Benthic_Forams.csv,benthic_forams,True,False,False,False,False
1,320_U1336A_Nannofossils_2.csv,nannofossils,False,True,False,False,True
2,363-U1482A-nannofossils.csv,nannofossils,True,False,False,False,True
3,375_U1518F_planktic_forams.csv,planktic_forams,True,False,False,False,False
4,320_U1334A_Nannofossils_1.csv,nannofossils,False,True,False,False,True


In [12]:
new_metadata.to_csv(metadata_file, index=False)

## Create a csv of all taxa

In [13]:
taxa_data = set()

def fetch_taxa_data(row):
    path = f"{clean_data_path}/{row['file']}"
    content = pd.read_csv(path, nrows=0)
    
    taxa_columns = set(content.columns) - non_taxa_columns
    taxa_data.update([(col, row['taxon_group']) for col in taxa_columns])

    
metadata[['file', 'taxon_group']].apply(fetch_taxa_data, axis=1)

0      None
1      None
2      None
3      None
4      None
       ... 
184    None
185    None
186    None
187    None
188    None
Length: 189, dtype: object

In [14]:
taxa_df = pd.DataFrame(taxa_data, columns=['taxon_name', 'taxon_group'])
taxa_df.head()

Unnamed: 0,taxon_name,taxon_group
0,Acarinina pentacamerata,planktic_forams
1,Planorotalites capdevilensis,planktic_forams
2,Globigerinelloides blowi (cf.),planktic_forams
3,Muricohedbergella hoelzli,planktic_forams
4,Bolivinella australis,benthic_forams


In [15]:
taxa_df.to_csv(taxa_list_path, index=False, mode='a')