# Get taxon group

In [1]:
import sys
sys.path.append('../scripts/')
import glob
import re

import pandas as pd
import numpy as np

from normalize_data import (
    csv_cleanup,
    update_metadata,
)

In [2]:
metadata_file = 'cleaned_data/metadata/Micropal_1_changes.csv'
clean_data_path = 'cleaned_data/Micropal_CSV_1'

In [2]:
metadata_file = 'cleaned_data/metadata/Micropal_2_changes.csv'
clean_data_path = 'cleaned_data/Micropal_CSV_2'

In [3]:
taxa_list_path = 'cleaned_data/taxa_list.csv'

In [4]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,taxon_group,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols
0,356-U1464C_benthic_forams.csv,benthic_forams,True,False,False,False
1,374_U1523B_benthic_forams.csv,benthic_forams,True,False,False,False
2,353_U1443A_nannofossils.csv,nannofossils,True,False,False,False
3,369_U1513D_planktic_forams.csv,planktic_forams,True,False,False,False
4,371_U1510B_planktic_forams.csv,planktic_forams,True,False,False,False


## Get non-taxa columns

The Micropal CSVs have non-taxa columns, then taxa columns, then non-taxa columns. We want to form a set of non-taxa columns.

Get the column names from the beginning and end of the header row. Manually inspect the columns to select the non-taxa columns.

In [5]:
normalized_columns = [
    'Sample',
    'Exp',
    'Site',
    'Hole',
    'Core',
    'Type',
    'Section',
    'A/W',
    'Top [cm]',
    'Bottom [cm]',
    'Top Depth [m]',
    'Bottom Depth [m]', 
]

In [16]:
non_taxa_columns = set()


non_taxa_columns.update([
'% Planktic Foraminifera within whole sample',
'A/W',
'Abundance',
'ADDITIONAL SPECIES',
'Additional zone name',
'Additional zone name (short)',
'Age',
'Aspect comment (etching)',
'BF comment',
'Bottom Depth [m]',
'Bottom [cm]',
'Chrysophyte cyst group abundance',
'Chrysophyte cyst preservation dissolution',
'Chrysophyte cyst preservation fragmentation',
'Comment',
'Comment (general)',
'Comments',
'COMMENTS',
'Core',
'Core-Sect',
'Cytherella sp.',
'Datum age average [Ma]',
'Datum author year',
'Datum comment',
'Datum group',
'Datum group code',
'Datum name',
'Datum name generic',
'Datum occurrence',
'Datum region',
'Datum status',
'Datum type',
'Datum validation comment',
'Diatom preservation - pyritization2',
'Diatom preservation dissolution',
'Diatom preservation fragmentation',
'Diatom preservation pyritization',
'Diatoms group abundance',
'Ebridian group abundance',
'Ebridian preservation dissolution',
'Ebridian preservation fragmentation',
'Exotic',
'Exp',
'Extra Sample ID Data',
'File Data',
'File data',
'Foram abundance',
'Foraminifer test linings',
'Foraminiferal linings',
'Foraminiferal test linings',
'General comment',
'Genus/species (upper zone)',
'Genus/species lower zone)',
'Group Abundance',
'Group Preservation',
'Group abundance',
'Hole',
'Large Benthic Forams [%]',
'Lower boundary age av. [Ma]',
'Mixing',
'NEOGENE',
'Nannofossil abundance',
'Nannofossil comment',
'Nannofossil fragments',
'Neogene',
'No. specimens/tray',
'Oligocene to Holocene',
'PALEO WATER DEPTH (IS=inner shelf, MS=middle shelf, OS=outer shelf)',
'PF Preservation',
'Percentage of benthic forams in total foram assemblage [%]',
'Percentage of non-calcareous agglutinated forams in total foram assemblage [%]',
'Percentage of planktic forams in total foram assemblage [%]',
'Planktonic Benthic ratio (P:B)',
'Preservation',
'Preservation palynofacies',
'Reworked species',

'Reworking comment (1= <1%, 2= light 1-10%, 3= >10%)',
'Reworking comment (1= <1%, 2=light 1-10%, 3= >10%)',
'Sample',
'Sample comment',
'Sample preparation comment',
'Section',
'Ship File Links',
'Shore File Links',
'Silicoflagellate preservation dissolution',
'Silicoflagellate preservation fragmentation',
'Silicoflagellates group abundance',
'Site',
'Temperature Range',
'Top Depth [m]',
'Top [cm]',
'Type',
'Unknown taxa',
'Upper boundary age av. [Ma]',
'Upper boundary age max [Ma]',
'Upper boundary age min [Ma]',
'Zone',
'Zone comment',
'Zone name',
'Zone name (short)',
'Zone name [short]',
'dupes and comments' ,
'other Centric or Pennate comment',
'preservation',
])
# non_taxa_columns

Get all the columns from the beginning of the file.

In [14]:
all_columns = set()

for file in metadata['file']:
    path = f"{clean_data_path}/{file}"
    content = pd.read_csv(path)
    
    all_columns.update(set(content.columns[0:15]))

In [17]:
all_columns  - non_taxa_columns

{'(Duplicate) Dentoglobigerina pseudovenezuelana',
 'Abas wittii',
 'Abyssamina incisa',
 'Abyssamina poagi',
 'Abyssammina quadrata',
 'Abyssammina spp.',
 'Acarinina africana',
 'Acarinina alticonica',
 'Acarinina angulosa',
 'Acarinina aspensis',
 'Acarinina boudreauxi',
 'Acarinina bullbrooki',
 'Acervulina mabahethi',
 'Achnanthes brevipes',
 'Achnanthes diapar',
 'Achnanthes groenlandica',
 'Achnanthes lanceolata',
 'Aciculammina (?) sp.',
 'Acritarchs',
 'Acrobotrys tritubus',
 'Acrosphaera cyrtodon',
 'Acrosphaera inflata',
 'Acrosphaera lappacea',
 'Acrosphaera murrayana',
 'Acrosphaera muse',
 'Acrosphaera spinosa',
 'Acrosphaera spinosa fasciculopora',
 'Actinocyclus actinochilus',
 'Actinocyclus barronii',
 'Actinocyclus curvatulus',
 'Actinocyclus dimorphus',
 'Actinocyclus ellipticus',
 'Actinocyclus ellipticus f. lanceolata',
 'Actinocyclus ellipticus var. javanica',
 'Actinocyclus ellipticus var. spiralis',
 'Actinocyclus fasciculatus',
 'Actinocyclus hajosiae',
 'Actin

Get all the columns from the end of the file.

In [20]:
all_columns = set()

for file in metadata['file']:
    path = f"{clean_data_path}/{file}"
    content = pd.read_csv(path)
 
    original_cols = [col for col in content.columns if not col.startswith('Unnamed:')]
    for col in normalized_columns:
        original_cols.remove(col)
        
    all_columns.update(set(original_cols[-5:]))

In [21]:
all_columns - non_taxa_columns

{"Actiniscus 'four-branch'",
 'Actiniscus pentasterias',
 'Benthic Foraminifera group',
 'Bolboforma praeintermedia',
 'Bolboforma subfragoris s.l.',
 'Cymatiosphaera sp.',
 'Globigerinoides trilobus',
 'Gyroidinoides primitivus',
 'Neobulimina canadaensis',
 'Organic matter',
 'Other spores',
 'Paragloborotalia kugleri',
 'Pyrite',
 'Reinhardtites levis',
 'Reticulofenestra daviesii ',
 'Reticulofenestra dictyoda  (10-14 microns)',
 'Rugoglobigerina pennyi',
 'Sighted trachyleberidids',
 'Siphocampe daseia',
 'Sphenolithus pseudoradians ',
 'Tenuitella anfracta',
 'Tenuitellinata juvenilis',
 'Theocapsomma teren',
 'Theocorythium vetulum',
 'Thyrsocyrtis triacantha ',
 'Trachyneis aspera',
 'Tranolithus orionatus',
 'Triceratium spp.',
 'Triquetrorhabdulus rugosus',
 'Trisolenia megalactis',
 'Tristylospyris triceros',
 'Tritaxilina zealandica >1500 m',
 'Trochosira radiata',
 'Trochosira trochlea',
 'Truncorotalia truncatulinoides',
 'Tryblioptichus spp.',
 'Turborotalia pomeroli ',


## Clean up taxa values

Look for taxa columns that have "code [extra text]", and remove "[extra text]".

In [22]:
def clean_up_taxa_values(file):
    path = f"{clean_data_path}/{file}"
    content = pd.read_csv(path)
    
    taxa_cols = list(set(content.columns) - non_taxa_columns)
    taxa_df = content[taxa_cols]
    content[taxa_cols] = taxa_df.replace(to_replace =' *\[.*\] *', value = '', regex = True) 
    
    # get rid of NAs in order to compare two columns   
    changed = not taxa_df.fillna('').equals(content[taxa_cols].fillna(''))

    if changed:
        content = csv_cleanup(content, path)
        content.to_csv(path, index=False)

    return changed
    
change_columns = [clean_up_taxa_values(file) for file in metadata['file']] 

### Update metadata

In [23]:
dict = {"clean_up_taxa_values": change_columns}
new_metadata = update_metadata(metadata, dict)
new_metadata.head()

Unnamed: 0,file,taxon_group,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols,clean_up_taxa_values
0,356-U1464C_benthic_forams.csv,benthic_forams,True,False,False,False,True
1,374_U1523B_benthic_forams.csv,benthic_forams,True,False,False,False,False
2,353_U1443A_nannofossils.csv,nannofossils,True,False,False,False,False
3,369_U1513D_planktic_forams.csv,planktic_forams,True,False,False,False,False
4,371_U1510B_planktic_forams.csv,planktic_forams,True,False,False,False,False


In [24]:
new_metadata.to_csv(metadata_file, index=False)

## Create a csv of all taxa

In [13]:
taxa_data = set()

def fetch_taxa_data(row):
    path = f"{clean_data_path}/{row['file']}"
    content = pd.read_csv(path, nrows=0)
    
    taxa_columns = set(content.columns) - non_taxa_columns
    taxa_data.update([(col, row['taxon_group']) for col in taxa_columns])

    
metadata[['file', 'taxon_group']].apply(fetch_taxa_data, axis=1)

0      None
1      None
2      None
3      None
4      None
       ... 
184    None
185    None
186    None
187    None
188    None
Length: 189, dtype: object

In [14]:
taxa_df = pd.DataFrame(taxa_data, columns=['taxon_name', 'taxon_group'])
taxa_df.head()

Unnamed: 0,taxon_name,taxon_group
0,Acarinina pentacamerata,planktic_forams
1,Planorotalites capdevilensis,planktic_forams
2,Globigerinelloides blowi (cf.),planktic_forams
3,Muricohedbergella hoelzli,planktic_forams
4,Bolivinella australis,benthic_forams


In [15]:
taxa_df.to_csv(taxa_list_path, index=False, mode='a')