# Create non-taxa csv

Create a csv that contain non-taxa coulmns.

In [1]:
import sys
sys.path.append('../scripts/')
import glob
import re
import os.path

import pandas as pd
import numpy as np

from normalize_data import (
    csv_cleanup,
    update_metadata,
    get_taxonomy_columns,
)

In [2]:
taxa_list_path = 'cleaned_data/taxa_list.csv'
non_taxa_fields_path = 'cleaned_data/non_taxa_fields.csv'

In [3]:
metadata_file = 'cleaned_data/metadata/Micropal_1_changes.csv'
clean_data_path = 'cleaned_data/Micropal_CSV_1'

In [21]:
metadata_file = 'cleaned_data/metadata/Micropal_2_changes.csv'
clean_data_path = 'cleaned_data/Micropal_CSV_2'

In [4]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,taxon_group,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols,clean_up_taxa_values
0,363-U1482A-Benthic_Forams.csv,benthic_forams,True,False,False,False,False
1,320_U1336A_Nannofossils_2.csv,nannofossils,False,True,False,False,True
2,363-U1482A-nannofossils.csv,nannofossils,True,False,False,False,True
3,375_U1518F_planktic_forams.csv,planktic_forams,True,False,False,False,False
4,320_U1334A_Nannofossils_1.csv,nannofossils,False,True,False,False,True


## Get non-taxa columns

The Micropal CSVs have non-taxa columns, then taxa columns, then non-taxa columns. We want to form a set of non-taxa columns.

Get the column names from the beginning and end of the header row. Manually inspect the columns to select the non-taxa columns.

In [5]:
sample_columns = set()

sample_columns.update([
'Sample',
'Exp',
'Site',
'Hole',
'Core',
'Core-Sect',
'Extra Sample ID Data',
'Type',
'Section',
'A/W',
'Top [cm]',
'Bottom [cm]',
'Top Depth [m]',
'Bottom Depth [m]', 
])
len(sample_columns)

14

In [6]:
taxa_metadata_columns = set()

taxa_metadata_columns.update([
'% Planktic Foraminifera within whole sample',
'Abundance',
'Additional zone name',
'Additional zone name (short)',
'Age',
'Aspect comment (etching)',
'Aspect comment (etching),Comment (general)',
'BF comment',
'Chrysophyte cyst group abundance',
'COMMENTS',
'Comment',
'Comment (general)',
'Comments',
'Datum age average [Ma]',
'Datum age maximum [Ma]',
'Datum age minimum [Ma]',
'Datum author year',
'Datum comment',
'Datum group',
'Datum group code',
'Datum name',
'Datum name generic',
'Datum occurrence',
'Datum region',
'Datum status',
'Datum type',
'Datum validation comment',
'Diatoms and siliceous plankton comment',
'Diatom preservation - pyritization2',
'Diatom preservation dissolution',
'Diatom preservation fragmentation',
'Diatoms group abundance',
'Ebridian group abundance',
'File Data',
'Foram abundance',
'Fragmentation',
'Fragmentation rank [auto-pop]',
'General comment',
'Genus/species (upper zone)',
'Genus/species lower zone)',
'Group Abundance',
'Group abundance',
'Large Benthic Forams [%]',
'Lower boundary age av. [Ma]',
'Lower boundary age max [Ma]',
'Lower boundary age min [Ma]',
'Mixing',
'Nannofossil abundance',
'Nannofossil comment',
'No. specimens/tray',
'PALEO WATER DEPTH (IS=inner shelf, MS=middle shelf, OS=outer shelf)',
'Percentage of benthic forams in total foram assemblage [%]',
'Percentage of non-calcareous agglutinated forams in total foram assemblage [%]',
'Percentage of planktic forams in total foram assemblage [%]',
'PF Preservation',
'Planktonic Benthic ratio (P:B)',
'Preservation',
'Reworking comment (1= <1%, 2= light 1-10%, 3= >10%)',
'Reworking comment (1= <1%, 2=light 1-10%, 3= >10%)',
'Sample comment',
'Sample preparation comment',
'Ship File Links',
'Shore File Links',
'Silicoflagellates group abundance',
'Temperature Range',
'Total in situ dinocysts',
'Type (lower zone)',
'Type (upper zone)',
'Upper boundary age av. [Ma]',
'Upper boundary age max [Ma]',
'Upper boundary age min [Ma]',
'Zone',
'Zone author (year)',
'Zone comment',
'Zone group',
'Zone group,Type (upper zone)',
'Zone name',
'Zone name (short)',
'Zone name [short]',
'Zone status'
])

len(taxa_metadata_columns)

80

In [7]:
non_taxa_columns = set()
non_taxa_columns.update(sample_columns)
non_taxa_columns.update(taxa_metadata_columns)

Get all the columns from the beginning of the file.

In [8]:
filtered_columns = set()

for file in metadata['file']:
    path = f"{clean_data_path}/{file}"
    content = pd.read_csv(path)
    
    content = content.dropna(axis="columns", how="all")
    columns = get_taxonomy_columns(content.columns, non_taxa_columns) 
    filtered_columns.update(set(columns[0:5]))

filtered_columns

{'Acarinina bullbrooki (Bolli, 1957 p.167, pl.38, figs.5a-b as Globorotalia)',
 'Acarinina pseudosubsphaerica Pearson and Berggren, 2006',
 'Achnanthes sp.',
 'Acritarchs',
 'Acrocubus octopylus',
 'Actinocyclus actinochilus',
 'Actinocyclus curvatulus',
 'Actinocyclus dimorphus',
 'Actinocyclus fasciculatus',
 'Actinocyclus ingens',
 'Actinocyclus ingens  var. nodus',
 'Actinocyclus ingens var. ingens',
 'Actinocyclus ingens var. ovalis',
 'Actinocyclus karstenii',
 'Actinocyclus octonarius',
 'Actinocyclus sp. cf. A. actinochilus early form',
 'Actinocyclus spp.',
 'Actinoptychus senarius',
 'Ammobaculites agglutinans',
 'Ammodochium rectangulare',
 'Amphicoryna scalaris',
 'Amphisorus hemprichii',
 'Anomalinoides globulosus',
 'Anthocyrtidium angulare',
 'Anthocyrtoma spp.',
 'Arachnoidiscus spp.',
 'Archaeosphaeridium australensis',
 'Arkhangelskiella cymbiformis',
 'Astalocus bradyi',
 'Asteromphalus parvulus',
 'Asteromphalus spp.',
 'Azpeitia tabularis',
 'Beella digitata',
 'Bi

Get all the columns from the end of the file.

In [9]:
filtered_columns = set()

for file in metadata['file']:
    path = f"{clean_data_path}/{file}"
    content = pd.read_csv(path)
 
    content = content.dropna(axis="columns", how="all")
    columns = get_taxonomy_columns(content.columns, non_taxa_columns)            
    filtered_columns.update(set(columns[-5:]))
    
filtered_columns

{'"Globigerina" ciperoensis _T',
 'Acrocubus octopylus ',
 'Ammodochium rectangulare',
 'Anthocyrtoma spp.',
 'Archaeosphaeridium australensis',
 'Artophormis gracilis',
 'Beella digitata',
 'Bolivinita quadrilatera',
 'Bolivinita quadrilatera ',
 'Botryostrobus miralestensis',
 'Braarudosphaera bigelowii',
 'Bulimina striata',
 'Bulimina truncanella',
 'Calcidiscus pataecus',
 'Calcidiscus tropicus',
 'Calciosolenia murrayi',
 'Calocyclas turris',
 'Calocycloma castum',
 'Candeina nitida',
 'Candeina praenitida',
 'Carpocanopsis cristata',
 'Cassigerinella chipolensis _T',
 'Catapsydrax dissimilis',
 'Ceratolithus atlanticus',
 'Ceratolithus cristatus',
 'Coccolithus pelagicus',
 'Corbisema flexuosa',
 'Cryptocarpium azyx',
 'Cyrtocapsella cornuta',
 'Cyrtocapsella japonica',
 'Cyrtocapsella tetrapera',
 'Dentoglobigerina pseudovenezuelana (Blow and Banner, 1962)',
 'Dicty mongolferi',
 'Dictyocha deflandrei',
 'Dictyocha frenguellii',
 'Dictyoprora mongolfieri',
 'Didymocyrtis bassan

## Create non-taxa csv

In [10]:
df = pd.DataFrame()
df['field'] = list(taxa_metadata_columns)
df['type'] = 'taxonomy metadata'

In [11]:
df2 = pd.DataFrame()
df2['field'] = list(sample_columns)
df2['type'] = 'sample'

In [12]:
df = df.append(df2, ignore_index=True)

In [13]:
df.shape

(94, 2)

In [14]:
df.sort_values(['type','field'], inplace=True)
df.head()

Unnamed: 0,field,type
91,A/W,sample
89,Bottom Depth [m],sample
88,Bottom [cm],sample
85,Core,sample
93,Core-Sect,sample


In [15]:
df.to_csv(non_taxa_fields_path, index=False)