# Create non-taxa csv

Create a csv that contain non-taxa coulmns.

In [1]:
import sys
sys.path.append('../scripts/')
import glob
import re
import os.path

import pandas as pd
import numpy as np

from normalize_data import (
    csv_cleanup,
    update_metadata,
    get_taxonomy_columns,
)

In [2]:
base_directory = 'cleaned_data'
non_taxa_fields_path = os.path.join(base_directory, 
                                    'taxa', 'draft', 'LIMS', 'non_taxa_fields.csv')

micropal_1 = os.path.join(base_directory, 'Micropal_CSV_1')
micropal_2 = os.path.join(base_directory, 'Micropal_CSV_2')
micropal_3 = os.path.join(base_directory, 'Micropal_CSV_3')
micropal_4 = os.path.join(base_directory, 'Micropal_CSV_revised')

micropal_meta_1 = os.path.join(base_directory, 
                               'metadata', 'LIMS', 'Micropal_1_changes.csv')
micropal_meta_2 = os.path.join(base_directory, 
                               'metadata', 'LIMS', 'Micropal_2_changes.csv')
micropal_meta_3 = os.path.join(base_directory, 
                               'metadata', 'LIMS', 'Micropal_3_changes.csv')
micropal_meta_4 = os.path.join(base_directory, 
                               'metadata', 'LIMS', 'Micropal_revised_changes.csv')

In [3]:
clean_data_path = micropal_4
metadata_file = micropal_meta_4

In [4]:
metadata = pd.read_csv(metadata_file)
metadata.head()

Unnamed: 0,file,taxon_group,add_expedition_section_cols,update_sample_col,update_top_bottom,add_missing_cols,clean_up_taxa_metadata_values,update_zones,add_missing_zone
0,363-U1482A-nannofossils_revised.csv,nannofossils,True,False,False,False,True,False,True


## Get non-taxa columns

The Micropal CSVs have non-taxa columns, then taxa columns, then non-taxa columns. We want to form a set of non-taxa columns.

Get the column names from the beginning and end of the header row. Manually inspect the columns to select the non-taxa columns.

In [5]:
sample_columns = set()

sample_columns.update([
'Sample',
'Exp',
'Site',
'Hole',
'Core',
'Core-Sect',
'Extra Sample ID Data',
'Type',
'Section',
'A/W',
'Top [cm]',
'Bottom [cm]',
'Top Depth [m]',
'Bottom Depth [m]', 
])
len(sample_columns)

14

In [6]:
taxa_metadata_columns = set()

taxa_metadata_columns.update([
'% Planktic Foraminifera within whole sample',
'Abundance',
'Additional zone name',
'Additional zone name (short)',
'Age',
'Aspect comment (etching)',
'Aspect comment (etching),Comment (general)',
'BF Group abundance',
'BF Preservation',
'BF comment',
'BF preservation',
'COMMENTS',
'Chrysophyte cyst group abundance',
'Comment',
'Comment (general)',
'Comments',
'Datum age average [Ma]',
'Datum age maximum [Ma]',
'Datum age minimum [Ma]',
'Datum author year',
'Datum comment',
'Datum group',
'Datum group code',
'Datum name',
'Datum name generic',
'Datum occurrence',
'Datum region',
'Datum status',
'Datum type',
'Datum validation comment',
'Diatom abundance',
'Diatom preservation - pyritization2',
'Diatom preservation dissolution',
'Diatom preservation fragmentation',
'Diatoms and siliceous plankton comment',
'Diatoms group abundance',
'Ebridian group abundance',
'File Data',
'Foram abundance',
'Fragmentation',
'Fragmentation rank [auto-pop]',
'General comment',
'Genus/species (upper zone)',
'Genus/species lower zone)',
'Group Abundance',
'Group abundance',
'Group abundance (%)',
'Group preservation',
'Large Benthic Forams [%]',
'Lower boundary age av. [Ma]',
'Lower boundary age max [Ma]',
'Lower boundary age min [Ma]',
'Mixing',
'Nannofossil abundance',
'Nannofossil comment',
'No. specimens/tray',
'PALEO WATER DEPTH (IS=inner shelf, MS=middle shelf, OS=outer shelf)',
'PF Group Abundance',
'PF Preservation',
'PF Zone',
'PF group abundance',
'PF preservation',
'Percentage of benthic forams in total foram assemblage [%]',
'Percentage of non-calcareous agglutinated forams in total foram assemblage [%]',
'Percentage of planktic forams in total foram assemblage [%]',
'Piece',
'Planktonic Benthic ratio (P:B)',
'Pleurostomellids comment',
'Preservation',
'Pteropod group abundance',
'Reworking comment (1= <1%, 2= light 1-10%, 3= >10%)',
'Reworking comment (1= <1%, 2=light 1-10%, 3= >10%)',
'Sample comment',
'Sample preparation comment',
'Ship File Links',
'Shore File Links',
'Sillicoflagellate abundance',
'Silicoflagellates group abundance',
'Temperature Range',
'Total in situ dinocysts',
'Type (lower zone)',
'Type (upper zone)',
'Upper boundary age av. [Ma]',
'Upper boundary age max [Ma]',
'Upper boundary age min [Ma]',
'XBroken',
'XCorroded',
'XCrumpled',
'Zone',
'Zone author (year)',
'Zone comment',
'Zone group',
'Zone group,Type (upper zone)',
'Zone name',
'Zone name (short)',
'Zone name [short]',
'Zone status',
'comments',
'constituent',
'count',
'count_type',
'dupes and comments',
'pc_abundance_name_mode',
'pc_fossil_group',
'pc_fossil_name',
'pc_preservation_name_average',
'physical_constituent_name'
])

len(taxa_metadata_columns)

107

In [7]:
non_taxa_columns = set()
non_taxa_columns.update(sample_columns)
non_taxa_columns.update(taxa_metadata_columns)

Get all the columns from the beginning of the file.

In [8]:
filtered_columns = set()

for file in metadata['file']:
    path = f"{clean_data_path}/{file}"
    content = pd.read_csv(path)
    
    content = content.dropna(axis="columns", how="all")
    columns = get_taxonomy_columns(content.columns, non_taxa_columns) 
    filtered_columns.update(set(columns[0:5]))

filtered_columns

{'Emiliania huxleyi',
 'Florisphaera profunda',
 'Gephyrocapsa caribbeanica',
 'Gephyrocapsa ericsonii',
 'Gephyrocapsa oceanica'}

Get all the columns from the end of the file.

In [9]:
filtered_columns = set()

for file in metadata['file']:
    path = f"{clean_data_path}/{file}"
    content = pd.read_csv(path)
 
    content = content.dropna(axis="columns", how="all")
    columns = get_taxonomy_columns(content.columns, non_taxa_columns)            
    filtered_columns.update(set(columns[-5:]))
    
filtered_columns

{'Reworked species',
 'Rhabdosphaera spp.',
 'Scyphosphaera intermedia',
 'Tetralithoides symeonidesii',
 'Umbilicosphaera hulburtiana'}

## Create non-taxa csv

In [10]:
df = pd.DataFrame()
df['field'] = list(taxa_metadata_columns)
df['type'] = 'taxonomy metadata'

In [11]:
df2 = pd.DataFrame()
df2['field'] = list(sample_columns)
df2['type'] = 'sample'

In [12]:
df = df.append(df2, ignore_index=True)

In [13]:
df.shape

(121, 2)

In [14]:
df.sort_values(['type','field'], inplace=True)
df.head()

Unnamed: 0,field,type
119,A/W,sample
108,Bottom Depth [m],sample
109,Bottom [cm],sample
116,Core,sample
113,Core-Sect,sample


In [15]:
df.to_csv(non_taxa_fields_path, index=False)