# Create NOAA Janus IODP metadata
## 101-210 taxa, 101-190 age

Get basic metadata (file names, column names) about NOAA Janus IODP dataset. Create csv that lists all the files.

NOAA/JanusIODP_paleo_agemodel  
expedition 101-210 
taxa 101-210, age models 101-190

In [2]:
import sys
sys.path.append('../../')
import glob
from pathlib import Path
import os
import re 
import pandas as pd
import numpy as np
import shutil

from scripts.normalize_noaa_files import (
    unique_filenames_for_paths,
    unique_columns_for_paths,
    filename_index,
    format_filepaths_set,
    qa_files_for_paths,
    column_counts_for_paths
)
import scripts.space_delim as sd
from config import OUTPUT_DIR, CLEAN_DATA_DIR, RAW_DATA_DIR


In [3]:
base_dir = CLEAN_DATA_DIR
data_dir = base_dir/'NOAA'/'JanusIODP_paleo_agemodel'
metadata_path = OUTPUT_DIR/'metadata'/'NOAA'/'noaa_janus_iodp_files.csv'

In [4]:
csv_paths = list(data_dir.glob("**/*.csv"))
csv_paths = [p for p in csv_paths if '.ipynb_checkpoints' not in str(p)]


print('files', len(csv_paths))

files 2481


In [5]:
taxa_csv_paths = list(data_dir.glob("paleontology/range_tables/**/*.csv"))
print('files', len(taxa_csv_paths))

files 2045


In [6]:
age_csv_paths = list(data_dir.glob("paleontology/age_models/**/*.csv"))
print('files', len(age_csv_paths))

files 436


## unique file names

Get all the file names.

In [6]:
files = unique_filenames_for_paths(taxa_csv_paths)
files

{'Benthic Foraminifers.csv',
 'Benthic_Foraminifers.csv',
 'Bolboforms.csv',
 'Diatoms.csv',
 'Dinoflagellates_Acritarch_Prasinophytes.csv',
 'Dinoflagellates_Acritarchs_Prasinophytes.csv',
 'Macrofossils.csv',
 'Miscellaneous.csv',
 'Nannofossils .csv',
 'Nannofossils.csv',
 'Ostracodes.csv',
 'Planktonic Foraminifers.csv',
 'Planktonic_Foraminifers .csv',
 'Planktonic_Foraminifers.csv',
 'Pollen_Spores.csv',
 'Pteropods.csv',
 'Radiolarians.csv',
 'Silicoflagellates_Ebridians_Actiniscidians.csv',
 'Sponge_Spicules.csv',
 'Trace_Fossils.csv'}

In [7]:
len(files)

20

In [8]:
unique_filenames_for_paths(age_csv_paths)

{'Age_Model_Initial_Report.csv',
 'Age_Model_Initial_Reports.csv',
 'Age_Model_Post_Moratorium.csv',
 'Age_Model_Shipboard.csv',
 'Age_Model_Shipboard_Report.csv'}

## column names

In [9]:
taxa_columns = unique_columns_for_paths(taxa_csv_paths)
len(taxa_columns)

12979

manually get common columns

In [133]:
all_cols = set()

for path in taxa_csv_paths:
    df = pd.read_csv(path)
    df = df.dropna(axis=1, how='all')
    df = df.dropna(axis=0, how='all')
    
    columns = list(df.columns)[0:15]
    all_cols.update(columns)
    

In [134]:
len(all_cols)

62

In [138]:
common_cols = {
    'Age From (oldest)',
    'Age To (youngest)',
    'Comment',
    'Cor',
    'Data',
    'Depth (mbsf)',
    'Fossil Group',
    'Fossil Group                                 ',
    'Group Abundance',
    'Group Preservation',
    'H',
    'Leg',
    'Sc',
    'Scientist',
    'Site',
    'T',
    'Top(cm)',
    'Zone From (bottom)',
    'Zone To  (top)'
}
len(common_cols)

19

In [139]:
common_cols - all_cols 

set()

In [140]:
all_cols - common_cols

{'Antarctissa sp.',
 'Biddulphia tuomeyi',
 'Carpocanistrum sp.',
 'Cibicidoides spp.',
 'Globigerina quinqueloba',
 'Globorotalia truncatulinoides',
 'Helicosphaera carteri',
 'Hemiaulus sp.',
 'Larcopyle sp.',
 'Monaxons taxa',
 'Neogloboquadrina pachyderma',
 'Nonionellina flemingi',
 'Orbulina universa',
 'Paralia sulcata',
 'Patellina corrugata',
 'Planktonics miscellaneous',
 'Pseudoemiliania lacunosa',
 'Rectuvigerina ongleyi',
 'Reticulofenestra medium-sized',
 'Reticulofenestra minutula',
 'Reticulofenestra small-sized',
 'Sigmoilopsis schlumbergeri',
 'Sphaeroidinellopsis seminulina',
 'Sphenolithus abies',
 'Sphenolithus moriformis',
 'Sponge spicules',
 'Spongodiscus sp.',
 'Stephanopyxis turris turris',
 'Stylodictya sp.',
 'Thalassiosira inura',
 'Thalassiosira lentiginosa',
 'Thalassiosira oestrupii',
 'Thalassiosira spp.',
 'Thalassiothrix longissima',
 'Thecosphaera sp.',
 'Thecosphaera sp. (small)',
 'Triaxon taxa',
 'Triceraspyris antarctica',
 'Triceraspyris coronat

## check if taxon groups in file matches taxon group in file name 

In [85]:
for path in taxa_csv_paths:
    df = pd.read_csv(path)
    df = df.dropna(axis=1, how='all')
    df = df.dropna(axis=0, how='all')
    
    cols = [col.strip() for col in df.columns]
    if 'Fossil Group' not in cols:
        print(path)

In [104]:
for path in taxa_csv_paths:
    df = pd.read_csv(path)
    df = df.dropna(axis=1, how='all')
    df = df.dropna(axis=0, how='all')
    
    df.columns = [col.strip() for col in df.columns]
    
    file_name_group = path.name.replace('.csv', '').replace('_', ' ').strip()
    file_data_groups = set([group.strip() for group in df['Fossil Group']])
    
    if len(file_data_groups) > 1:
        print(path, 'MULTIPLE', file_data_groups)
        
    if path.name == 'Dinoflagellates_Acritarch_Prasinophytes.csv':
        if list(file_data_groups)[0] != 'Dinoflagellates/Acritarchs/Prasinophytes':
            print(path)
            print(file_name_group, '-', list(file_data_groups)[0], '\n')
    elif path.name == 'Dinoflagellates_Acritarchs_Prasinophytes.csv':
        if list(file_data_groups)[0] != 'Dinoflagellates/Acritarchs/Prasinophytes':
            print(path)
            print(file_name_group, '-', list(file_data_groups)[0], '\n')
    elif path.name == 'Silicoflagellates_Ebridians_Actiniscidians.csv':
        if list(file_data_groups)[0] != 'Silicoflagellates/Ebridians/Actiniscidians':
            print(path)
            print(file_name_group, '-', list(file_data_groups)[0], '\n')
    elif path.name == 'Pollen_Spores.csv':
        if list(file_data_groups)[0] != 'Pollen/Spores':
            print(path)
            print(file_name_group, '-', list(file_data_groups)[0], '\n')
                  
    elif file_name_group != list(file_data_groups)[0]:
        print(path)
        print(file_name_group, '-', list(file_data_groups)[0])
        print('files >>', [foo.name for foo in list(Path.glob(path.parent, '**/*.csv'))], '\n')


../../output/cleaned_data/NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/159/959/HOLE_A/Silicoflagellates_Ebridians_Actiniscidians.csv
Silicoflagellates Ebridians Actiniscidians - Planktonic Foraminifers 

../../output/cleaned_data/NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/167/1020/HOLE_A/Radiolarians.csv
Radiolarians - Nannofossils
files >> ['Nannofossils.csv', 'Radiolarians.csv'] 

../../output/cleaned_data/NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/210/1276/HOLE_A/Planktonic_Foraminifers.csv
Planktonic Foraminifers - Benthic Foraminifers
files >> ['Planktonic_Foraminifers.csv'] 

../../output/cleaned_data/NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/189/1170/HOLE_D/Nannofossils.csv
Nannofossils - Dinoflagellates/Acritarchs/Prasinophytes
files >> ['Nannofossils.csv', 'Benthic_Foraminifers.csv', 'Radiolarians.csv', 'Planktonic_Foraminifers.csv'] 

../../output/cleaned_data/NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/126/787/HO

## create metdata file list

Create csv that lists all the files for this dataset.

In [114]:
groups = set()

for path in csv_paths:
    relative_path = path.relative_to(base_dir)
    filename = relative_path.name 
        
    if filename.startswith('Age_'):
        pass
    else:

        df = pd.read_csv(path)
        df = df.dropna(axis=1, how='all')
        df = df.dropna(axis=0, how='all')
        
        df.columns = [col.strip() for col in df.columns]

        groups.update(list(df['Fossil Group'].str.strip()))
    

groups     

{'Benthic Foraminifers',
 'Bolboforms',
 'Diatoms',
 'Dinoflagellates/Acritarchs/Prasinophytes',
 'Macrofossils',
 'Miscellaneous',
 'Nannofossils',
 'Ostracodes',
 'Planktonic Foraminifers',
 'Pollen/Spores',
 'Pteropods',
 'Radiolarians',
 'Silicoflagellates/Ebridians/Actiniscidians',
 'Sponge Spicules',
 'Trace Fossils'}

In [116]:
taxon_group_normalize = {
    'Benthic Foraminifers': 'benthic_forams',
    'Bolboforms': 'bolboformids',
    'Diatoms': 'diatoms',
    'Dinoflagellates/Acritarchs/Prasinophytes': 'Dinoflagellates/Acritarchs/Prasinophytes',
    'Macrofossils': 'macrofossils',
    'Miscellaneous': 'miscellaneous',
    'Nannofossils': 'nannofossils',
    'Ostracodes': 'ostracods',
    'Planktonic Foraminifers': 'planktic_forams',
    'Pollen/Spores': 'Pollen/Spores',
    'Pteropods': 'pteropods',
    'Radiolarians': 'radiolarians',
    'Silicoflagellates/Ebridians/Actiniscidians': 'Silicoflagellates/Ebridians/Actiniscidians',
    'Sponge Spicules': 'sponge_spicules',
    'Trace Fossils': 'trace_fossils'
}



In [125]:
file_list = []



for path in csv_paths:
    file_data = {}
    relative_path = path.relative_to(base_dir)
    path_parts = relative_path.parts
    filename = relative_path.name 
    
    file_data['path'] = relative_path
    
    if filename.startswith('Age_'):
        type = 'age'
    else:
        type = 'taxa'
    file_data['type'] = type
    
    if type == 'taxa':
        
        df = pd.read_csv(path)
        df = df.dropna(axis=1, how='all')
        df = df.dropna(axis=0, how='all')
        
        df.columns = [col.strip() for col in df.columns]

        # get all fossil groups
        file_data_groups = set([group.strip() for group in df['Fossil Group']])

        # get taxon group from file name        
        if len(file_data_groups) > 1:
            file_data['taxon_group'] = taxon_group_normalize[relative_path.stem]
        else:
            file_data['taxon_group'] = taxon_group_normalize[list(file_data_groups)[0].strip()]
    
    file_data['expedition'] = path_parts[4]
    file_data['site'] = path_parts[5]

    
    file_list.append(file_data)
 

In [126]:
df = pd.DataFrame(file_list)
df = df.sort_values(by=['expedition', 'site', 'type', 'taxon_group'])
df.head()

Unnamed: 0,path,type,expedition,site,taxon_group
1868,NOAA/JanusIODP_paleo_agemodel/paleontology/ran...,taxa,101,626,benthic_forams
1871,NOAA/JanusIODP_paleo_agemodel/paleontology/ran...,taxa,101,626,benthic_forams
1867,NOAA/JanusIODP_paleo_agemodel/paleontology/ran...,taxa,101,626,nannofossils
1870,NOAA/JanusIODP_paleo_agemodel/paleontology/ran...,taxa,101,626,nannofossils
1869,NOAA/JanusIODP_paleo_agemodel/paleontology/ran...,taxa,101,626,planktic_forams


In [132]:
set(df[df['type'] == 'taxa']['taxon_group'])

{'Dinoflagellates/Acritarchs/Prasinophytes',
 'Pollen/Spores',
 'Silicoflagellates/Ebridians/Actiniscidians',
 'benthic_forams',
 'bolboformids',
 'diatoms',
 'macrofossils',
 'miscellaneous',
 'nannofossils',
 'ostracods',
 'planktic_forams',
 'pteropods',
 'radiolarians',
 'sponge_spicules',
 'trace_fossils'}

In [130]:
df.to_csv(metadata_path, index=False)