# QA NOAA Janus IODP metadata
## 101-210 taxa, 101-190 age

Get basic metadata (file names, column names) about NOAA Janus IODP dataset. Create csv that lists all the files.

NOAA/JanusIODP_paleo_agemodel  
expedition 101-210 
taxa 101-210, age models 101-190

In [1]:
import sys
sys.path.append('../../')
import glob
from pathlib import Path
import os
import re 
import pandas as pd
import numpy as np
import shutil

from scripts.normalize_noaa_files import (
    unique_filenames_for_paths,
    unique_columns_for_paths,
    filename_index,
    format_filepaths_set,
    qa_files_for_paths,
    column_counts_for_paths
)
import scripts.space_delim as sd
from config import OUTPUT_DIR, CLEAN_DATA_DIR, RAW_DATA_DIR


In [2]:
base_dir = CLEAN_DATA_DIR
data_dir = base_dir/'NOAA'/'JanusIODP_paleo_agemodel'
metadata_path = OUTPUT_DIR/'metadata'/'NOAA'/'noaa_janus_iodp_files.csv'

In [3]:
csv_paths = list(data_dir.glob("**/*.csv"))
print('files', len(csv_paths))

files 2481


In [4]:
taxa_csv_paths = list(data_dir.glob("paleontology/range_tables/**/*.csv"))
print('files', len(taxa_csv_paths))

files 2045


In [5]:
age_csv_paths = list(data_dir.glob("paleontology/age_models/**/*.csv"))
print('files', len(age_csv_paths))

files 436


## unique file names

Get all the file names.

In [6]:
files = unique_filenames_for_paths(taxa_csv_paths)
files

{'Benthic Foraminifers.csv',
 'Benthic_Foraminifers.csv',
 'Bolboforms.csv',
 'Diatoms.csv',
 'Dinoflagellates_Acritarch_Prasinophytes.csv',
 'Dinoflagellates_Acritarchs_Prasinophytes.csv',
 'Macrofossils.csv',
 'Miscellaneous.csv',
 'Nannofossils .csv',
 'Nannofossils.csv',
 'Ostracodes.csv',
 'Planktonic Foraminifers.csv',
 'Planktonic_Foraminifers .csv',
 'Planktonic_Foraminifers.csv',
 'Pollen_Spores.csv',
 'Pteropods.csv',
 'Radiolarians.csv',
 'Silicoflagellates_Ebridians_Actiniscidians.csv',
 'Sponge_Spicules.csv',
 'Trace_Fossils.csv'}

In [7]:
len(files)

20

In [8]:
unique_filenames_for_paths(age_csv_paths)

{'Age_Model_Initial_Report.csv',
 'Age_Model_Initial_Reports.csv',
 'Age_Model_Post_Moratorium.csv',
 'Age_Model_Shipboard.csv',
 'Age_Model_Shipboard_Report.csv'}

## column names

In [9]:
taxa_columns = unique_columns_for_paths(taxa_csv_paths)
len(taxa_columns)

12979

## file list

Create csv that lists all the files for this dataset.

In [10]:
file_list = []
switch = {
    'Benthic Foraminifers.csv': 'benthic_foraminfera',
    'Benthic_Foraminifers.csv': 'benthic_foraminfera',
    'Bolboforms.csv': 'bolboformids',
    'Diatoms.csv': 'diatoms',
    'Dinoflagellates_Acritarch_Prasinophytes.csv': 'dinoflagellates/acritarchs/prasinophytes',
    'Dinoflagellates_Acritarchs_Prasinophytes.csv': 'dinoflagellates/acritarchs/prasinophytes',
    'Macrofossils.csv': 'macrofossils',
    'Miscellaneous.csv': 'miscellaneous',
    'Nannofossils .csv': 'nannofossils',
    'Nannofossils.csv': 'nannofossils',
    'Ostracodes.csv': 'ostracods',
    'Planktonic Foraminifers.csv': 'planktic_foraminfera',
    'Planktonic_Foraminifers .csv': 'planktic_foraminfera',
    'Planktonic_Foraminifers.csv': 'planktic_foraminfera',
    'Pollen_Spores.csv': 'pollen',
    'Pteropods.csv': 'pteropods',
    'Radiolarians.csv': 'radiolarians',
    'Silicoflagellates_Ebridians_Actiniscidians.csv': 'silicoflagellates/ebridians/actiniscidians',
    'Sponge_Spicules.csv': 'sponge_spicules',
    'Trace_Fossils.csv': 'trace_fossils'
}
index = filename_index(csv_paths[0])

for path in csv_paths:
    file_data = {}
    relative_path = path.relative_to(base_dir)
    path_parts = relative_path.parts
    filename = relative_path.name 
    
    file_data['path'] = relative_path
    
    if filename.startswith('Age_'):
        type = 'age'
    else:
        type = 'taxa'
    file_data['type'] = type
    
    if type == 'taxa':
        file_data['taxon_group'] = switch.get(filename, np.nan)
    
    file_data['expedition'] = path_parts[4]
    file_data['site'] = path_parts[5]

    
    file_list.append(file_data)
 

In [11]:
df = pd.DataFrame(file_list)
df = df.sort_values(by=['expedition', 'site', 'type', 'taxon_group'])
df.head()

Unnamed: 0,path,type,expedition,site,taxon_group
1868,NOAA/JanusIODP_paleo_agemodel/paleontology/ran...,taxa,101,626,benthic_foraminfera
1871,NOAA/JanusIODP_paleo_agemodel/paleontology/ran...,taxa,101,626,benthic_foraminfera
1867,NOAA/JanusIODP_paleo_agemodel/paleontology/ran...,taxa,101,626,nannofossils
1870,NOAA/JanusIODP_paleo_agemodel/paleontology/ran...,taxa,101,626,nannofossils
1869,NOAA/JanusIODP_paleo_agemodel/paleontology/ran...,taxa,101,626,planktic_foraminfera


In [12]:
df.to_csv(metadata_path, index=False)