# QA NOAA Janus files

In [1]:
import sys
sys.path.append('../scripts/')
import glob
from pathlib import Path
import os

import pandas as pd

from normalize_noaa_files import (
    unique_filenames,
    unique_columns,
    filename_index,
    format_filepaths_set
)

## JanusIODP_paleo_agemodel

In [2]:
path = os.path.join('raw_data', 'NOAA_csv', 'JanusIODP_paleo_agemodel', '**', '**', 
                    '**', '**', '**', '*.csv')

csv_paths = glob.glob(path)
print('files', len(csv_paths))

files 2481


### files grouped by expedition and file type

In [3]:
contents = {}
index = filename_index(path)

for path in csv_paths:
    parts = Path(path).parts
    exp = parts[5]
    filename = parts[index]
        
    if exp not in contents:
        contents[exp] = {'taxa': set(),'age_model': set()}
        
    ages = ['Age_Model_Shipboard.csv', 'Age_Model_Initial_Reports.csv', 
            'Age_Model_Post_Moratorium.csv', 'Age_Model_Shipboard_Report.csv', 
            'Age_Model_Initial_Report.csv'
           ]
        
    if filename in ages:
        contents[exp]['age_model'].add(filename)
    else:
        contents[exp]['taxa'].add(filename)
        

In [4]:
exps = []
taxa = []
ages = []

for exp in contents.items():
    exps.append(exp[0])
    taxa.append(format_filepaths_set(exp[1], 'taxa'))
    ages.append(format_filepaths_set(exp[1], 'age_model'))

In [5]:
dict = {
    "expedition": exps,
    "taxa": taxa,
    "age_model": ages
}
df = pd.DataFrame(dict)
df.head()

Unnamed: 0,expedition,taxa,age_model
0,135,"Nannofossils.csv,Planktonic_Foraminifers.csv,B...",Age_Model_Initial_Report.csv
1,104,"Pollen_Spores.csv,Benthic_Foraminifers.csv,Pla...",Age_Model_Initial_Reports.csv
2,168,Nannofossils.csv,Age_Model_Initial_Report.csv
3,157,"Nannofossils.csv,Planktonic_Foraminifers.csv","Age_Model_Initial_Report.csv,Age_Model_Shipboa..."
4,150,"Diatoms.csv,Nannofossils.csv,Dinoflagellates_A...",Age_Model_Initial_Report.csv


In [6]:
df.to_csv(os.path.join('tmp','noaa_JanusIODP_grouped_files.csv'), index=False)

### unique file names

In [7]:
unique_filenames(csv_paths)

{'Age_Model_Initial_Report.csv',
 'Age_Model_Initial_Reports.csv',
 'Age_Model_Post_Moratorium.csv',
 'Age_Model_Shipboard.csv',
 'Age_Model_Shipboard_Report.csv',
 'Benthic Foraminifers.csv',
 'Benthic_Foraminifers.csv',
 'Bolboforms.csv',
 'Diatoms.csv',
 'Dinoflagellates_Acritarch_Prasinophytes.csv',
 'Dinoflagellates_Acritarchs_Prasinophytes.csv',
 'Macrofossils.csv',
 'Miscellaneous.csv',
 'Nannofossils .csv',
 'Nannofossils.csv',
 'Ostracodes.csv',
 'Planktonic Foraminifers.csv',
 'Planktonic_Foraminifers .csv',
 'Planktonic_Foraminifers.csv',
 'Pollen_Spores.csv',
 'Pteropods.csv',
 'Radiolarians.csv',
 'Silicoflagellates_Ebridians_Actiniscidians.csv',
 'Sponge_Spicules.csv',
 'Trace_Fossils.csv'}

### column names

In [8]:
columns = unique_columns(csv_paths)
len(columns)

12991

## Janus_core_data

In [9]:
path = os.path.join('raw_data', 'NOAA_csv', 'Janus_core_data', '**', '**', '*.csv')
csv_paths = glob.glob(path)
print('files', len(csv_paths))

files 325


### files grouped by expedition and file type

In [10]:
exps = []
files = []

for path in csv_paths:
    parts = Path(path).parts
    exp = parts[5]
    
    if exp not in exps:
        exps.append(exp)
        files.append('xxx-delimited_pre-janus.csv')

In [11]:
dict = { "expedition": exps,
        "lithology": files
       }
df = pd.DataFrame(dict)
df.head()

Unnamed: 0,expedition,lithology
0,sed_lith_104_642b_delimited_pre-janus.csv,xxx-delimited_pre-janus.csv
1,sed_lith_104_642e_delimited_pre-janus.csv,xxx-delimited_pre-janus.csv
2,sed_lith_104_644a_delimited_pre-janus.csv,xxx-delimited_pre-janus.csv
3,sed_lith_104_642d_delimited_pre-janus.csv,xxx-delimited_pre-janus.csv
4,sed_lith_104_642c_delimited_pre-janus.csv,xxx-delimited_pre-janus.csv


In [12]:
df.to_csv(os.path.join('tmp', 'noaa_Janus_grouped_files.csv'), index=False)

### unique file names

In [13]:
# unique_filenames(csv_paths)

### column names

In [14]:
unique_columns(csv_paths)

{'Unnamed: 30',
 'Unnamed: 31',
 'Unnamed: 32',
 'Unnamed: 33',
 'Unnamed: 34',
 'bottom interval',
 'bottom interval depth below sea floor',
 'code',
 'color',
 'color number',
 'core',
 'coretype',
 'depth',
 'drilling deformities',
 'hole',
 'induration',
 'latitude',
 'leg',
 'lithology',
 'longitude',
 'minerals',
 'more data available',
 'observer',
 'other',
 'paleontology',
 'piece number bottom',
 'piece number top',
 'section',
 'site',
 'structures',
 'sub piece bottom',
 'sub piece top',
 'top interval',
 'top interval depth below sea floor',
 'unusual occurrences'}