# QA NOAA Janus files

In [1]:
import sys
sys.path.append('../scripts/')
import glob
from pathlib import Path
import os

import pandas as pd

from normalize_noaa_files import (
    unique_filenames,
    unique_columns,
    filename_index,
    format_filepaths_set
)

## JanusIODP_paleo_agemodel

In [2]:
path = os.path.join('raw_data', 'NOAA_csv', 'JanusIODP_paleo_agemodel', '**', '**', 
                    '**', '**', '**', '*.csv')

csv_paths = glob.glob(path)
print('files', len(csv_paths))

files 2481


### files grouped by expedition and file type

In [3]:
contents = {}
index = filename_index(path)

for path in csv_paths:
    parts = Path(path).parts
    exp = parts[5]
    filename = parts[index]
        
    if exp not in contents:
        contents[exp] = {'taxa': set(),'age_model': set()}
        
    ages = ['Age_Model_Shipboard.csv', 'Age_Model_Initial_Reports.csv', 
            'Age_Model_Post_Moratorium.csv', 'Age_Model_Shipboard_Report.csv', 
            'Age_Model_Initial_Report.csv'
           ]
        
    if filename in ages:
        contents[exp]['age_model'].add(filename)
    else:
        contents[exp]['taxa'].add(filename)
        

In [4]:
exps = []
taxa = []
ages = []

for exp in contents.items():
    exps.append(exp[0])
    taxa.append(format_filepaths_set(exp[1], 'taxa'))
    ages.append(format_filepaths_set(exp[1], 'age_model'))

In [5]:
dict = {
    "expedition": exps,
    "taxa": taxa,
    "age_model": ages
}
df = pd.DataFrame(dict)
df.head()

Unnamed: 0,expedition,taxa,age_model
0,135,"Benthic_Foraminifers.csv,Nannofossils.csv,Plan...",Age_Model_Initial_Report.csv
1,104,"Macrofossils.csv,Dinoflagellates_Acritarch_Pra...",Age_Model_Initial_Reports.csv
2,168,Nannofossils.csv,Age_Model_Initial_Report.csv
3,157,"Planktonic_Foraminifers.csv,Nannofossils.csv","Age_Model_Shipboard.csv,Age_Model_Initial_Repo..."
4,150,"Dinoflagellates_Acritarch_Prasinophytes.csv,Na...",Age_Model_Initial_Report.csv


In [6]:
df.to_csv(os.path.join('tmp','noaa_JanusIODP_grouped_files.csv'), index=False)

### unique file names

In [7]:
unique_filenames(csv_paths)

{'Age_Model_Initial_Report.csv',
 'Age_Model_Initial_Reports.csv',
 'Age_Model_Post_Moratorium.csv',
 'Age_Model_Shipboard.csv',
 'Age_Model_Shipboard_Report.csv',
 'Benthic Foraminifers.csv',
 'Benthic_Foraminifers.csv',
 'Bolboforms.csv',
 'Diatoms.csv',
 'Dinoflagellates_Acritarch_Prasinophytes.csv',
 'Dinoflagellates_Acritarchs_Prasinophytes.csv',
 'Macrofossils.csv',
 'Miscellaneous.csv',
 'Nannofossils .csv',
 'Nannofossils.csv',
 'Ostracodes.csv',
 'Planktonic Foraminifers.csv',
 'Planktonic_Foraminifers .csv',
 'Planktonic_Foraminifers.csv',
 'Pollen_Spores.csv',
 'Pteropods.csv',
 'Radiolarians.csv',
 'Silicoflagellates_Ebridians_Actiniscidians.csv',
 'Sponge_Spicules.csv',
 'Trace_Fossils.csv'}

### column names

In [8]:
columns = unique_columns(csv_paths)
len(columns)

12991

## Miscellaneous.csv

In [9]:
for path in csv_paths:
    if 'Miscellaneous.csv' in path:
        link = 'https://github.com/eODP/data-processing/tree/master/notebooks/' + '/'.join(path.split('/'))
        print(link)

https://github.com/eODP/data-processing/tree/master/notebooks/raw_data/NOAA_csv/JanusIODP_paleo_agemodel/paleontology/range_tables/104/643/HOLE_A/Miscellaneous.csv
https://github.com/eODP/data-processing/tree/master/notebooks/raw_data/NOAA_csv/JanusIODP_paleo_agemodel/paleontology/range_tables/104/644/HOLE_A/Miscellaneous.csv
https://github.com/eODP/data-processing/tree/master/notebooks/raw_data/NOAA_csv/JanusIODP_paleo_agemodel/paleontology/range_tables/104/642/HOLE_B/Miscellaneous.csv
https://github.com/eODP/data-processing/tree/master/notebooks/raw_data/NOAA_csv/JanusIODP_paleo_agemodel/paleontology/range_tables/120/747/HOLE_A/Miscellaneous.csv
https://github.com/eODP/data-processing/tree/master/notebooks/raw_data/NOAA_csv/JanusIODP_paleo_agemodel/paleontology/range_tables/120/749/HOLE_B/Miscellaneous.csv
https://github.com/eODP/data-processing/tree/master/notebooks/raw_data/NOAA_csv/JanusIODP_paleo_agemodel/paleontology/range_tables/120/749/HOLE_C/Miscellaneous.csv
https://github.c

## Janus_core_data

In [10]:
path = os.path.join('raw_data', 'NOAA_csv', 'Janus_core_data', '**', '**', '*.csv')
csv_paths = glob.glob(path)
print('files', len(csv_paths))

files 325


### files grouped by expedition and file type

In [11]:
exps = []
files = []

for path in csv_paths:
    parts = Path(path).parts
    exp = parts[5]
    
    if exp not in exps:
        exps.append(exp)
        files.append('xxx-delimited_pre-janus.csv')

In [12]:
dict = { "expedition": exps,
        "lithology": files
       }
df = pd.DataFrame(dict)
df.head()

Unnamed: 0,expedition,lithology
0,sed_lith_104_642b_delimited_pre-janus.csv,xxx-delimited_pre-janus.csv
1,sed_lith_104_642e_delimited_pre-janus.csv,xxx-delimited_pre-janus.csv
2,sed_lith_104_644a_delimited_pre-janus.csv,xxx-delimited_pre-janus.csv
3,sed_lith_104_642d_delimited_pre-janus.csv,xxx-delimited_pre-janus.csv
4,sed_lith_104_642c_delimited_pre-janus.csv,xxx-delimited_pre-janus.csv


In [13]:
df.to_csv(os.path.join('tmp', 'noaa_Janus_grouped_files.csv'), index=False)

### unique file names

In [14]:
# unique_filenames(csv_paths)

### column names

In [15]:
unique_columns(csv_paths)

{'Unnamed: 30',
 'Unnamed: 31',
 'Unnamed: 32',
 'Unnamed: 33',
 'Unnamed: 34',
 'bottom interval',
 'bottom interval depth below sea floor',
 'code',
 'color',
 'color number',
 'core',
 'coretype',
 'depth',
 'drilling deformities',
 'hole',
 'induration',
 'latitude',
 'leg',
 'lithology',
 'longitude',
 'minerals',
 'more data available',
 'observer',
 'other',
 'paleontology',
 'piece number bottom',
 'piece number top',
 'section',
 'site',
 'structures',
 'sub piece bottom',
 'sub piece top',
 'top interval',
 'top interval depth below sea floor',
 'unusual occurrences'}

## check minerals, paleontology, structures, unusual occurrences values

In [16]:
minerals = []
paleontology = []
structures = []
unusual_occurrences = []

for path in csv_paths:
    df = pd.read_csv(path, nrows=1)
    columns = df.columns
    if 'minerals' in columns:
        minerals.append(path)
    if 'paleontology' in columns:
        paleontology.append(path)
    if 'structures' in columns:
        structures.append(path)
    if 'unusual occurrences' in columns:
        unusual_occurrences.append(path)

In [17]:
print(len(csv_paths))
print(len(minerals))
print(len(paleontology))
print(len(structures))
print(len(unusual_occurrences))


325
325
325
325
325


In [18]:
def unique_values(column):
    return list(column.dropna().unique())

path = csv_paths[0]
print(path)

df = pd.read_csv(path)

unique_values(df['minerals'])

raw_data/NOAA_csv/Janus_core_data/104/642b/sed_lith_104_642b_delimited_pre-janus.csv


['fe/mn laminae',
 'fe dots (~57 cm); pyrite in burrows(8-16 cm)',
 'crystalline fragment dropstones',
 'pyrite in burrows',
 'pyrite (?) in mottling',
 'pyrite in mottling',
 'pyrite pockets',
 'pyrite in sandy zone(~134 cm);pyrite & volcanic ash(132 cm)',
 'volcanic ash? pocket (17 cm)',
 'pyrite infilled pores (?)',
 'pyrite in burrows (96 cm)',
 'pyrite in sandy patches (11, 15-17 cm)',
 'pyrite sandy patches (110-113, 127-131 cm)',
 'pyrite in burrows (123-126 cm)',
 'pyrite in burrow (~86 cm)',
 'pyrite in burrows (127 cm)',
 'pyrite in burrows (124-132, 138 cm)',
 'pyrite in burrows; volcanic ash (96-98,103 cm)',
 'pyrite in burrows, pyrite in color bands',
 'nannos',
 'volcanic ash (120-122 cm)',
 'pyrite concretion (50 cm)',
 'pyrite in burrows (0-89 cm)',
 'pyrite concretions (90, 110 cm)',
 'pyrite impregnation along burrows(0-76 cm)',
 'pyrite in burrows (50-150 cm)',
 'pyrite impregnations (11-150 cm)',
 'pyrite impregnations and concretion (111 cm)',
 'pyrite concretion (

In [19]:
unique_values(df['paleontology'])

['forams',
 'nanno',
 'nannos',
 'forams (100-120 cm)',
 'forams (0-122 cm)',
 'forams in sandy patches (135-145 cm)',
 'forams (130-150 cm)',
 'zoophycos trace fossil',
 'nanno, mollusc fragments (11 cm)',
 'nannos; forams',
 'forams visible',
 'forams (88-102 cm)',
 'visible forams (0-120 cm)',
 'diatoms',
 'forams (50 cm ?), diatoms']

In [20]:
unique_values(df['structures'])

['grayish color mottling (~27-30 cm);mottling(38-41 cm); minor bioturbation (38-41 cm)',
 'parallel laminae',
 'fine brownish laminae (~142-145 cm), dropstones',
 'moderate bioturbation/pyritized burrows (~114-117 cm), dropstone (~115cm)',
 'pyritized burrows/moderate bioturbation (8-16 cm)',
 'heavy bioturbation(144-150cm);2 mm burrows (144-150 cm); cross-stratification(138-144cm), dropstones (131, 139 cm)',
 'moderate bioturbation',
 'moderate (80-93 cm) to heavy (96-100 cm) bioturbation; large granite dropstones (~116cm)',
 'parallel laminae (119-127 cm), dropstones, small (119-127 cm), large (132 cm)',
 'laminae',
 'dropstones (~67,93,122cm), minor bioturbation (45-100 cm)',
 'dropstone? (~136 cm)',
 'minor bioturbation (14-55 cm)',
 'scattered dropstones (105-110 cm, 136-141 cm)',
 'black mottling/minor bioturbation (8-37 cm); faint laminae below 30 cm',
 'dropstones, 5y 3/2 mottling(47-64 cm); mottling(64-80, 80-110 cm)',
 'faint black mottling; minor bioturbation, dropstones',
 

In [21]:
unique_values(df['unusual occurrences'])

['small dropstones (8-100 cm)',
 'pebbles (17-19 cm)',
 'volcanic ash layer (~132 cm)',
 'mud pebbles (149 cm)',
 'soft mud pebbles',
 'reworked very dark gray mud pebbles (especially 110, 120 cm)',
 'large mud pebbles (70-75cm)',
 'pyrite concretion (50 cm) along large burrows',
 'pyrite concretions (90, 110 cm)',
 'pyrite concretion (111 cm)',
 'pyrite concretion (64 cm)',
 'pyrite concretions (1-4 cm)',
 'pyrite concretion (132 cm)',
 'pyrite concretion(102-104 cm)',
 'pebbles/crs sand (100-115 cm) (dropstones ?)']