# QA NOAA Janus files

Get basic metadata (file names, column names) about NOAA Janus dataset. Create csv that lists all the files.

NOAA_csv/Janus_core_data  
expedition 101-126  
lithology

In [1]:
import sys
sys.path.append('../scripts/')
import glob
from pathlib import Path
import os

import pandas as pd

from normalize_noaa_files import (
    unique_filenames_for_paths,
    unique_columns_for_paths,
    filename_index,
    format_filepaths_set
)

In [2]:
base_directory = 'cleaned_data'
base_data_path = os.path.join(base_directory, 'NOAA_csv', 'Janus_core_data')
metadata_path = os.path.join(base_directory, 'metadata', 'noaa_janus_files.csv')

In [3]:
csv_paths = glob.glob(os.path.join(base_data_path, '**', '**', '*.csv'))
print('files', len(csv_paths))

files 325


## unique file names

Get all the file names.

In [20]:
files = unique_filenames_for_paths(csv_paths)
list(files)[0:15]

['sed_lith_108_666a_delimited_pre-janus.csv',
 'sed_lith_123_765c_delimited_pre-janus.csv',
 'sed_lith_124e_772a_delimited_pre-janus.csv',
 'sed_lith_108_667a_delimited_pre-janus.csv',
 'sed_lith_112_679b_delimited_pre-janus.csv',
 'sed_lith_125_780a_delimited_pre-janus.csv',
 'sed_lith_126_788c_delimited_pre-janus.csv',
 'sed_lith_117_725a_delimited_pre-janus.csv',
 'sed_lith_121_755a_delimited_pre-janus.csv',
 'sed_lith_124_770c_delimited_pre-janus.csv',
 'sed_lith_108_660a_delimited_pre-janus.csv',
 'sed_lith_110_671b_delimited_pre-janus.csv',
 'sed_lith_126_793a_delimited_pre-janus.csv',
 'sed_lith_108_664a_delimited_pre-janus.csv',
 'sed_lith_124e_773b_delimited_pre-janus.csv']

## file list

Create csv that lists all the files for this dataset.

In [5]:
file_list = []

index = filename_index(csv_paths[0])

for path in csv_paths:
    file_data = {}
    path_parts = Path(path).parts
    filename = path_parts[index]   
    
    file_data['path'] = path
    file_data['type'] = 'lithology'
    file_data['expedition'] =  path_parts[3]
    file_data['site'] =  path_parts[4]
    
    
    file_list.append(file_data)

In [6]:
df = pd.DataFrame(file_list)
df.head()

Unnamed: 0,path,type,expedition,site
0,cleaned_data/NOAA_csv/Janus_core_data/104/642b...,lithology,104,642b
1,cleaned_data/NOAA_csv/Janus_core_data/104/642e...,lithology,104,642e
2,cleaned_data/NOAA_csv/Janus_core_data/104/644a...,lithology,104,644a
3,cleaned_data/NOAA_csv/Janus_core_data/104/642d...,lithology,104,642d
4,cleaned_data/NOAA_csv/Janus_core_data/104/642c...,lithology,104,642c


In [7]:
df.to_csv(metadata_path, index=False)

## column names

Get all the column names.

In [8]:
def column_counts_for_paths(paths):
    counts = set()
    for path in paths:
        df = pd.read_csv(path, nrows=0)
        counts.add(len(df.columns))
    
    return counts

In [9]:
unique_columns_for_paths(csv_paths)

{'Unnamed: 30',
 'Unnamed: 31',
 'Unnamed: 32',
 'Unnamed: 33',
 'Unnamed: 34',
 'bottom interval',
 'bottom interval depth below sea floor',
 'code',
 'color',
 'color number',
 'core',
 'coretype',
 'depth',
 'drilling deformities',
 'hole',
 'induration',
 'latitude',
 'leg',
 'lithology',
 'longitude',
 'minerals',
 'more data available',
 'observer',
 'other',
 'paleontology',
 'piece number bottom',
 'piece number top',
 'section',
 'site',
 'structures',
 'sub piece bottom',
 'sub piece top',
 'top interval',
 'top interval depth below sea floor',
 'unusual occurrences'}

In [10]:
column_counts_for_paths(csv_paths)

{30, 35}

print out files that have too many columns

In [11]:
for path in csv_paths:
    df = pd.read_csv(path, nrows=0)
    if len(df.columns) == 35:
        print(path)

cleaned_data/NOAA_csv/Janus_core_data/110/674a/sed_lith_110_674a_delimited_pre-janus.csv


## check minerals, paleontology, structures, unusual occurrences values

In [12]:
def unique_values(series):
    return list(series.dropna().unique())
    

In [13]:
path = csv_paths[0]
print(path)
df = pd.read_csv(path)

cleaned_data/NOAA_csv/Janus_core_data/104/642b/sed_lith_104_642b_delimited_pre-janus.csv


In [14]:
unique_values(df['minerals'])

['fe/mn laminae',
 'fe dots (~57 cm); pyrite in burrows(8-16 cm)',
 'crystalline fragment dropstones',
 'pyrite in burrows',
 'pyrite (?) in mottling',
 'pyrite in mottling',
 'pyrite pockets',
 'pyrite in sandy zone(~134 cm);pyrite & volcanic ash(132 cm)',
 'volcanic ash? pocket (17 cm)',
 'pyrite infilled pores (?)',
 'pyrite in burrows (96 cm)',
 'pyrite in sandy patches (11, 15-17 cm)',
 'pyrite sandy patches (110-113, 127-131 cm)',
 'pyrite in burrows (123-126 cm)',
 'pyrite in burrow (~86 cm)',
 'pyrite in burrows (127 cm)',
 'pyrite in burrows (124-132, 138 cm)',
 'pyrite in burrows; volcanic ash (96-98,103 cm)',
 'pyrite in burrows, pyrite in color bands',
 'nannos',
 'volcanic ash (120-122 cm)',
 'pyrite concretion (50 cm)',
 'pyrite in burrows (0-89 cm)',
 'pyrite concretions (90, 110 cm)',
 'pyrite impregnation along burrows(0-76 cm)',
 'pyrite in burrows (50-150 cm)',
 'pyrite impregnations (11-150 cm)',
 'pyrite impregnations and concretion (111 cm)',
 'pyrite concretion (

In [15]:
unique_values(df['paleontology'])

['forams',
 'nanno',
 'nannos',
 'forams (100-120 cm)',
 'forams (0-122 cm)',
 'forams in sandy patches (135-145 cm)',
 'forams (130-150 cm)',
 'zoophycos trace fossil',
 'nanno, mollusc fragments (11 cm)',
 'nannos; forams',
 'forams visible',
 'forams (88-102 cm)',
 'visible forams (0-120 cm)',
 'diatoms',
 'forams (50 cm ?), diatoms']

In [22]:
unique_values(df['structures'])[0:30]

['grayish color mottling (~27-30 cm);mottling(38-41 cm); minor bioturbation (38-41 cm)',
 'parallel laminae',
 'fine brownish laminae (~142-145 cm), dropstones',
 'moderate bioturbation/pyritized burrows (~114-117 cm), dropstone (~115cm)',
 'pyritized burrows/moderate bioturbation (8-16 cm)',
 'heavy bioturbation(144-150cm);2 mm burrows (144-150 cm); cross-stratification(138-144cm), dropstones (131, 139 cm)',
 'moderate bioturbation',
 'moderate (80-93 cm) to heavy (96-100 cm) bioturbation; large granite dropstones (~116cm)',
 'parallel laminae (119-127 cm), dropstones, small (119-127 cm), large (132 cm)',
 'laminae',
 'dropstones (~67,93,122cm), minor bioturbation (45-100 cm)',
 'dropstone? (~136 cm)',
 'minor bioturbation (14-55 cm)',
 'scattered dropstones (105-110 cm, 136-141 cm)',
 'black mottling/minor bioturbation (8-37 cm); faint laminae below 30 cm',
 'dropstones, 5y 3/2 mottling(47-64 cm); mottling(64-80, 80-110 cm)',
 'faint black mottling; minor bioturbation, dropstones',
 

In [17]:
unique_values(df['unusual occurrences'])

['small dropstones (8-100 cm)',
 'pebbles (17-19 cm)',
 'volcanic ash layer (~132 cm)',
 'mud pebbles (149 cm)',
 'soft mud pebbles',
 'reworked very dark gray mud pebbles (especially 110, 120 cm)',
 'large mud pebbles (70-75cm)',
 'pyrite concretion (50 cm) along large burrows',
 'pyrite concretions (90, 110 cm)',
 'pyrite concretion (111 cm)',
 'pyrite concretion (64 cm)',
 'pyrite concretions (1-4 cm)',
 'pyrite concretion (132 cm)',
 'pyrite concretion(102-104 cm)',
 'pebbles/crs sand (100-115 cm) (dropstones ?)']