# create metadata NOAA Janus files
## 101-126 lithology

Get basic metadata (file names, column names) about NOAA Janus dataset. Create csv that lists all the files.

NOAA_csv/Janus_core_data  
expedition 101-126  
lithology

In [2]:
import sys
sys.path.append('../../')
import glob
from pathlib import Path
import os

import pandas as pd

from scripts.normalize_noaa_files import (
    unique_filenames_for_paths,
    unique_columns_for_paths,
    filename_index,
    format_filepaths_set
)
from config import OUTPUT_DIR, CLEAN_DATA_DIR, RAW_DATA_DIR
from scripts.shared_utils import (
    log_df
)

In [3]:
base_dir = CLEAN_DATA_DIR
data_dir = base_dir/'NOAA'/'Janus_core_data'
metadata_path = OUTPUT_DIR/'metadata'/'NOAA'/'noaa_janus_files.csv'

In [4]:
csv_paths = list(data_dir.glob("**/*.csv"))
print('files', len(csv_paths))

files 327


## unique file names

Get all the file names.

In [4]:
files = unique_filenames_for_paths(csv_paths)
list(files)[0:15]

['sed_lith_112_687b_delimited_pre-janus.csv',
 'sed_lith_107_651a_delimited_pre-janus.csv',
 'sed_lith_112_684c_delimited_pre-janus.csv',
 'sed_lith_101_632b_delimited_pre-janus.csv',
 'sed_lith_114_701b_delimited_pre-janus.csv',
 'sed_lith_101_628a_delimited_pre-janus.csv',
 'sed_lith_101_627a_delimited_pre-janus.csv',
 'sed_lith_125_780a_delimited_pre-janus.csv',
 'sed_lith_108_658a_delimited_pre-janus.csv',
 'sed_lith_104_643a_delimited_pre-janus.csv',
 'sed_lith_114_704a_delimited_pre-janus.csv',
 'sed_lith_126_791b_delimited_pre-janus.csv',
 'sed_lith_113_692b_delimited_pre-janus.csv',
 'sed_lith_115_707b_delimited_pre-janus.csv',
 'sed_lith_114_701a_delimited_pre-janus.csv']

## column names

Get all the column names.


In [5]:
unique_columns_for_paths(csv_paths)

{'Unnamed: 30',
 'Unnamed: 31',
 'Unnamed: 32',
 'Unnamed: 33',
 'Unnamed: 34',
 'bottom interval',
 'bottom interval depth below sea floor',
 'code',
 'color',
 'color number',
 'core',
 'coretype',
 'depth',
 'drilling deformities',
 'hole',
 'induration',
 'latitude',
 'leg',
 'lithology',
 'longitude',
 'minerals',
 'more data available',
 'observer',
 'other',
 'paleontology',
 'piece number bottom',
 'piece number top',
 'section',
 'site',
 'structures',
 'sub piece bottom',
 'sub piece top',
 'top interval',
 'top interval depth below sea floor',
 'unusual occurrences'}

## file list

Create csv that lists all the files for this dataset.

In [5]:
file_list = []

for path in csv_paths:
    if '.ipynb_checkpoints' in str(path):
        continue
        
    file_data = {}
    relative_path = path.relative_to(base_dir)
    path_parts = relative_path.parts
    filename = relative_path.name 
    
    file_data['path'] = relative_path
    
    file_data['type'] = 'lithology'
    file_data['expedition'] =  path_parts[2]
    file_data['site'] =  path_parts[3]
    
    
    file_list.append(file_data)

In [6]:
df = pd.DataFrame(file_list)
df = df.sort_values(by=['expedition', 'site', 'type'])
log_df(df)

(325, 4)


Unnamed: 0,path,type,expedition,site
165,NOAA/Janus_core_data/101/626a/sed_lith_101_626...,lithology,101,626a
173,NOAA/Janus_core_data/101/626b/sed_lith_101_626...,lithology,101,626b
176,NOAA/Janus_core_data/101/626c/sed_lith_101_626...,lithology,101,626c
175,NOAA/Janus_core_data/101/626d/sed_lith_101_626...,lithology,101,626d
166,NOAA/Janus_core_data/101/627a/sed_lith_101_627...,lithology,101,627a


In [7]:
df.to_csv(metadata_path, index=False)