# QA NOAA DSDP files

In [1]:
import sys
sys.path.append('../scripts/')
import glob
from pathlib import Path
import os

import pandas as pd
import numpy as np

from normalize_noaa_files import (
    unique_filenames,
    unique_columns,
    filename_index,
    format_filepaths_set
)


## DSDP_core_data

In [14]:
path = os.path.join('raw_data', 'NOAA_csv', 'DSDP_core_data', '**', '**', '*.csv')
csv_paths = glob.glob(path)
print('files', len(csv_paths))

files 4477


### grouped files

create list of files grouped by expedition and file type

In [3]:
contents = {}
index = filename_index(path)

for path in csv_paths:
    parts = Path(path).parts
    exp = parts[3]
    filename = parts[index]
    
    if exp not in contents:
        contents[exp] = {'taxa': set(), 'lithology': set(), 'age_model': set()}
        
    if filename == 'vistxt.csv' or filename == 'hr_desc.csv':
        contents[exp]['lithology'].add(filename)
    elif filename == 'ageprof.csv':
        contents[exp]['age_model'].add(filename)
    else:
        contents[exp]['taxa'].add(filename)

In [4]:
exps = []
taxa = []
ages = []
liths = []

for exp in contents.items():
    exps.append(exp[0])
    taxa.append(format_filepaths_set(exp[1], 'taxa'))
    ages.append(format_filepaths_set(exp[1], 'age_model'))
    liths.append(format_filepaths_set(exp[1], 'lithology'))

In [5]:
dict = {
    "expedition": exps,
    "taxa": taxa,
    "age_model": ages,
    "lithology": liths
}
df = pd.DataFrame(dict)
df.head()

Unnamed: 0,expedition,taxa,age_model,lithology
0,61,"b_forams.csv,p_forams.csv,nannos.csv,radiolar.csv",ageprof.csv,"vistxt.csv,hr_desc.csv"
1,95,"diatoms.csv,radiolar.csv,b_forams.csv,nannos.c...",ageprof.csv,vistxt.csv
2,59,"diatoms.csv,radiolar.csv,p_forams.csv,b_forams...",ageprof.csv,"vistxt.csv,hr_desc.csv"
3,92,"p_forams.csv,nannos.csv",ageprof.csv,"vistxt.csv,hr_desc.csv"
4,66,"nannos.csv,radiolar.csv",ageprof.csv,"vistxt.csv,hr_desc.csv"


In [6]:
path = os.path.join('tmp', 'noaa_dsdp_grouped_files.csv')
df.to_csv(path, index=False)

### unique file names

In [7]:
unique_filenames(csv_paths)

{'ageprof.csv',
 'b_forams.csv',
 'diatoms.csv',
 'dinoflag.csv',
 'ebri_act.csv',
 'hr_desc.csv',
 'nannos.csv',
 'ostracod.csv',
 'p_forams.csv',
 'phyliths.csv',
 'pollen.csv',
 'radiolar.csv',
 'siliflag.csv',
 'vistxt.csv'}

### column names

In [8]:
unique_columns(csv_paths)

{' describer',
 'age',
 'age bottom of section(million years)',
 'age mnemonic',
 'age top of section(million years)',
 'alteration data',
 'auxiliary age',
 'auxiliary age mnemonic',
 'average age(million years)t',
 'averaged age',
 'bottom interval depth (cm)',
 'bottom of layer depth (m)',
 'bottom of section depth(m)',
 'chemical dissolution',
 'chemical overgrowth',
 'color',
 'comments',
 'core',
 'coredepth(m)',
 'data source',
 'deformations due to drilling',
 'describer',
 'dsdp initial report volume number',
 'fossil',
 'fossil abundance',
 'fossil code',
 'fossil group',
 'fossil preservation',
 'groundmass mineral data',
 'group abundance',
 'hardness or induration',
 'hole',
 'investigators name',
 'leg',
 'lithology',
 'mechanical preservations',
 'minerals',
 'other information data',
 'other observations',
 'page number reference',
 'paleontology',
 'phenocryst data',
 'piece numbers',
 'publication date (month/year)',
 'record join code',
 'replacement mineral data',
 

### hr_desc.csv

create github links for each hr_desc.csv' file

In [9]:
for path in csv_paths:
    if 'hr_desc.csv' in path:
        link = 'https://github.com/eODP/data-processing/tree/master/notebooks/' + '/'.join(path.split('/'))
#         print(link)

get unique columns

In [22]:
columns = set()
columns_count = set()

for path in csv_paths:
    if 'hr_desc.csv' in path:
        df = pd.read_csv(path, nrows=1)
        columns.update(list(df.columns))
        columns_count.add(len(df.columns))
columns

{'alteration data',
 'comments',
 'core',
 'describer',
 'groundmass mineral data',
 'hole',
 'leg',
 'other information data',
 'phenocryst data',
 'piece numbers',
 'replacement mineral data',
 'rock name',
 'sample midpoint depth(m)',
 'section',
 'site',
 'structure data',
 'texture data',
 'top interval depth(cm)',
 'top of core depth(cm)',
 'top of section(m)',
 'unknown code',
 'vesicle data'}

In [23]:
columns_count

{22}

### file list
create csv that lists all the files

In [11]:
file_names = []
file_types = []
taxon_groups = []

index = filename_index(csv_paths[0])

for path in csv_paths:
    filename = Path(path).parts[index]    
    file_names.append(path)
    
    if filename == 'vistxt.csv' or filename == 'hr_desc.csv':
        file_types.append('lithology')
    elif filename == 'ageprof.csv':
        file_types.append('age')
    else:
        file_types.append('taxa')
        
    switch = {
        'b_forams.csv': 'benthic foraminifera',
        'diatoms.csv': 'diatoms',
        'dinoflag.csv': 'dinoflagellates',
        'ebri_act.csv': 'ebridians',
        'nannos.csv': 'nannofossils',
        'ostracod.csv': 'ostracod',
        'p_forams.csv': 'planktic foraminfera',
        'phyliths.csv': 'phytoliths',
        'pollen.csv': 'pollen',
        'radiolar.csv': 'radiolarians',
        'siliflag.csv': 'silicoflagellates',
    }
    taxon_groups.append(switch.get(filename, np.nan))


In [12]:
dict = {
    "file": file_names,
    "type": file_types,
    "taxon_group": taxon_groups
}
df = pd.DataFrame(dict)
df.head()

Unnamed: 0,file,type,taxon_group
0,raw_data/NOAA_csv/DSDP_core_data/61/462/radiol...,taxa,radiolarians
1,raw_data/NOAA_csv/DSDP_core_data/61/462/agepro...,age,
2,raw_data/NOAA_csv/DSDP_core_data/61/462/b_fora...,taxa,benthic foraminifera
3,raw_data/NOAA_csv/DSDP_core_data/61/462/p_fora...,taxa,planktic foraminfera
4,raw_data/NOAA_csv/DSDP_core_data/61/462/hr_des...,lithology,


In [13]:
path = os.path.join('cleaned_data', 'metadata', 'noaa_dsdp_files.csv')
df.to_csv(path, index=False)