# QA NOAA DSDP files
## 1-96 taxa, lithology, age, hard rocks

Get basic metadata (file names, column names) about NOAA DSDP dataset. Create csv that lists all the files.

NOAA_csv/DSDP_core_data  
expedition 1-96  
taxa, lithology, age models, hard rocks

In [4]:
import sys
sys.path.append('../../')
import glob
from pathlib import Path
import os

import pandas as pd
import numpy as np

from scripts.normalize_noaa_files import (
    unique_filenames_for_paths,
    unique_columns_for_paths,
    filename_index,
    format_filepaths_set
)
from config import OUTPUT_DIR, CLEAN_DATA_DIR

In [5]:
base_dir = CLEAN_DATA_DIR
data_dir = base_dir/'NOAA'/'DSDP_core_data'
metadata_path = OUTPUT_DIR/'metadata'/'NOAA'/'noaa_dsdp_files.csv'

In [21]:
csv_paths = list(data_dir.glob("**/*.csv"))
print('files', len(csv_paths))

files 4477


## unique file names

Get all the file names.

In [15]:
unique_filenames_for_paths(csv_paths)

{'ageprof.csv',
 'b_forams.csv',
 'diatoms.csv',
 'dinoflag.csv',
 'ebri_act.csv',
 'hr_desc.csv',
 'nannos.csv',
 'ostracod.csv',
 'p_forams.csv',
 'phyliths.csv',
 'pollen.csv',
 'radiolar.csv',
 'siliflag.csv',
 'vistxt.csv'}

## file list
Create csv that lists all the files for this dataset.

In [22]:
file_list = []
switch = {
    'b_forams.csv': 'benthic_foraminfera',
    'diatoms.csv': 'diatoms',
    'dinoflag.csv': 'dinoflagellates',
    'ebri_act.csv': 'ebridians',
    'nannos.csv': 'nannofossils',
    'ostracod.csv': 'ostracods',
    'phyliths.csv': 'phytoliths',
    'p_forams.csv': 'planktic_foraminfera',
    'pollen.csv': 'pollen',
    'radiolar.csv': 'radiolarians',
    'siliflag.csv': 'silicoflagellates',
}

for path in csv_paths:
    file_data = {}
    relative_path = path.relative_to(base_dir)
    path_parts = relative_path.parts
    filename = relative_path.name
    
    file_data['path'] = relative_path
    
    if filename == 'vistxt.csv':
        type = 'lithology'
    elif filename == 'ageprof.csv':
        type = 'age'
    elif filename == 'hr_desc.csv':
        type = 'hard_rock'
    else:
        type = 'taxa'
    file_data['type'] = type
        
    if type == 'taxa':
        file_data['taxon_group'] = switch.get(filename, np.nan)
    
    file_data['expedition'] =  path_parts[2]
    file_data['site'] =  path_parts[3]

    file_list.append(file_data)

In [23]:
df = pd.DataFrame(file_list)
df.head()

Unnamed: 0,path,type,taxon_group,expedition,site
0,NOAA/DSDP_core_data/61/462/radiolar.csv,taxa,radiolarians,61,462
1,NOAA/DSDP_core_data/61/462/ageprof.csv,age,,61,462
2,NOAA/DSDP_core_data/61/462/b_forams.csv,taxa,benthic_foraminfera,61,462
3,NOAA/DSDP_core_data/61/462/p_forams.csv,taxa,planktic_foraminfera,61,462
4,NOAA/DSDP_core_data/61/462/hr_desc.csv,hard_rock,,61,462


In [24]:
df.to_csv(metadata_path, index=False)

## column names

Get all the column names.

In [25]:
def column_counts_for_paths(paths):
    counts = set()
    for path in paths:
        df = pd.read_csv(path, nrows=0)
        counts.add(len(df.columns))
    
    return counts

In [26]:
metadata_df = pd.read_csv(metadata_path)
metadata_df.head()

Unnamed: 0,path,type,taxon_group,expedition,site
0,NOAA/DSDP_core_data/61/462/radiolar.csv,taxa,radiolarians,61,462
1,NOAA/DSDP_core_data/61/462/ageprof.csv,age,,61,462
2,NOAA/DSDP_core_data/61/462/b_forams.csv,taxa,benthic_foraminfera,61,462
3,NOAA/DSDP_core_data/61/462/p_forams.csv,taxa,planktic_foraminfera,61,462
4,NOAA/DSDP_core_data/61/462/hr_desc.csv,hard_rock,,61,462


### taxa

In [29]:
type_df = metadata_df[metadata_df['type'] == 'taxa']
taxa_paths = [base_dir/path for path in type_df['path']]
taxa_paths[0:5]

[PosixPath('../../output/cleaned_data/NOAA/DSDP_core_data/61/462/radiolar.csv'),
 PosixPath('../../output/cleaned_data/NOAA/DSDP_core_data/61/462/b_forams.csv'),
 PosixPath('../../output/cleaned_data/NOAA/DSDP_core_data/61/462/p_forams.csv'),
 PosixPath('../../output/cleaned_data/NOAA/DSDP_core_data/61/462/nannos.csv'),
 PosixPath('../../output/cleaned_data/NOAA/DSDP_core_data/61/462A/radiolar.csv')]

In [30]:
unique_columns_for_paths(taxa_paths)

{'age',
 'bottom interval depth (cm)',
 'chemical dissolution',
 'chemical overgrowth',
 'core',
 'coredepth(m)',
 'dsdp initial report volume number',
 'fossil',
 'fossil abundance',
 'fossil code',
 'fossil group',
 'fossil preservation',
 'group abundance',
 'hole',
 'investigators name',
 'leg',
 'mechanical preservations',
 'page number reference',
 'publication date (month/year)',
 'record join code',
 'sample depth(m)',
 'section',
 'site',
 'top interval depth(cm)',
 'total number of observed fossils'}

In [31]:
column_counts_for_paths(taxa_paths)

{25}

### age

In [32]:
type_df = metadata_df[metadata_df['type'] == 'age']
age_paths = [base_dir/path for path in type_df['path']]
age_paths[0:5]

[PosixPath('../../output/cleaned_data/NOAA/DSDP_core_data/61/462/ageprof.csv'),
 PosixPath('../../output/cleaned_data/NOAA/DSDP_core_data/61/462A/ageprof.csv'),
 PosixPath('../../output/cleaned_data/NOAA/DSDP_core_data/95/603F/ageprof.csv'),
 PosixPath('../../output/cleaned_data/NOAA/DSDP_core_data/95/613/ageprof.csv'),
 PosixPath('../../output/cleaned_data/NOAA/DSDP_core_data/95/612/ageprof.csv')]

In [33]:
unique_columns_for_paths(age_paths)

{'age',
 'age bottom of section(million years)',
 'age mnemonic',
 'age top of section(million years)',
 'auxiliary age',
 'auxiliary age mnemonic',
 'average age(million years)t',
 'averaged age',
 'bottom of section depth(m)',
 'data source',
 'hole',
 'leg',
 'site',
 'special condition',
 'top of section depth(m)'}

In [34]:
column_counts_for_paths(age_paths)

{15}

### hard rock

In [35]:
type_df = metadata_df[metadata_df['type'] == 'hard_rock']
hard_rocks_paths = [base_dir/path for path in type_df['path']]
hard_rocks_paths[0:5]

[PosixPath('../../output/cleaned_data/NOAA/DSDP_core_data/61/462/hr_desc.csv'),
 PosixPath('../../output/cleaned_data/NOAA/DSDP_core_data/61/462A/hr_desc.csv'),
 PosixPath('../../output/cleaned_data/NOAA/DSDP_core_data/59/449/hr_desc.csv'),
 PosixPath('../../output/cleaned_data/NOAA/DSDP_core_data/59/448/hr_desc.csv'),
 PosixPath('../../output/cleaned_data/NOAA/DSDP_core_data/59/448A/hr_desc.csv')]

In [36]:
unique_columns_for_paths(hard_rocks_paths)

{'alteration data',
 'comments',
 'core',
 'describer',
 'groundmass mineral data',
 'hole',
 'leg',
 'other information data',
 'phenocryst data',
 'piece numbers',
 'replacement mineral data',
 'rock name',
 'sample midpoint depth(m)',
 'section',
 'site',
 'structure data',
 'texture data',
 'top interval depth(cm)',
 'top of core depth(cm)',
 'top of section(m)',
 'unknown code',
 'vesicle data'}

In [37]:
column_counts_for_paths(hard_rocks_paths)

{22}

### lithology

In [38]:
type_df = metadata_df[metadata_df['type'] == 'lithology']
lith_paths = [base_dir/path for path in type_df['path']]
lith_paths[0:5]

[PosixPath('../../output/cleaned_data/NOAA/DSDP_core_data/61/462/vistxt.csv'),
 PosixPath('../../output/cleaned_data/NOAA/DSDP_core_data/61/462A/vistxt.csv'),
 PosixPath('../../output/cleaned_data/NOAA/DSDP_core_data/95/603F/vistxt.csv'),
 PosixPath('../../output/cleaned_data/NOAA/DSDP_core_data/95/613/vistxt.csv'),
 PosixPath('../../output/cleaned_data/NOAA/DSDP_core_data/95/612/vistxt.csv')]

In [39]:
unique_columns_for_paths(lith_paths)

{' describer',
 'bottom interval depth (cm)',
 'bottom of layer depth (m)',
 'color',
 'core',
 'deformations due to drilling',
 'hardness or induration',
 'hole',
 'leg',
 'lithology',
 'minerals',
 'other observations',
 'paleontology',
 'section',
 'site',
 'structures',
 'top interval depth (cm)',
 'top of core depth(m)',
 'top of layer depth (m)',
 'unusual occurrences',
 'z-coding'}

In [40]:
column_counts_for_paths(lith_paths)

{21}

## hr_desc.csv

Create github links for each hr_desc.csv file

In [49]:
type_df = metadata_df[metadata_df['type'] == 'hard_rock']
hard_rocks_paths = [base_dir/path for path in type_df['path']]

for path in hard_rocks_paths:
    if 'hr_desc.csv' == path.name:
        link = 'https://github.com/eODP/data-processing/tree/master/' + str(path).replace('../../', '')
#         print(link)

## grouped files

create list of files grouped by expedition and file type

In [60]:
contents = {}

for path in csv_paths:
    relative_path = path.relative_to(base_dir)
    parts = relative_path.parts
    filename = relative_path.name
    exp = parts[2]
     
    if exp not in contents:
        contents[exp] = {'taxa': set(), 'lithology': set(), 'age_model': set(), 'hard_rock': set()}
        
    if filename == 'vistxt.csv':
        contents[exp]['lithology'].add(filename)
    elif filename == 'ageprof.csv':
        contents[exp]['age_model'].add(filename)
    elif filename == 'hr_desc.csv':
        contents[exp]['hard_rock'].add(filename)
    else:
        contents[exp]['taxa'].add(filename)


In [62]:
file_list = []

for exp in contents.items():
    file_data = {}
    file_data['expedition'] = exp[0]
    file_data['taxa'] = format_filepaths_set(exp[1], 'taxa')
    file_data['age_model'] = format_filepaths_set(exp[1], 'age_model')
    file_data['lithology'] = format_filepaths_set(exp[1], 'lithology')
    file_data['hard_rock'] = format_filepaths_set(exp[1], 'hard_rock')

    file_list.append(file_data)

In [63]:
df = pd.DataFrame(file_list)
df.head()

Unnamed: 0,expedition,taxa,age_model,lithology,hard_rock
0,61,"nannos.csv,p_forams.csv,radiolar.csv,b_forams.csv",ageprof.csv,vistxt.csv,hr_desc.csv
1,95,"diatoms.csv,ostracod.csv,siliflag.csv,nannos.c...",ageprof.csv,vistxt.csv,
2,59,"diatoms.csv,p_forams.csv,siliflag.csv,nannos.c...",ageprof.csv,vistxt.csv,hr_desc.csv
3,92,"p_forams.csv,nannos.csv",ageprof.csv,vistxt.csv,hr_desc.csv
4,66,"nannos.csv,radiolar.csv",ageprof.csv,vistxt.csv,hr_desc.csv


In [65]:
path = OUTPUT_DIR/'tmp'/'noaa_dsdp_grouped_files.csv'
df.to_csv(path, index=False)

## taxa with (q)

In [66]:
metadata_df = pd.read_csv(metadata_path)
metadata_df.head()

Unnamed: 0,path,type,taxon_group,expedition,site
0,NOAA/DSDP_core_data/61/462/radiolar.csv,taxa,radiolarians,61,462
1,NOAA/DSDP_core_data/61/462/ageprof.csv,age,,61,462
2,NOAA/DSDP_core_data/61/462/b_forams.csv,taxa,benthic_foraminfera,61,462
3,NOAA/DSDP_core_data/61/462/p_forams.csv,taxa,planktic_foraminfera,61,462
4,NOAA/DSDP_core_data/61/462/hr_desc.csv,hard_rock,,61,462


In [67]:
type_df = metadata_df[metadata_df['type'] == 'taxa']
type_df.head()

Unnamed: 0,path,type,taxon_group,expedition,site
0,NOAA/DSDP_core_data/61/462/radiolar.csv,taxa,radiolarians,61,462
2,NOAA/DSDP_core_data/61/462/b_forams.csv,taxa,benthic_foraminfera,61,462
3,NOAA/DSDP_core_data/61/462/p_forams.csv,taxa,planktic_foraminfera,61,462
5,NOAA/DSDP_core_data/61/462/nannos.csv,taxa,nannofossils,61,462
7,NOAA/DSDP_core_data/61/462A/radiolar.csv,taxa,radiolarians,61,462A


In [70]:
files = {}
taxa = set()

for path in type_df['path']:
    
    df = pd.read_csv(base_dir/path, usecols=['fossil'])
    for taxon in df['fossil'].values:
        if taxon != taxon:
            continue
        if '(q)' in taxon:  
            if path not in files:
                files[path] = set()
            
            files[path].add(taxon)
            taxa.add(taxon)
        

In [71]:
len(files)

1059

In [72]:
len(taxa)

1656