# QA NOAA DSDP files
## 1-96 taxa, lithology, age, hard rocks

QA NOAA DSDP dataset. 

NOAA_csv/DSDP_core_data  
expedition 1-96  
taxa, lithology, age models, hard rocks

In [1]:
import sys
sys.path.append('../../')
import glob
from pathlib import Path
import os

import pandas as pd
import numpy as np

from scripts.normalize_noaa_files import (
    unique_filenames_for_paths,
    unique_columns_for_paths,
    filename_index,
    format_filepaths_set
)
from config import OUTPUT_DIR, CLEAN_DATA_DIR

In [2]:
base_dir = CLEAN_DATA_DIR
data_dir = base_dir/'NOAA'/'DSDP_core_data'
metadata_path = OUTPUT_DIR/'metadata'/'NOAA'/'noaa_dsdp_files.csv'

In [3]:
csv_paths = list(data_dir.glob("**/*.csv"))
csv_paths = [p for p in csv_paths if '.ipynb_checkpoints' not in str(p)]

print('files', len(csv_paths))

files 4477


## column names

Get all the column names.

In [4]:
def column_counts_for_paths(paths):
    counts = set()
    for path in paths:
        df = pd.read_csv(path, nrows=0)
        counts.add(len(df.columns))
    
    return counts

In [5]:
metadata_df = pd.read_csv(metadata_path)
metadata_df.head()

Unnamed: 0,path,type,taxon_group,expedition,site
0,NOAA/DSDP_core_data/61/462/radiolar.csv,taxa,radiolarians,61,462
1,NOAA/DSDP_core_data/61/462/ageprof.csv,age,,61,462
2,NOAA/DSDP_core_data/61/462/b_forams.csv,taxa,benthic_foraminfera,61,462
3,NOAA/DSDP_core_data/61/462/p_forams.csv,taxa,planktic_foraminfera,61,462
4,NOAA/DSDP_core_data/61/462/hr_desc.csv,hard_rock,,61,462


### taxa

In [6]:
type_df = metadata_df[metadata_df['type'] == 'taxa']
taxa_paths = [base_dir/path for path in type_df['path']]
taxa_paths[0:5]

[PosixPath('../../output/cleaned_data/NOAA/DSDP_core_data/61/462/radiolar.csv'),
 PosixPath('../../output/cleaned_data/NOAA/DSDP_core_data/61/462/b_forams.csv'),
 PosixPath('../../output/cleaned_data/NOAA/DSDP_core_data/61/462/p_forams.csv'),
 PosixPath('../../output/cleaned_data/NOAA/DSDP_core_data/61/462/nannos.csv'),
 PosixPath('../../output/cleaned_data/NOAA/DSDP_core_data/61/462A/radiolar.csv')]

In [7]:
unique_columns_for_paths(taxa_paths)

{' describer',
 'age',
 'bottom interval depth (cm)',
 'bottom of layer depth (m)',
 'chemical dissolution',
 'chemical overgrowth',
 'color',
 'core',
 'coredepth(m)',
 'deformations due to drilling',
 'dsdp initial report volume number',
 'fossil',
 'fossil abundance',
 'fossil code',
 'fossil group',
 'fossil preservation',
 'group abundance',
 'hardness or induration',
 'hole',
 'investigators name',
 'leg',
 'lithology',
 'mechanical preservations',
 'minerals',
 'other observations',
 'page number reference',
 'paleontology',
 'publication date (month/year)',
 'record join code',
 'sample depth(m)',
 'section',
 'site',
 'structures',
 'top interval depth (cm)',
 'top interval depth(cm)',
 'top of core depth(m)',
 'top of layer depth (m)',
 'total number of observed fossils',
 'unusual occurrences',
 'z-coding'}

In [8]:
column_counts_for_paths(taxa_paths)

{21, 25}

### age

In [9]:
type_df = metadata_df[metadata_df['type'] == 'age']
age_paths = [base_dir/path for path in type_df['path']]
age_paths[0:5]

[PosixPath('../../output/cleaned_data/NOAA/DSDP_core_data/61/462/ageprof.csv'),
 PosixPath('../../output/cleaned_data/NOAA/DSDP_core_data/61/462A/ageprof.csv'),
 PosixPath('../../output/cleaned_data/NOAA/DSDP_core_data/95/603F/ageprof.csv'),
 PosixPath('../../output/cleaned_data/NOAA/DSDP_core_data/95/613/ageprof.csv'),
 PosixPath('../../output/cleaned_data/NOAA/DSDP_core_data/95/612/ageprof.csv')]

In [10]:
unique_columns_for_paths(age_paths)

{'age',
 'age bottom of section(million years)',
 'age mnemonic',
 'age top of section(million years)',
 'auxiliary age',
 'auxiliary age mnemonic',
 'average age(million years)t',
 'averaged age',
 'bottom of section depth(m)',
 'data source',
 'hole',
 'leg',
 'site',
 'special condition',
 'top of section depth(m)'}

In [11]:
column_counts_for_paths(age_paths)

{15}

### hard rock

In [12]:
type_df = metadata_df[metadata_df['type'] == 'hard_rock']
hard_rocks_paths = [base_dir/path for path in type_df['path']]
hard_rocks_paths[0:5]

[PosixPath('../../output/cleaned_data/NOAA/DSDP_core_data/61/462/hr_desc.csv'),
 PosixPath('../../output/cleaned_data/NOAA/DSDP_core_data/61/462A/hr_desc.csv'),
 PosixPath('../../output/cleaned_data/NOAA/DSDP_core_data/59/449/hr_desc.csv'),
 PosixPath('../../output/cleaned_data/NOAA/DSDP_core_data/59/448/hr_desc.csv'),
 PosixPath('../../output/cleaned_data/NOAA/DSDP_core_data/59/448A/hr_desc.csv')]

In [13]:
unique_columns_for_paths(hard_rocks_paths)

{'alteration data',
 'comments',
 'core',
 'describer',
 'groundmass mineral data',
 'hole',
 'leg',
 'other information data',
 'phenocryst data',
 'piece numbers',
 'replacement mineral data',
 'rock name',
 'sample midpoint depth(m)',
 'section',
 'site',
 'structure data',
 'texture data',
 'top interval depth(cm)',
 'top of core depth(cm)',
 'top of section(m)',
 'unknown code',
 'vesicle data'}

In [14]:
column_counts_for_paths(hard_rocks_paths)

{22}

### lithology

In [15]:
type_df = metadata_df[metadata_df['type'] == 'lithology']
lith_paths = [base_dir/path for path in type_df['path']]
lith_paths[0:5]

[PosixPath('../../output/cleaned_data/NOAA/DSDP_core_data/61/462/vistxt.csv'),
 PosixPath('../../output/cleaned_data/NOAA/DSDP_core_data/61/462A/vistxt.csv'),
 PosixPath('../../output/cleaned_data/NOAA/DSDP_core_data/95/603F/vistxt.csv'),
 PosixPath('../../output/cleaned_data/NOAA/DSDP_core_data/95/613/vistxt.csv'),
 PosixPath('../../output/cleaned_data/NOAA/DSDP_core_data/95/612/vistxt.csv')]

In [16]:
unique_columns_for_paths(lith_paths)

{' describer',
 'bottom interval depth (cm)',
 'bottom of layer depth (m)',
 'color',
 'core',
 'deformations due to drilling',
 'hardness or induration',
 'hole',
 'leg',
 'lithology',
 'minerals',
 'other observations',
 'paleontology',
 'section',
 'site',
 'structures',
 'top interval depth (cm)',
 'top of core depth(m)',
 'top of layer depth (m)',
 'unusual occurrences',
 'z-coding'}

In [17]:
column_counts_for_paths(lith_paths)

{21}

## hr_desc.csv

Create github links for each hr_desc.csv file

In [18]:
type_df = metadata_df[metadata_df['type'] == 'hard_rock']
hard_rocks_paths = [base_dir/path for path in type_df['path']]

for path in hard_rocks_paths:
    if 'hr_desc.csv' == path.name:
        link = 'https://github.com/eODP/data-processing/tree/master/' + str(path).replace('../../', '')
#         print(link)

## taxa with (q)

In [19]:
metadata_df = pd.read_csv(metadata_path)
metadata_df.head()

Unnamed: 0,path,type,taxon_group,expedition,site
0,NOAA/DSDP_core_data/61/462/radiolar.csv,taxa,radiolarians,61,462
1,NOAA/DSDP_core_data/61/462/ageprof.csv,age,,61,462
2,NOAA/DSDP_core_data/61/462/b_forams.csv,taxa,benthic_foraminfera,61,462
3,NOAA/DSDP_core_data/61/462/p_forams.csv,taxa,planktic_foraminfera,61,462
4,NOAA/DSDP_core_data/61/462/hr_desc.csv,hard_rock,,61,462


In [20]:
type_df = metadata_df[metadata_df['type'] == 'taxa']
type_df.head()

Unnamed: 0,path,type,taxon_group,expedition,site
0,NOAA/DSDP_core_data/61/462/radiolar.csv,taxa,radiolarians,61,462
2,NOAA/DSDP_core_data/61/462/b_forams.csv,taxa,benthic_foraminfera,61,462
3,NOAA/DSDP_core_data/61/462/p_forams.csv,taxa,planktic_foraminfera,61,462
5,NOAA/DSDP_core_data/61/462/nannos.csv,taxa,nannofossils,61,462
7,NOAA/DSDP_core_data/61/462A/radiolar.csv,taxa,radiolarians,61,462A


In [24]:
df = pd.read_csv(base_dir/'NOAA/DSDP_core_data/61/462/radiolar.csv')
df.head()

Unnamed: 0,leg,site,hole,core,section,top interval depth(cm),bottom interval depth (cm),coredepth(m),sample depth(m),total number of observed fossils,...,chemical overgrowth,chemical dissolution,mechanical preservations,age,page number reference,fossil code,fossil,fossil abundance,fossil preservation,record join code
0,61,462,,1,1,112.0,114.0,0.5,1.63,1,...,,,MODERATE,NO AGE GIVEN,498,RSTIO0070,Stichocorys peregrina,RARE,,1
1,61,462,,1,3,86.0,88.0,0.5,4.37,6,...,,,MODERATE,NO AGE GIVEN,498,RARTP0030,Artophormis gracilis,PRESENT,,1
2,61,462,,1,3,86.0,88.0,0.5,4.37,6,...,,,MODERATE,NO AGE GIVEN,498,RARTS0020,Artostrobium doliolum,PRESENT,,1
3,61,462,,1,3,86.0,88.0,0.5,4.37,6,...,,,MODERATE,NO AGE GIVEN,498,RLITX0040,Lithopera renzae,PRESENT,,1
4,61,462,,1,3,86.0,88.0,0.5,4.37,6,...,,,MODERATE,NO AGE GIVEN,498,RSOLE0015,Solenospheara omnitubus,RARE,,1


In [26]:
files = {}
taxa = set()

for path in type_df['path']:
    
    df = pd.read_csv(base_dir/path)
    if 'fossil' in df.columns:
        for taxon in df['fossil'].values:
            if taxon != taxon:
                continue
            if '(q)' in taxon:  
                if path not in files:
                    files[path] = set()

                files[path].add(taxon)
                taxa.add(taxon)
    else:
        print(path)
        

NOAA/DSDP_core_data/50/415A/.ipynb_checkpoints/vistxt-checkpoint.csv
NOAA/DSDP_core_data/2/8A/.ipynb_checkpoints/vistxt-checkpoint.csv


In [None]:
len(files)

In [None]:
len(taxa)