# create metadata NOAA DSDP files
## 1-96 taxa, lithology, age, hard rocks

Get basic metadata (file names, column names) about NOAA DSDP dataset. Create csv that lists all the files.

NOAA_csv/DSDP_core_data  
expedition 1-96  
taxa, lithology, age models, hard rocks

In [1]:
import sys
sys.path.append('../../')
import glob
from pathlib import Path
import os

import pandas as pd
import numpy as np

from scripts.normalize_noaa_files import (
    unique_filenames_for_paths,
    unique_columns_for_paths,
    filename_index,
    format_filepaths_set
)
from config import OUTPUT_DIR, CLEAN_DATA_DIR

In [2]:
base_dir = CLEAN_DATA_DIR
data_dir = base_dir/'NOAA'/'DSDP_core_data'
metadata_path = OUTPUT_DIR/'metadata'/'NOAA'/'noaa_dsdp_files.csv'

In [18]:
csv_paths = list(data_dir.glob("**/*.csv"))
csv_paths = [p for p in csv_paths if '.ipynb_checkpoints' not in str(p)]
print('files', len(csv_paths))

files 4477


In [22]:
taxa_csv_paths = [p for p in csv_paths if p.name not in ['ageprof.csv', 'hr_desc.csv','vistxt.csv']]
print('files', len(taxa_csv_paths))

files 2093


## unique file names

Get all the file names.

In [11]:
unique_filenames_for_paths(csv_paths)

{'ageprof.csv',
 'b_forams.csv',
 'diatoms.csv',
 'dinoflag.csv',
 'ebri_act.csv',
 'hr_desc.csv',
 'nannos.csv',
 'ostracod.csv',
 'p_forams.csv',
 'phyliths.csv',
 'pollen.csv',
 'radiolar.csv',
 'siliflag.csv',
 'vistxt.csv'}

## check if taxon groups in file matches taxon group in file name 

In [30]:
lims_metadata_file = OUTPUT_DIR/'metadata'/'LIMS'/'Micropal_changes.csv' 
metadata = pd.read_csv(lims_metadata_file)
set(metadata['taxon_groups'])


{'benthic_forams',
 'bolboformids',
 'chrysophyte_cysts',
 'diatoms',
 'dinoflagellates',
 'ebridians',
 'nannofossils',
 'ostracods',
 'other',
 'palynology',
 'planktic_forams',
 'radiolarians',
 'silicoflagellates'}

In [26]:
groups = set()

for path in taxa_csv_paths:
    df = pd.read_csv(path)
    df = df.dropna(axis=1, how='all')
    df = df.dropna(axis=0, how='all')
    
    cols = [col.strip() for col in df.columns]
    if 'fossil group' not in cols:
        print(path)
    else:
        groups.update(df['fossil group' ])
        
groups

{'BENTHIC FORAMINIFERA',
 'DIATOMS',
 'DINOFLAGELLATES',
 'EBRIDIANS & ACTINICIDIANS',
 'NANNOFOSSILS',
 'OSTRACODES',
 'PHYTOLITHARIA',
 'PLANKTONIC FORAMINIFERA',
 'POLLEN AND SPORES',
 'RADIOLARIA',
 'SILICOFLAGELLATES'}

In [35]:
taxon_groups_file_data = {
 'b_forams.csv': 'BENTHIC FORAMINIFERA',
 'diatoms.csv': 'DIATOMS',
 'dinoflag.csv': 'DINOFLAGELLATES',
 'ebri_act.csv': 'EBRIDIANS & ACTINICIDIANS',
 'nannos.csv': 'NANNOFOSSILS',
 'ostracod.csv': 'OSTRACODES',
 'p_forams.csv': 'PLANKTONIC FORAMINIFERA',
 'phyliths.csv': 'PHYTOLITHARIA',
 'pollen.csv': 'POLLEN AND SPORES',
 'radiolar.csv': 'RADIOLARIA',
 'siliflag.csv': 'SILICOFLAGELLATES',
}


In [39]:
for path in taxa_csv_paths:
    df = pd.read_csv(path)
    df = df.dropna(axis=1, how='all')
    df = df.dropna(axis=0, how='all')
    
    df.columns = [col.strip() for col in df.columns]
    
    file_name_group = taxon_groups_file_data[path.name]
    file_data_groups = set([group.strip() for group in df['fossil group']])
    
    if len(file_data_groups) > 1:
        print(path, 'MULTIPLE', file_data_groups)

    elif file_name_group != list(file_data_groups)[0]:
        print(path)
        print(file_name_group, '-', list(file_data_groups)[0])



##  create file list metadata
Create csv that lists all the files for this dataset.

In [None]:
taxon_groups_convert = {
 'b_forams.csv': 'benthic_forams',
 'diatoms.csv': 'diatoms',
 'dinoflag.csv': 'dinoflagellates',
 'ebri_act.csv': 'Ebridians and Actinicidians',
 'nannos.csv': 'nannofossils',
 'ostracod.csv': 'ostracods',
 'p_forams.csv': 'planktic_forams',
 'phyliths.csv': 'phytolitharia',
 'pollen.csv': 'Pollen and Spores',
 'radiolar.csv': 'radiolarians',
 'siliflag.csv': 'silicoflagellates',
}



In [40]:
file_list = []


for path in csv_paths:
    if '.ipynb_checkpoints' in str(path):
        continue
        
    file_data = {}
    relative_path = path.relative_to(base_dir)
    path_parts = relative_path.parts
    filename = relative_path.name
    
    file_data['path'] = relative_path
    
    if filename == 'vistxt.csv':
        type = 'lithology'
    elif filename == 'ageprof.csv':
        type = 'age'
    elif filename == 'hr_desc.csv':
        type = 'hard_rock'
    else:
        type = 'taxa'
    file_data['type'] = type
        
    if type == 'taxa':
        file_data['taxon_group'] = taxon_groups_convert[filename]
    
    file_data['expedition'] =  path_parts[2]
    file_data['site'] =  path_parts[3]

    file_list.append(file_data)

In [41]:
df = pd.DataFrame(file_list)
df.head()

Unnamed: 0,path,type,taxon_group,expedition,site
0,NOAA/DSDP_core_data/61/462/radiolar.csv,taxa,radiolarians,61,462
1,NOAA/DSDP_core_data/61/462/ageprof.csv,age,,61,462
2,NOAA/DSDP_core_data/61/462/b_forams.csv,taxa,benthic_forams,61,462
3,NOAA/DSDP_core_data/61/462/p_forams.csv,taxa,planktic_forams,61,462
4,NOAA/DSDP_core_data/61/462/hr_desc.csv,hard_rock,,61,462


In [42]:
df.to_csv(metadata_path, index=False)

## create grouped file list metadata

create list of files grouped by expedition and file type

In [8]:
contents = {}

for path in csv_paths:
    relative_path = path.relative_to(base_dir)
    parts = relative_path.parts
    filename = relative_path.name
    exp = parts[2]
     
    if exp not in contents:
        contents[exp] = {'taxa': set(), 'lithology': set(), 'age_model': set(), 'hard_rock': set()}
        
    if filename == 'vistxt.csv':
        contents[exp]['lithology'].add(filename)
    elif filename == 'ageprof.csv':
        contents[exp]['age_model'].add(filename)
    elif filename == 'hr_desc.csv':
        contents[exp]['hard_rock'].add(filename)
    else:
        contents[exp]['taxa'].add(filename)


In [9]:
file_list = []

for exp in contents.items():
    file_data = {}
    file_data['expedition'] = exp[0]
    file_data['taxa'] = format_filepaths_set(exp[1], 'taxa')
    file_data['age_model'] = format_filepaths_set(exp[1], 'age_model')
    file_data['lithology'] = format_filepaths_set(exp[1], 'lithology')
    file_data['hard_rock'] = format_filepaths_set(exp[1], 'hard_rock')

    file_list.append(file_data)

In [10]:
df = pd.DataFrame(file_list)
df.head()

Unnamed: 0,expedition,taxa,age_model,lithology,hard_rock
0,61,"nannos.csv,p_forams.csv,radiolar.csv,b_forams.csv",ageprof.csv,vistxt.csv,hr_desc.csv
1,95,"diatoms.csv,nannos.csv,radiolar.csv,ostracod.c...",ageprof.csv,vistxt.csv,
2,59,"p_forams.csv,diatoms.csv,nannos.csv,radiolar.c...",ageprof.csv,vistxt.csv,hr_desc.csv
3,92,"nannos.csv,p_forams.csv",ageprof.csv,vistxt.csv,hr_desc.csv
4,66,"nannos.csv,radiolar.csv",ageprof.csv,vistxt.csv,hr_desc.csv


In [11]:
path = OUTPUT_DIR/'tmp'/'noaa_dsdp_grouped_files.csv'
df.to_csv(path, index=False)