# QA NOAA Janus IODP metadata

Get basic metadata (file names, column names) about NOAA Janus IODP dataset. Create csv that lists all the files.

NOAA_csv/JanusIODP_paleo_agemodel  
expedition 101-210 
taxa 101-210, age models 101-190

In [23]:
import sys
sys.path.append('../scripts/')
import glob
from pathlib import Path
import os
import re 
import pandas as pd
import numpy as np

from normalize_noaa_files import (
    unique_filenames_for_paths,
    unique_columns_for_paths,
    filename_index,
    format_filepaths_set,
    qa_files_for_paths,
    column_counts_for_paths
)
import space_delim as sd


In [24]:
base_directory = 'cleaned_data'
base_data_path = os.path.join(base_directory, 'NOAA_csv', 'JanusIODP_paleo_agemodel')
metadata_path = os.path.join(base_directory, 'metadata', 'noaa_janus_iodp_files.csv')

In [25]:
csv_paths = glob.glob(os.path.join(base_data_path, 'paleontology', '**', '**', '**', '**', '*.csv'))
print('files', len(csv_paths))

files 2481


In [26]:
taxa_csv_paths = [path for path in csv_paths if 'range_tables' in path] 
print('files', len(taxa_csv_paths))

files 2045


In [27]:
age_csv_paths = [path for path in csv_paths if 'age_models' in path]
print('files', len(age_csv_paths))

files 436


## unique file names

Get all the file names.

In [28]:
unique_filenames_for_paths(taxa_csv_paths)

{'Benthic Foraminifers.csv',
 'Benthic_Foraminifers.csv',
 'Bolboforms.csv',
 'Diatoms.csv',
 'Dinoflagellates_Acritarch_Prasinophytes.csv',
 'Dinoflagellates_Acritarchs_Prasinophytes.csv',
 'Macrofossils.csv',
 'Miscellaneous.csv',
 'Nannofossils .csv',
 'Nannofossils.csv',
 'Ostracodes.csv',
 'Planktonic Foraminifers.csv',
 'Planktonic_Foraminifers .csv',
 'Planktonic_Foraminifers.csv',
 'Pollen_Spores.csv',
 'Pteropods.csv',
 'Radiolarians.csv',
 'Silicoflagellates_Ebridians_Actiniscidians.csv',
 'Sponge_Spicules.csv',
 'Trace_Fossils.csv'}

In [29]:
unique_filenames_for_paths(age_csv_paths)

{'Age_Model_Initial_Report.csv',
 'Age_Model_Initial_Reports.csv',
 'Age_Model_Post_Moratorium.csv',
 'Age_Model_Shipboard.csv',
 'Age_Model_Shipboard_Report.csv'}

## QA ODP paleo files

Count the number of good files vs bad files that need to be fixed

In [30]:
expected_fields = {
    'Data',
    'Age From (oldest)',
    'Age To (youngest)',
    'Zone From (bottom)',
    'Zone To  (top)',
    'Leg',
    'Site',
    'H',
    'Cor',
    'T',
    'Sc',
    'Top(cm)',
    'Depth (mbsf)',
    'Scientist',
#     'Fossil Group',
    'Comment', 
    'Group Abundance',
    'Group Preservation'
}

results = qa_files_for_paths(taxa_csv_paths, expected_fields, sep=',')

In [31]:
print('bad_tabs', len(results['bad_tabs']))
print('bad_encoding', len(results['bad_encoding']))
print('space_delim', len(results['space_delim']))
print('missing_fields', len(results['missing_fields']))
print('good_files', len(results['good_files']))

bad_tabs 0
bad_encoding 0
space_delim 0
missing_fields 0
good_files 2045


In [32]:
results['missing_fields']

[]

### process latin_encoding
handle files with encoding that isn't utf-8

In [33]:
for file in results['bad_encoding']:
    print(file)

convert file to utf-8 encoding
https://codereview.stackexchange.com/a/202985

In [34]:
for file in results['bad_encoding']:
    with open(file, 'rb') as f:
        content_bytes = f.read()
    detected = chardet.detect(content_bytes)
    encoding = detected['encoding']
    content_text = content_bytes.decode(encoding)
    
    with open(file, 'w', encoding='utf-8') as f:
        f.write(content_text)
        print(file)

### process bad_tabs
handle files where the hearers and rows have different number of columns

In [35]:
for file in results['bad_tabs']:
    print(file)

### process space_delim

handle files that use random number of spaces to separate the columns

In [36]:
for file in results['space_delim']:
    file_size = os.path.getsize(file)
    print(f'"{file}",')

### process missing_fields

handle files don't have the expected columns

In [37]:
for file in results['missing_fields']:
    print(file)
    df = pd.read_csv(file, nrows=1)
    print(expected_fields - set(df.columns))

## check space_delim files were correctly fixed

After converting space delimited files, check the  files for errors. Errors
include values that have spaces or columns that have no values.

In [38]:
fixed_space_delim_files = (
    sd.space_delim_files_janus_iodp_1
    + sd.space_delim_files_janus_iodp_2
    + sd.space_delim_files_janus_iodp_3
)

skip_fields = {
    'Data', 'Age From (oldest)', 'Age To (youngest)', 'Zone From (bottom)', 
    'Zone To  (top)', 'Leg', 'Site','H', 'Cor', 'T', 'Sc', 'Top(cm)', 
    'Depth (mbsf)', 'Scientist', 'Comment', 'Fossil Group'
}

def valid_values(x):
    return isinstance(x, str) and ' ' in x

for file in fixed_space_delim_files:
    filename = os.path.join("cleaned_data", file)

    df = pd.read_csv(filename, dtype=str)
    df.dropna(axis="columns", how="all")
    
    taxa_columns = set(df.columns) - skip_fields
    for col in taxa_columns:
        # check if there are values with spaces         
        if sum(df[col].apply(valid_values)) > 0:
            print(f'{col}: has space')
            print(filename)
            print('---')
            
        # check if column is blank       
        if df[col].isnull().values.all():
            print(f'{col}: has no values')
            print(filename)
            print('---')

           

Group Preservation: has no values
cleaned_data/NOAA_csv/JanusIODP_paleo_agemodel/paleontology/range_tables/181/1119/HOLE_B/Diatoms.csv
---
Group Preservation: has no values
cleaned_data/NOAA_csv/JanusIODP_paleo_agemodel/paleontology/range_tables/181/1119/HOLE_C/Diatoms.csv
---
Group Preservation: has no values
cleaned_data/NOAA_csv/JanusIODP_paleo_agemodel/paleontology/range_tables/181/1119/HOLE_A/Diatoms.csv
---
Group Preservation: has no values
cleaned_data/NOAA_csv/JanusIODP_paleo_agemodel/paleontology/range_tables/181/1120/HOLE_B/Diatoms.csv
---
Group Abundance: has no values
cleaned_data/NOAA_csv/JanusIODP_paleo_agemodel/paleontology/range_tables/181/1120/HOLE_B/Diatoms.csv
---
Group Preservation: has no values
cleaned_data/NOAA_csv/JanusIODP_paleo_agemodel/paleontology/range_tables/181/1120/HOLE_D/Diatoms.csv
---
Group Abundance: has no values
cleaned_data/NOAA_csv/JanusIODP_paleo_agemodel/paleontology/range_tables/181/1120/HOLE_D/Diatoms.csv
---
Group Preservation: has no values

## file list

Create csv that lists all the files for this dataset.

In [39]:
file_list = []
switch = {
    'Benthic Foraminifers.csv': 'benthic_foraminfera',
    'Benthic_Foraminifers.csv': 'benthic_foraminfera',
    'Bolboforms.csv': 'bolboformids',
    'Diatoms.csv': 'diatoms',
    'Dinoflagellates_Acritarch_Prasinophytes.csv': 'dinoflagellates/acritarchs/prasinophytes',
    'Dinoflagellates_Acritarchs_Prasinophytes.csv': 'dinoflagellates/acritarchs/prasinophytes',
    'Nannofossils .csv': 'nannofossils',
    'Nannofossils.csv': 'nannofossils',
    'Ostracodes.csv': 'ostracods',
    'Planktonic Foraminifers.csv': 'planktic_foraminfera',
    'Planktonic_Foraminifers .csv': 'planktic_foraminfera',
    'Planktonic_Foraminifers.csv': 'planktic_foraminfera',
    'Pollen_Spores.csv': 'pollen',
    'Pteropods.csv': 'pteropods',
    'Radiolarians.csv': 'radiolarians',
    'Silicoflagellates_Ebridians_Actiniscidians.csv': 'silicoflagellates/ebridians/actiniscidians',
    'Sponge_Spicules.csv': 'sponge_spicules',
    'Trace_Fossils.csv': 'trace_fossils'
}
index = filename_index(csv_paths[0])

for path in csv_paths:
    file_data = {}
    parts = Path(path).parts
    filename = parts[index]    
    
    file_data['path'] = path
    
    if filename.startswith('Age_'):
        type = 'age'
    else:
        type = 'taxa'
    file_data['type'] = type
    
    if type == 'taxa':
        file_data['taxon_group'] = switch.get(filename, np.nan)
    
    file_data['expedition'] = parts[5]
    file_data['site'] = parts[6]

    
    file_list.append(file_data)
 

In [40]:
df = pd.DataFrame(file_list)
df.head()

Unnamed: 0,path,type,expedition,site,taxon_group
0,cleaned_data/NOAA_csv/JanusIODP_paleo_agemodel...,age,135,835,
1,cleaned_data/NOAA_csv/JanusIODP_paleo_agemodel...,age,135,834,
2,cleaned_data/NOAA_csv/JanusIODP_paleo_agemodel...,age,135,834,
3,cleaned_data/NOAA_csv/JanusIODP_paleo_agemodel...,age,135,841,
4,cleaned_data/NOAA_csv/JanusIODP_paleo_agemodel...,age,135,841,


In [41]:
df.to_csv(metadata_path, index=False)

## column names

Get all the column names.

### age models

In [42]:
age_columns = unique_columns_for_paths(age_csv_paths)
age_columns

{'    Age (Ma)',
 'Age Model Type           ',
 'Control Point Comment',
 'Depth (mbsf)',
 'H',
 'Leg',
 'Site',
 'Unnamed: 6'}

In [43]:
column_counts_for_paths(age_csv_paths)

{7, 8}

print out files that have too many columns

In [44]:
for path in age_csv_paths:
    df = pd.read_csv(path, nrows=0)
    if len(df.columns) == 8:
        print(path)

cleaned_data/NOAA_csv/JanusIODP_paleo_agemodel/paleontology/age_models/150/906/HOLE_A/Age_Model_Initial_Report.csv
cleaned_data/NOAA_csv/JanusIODP_paleo_agemodel/paleontology/age_models/154/925/HOLE_A/Age_Model_Initial_Report.csv


### taxa

In [45]:
taxa_columns = unique_columns_for_paths(taxa_csv_paths)
len(taxa_columns)

12976

## files grouped by expedition and file type

In [46]:
contents = {}
index = filename_index(path)

for path in csv_paths:
    parts = Path(path).parts
    exp = parts[5]
    filename = parts[index]
        
    if exp not in contents:
        contents[exp] = {'taxa': set(),'age_model': set()}
        
    if filename.startswith('Age_'):
        contents[exp]['age_model'].add(filename)
    else:
        contents[exp]['taxa'].add(filename)
    

In [47]:
file_list = []

for exp in contents.items():
    file_data = {}
    file_data['expedition'] = exp[0]
    file_data['taxa'] = format_filepaths_set(exp[1], 'taxa')
    file_data['age_model'] = format_filepaths_set(exp[1], 'age_model')

    file_list.append(file_data)

In [48]:

df = pd.DataFrame(file_list)
df.head()

Unnamed: 0,expedition,taxa,age_model
0,135,"Nannofossils.csv,Planktonic_Foraminifers.csv,B...",Age_Model_Initial_Report.csv
1,104,"Ostracodes.csv,Macrofossils.csv,Planktonic_For...",Age_Model_Initial_Reports.csv
2,168,Nannofossils.csv,Age_Model_Initial_Report.csv
3,157,"Nannofossils.csv,Planktonic_Foraminifers.csv","Age_Model_Initial_Report.csv,Age_Model_Shipboa..."
4,150,"Dinoflagellates_Acritarch_Prasinophytes.csv,Na...",Age_Model_Initial_Report.csv


In [49]:
path = os.path.join('tmp', 'noaa_janus_iodp_grouped_files.csv')
df.to_csv(path, index=False)

## Miscellaneous.csv

create github link for each Miscellaneous.csv.

In [50]:
for path in csv_paths:
    if 'Miscellaneous.csv' in path:
        link = 'https://github.com/eODP/data-processing/tree/master/notebooks/' + '/'.join(path.split('/'))
        print(link)

https://github.com/eODP/data-processing/tree/master/notebooks/cleaned_data/NOAA_csv/JanusIODP_paleo_agemodel/paleontology/range_tables/104/643/HOLE_A/Miscellaneous.csv
https://github.com/eODP/data-processing/tree/master/notebooks/cleaned_data/NOAA_csv/JanusIODP_paleo_agemodel/paleontology/range_tables/104/644/HOLE_A/Miscellaneous.csv
https://github.com/eODP/data-processing/tree/master/notebooks/cleaned_data/NOAA_csv/JanusIODP_paleo_agemodel/paleontology/range_tables/104/642/HOLE_B/Miscellaneous.csv
https://github.com/eODP/data-processing/tree/master/notebooks/cleaned_data/NOAA_csv/JanusIODP_paleo_agemodel/paleontology/range_tables/120/747/HOLE_A/Miscellaneous.csv
https://github.com/eODP/data-processing/tree/master/notebooks/cleaned_data/NOAA_csv/JanusIODP_paleo_agemodel/paleontology/range_tables/120/749/HOLE_B/Miscellaneous.csv
https://github.com/eODP/data-processing/tree/master/notebooks/cleaned_data/NOAA_csv/JanusIODP_paleo_agemodel/paleontology/range_tables/120/749/HOLE_C/Miscellane