# QA NOAA Janus IODP metadata
## 101-210 taxa, 101-190 age

Get basic metadata (file names, column names) about NOAA Janus IODP dataset. Create csv that lists all the files.

NOAA/JanusIODP_paleo_agemodel  
expedition 101-210 
taxa 101-210, age models 101-190

In [48]:
import sys
sys.path.append('../../')
import glob
from pathlib import Path
import os
import re 
import pandas as pd
import numpy as np
import shutil

from scripts.normalize_noaa_files import (
    unique_filenames_for_paths,
    unique_columns_for_paths,
    filename_index,
    format_filepaths_set,
    qa_files_for_paths,
    column_counts_for_paths
)
import scripts.space_delim as sd
from config import OUTPUT_DIR, CLEAN_DATA_DIR, RAW_DATA_DIR


In [83]:
base_dir = CLEAN_DATA_DIR
data_dir = base_dir/'NOAA'/'JanusIODP_paleo_agemodel'
metadata_path = OUTPUT_DIR/'metadata'/'NOAA'/'noaa_janus_iodp_files.csv'

In [84]:
csv_paths = list(data_dir.glob("**/*.csv"))
print('files', len(csv_paths))

files 2481


In [85]:
taxa_csv_paths = list(data_dir.glob("paleontology/range_tables/**/*.csv"))
print('files', len(taxa_csv_paths))

files 2045


In [69]:
age_csv_paths = list(data_dir.glob("paleontology/age_models/**/*.csv"))
print('files', len(age_csv_paths))

files 436


## unique file names

Get all the file names.

In [17]:
files = unique_filenames_for_paths(taxa_csv_paths)
files

{'Benthic Foraminifers.csv',
 'Benthic_Foraminifers.csv',
 'Bolboforms.csv',
 'Diatoms.csv',
 'Dinoflagellates_Acritarch_Prasinophytes.csv',
 'Dinoflagellates_Acritarchs_Prasinophytes.csv',
 'Macrofossils.csv',
 'Miscellaneous.csv',
 'Nannofossils .csv',
 'Nannofossils.csv',
 'Ostracodes.csv',
 'Planktonic Foraminifers.csv',
 'Planktonic_Foraminifers .csv',
 'Planktonic_Foraminifers.csv',
 'Pollen_Spores.csv',
 'Pteropods.csv',
 'Radiolarians.csv',
 'Silicoflagellates_Ebridians_Actiniscidians.csv',
 'Sponge_Spicules.csv',
 'Trace_Fossils.csv'}

In [18]:
len(files)

20

In [19]:
unique_filenames_for_paths(age_csv_paths)

{'Age_Model_Initial_Report.csv',
 'Age_Model_Initial_Reports.csv',
 'Age_Model_Post_Moratorium.csv',
 'Age_Model_Shipboard.csv',
 'Age_Model_Shipboard_Report.csv'}

## column names

In [20]:
taxa_columns = unique_columns_for_paths(taxa_csv_paths)
len(taxa_columns)

12980

## file list

Create csv that lists all the files for this dataset.

In [48]:
file_list = []
switch = {
    'Benthic Foraminifers.csv': 'benthic_foraminfera',
    'Benthic_Foraminifers.csv': 'benthic_foraminfera',
    'Bolboforms.csv': 'bolboformids',
    'Diatoms.csv': 'diatoms',
    'Dinoflagellates_Acritarch_Prasinophytes.csv': 'dinoflagellates/acritarchs/prasinophytes',
    'Dinoflagellates_Acritarchs_Prasinophytes.csv': 'dinoflagellates/acritarchs/prasinophytes',
    'Macrofossils.csv': 'macrofossils',
    'Miscellaneous.csv': 'miscellaneous',
    'Nannofossils .csv': 'nannofossils',
    'Nannofossils.csv': 'nannofossils',
    'Ostracodes.csv': 'ostracods',
    'Planktonic Foraminifers.csv': 'planktic_foraminfera',
    'Planktonic_Foraminifers .csv': 'planktic_foraminfera',
    'Planktonic_Foraminifers.csv': 'planktic_foraminfera',
    'Pollen_Spores.csv': 'pollen',
    'Pteropods.csv': 'pteropods',
    'Radiolarians.csv': 'radiolarians',
    'Silicoflagellates_Ebridians_Actiniscidians.csv': 'silicoflagellates/ebridians/actiniscidians',
    'Sponge_Spicules.csv': 'sponge_spicules',
    'Trace_Fossils.csv': 'trace_fossils'
}
index = filename_index(csv_paths[0])

for path in csv_paths:
    file_data = {}
    relative_path = path.relative_to(base_dir)
    path_parts = relative_path.parts
    filename = relative_path.name 
    
    file_data['path'] = relative_path
    
    if filename.startswith('Age_'):
        type = 'age'
    else:
        type = 'taxa'
    file_data['type'] = type
    
    if type == 'taxa':
        file_data['taxon_group'] = switch.get(filename, np.nan)
    
    file_data['expedition'] = path_parts[4]
    file_data['site'] = path_parts[5]

    
    file_list.append(file_data)
 

In [55]:
df = pd.DataFrame(file_list)
df = df.sort_values(by=['expedition', 'site', 'type', 'taxon_group'])
df.head()

Unnamed: 0,path,type,expedition,site,taxon_group
1868,NOAA/JanusIODP_paleo_agemodel/paleontology/ran...,taxa,101,626,benthic_foraminfera
1871,NOAA/JanusIODP_paleo_agemodel/paleontology/ran...,taxa,101,626,benthic_foraminfera
1867,NOAA/JanusIODP_paleo_agemodel/paleontology/ran...,taxa,101,626,nannofossils
1870,NOAA/JanusIODP_paleo_agemodel/paleontology/ran...,taxa,101,626,nannofossils
1869,NOAA/JanusIODP_paleo_agemodel/paleontology/ran...,taxa,101,626,planktic_foraminfera


In [56]:
df.to_csv(metadata_path, index=False)

## QA ODP paleo files

Count the number of good files vs bad files that need to be fixed

In [70]:
expected_fields = {
    'Data',
    'Age From (oldest)',
    'Age To (youngest)',
    'Zone From (bottom)',
    'Zone To  (top)',
    'Leg',
    'Site',
    'H',
    'Cor',
    'T',
    'Sc',
    'Top(cm)',
    'Depth (mbsf)',
    'Scientist',
#     'Fossil Group',
    'Comment', 
    'Group Abundance',
    'Group Preservation'
}

results = qa_files_for_paths(taxa_csv_paths, expected_fields, sep=',')

In [71]:
print('bad_tabs', len(results['bad_tabs']))
print('bad_encoding', len(results['bad_encoding']))
print('space_delim', len(results['space_delim']))
print('missing_fields', len(results['missing_fields']))
print('good_files', len(results['good_files']))
print('unnamed_column', len(results['unnamed_column']))

bad_tabs 0
bad_encoding 0
space_delim 61
missing_fields 0
good_files 1943
unnamed_column 41


### process latin_encoding
handle files with encoding that isn't utf-8

In [14]:
for file in results['bad_encoding']:
    pass

convert file to utf-8 encoding
https://codereview.stackexchange.com/a/202985

In [15]:
for file in results['bad_encoding']:
    with open(file, 'rb') as f:
        content_bytes = f.read()
    detected = chardet.detect(content_bytes)
    encoding = detected['encoding']
    content_text = content_bytes.decode(encoding)
    
    with open(file, 'w', encoding='utf-8') as f:
        f.write(content_text)
        print(file)

### process bad_tabs
handle files where the hearers and rows have different number of columns

In [15]:
for file in results['bad_tabs']:
    pass

### process missing_fields

handle files don't have the expected columns

In [23]:
for file in results['missing_fields']:
    print(file)
    df = pd.read_csv(file, nrows=1)
    print(expected_fields - set(df.columns))

../../output/cleaned_data/NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/174/1071/HOLE_B/Planktonic_Foraminifers.csv
{'Zone To  (top)', 'Group Preservation'}


### process unnamed_column
handle files with unnamed columns

In [55]:
results['unnamed_column'][0:2]

[PosixPath('../../output/cleaned_data/NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/192/1183/HOLE_A/Nannofossils.csv'),
 PosixPath('../../output/cleaned_data/NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/167/1010/HOLE_E/Radiolarians.csv')]

remove blank lines

In [41]:
raw_url = '../../raw_data/NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/'
clean_url = '../../output/cleaned_data/NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/'

In [64]:
for file in results['unnamed_column']:
    print(file)
    df = pd.read_csv(file, dtype=str)
    df.dropna(how='all', axis='index', inplace=True)

    last_columns = df.columns[(len(df.columns)-2):len(df.columns)]
    
    last_taxa = df.columns[(len(df.columns)-2)]
    print(df[last_taxa].unique() )
#     print(df[last_columns].head(2))
 


../../output/cleaned_data/NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/192/1183/HOLE_A/Nannofossils.csv
['C                 ' 'R                 ' '                  ']
../../output/cleaned_data/NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/167/1010/HOLE_E/Radiolarians.csv
['                   ' '+                  ']
../../output/cleaned_data/NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/167/1010/HOLE_C/Radiolarians.csv
['T                      ' '                       ']
../../output/cleaned_data/NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/167/1011/HOLE_B/Radiolarians.csv
['+                     ' '                      ']
../../output/cleaned_data/NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/151/907/HOLE_A/Radiolarians.csv
['                 ' '+                ']
../../output/cleaned_data/NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/189/1169/HOLE_A/Radiolarians.csv
['R                      ' '                       '

### process space_delim

handle files that use random number of spaces to separate the columns

In [72]:
results['space_delim'][0:2]

[PosixPath('../../raw_data/NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/174/1071/HOLE_B/Benthic_Foraminifers.csv'),
 PosixPath('../../raw_data/NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/174/1071/HOLE_B/Planktonic_Foraminifers.csv')]

In [75]:
def spaces(int):
    return ' ' * int

def replace_path(file, old_path, new_path):
    index = file.parts.index(old_path)
    return Path().joinpath(*file.parts[0:index], new_path, *file.parts[index + 1:])
    
    

automatically process raw files with spaces

In [76]:
for file in results['space_delim']:
    df = sd.convert_space_delim_file(file)
    
    index = file.parts.index('raw_data')
    path = Path('../../output/cleaned_data').joinpath(*file.parts[index + 1:])
    df.to_csv(path, index=False)

manual cleanup; file needs to be fixed by PIs: Z Actiniscus pentasterias,X Phytoliths diff. shapes

In [77]:
file = '../../raw_data/NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/175/1077/HOLE_A/Diatoms.csv'
output = file.replace('raw_data', 'output/cleaned_data')

with open(file) as reader:
    
    all_lines = []
    for line in reader:
        if line == '""\n':
                continue

        new_line = line
        new_line = new_line.replace('""Rhizosolenia """"Imbricatae""""""', 'Rhizosolenia "Imbricatae"')
        text = 'other Freshwater (Achnanthes; Fragilaria, etc.)'
        new_line = new_line.replace(text, f'"{text}"')
        new_line = new_line.replace('Z Actiniscus pentasterias', 'Actiniscus pentasterias')
        new_line = new_line.replace('X Phytoliths diff. shapes', 'Phytoliths diff. shapes')
        new_line = re.sub('\n', '', new_line)
        new_line = re.sub(",?\"$", '', new_line)

        new_line = re.sub(', {1,}', ',', new_line)
        new_line = re.sub(' {1,},', ',', new_line)
        new_line = re.sub('  IR,', 'IR,', new_line)
        new_line = re.sub('^"', '', new_line)
        all_lines.append(new_line)
    
with open(output, 'w') as writer:
    writer.writelines('\n'.join(all_lines))

manual fix messed up spacing, convert to csv


In [78]:
file = '../../raw_data/NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/172/1056/HOLE_C/Nannofossils.csv'
output = file.replace('raw_data', 'output/cleaned_data')

with open(file) as reader:
    
    all_lines = []
    for line in reader:
        if line == '""\n':
                continue

        new_line = line
        new_line = re.sub('\n', '', new_line)
        new_line = re.sub('IR +172 +1056 +C', 
                          'IR' + spaces(81) + '172       1056       C', new_line)
        new_line = re.sub('C +([0-9]) +H +CC', 
                          r'C      \1        H      CC', new_line)
        new_line = re.sub('CC {9}([0-9.]{4}) {10}([0-9])', r'CC          \1         \2', new_line)
        new_line = re.sub('CC {9}([0-9.]{5}) {10}([0-9])', r'CC          \1        \2', new_line)
        new_line = re.sub('(.{171}) +(Raffi) +(Nannofossils)', 
                          r'\1' + spaces(0) + 'Raffi' + spaces(12) +'Nannofossils', new_line)
        new_line = re.sub('Nannofossils +A +G {18}', 
                          'Nannofossils'+ spaces(8)+'A' + spaces(21)+ 'G', new_line)
        new_line = re.sub('(.{284}) +F +([CA])', r'\1' +  'F' + spaces(25) + r'\2', new_line)
        new_line = re.sub('(.{310}[CA]) {42,43}([AF ])',
                          r'\1' + spaces(29) + r'\2',new_line)
        new_line = re.sub('(.{343}) +A', 
                          r'\1' + spaces(25) + 'A', new_line)
        new_line = re.sub('(.{368}A) {29,32}([CF ])', 
                          r'\1' + spaces(25) + r'\2', new_line)
        new_line = re.sub('(.{420}) {11,12}([R ]) {35,39}([C F])', 
                          r'\1' +  r'\2' +  spaces(26) + r'\3', new_line)
        new_line = re.sub('(.{474}) {4,8}([ARC ]) {38,39}', 
                          r'\1' + r'\2' + spaces(21), new_line)
        new_line = re.sub('(.{525}) {11}([1 ]) {15}([+ ])', 
                          r'\1' + r'\2' + r'\3', new_line)
   
        new_line = re.sub('(.{577}) {0,4}([<>])', 
                          r'\1' + spaces(5) + r'\2', new_line)
        all_lines.append(new_line)
    

with open(output, 'w') as writer:
    writer.writelines('\n'.join(all_lines))
 
df = sd.convert_space_delim_file(output)
df.to_csv(output, index=False)

manual fix messed up spacing, convert to csv


In [79]:
file = '../../raw_data/NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/174/1071/HOLE_B/Planktonic_Foraminifers.csv'
output = file.replace('raw_data', 'output/cleaned_data')

with open(file) as reader:
    
    all_lines = []
    for line in reader:
        if line == '""\n':
                continue
                
        new_line = line
        new_line = re.sub('\n', '', new_line)
        new_line = re.sub('IR +174', 
                          'IR' + spaces(92) + '174', new_line)
        new_line = re.sub('(.{117}) +([0-9]) +X +CC', 
                          r'\1' + r'\2' + spaces(7) + 'X'  + spaces(5) + 'CC', new_line)
        new_line = re.sub('(.{138}) {1}([0-9. ]{5}) +([0-9.]+)', 
                          r'\1' + r'\2' + spaces(7) + r'\3' + spaces(2) , new_line)   
        new_line = re.sub('Olson {6}Planktonic Foraminifers {10}T {17}G?', 
                          'Olson         Planktonic Foraminifers   T' +spaces(19) + 'G', new_line)
        new_line = re.sub('(.{250}) +([0-9])', 
                          r'\1' + r'\2' +spaces(15) , new_line)  
        new_line = re.sub('planktonics miscellaneous', 'Planktonics miscellaneous', new_line)
        all_lines.append(new_line)
        
with open(output, 'w') as writer:
    writer.writelines('\n'.join(all_lines))
    
df = sd.convert_space_delim_file(output)
df.to_csv(output, index=False)

manually fix spacing

In [80]:
file = "../../raw_data/NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/175/1081/HOLE_A/Planktonic_Foraminifers.csv"
output = file.replace('raw_data', 'output/cleaned_data')


with open(file) as reader:
    
    all_lines = []
    for line in reader:
        if line == '""\n':
                continue
        if line.startswith("Abbreviated View --ALL"):
            continue

                
        new_line = line
        new_line = re.sub('\n', '', new_line)
        
        new_line = re.sub('Christensen ', ' Christensen', new_line)
        new_line = re.sub(spaces(7)  + 'Fossil Group', 'Fossil Group' + spaces(7), new_line)
        
        all_lines.append(new_line)
        
with open(output, 'w') as writer:
    writer.writelines('\n'.join(all_lines))
   
df = sd.convert_space_delim_file(output)
df.to_csv(output, index=False)

fix quotes

In [81]:
file = '../../raw_data/NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/181/1119/HOLE_B/Nannofossils.csv'
output = file.replace('raw_data', 'output/cleaned_data')

with open(file) as reader:
    all_lines = []
    for line in reader:
        if line == '""\n':
                continue
       
        if re.match('^" +IR.*?$', line):
            line = re.sub('^"(.*?)"$', r'\1', line)
            
        line = re.sub('""Small""', '"Small"', line)
            
            
        all_lines.append(line)

with open(output, 'w') as writer:
    writer.writelines('\n'.join(all_lines))
    
df = sd.convert_space_delim_file(output)
df.to_csv(output, index=False)

 needs review - Reticulofenestra Z ;  

In [82]:
file = '../../raw_data/NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/181/1120/HOLE_D/Nannofossils.csv'
output = file.replace('raw_data', 'output/cleaned_data')

with open(file) as reader:
    all_lines = []
    for line in reader:
        if line == '""\n':
                continue
                
        line = line.replace('Reticulofenestra Z', 'Reticulofenestra')
                
        all_lines.append(line)

with open(output, 'w') as writer:
    writer.writelines('\n'.join(all_lines))
    
df = sd.convert_space_delim_file(output)
df.to_csv(output, index=False)   

## copy files to folder for PIs to review

In [41]:
def copy_file(path, current_directory, output_directory):
    output_path = path.replace(current_directory, output_directory)
    directories = re.sub('/[A-Za-z0-9\-_ ]+\.csv$', '', output_path)
    if not os.path.isdir(directories):
        os.makedirs(directories)    
    shutil.copy(path, output_path)

for file in results['unnamed_column']:
    copy_file(file, 'raw_data', 'tmp/NOAA_review/unnamed_column/cleaned_data')
    
for file in results['unnamed_column']:
    copy_file(file, 'raw_data', 'tmp/NOAA_review/unnamed_column/raw_data')
    
for file in results['space_delim']:
    copy_file(file, 'raw_data', 'tmp/NOAA_review/space_delimited/raw_data')
    
for file in results['space_delim']:
    file = file.replace('raw_data', 'cleaned_data')
    copy_file(file, 'cleaned_data', 'tmp/NOAA_review/space_delimited/cleaned_data')

## column names

Get all the column names.

### age models

In [57]:
age_columns = unique_columns_for_paths(age_csv_paths)
age_columns

{'    Age (Ma)',
 'Age Model Type           ',
 'Control Point Comment',
 'Depth (mbsf)',
 'H',
 'Leg',
 'Site',
 'Unnamed: 6'}

In [58]:
column_counts_for_paths(age_csv_paths)

{7, 8}

print out files that have too many columns

In [22]:
for path in age_csv_paths:
    df = pd.read_csv(path, nrows=0)
    if len(df.columns) == 8:
        print(path)

cleaned_data/NOAA_csv/JanusIODP_paleo_agemodel/paleontology/age_models/150/906/HOLE_A/Age_Model_Initial_Report.csv
cleaned_data/NOAA_csv/JanusIODP_paleo_agemodel/paleontology/age_models/154/925/HOLE_A/Age_Model_Initial_Report.csv


### taxa

In [23]:
taxa_columns = unique_columns_for_paths(taxa_csv_paths)
len(taxa_columns)

12976

## files grouped by expedition and file type

In [24]:
contents = {}
index = filename_index(path)

for path in csv_paths:
    parts = Path(path).parts
    exp = parts[5]
    filename = parts[index]
        
    if exp not in contents:
        contents[exp] = {'taxa': set(),'age_model': set()}
        
    if filename.startswith('Age_'):
        contents[exp]['age_model'].add(filename)
    else:
        contents[exp]['taxa'].add(filename)
    

In [25]:
file_list = []

for exp in contents.items():
    file_data = {}
    file_data['expedition'] = exp[0]
    file_data['taxa'] = format_filepaths_set(exp[1], 'taxa')
    file_data['age_model'] = format_filepaths_set(exp[1], 'age_model')

    file_list.append(file_data)

In [26]:

df = pd.DataFrame(file_list)
df.head()

Unnamed: 0,expedition,taxa,age_model
0,135,"Planktonic_Foraminifers.csv,Benthic_Foraminife...",Age_Model_Initial_Report.csv
1,104,"Nannofossils.csv,Planktonic_Foraminifers.csv,R...",Age_Model_Initial_Reports.csv
2,168,Nannofossils.csv,Age_Model_Initial_Report.csv
3,157,"Planktonic_Foraminifers.csv,Nannofossils.csv","Age_Model_Shipboard.csv,Age_Model_Initial_Repo..."
4,150,"Dinoflagellates_Acritarch_Prasinophytes.csv,Na...",Age_Model_Initial_Report.csv


In [27]:
path = os.path.join('tmp', 'noaa_janus_iodp_grouped_files.csv')
df.to_csv(path, index=False)

## Miscellaneous.csv

create github link for each Miscellaneous.csv.

In [28]:
for path in csv_paths:
    if 'Miscellaneous.csv' in path:
        link = 'https://github.com/eODP/data-processing/tree/master/notebooks/' + '/'.join(path.split('/'))
        print(link)

https://github.com/eODP/data-processing/tree/master/notebooks/cleaned_data/NOAA_csv/JanusIODP_paleo_agemodel/paleontology/range_tables/104/643/HOLE_A/Miscellaneous.csv
https://github.com/eODP/data-processing/tree/master/notebooks/cleaned_data/NOAA_csv/JanusIODP_paleo_agemodel/paleontology/range_tables/104/644/HOLE_A/Miscellaneous.csv
https://github.com/eODP/data-processing/tree/master/notebooks/cleaned_data/NOAA_csv/JanusIODP_paleo_agemodel/paleontology/range_tables/104/642/HOLE_B/Miscellaneous.csv
https://github.com/eODP/data-processing/tree/master/notebooks/cleaned_data/NOAA_csv/JanusIODP_paleo_agemodel/paleontology/range_tables/120/747/HOLE_A/Miscellaneous.csv
https://github.com/eODP/data-processing/tree/master/notebooks/cleaned_data/NOAA_csv/JanusIODP_paleo_agemodel/paleontology/range_tables/120/749/HOLE_B/Miscellaneous.csv
https://github.com/eODP/data-processing/tree/master/notebooks/cleaned_data/NOAA_csv/JanusIODP_paleo_agemodel/paleontology/range_tables/120/749/HOLE_C/Miscellane

In [50]:
for path in taxa_csv_paths:
    df = pd.read_csv(path, nrows=0)
    if '.1' in df.columns:
        print(path)