# QA NOAA Janus IODP metadata
## 101-210 taxa, 101-190 age

QA NOAA Janus IODP dataset. Fix bad files.

NOAA/JanusIODP_paleo_agemodel  
expedition 101-210 
taxa 101-210, age models 101-190

In [48]:
import sys
sys.path.append('../../')
import glob
from pathlib import Path
import os
import re 
import pandas as pd
import numpy as np
import shutil

from scripts.normalize_noaa_files import (
    unique_filenames_for_paths,
    unique_columns_for_paths,
    filename_index,
    format_filepaths_set,
    qa_files_for_paths,
    column_counts_for_paths
)
import scripts.space_delim as sd
from config import OUTPUT_DIR, CLEAN_DATA_DIR, RAW_DATA_DIR
from scripts.shared_utils import (
    log_df
)

In [3]:
base_dir = CLEAN_DATA_DIR
data_dir = base_dir/'NOAA'/'JanusIODP_paleo_agemodel'
metadata_path = OUTPUT_DIR/'metadata'/'NOAA'/'noaa_janus_iodp_files.csv'

In [4]:
csv_paths = list(data_dir.glob("**/*.csv"))
csv_paths = [p for p in csv_paths if '.ipynb_checkpoints' not in str(p)]


print('files', len(csv_paths))

files 2481


In [5]:
taxa_csv_paths = list(data_dir.glob("paleontology/range_tables/**/*.csv"))
print('files', len(taxa_csv_paths))

files 2045


In [6]:
age_csv_paths = list(data_dir.glob("paleontology/age_models/**/*.csv"))
print('files', len(age_csv_paths))

files 436


## QA and Fix ODP paleo files

Count the number of good files vs bad files that need to be fixed

In [10]:
expected_fields = {
    'Data',
    'Age From (oldest)',
    'Age To (youngest)',
    'Zone From (bottom)',
    'Zone To  (top)',
    'Leg',
    'Site',
    'H',
    'Cor',
    'T',
    'Sc',
    'Top(cm)',
    'Depth (mbsf)',
    'Scientist',
#     'Fossil Group',
    'Comment', 
    'Group Abundance',
    'Group Preservation'
}

results = qa_files_for_paths(taxa_csv_paths, expected_fields, sep=',')

In [11]:
print('bad_tabs', len(results['bad_tabs']))
print('bad_encoding', len(results['bad_encoding']))
print('space_delim', len(results['space_delim']))
print('missing_fields', len(results['missing_fields']))
print('good_files', len(results['good_files']))
print('unnamed_column', len(results['unnamed_column']))

bad_tabs 0
bad_encoding 0
space_delim 0
missing_fields 0
good_files 2007
unnamed_column 38


### process latin_encoding
handle files with encoding that isn't utf-8

In [12]:
for file in results['bad_encoding']:
    pass

convert file to utf-8 encoding
https://codereview.stackexchange.com/a/202985

In [13]:
for file in results['bad_encoding']:
    with open(file, 'rb') as f:
        content_bytes = f.read()
    detected = chardet.detect(content_bytes)
    encoding = detected['encoding']
    content_text = content_bytes.decode(encoding)
    
    with open(file, 'w', encoding='utf-8') as f:
        f.write(content_text)
        print(file)

### process bad_tabs
handle files where the hearers and rows have different number of columns

In [14]:
for file in results['bad_tabs']:
    pass

### process missing_fields

handle files don't have the expected columns

In [15]:
for file in results['missing_fields']:
    print(file)
    df = pd.read_csv(file, nrows=1)
    print(expected_fields - set(df.columns))

### process unnamed_column
handle files with unnamed columns

In [16]:
results['unnamed_column'][0:2]

[PosixPath('../../output/cleaned_data/NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/192/1183/HOLE_A/Nannofossils.csv'),
 PosixPath('../../output/cleaned_data/NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/167/1010/HOLE_E/Radiolarians.csv')]

remove blank lines

In [17]:
raw_url = '../../raw_data/NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/'
clean_url = '../../output/cleaned_data/NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/'

In [18]:
for file in results['unnamed_column']:
    print(file)
    df = pd.read_csv(file, dtype=str)
    df.dropna(how='all', axis='index', inplace=True)

    last_columns = df.columns[(len(df.columns)-2):len(df.columns)]
    
    last_taxa = df.columns[(len(df.columns)-2)]
    print(df[last_taxa].unique() )
#     print(df[last_columns].head(2))
 


../../output/cleaned_data/NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/192/1183/HOLE_A/Nannofossils.csv
['C                 ' 'R                 ' '                  ']
../../output/cleaned_data/NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/167/1010/HOLE_E/Radiolarians.csv
['                   ' '+                  ']
../../output/cleaned_data/NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/167/1010/HOLE_C/Radiolarians.csv
['T                      ' '                       ']
../../output/cleaned_data/NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/167/1011/HOLE_B/Radiolarians.csv
['+                     ' '                      ']
../../output/cleaned_data/NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/151/907/HOLE_A/Radiolarians.csv
['                 ' '+                ']
../../output/cleaned_data/NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/189/1169/HOLE_A/Radiolarians.csv
['R                      ' '                       '

### process space_delim

handle files that use random number of spaces to separate the columns

In [9]:
results['space_delim'][0:2]

NameError: name 'results' is not defined

In [10]:
def spaces(int):
    return ' ' * int

def replace_path(file, old_path, new_path):
    index = file.parts.index(old_path)
    return Path().joinpath(*file.parts[0:index], new_path, *file.parts[index + 1:])
    
    

automatically process raw files with spaces

In [11]:
for file in results['space_delim']:
    df = sd.convert_space_delim_file(file)
    
    index = file.parts.index('raw_data')
    path = Path('../../output/cleaned_data').joinpath(*file.parts[index + 1:])
    df.to_csv(path, index=False)

NameError: name 'results' is not defined

In [None]:
bad_files = [
    # needs fixing - Z, X; manually remove quotes
    'cleaned_data/NOAA_csv/JanusIODP_paleo_agemodel/paleontology/range_tables/175/1077/HOLE_A/Diatoms.csv', 
    # needs fixing - Reticulofenestra Z 
    'cleaned_data/NOAA_csv/JanusIODP_paleo_agemodel/paleontology/range_tables/181/1120/HOLE_D/Nannofossils.csv',
    # needs fixing - Form A, Form B    
    'cleaned_data/NOAA_csv/JanusIODP_paleo_agemodel/paleontology/range_tables/181/1120/HOLE_B/Radiolarians.csv',
    # needs fixing - Form A      
    'cleaned_data/NOAA_csv/JanusIODP_paleo_agemodel/paleontology/range_tables/181/1119/HOLE_B/Radiolarians.csv'
]

manual cleanup; file needs to be fixed by PIs: Z Actiniscus pentasterias,X Phytoliths diff. shapes

In [12]:
file = '../../raw_data/NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/175/1077/HOLE_A/Diatoms.csv'
output = file.replace('raw_data', 'output/cleaned_data')

with open(file) as reader:
    
    all_lines = []
    for line in reader:
        if line == '""\n':
                continue

        new_line = line
        new_line = new_line.replace('""Rhizosolenia """"Imbricatae""""""', 'Rhizosolenia "Imbricatae"')
        text = 'other Freshwater (Achnanthes; Fragilaria, etc.)'
        new_line = new_line.replace(text, f'"{text}"')
        new_line = new_line.replace('Z Actiniscus pentasterias', 'Actiniscus pentasterias')
        new_line = new_line.replace('X Phytoliths diff. shapes', 'Phytoliths diff. shapes')
        new_line = re.sub('\n', '', new_line)
        new_line = re.sub(",?\"$", '', new_line)

        new_line = re.sub(', {1,}', ',', new_line)
        new_line = re.sub(' {1,},', ',', new_line)
        new_line = re.sub('  IR,', 'IR,', new_line)
        new_line = re.sub('^"', '', new_line)
        all_lines.append(new_line)
    
with open(output, 'w') as writer:
    writer.writelines('\n'.join(all_lines))

manual fix messed up spacing, convert to csv


In [13]:
file = '../../raw_data/NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/172/1056/HOLE_C/Nannofossils.csv'
output = file.replace('raw_data', 'output/cleaned_data')

with open(file) as reader:
    
    all_lines = []
    for line in reader:
        if line == '""\n':
                continue

        new_line = line
        new_line = re.sub('\n', '', new_line)
        new_line = re.sub('IR +172 +1056 +C', 
                          'IR' + spaces(81) + '172       1056       C', new_line)
        new_line = re.sub('C +([0-9]) +H +CC', 
                          r'C      \1        H      CC', new_line)
        new_line = re.sub('CC {9}([0-9.]{4}) {10}([0-9])', r'CC          \1         \2', new_line)
        new_line = re.sub('CC {9}([0-9.]{5}) {10}([0-9])', r'CC          \1        \2', new_line)
        new_line = re.sub('(.{171}) +(Raffi) +(Nannofossils)', 
                          r'\1' + spaces(0) + 'Raffi' + spaces(12) +'Nannofossils', new_line)
        new_line = re.sub('Nannofossils +A +G {18}', 
                          'Nannofossils'+ spaces(8)+'A' + spaces(21)+ 'G', new_line)
        new_line = re.sub('(.{284}) +F +([CA])', r'\1' +  'F' + spaces(25) + r'\2', new_line)
        new_line = re.sub('(.{310}[CA]) {42,43}([AF ])',
                          r'\1' + spaces(29) + r'\2',new_line)
        new_line = re.sub('(.{343}) +A', 
                          r'\1' + spaces(25) + 'A', new_line)
        new_line = re.sub('(.{368}A) {29,32}([CF ])', 
                          r'\1' + spaces(25) + r'\2', new_line)
        new_line = re.sub('(.{420}) {11,12}([R ]) {35,39}([C F])', 
                          r'\1' +  r'\2' +  spaces(26) + r'\3', new_line)
        new_line = re.sub('(.{474}) {4,8}([ARC ]) {38,39}', 
                          r'\1' + r'\2' + spaces(21), new_line)
        new_line = re.sub('(.{525}) {11}([1 ]) {15}([+ ])', 
                          r'\1' + r'\2' + r'\3', new_line)
   
        new_line = re.sub('(.{577}) {0,4}([<>])', 
                          r'\1' + spaces(5) + r'\2', new_line)
        all_lines.append(new_line)
    

with open(output, 'w') as writer:
    writer.writelines('\n'.join(all_lines))
 
df = sd.convert_space_delim_file(output)
df.to_csv(output, index=False)

manual fix messed up spacing, convert to csv


In [17]:
file = '../../raw_data/NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/174/1071/HOLE_B/Planktonic_Foraminifers.csv'
output = file.replace('raw_data', 'output/cleaned_data')

with open(file) as reader:
    
    all_lines = []
    for line in reader:
        if line == '""\n':
                continue
                
        new_line = line
        new_line = re.sub('\n', '', new_line)
        new_line = re.sub('IR +174', 
                          'IR' + spaces(92) + '174', new_line)
        new_line = re.sub('(.{117}) +([0-9]) +X +CC', 
                          r'\1' + r'\2' + spaces(7) + 'X'  + spaces(5) + 'CC', new_line)
        new_line = re.sub('(.{138}) {1}([0-9. ]{5}) +([0-9.]+)', 
                          r'\1' + r'\2' + spaces(7) + r'\3' + spaces(2) , new_line)   
        new_line = re.sub('Olson {6}Planktonic Foraminifers {10}T {17}(G?)', 
                          'Olson         Planktonic Foraminifers   T' +spaces(19) + r'\1', new_line)
        new_line = re.sub('(.{250}) +([0-9])', 
                          r'\1' + r'\2' +spaces(15) , new_line)  
        new_line = re.sub('planktonics miscellaneous', 'Planktonics miscellaneous', new_line)
        all_lines.append(new_line)
        
with open(output, 'w') as writer:
    writer.writelines('\n'.join(all_lines))
    
df = sd.convert_space_delim_file(output)
df
df.to_csv(output, index=False)

manually fix spacing

In [25]:
file = "../../raw_data/NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/175/1081/HOLE_A/Planktonic_Foraminifers.csv"
output = file.replace('raw_data', 'output/cleaned_data')


with open(file) as reader:
    
    all_lines = []
    for line in reader:
        if line == '""\n':
                continue
        if line.startswith("Abbreviated View --ALL"):
            continue

                
        new_line = line
        new_line = re.sub('\n', '', new_line)
        
        new_line = re.sub('Christensen ', ' Christensen', new_line)
        new_line = re.sub(spaces(7)  + 'Fossil Group', 'Fossil Group' + spaces(7), new_line)
        
        all_lines.append(new_line)
        
with open(output, 'w') as writer:
    writer.writelines('\n'.join(all_lines))
   
df = sd.convert_space_delim_file(output)
df.to_csv(output, index=False)

fix quotes

In [26]:
file = '../../raw_data/NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/181/1119/HOLE_B/Nannofossils.csv'
output = file.replace('raw_data', 'output/cleaned_data')

with open(file) as reader:
    all_lines = []
    for line in reader:
        if line == '""\n':
                continue
       
        if re.match('^" +IR.*?$', line):
            line = re.sub('^"(.*?)"$', r'\1', line)
            
        line = re.sub('""Small""', '"Small"', line)
            
            
        all_lines.append(line)

with open(output, 'w') as writer:
    writer.writelines('\n'.join(all_lines))
    
df = sd.convert_space_delim_file(output)
df.to_csv(output, index=False)

 needs review - Reticulofenestra Z ;  

In [27]:
file = '../../raw_data/NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/181/1120/HOLE_D/Nannofossils.csv'
output = file.replace('raw_data', 'output/cleaned_data')

with open(file) as reader:
    all_lines = []
    for line in reader:
        if line == '""\n':
                continue
                
        line = line.replace('Reticulofenestra Z', 'Reticulofenestra')
                
        all_lines.append(line)

with open(output, 'w') as writer:
    writer.writelines('\n'.join(all_lines))
    
df = sd.convert_space_delim_file(output)
df.to_csv(output, index=False)   

### fix files manually

In [85]:
fix_base = '../../output/cleaned_data/NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/'

fix 197/1204/HOLE_B/Nannofossils.csv

In [78]:
file = fix_base + '197/1204/HOLE_B/Nannofossils.csv'

df = pd.read_csv(file, dtype=str)
log_df(df)



(19, 38)


Unnamed: 0,Data,Age From (oldest),Age To (youngest),Zone From (bottom),Zone To (top),Unnamed: 5,Leg,Site,H,Cor,...,Cribrosphaerella ehrenbergii,Cretarhabdus crenulatus,Ceratolithoides aculeus,Aspidolithus parcus parcus,Tranolithus orionatus,Micula murus,Cretarhabdus conicus,Tegumentum stradneri,Micula swastica,Comment
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,IR,,Maestrichtian,CC26,CC25,,197.0,1204.0,B,1.0,...,F,F,C,,,R,R,R,R,
4,,,,,,,,,,,...,,,,,,,,,,


In [79]:
tmp_df = df[df['Fossil Group                                 '] == 'F              '].copy()
log_df(tmp_df)

(3, 38)


Unnamed: 0,Data,Age From (oldest),Age To (youngest),Zone From (bottom),Zone To (top),Unnamed: 5,Leg,Site,H,Cor,...,Cribrosphaerella ehrenbergii,Cretarhabdus crenulatus,Ceratolithoides aculeus,Aspidolithus parcus parcus,Tranolithus orionatus,Micula murus,Cretarhabdus conicus,Tegumentum stradneri,Micula swastica,Comment
13,IR,,Campanian,CC23,CC22(Perch-Nielsen),197,1204,B,1,R,...,R,,R,,,,,,,
15,IR,,Campanian,CC23,CC22(Perch-Nielsen),197,1204,B,1,R,...,R,R,,R,,,,,,
17,IR,,Campanian,CC23,CC22(Perch-Nielsen),197,1204,B,17,R,...,R,R,R,,,,,,,


In [80]:
tmp_df.columns

Index(['Data', 'Age From (oldest)', 'Age To (youngest)', 'Zone From (bottom)',
       'Zone To  (top)', 'Unnamed: 5', 'Leg', 'Site', 'H', 'Cor', 'T', 'Sc',
       'Top(cm)', 'Depth (mbsf)', 'Scientist',
       'Fossil Group                                 ', 'Group Abundance',
       'Group Preservation', 'Watznaueria barnesae', 'Quadrum trifidum',
       'Quadrum sissinghii', 'Quadrum gothicum', 'Prediscosphaera cretacea',
       'Pervilithus varius', 'Micula decussata', 'Micula concava',
       'Marthasterites inconspicuus', 'Eiffellithus turriseiffelii',
       'Cribrosphaerella ehrenbergii', 'Cretarhabdus crenulatus',
       'Ceratolithoides aculeus', 'Aspidolithus parcus parcus',
       'Tranolithus orionatus', 'Micula murus', 'Cretarhabdus conicus',
       'Tegumentum stradneri', 'Micula swastica', 'Comment'],
      dtype='object')

In [90]:
replace_cols = list(tmp_df.columns)[5:]

replace_cols[0:5]

['Leg', 'Site', 'H', 'Cor', 'T']

In [81]:

for index, row in tmp_df.iterrows():
    for index2, col in enumerate(replace_cols):
        if index2 + 1 == len(replace_cols):
            continue
        df.at[index, replace_cols[index2+1]] = row[col]


In [82]:
df.tail()

Unnamed: 0,Data,Age From (oldest),Age To (youngest),Zone From (bottom),Zone To (top),Unnamed: 5,Leg,Site,H,Cor,...,Cribrosphaerella ehrenbergii,Cretarhabdus crenulatus,Ceratolithoides aculeus,Aspidolithus parcus parcus,Tranolithus orionatus,Micula murus,Cretarhabdus conicus,Tegumentum stradneri,Micula swastica,Comment
14,,,,,,,,,,,...,,,,,,,,,,
15,IR,,Campanian,CC23,CC22(Perch-Nielsen),197.0,197.0,1204.0,B,1.0,...,R,R,R,,R,,,,,
16,,,,,,,,,,,...,,,,,,,,,,
17,IR,,Campanian,CC23,CC22(Perch-Nielsen),197.0,197.0,1204.0,B,17.0,...,R,R,R,R,,,,,,
18,,,,,,,,,,,...,,,,,,,,,,


In [84]:
del df['Unnamed: 5']
df.to_csv(file, index=False)

fix file '122/763/HOLE_B/Nannofossils.csv'

In [96]:
file = fix_base + '122/763/HOLE_B/Nannofossils.csv'

df = pd.read_csv(file, dtype=str)
log_df(df)


(365, 134)


Unnamed: 0,Data,Age From (oldest),Age To (youngest),Zone From (bottom),Zone To (top),Leg,Site,H,Cor,T,...,Broinsonia parca parca,Broinsonia parca constricta,Markalius inversus,Quadrum gothicum,Corollithion exiguum,Microrhabdulus elongatus,Zygodiscus spiralis,Manivitella granulata,Quadrum trifidum,Comment
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,SR,Campanian,,CC22,,122.0,763.0,B,8.0,X,...,R,R,,R,,,,,,from ODP LEG 122 SR Chapter 32 Table 3 back po...
4,,,,,,,,,,,...,,,,,,,,,,


In [97]:
tmp_df = df[df['Fossil Group                                 '] == 'A              '].copy()
log_df(tmp_df)


(2, 134)


Unnamed: 0,Data,Age From (oldest),Age To (youngest),Zone From (bottom),Zone To (top),Leg,Site,H,Cor,T,...,Broinsonia parca parca,Broinsonia parca constricta,Markalius inversus,Quadrum gothicum,Corollithion exiguum,Microrhabdulus elongatus,Zygodiscus spiralis,Manivitella granulata,Quadrum trifidum,Comment
31,SR,Campanian,,CC18,CC20Perch-Nielsen 122,763,B,11,X,3,...,F,,,,,,,,from ODP LEG 122 SR Chapter 32 Table 3 back po...,
33,SR,Campanian,,CC18,CC20Perch-Nielsen 122,763,B,11,X,5,...,F,,,,,,,,from ODP LEG 122 SR Chapter 32 Table 3 back po...,


In [103]:
replace_cols = list(tmp_df.columns)[5:]

replace_cols[0:5]

['Leg', 'Site', 'H', 'Cor', 'T']

In [105]:
for index, row in tmp_df.iterrows():
    df.at[index, 'Zone To  (top)'] = 'CC20Perch-Nielsen'
    df.at[index, 'Leg'] = '122'

    for index2, col in enumerate(replace_cols):
        if index2 + 1 == len(replace_cols):
            continue
        # print(replace_cols[index2 + 1], row[col])
            
        df.at[index, replace_cols[index2+1]] = row[col]




In [106]:
df.to_csv(file, index=False)

## copy files to folder for PIs to review

In [61]:
def copy_file(mypath, current_directory, output_directory):
    output_path = str(mypath).replace(current_directory, output_directory)
    directories = re.sub('/[A-Za-z0-9\-_ ]+\.csv$', '', output_path)
    if not os.path.isdir(directories):
        os.makedirs(directories)  
        print("lil")
    shutil.copy(mypath, output_path)

for file in results['unnamed_column']:
    if file.match('raw_data'):
        file = file.replace('raw_data', 'cleaned_data')
        copy_file(file, 'raw_data', 'tmp/NOAA_review/unnamed_column/cleaned_data')
    
for file in results['unnamed_column']:
    if file.match('raw_data'):
        copy_file(file, 'raw_data', 'tmp/NOAA_review/unnamed_column/raw_data')
    
for file in results['space_delim']:
    if file.match('raw_data'):
        copy_file(file, 'raw_data', 'tmp/NOAA_review/space_delimited/raw_data')
    
for file in results['space_delim']:
    if file.match('raw_data'):
        file = file.replace('raw_data', 'cleaned_data')
        copy_file(file, 'cleaned_data', 'tmp/174-1071B/space_delimited/cleaned_data')

## column names

Get all the column names.

### age models

In [29]:
age_columns = unique_columns_for_paths(age_csv_paths)
age_columns

{'    Age (Ma)',
 'Age Model Type           ',
 'Control Point Comment',
 'Depth (mbsf)',
 'H',
 'Leg',
 'Site',
 'Unnamed: 6'}

In [30]:
column_counts_for_paths(age_csv_paths)

{7, 8}

print out files that have too many columns

In [31]:
for path in age_csv_paths:
    df = pd.read_csv(path, nrows=0)
    if len(df.columns) == 8:
        print(path)

../../output/cleaned_data/NOAA/JanusIODP_paleo_agemodel/paleontology/age_models/150/906/HOLE_A/Age_Model_Initial_Report.csv
../../output/cleaned_data/NOAA/JanusIODP_paleo_agemodel/paleontology/age_models/154/925/HOLE_A/Age_Model_Initial_Report.csv


### taxa

In [32]:
taxa_columns = unique_columns_for_paths(taxa_csv_paths)
len(taxa_columns)

12979

## files grouped by expedition and file type

In [33]:
contents = {}
index = filename_index(path)

for path in csv_paths:
    parts = Path(path).parts
    exp = parts[5]
    filename = parts[index]
        
    if exp not in contents:
        contents[exp] = {'taxa': set(),'age_model': set()}
        
    if filename.startswith('Age_'):
        contents[exp]['age_model'].add(filename)
    else:
        contents[exp]['taxa'].add(filename)
    

In [34]:
file_list = []

for exp in contents.items():
    file_data = {}
    file_data['expedition'] = exp[0]
    file_data['taxa'] = format_filepaths_set(exp[1], 'taxa')
    file_data['age_model'] = format_filepaths_set(exp[1], 'age_model')

    file_list.append(file_data)

In [35]:

df = pd.DataFrame(file_list)
df.head()

Unnamed: 0,expedition,taxa,age_model
0,JanusIODP_paleo_agemodel,"Dinoflagellates_Acritarch_Prasinophytes.csv,Di...","Age_Model_Initial_Report.csv,Age_Model_Shipboa..."


In [67]:
path = os.path.join('tmp', 'noaa_janus_iodp_grouped_files.csv')
df.to_csv(path, index=False)

## Miscellaneous.csv

create github link for each Miscellaneous.csv.

In [71]:
for path in csv_paths:
    if 'Miscellaneous.csv' == path.name:
        link = 'https://github.com/eODP/data-processing/tree/master/notebooks/' + str(path)
        print(link)

https://github.com/eODP/data-processing/tree/master/notebooks/../../output/cleaned_data/NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/104/643/HOLE_A/Miscellaneous.csv
https://github.com/eODP/data-processing/tree/master/notebooks/../../output/cleaned_data/NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/104/644/HOLE_A/Miscellaneous.csv
https://github.com/eODP/data-processing/tree/master/notebooks/../../output/cleaned_data/NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/104/642/HOLE_B/Miscellaneous.csv
https://github.com/eODP/data-processing/tree/master/notebooks/../../output/cleaned_data/NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/120/747/HOLE_A/Miscellaneous.csv
https://github.com/eODP/data-processing/tree/master/notebooks/../../output/cleaned_data/NOAA/JanusIODP_paleo_agemodel/paleontology/range_tables/120/749/HOLE_B/Miscellaneous.csv
https://github.com/eODP/data-processing/tree/master/notebooks/../../output/cleaned_data/NOAA/JanusIODP_paleo_agemod

In [72]:
for path in taxa_csv_paths:
    df = pd.read_csv(path, nrows=0)
    if '.1' in df.columns:
        print(path)