# QA ODP files

expeditions 101 - 210; range_tables for paleo fossils, age_models; https://www.ngdc.noaa.gov/mgg/geology/data/joides_resolution/odp_all_paleontology.tar.gz

In [1]:
import sys
import os 

sys.path.append('../scripts/')
import glob


import pandas as pd
import chardet

## QA ODP paleo files

count the number of good files vs files that need to be fixed

In [2]:
data_path = 'cleaned_data/odp_all_paleontology/range_tables'
raw_csvs = glob.glob(f"{data_path}/**/*.txt", recursive=True)

In [3]:
len(raw_csvs)

2045

In [4]:
bad_encoding = [] # does not use utf-8 encoding
bad_tabs = [] # rows have different number of columns than headers
space_delim = [] # uses spaces as delimiter
missing_fields = [] # does not have all the fields
good_files = []

fields = {
    'Age From (oldest)', 'Age To (youngest)', 'Zone From (bottom)', 'Zone To  (top)',
    'Leg', 'Site','H', 'Cor', 'T', 'Sc', 'Top(cm)', 'Depth (mbsf)',  
    'Group Abundance', 'Group Preservation'
}


for file in raw_csvs:
    try:
        df = pd.read_csv(file, delimiter="\t")
    except UnicodeDecodeError:
        bad_encoding.append(file)
        continue
    except pd.errors.ParserError:

        bad_tabs.append(file)
        continue
    
    
    if fields.issubset(df.columns):
        good_files.append(file)
    elif len(df.columns) == 1:
        space_delim.append(file)
    else:
        missing_fields.append(file)

In [5]:
print('bad_tabs', len(bad_tabs))
print('bad_encoding', len(bad_encoding))
print('space_delim', len(space_delim))
print('missing_fields', len(missing_fields))
print('good_files', len(good_files))

bad_tabs 9
bad_encoding 6
space_delim 61
missing_fields 0
good_files 1969


## process latin_encoding
handle files with encoding that isn't utf-8

In [6]:
for file in bad_encoding:
    print(file)

cleaned_data/odp_all_paleontology/range_tables/114/700/HOLE_B/Planktonic_Foraminifers.txt
cleaned_data/odp_all_paleontology/range_tables/114/702/HOLE_B/Planktonic_Foraminifers.txt
cleaned_data/odp_all_paleontology/range_tables/114/702/HOLE_A/Planktonic_Foraminifers.txt
cleaned_data/odp_all_paleontology/range_tables/114/699/HOLE_A/Planktonic_Foraminifers.txt
cleaned_data/odp_all_paleontology/range_tables/114/698/HOLE_A/Planktonic_Foraminifers.txt
cleaned_data/odp_all_paleontology/range_tables/149/898/HOLE_A/Dinoflagellates_Acritarch_Prasinophytes.txt


convert file to utf-8 encoding
https://codereview.stackexchange.com/a/202985

In [7]:
for file in bad_encoding:
    with open(file, 'rb') as f:
        content_bytes = f.read()
    detected = chardet.detect(content_bytes)
    encoding = detected['encoding']
    content_text = content_bytes.decode(encoding)
    
    with open(file, 'w', encoding='utf-8') as f:
        f.write(content_text)
        print(file)

cleaned_data/odp_all_paleontology/range_tables/114/700/HOLE_B/Planktonic_Foraminifers.txt
cleaned_data/odp_all_paleontology/range_tables/114/702/HOLE_B/Planktonic_Foraminifers.txt
cleaned_data/odp_all_paleontology/range_tables/114/702/HOLE_A/Planktonic_Foraminifers.txt
cleaned_data/odp_all_paleontology/range_tables/114/699/HOLE_A/Planktonic_Foraminifers.txt
cleaned_data/odp_all_paleontology/range_tables/114/698/HOLE_A/Planktonic_Foraminifers.txt
cleaned_data/odp_all_paleontology/range_tables/149/898/HOLE_A/Dinoflagellates_Acritarch_Prasinophytes.txt


## process bad_tabs
handle files where the hearers and rows have different number of columns

In [8]:
for file in bad_tabs:
    print(file)

cleaned_data/odp_all_paleontology/range_tables/172/1056/HOLE_B/Planktonic_Foraminifers.txt
cleaned_data/odp_all_paleontology/range_tables/172/1060/HOLE_A/Planktonic_Foraminifers.txt
cleaned_data/odp_all_paleontology/range_tables/172/1061/HOLE_A/Planktonic_Foraminifers.txt
cleaned_data/odp_all_paleontology/range_tables/172/1059/HOLE_A/Nannofossils.txt
cleaned_data/odp_all_paleontology/range_tables/172/1057/HOLE_A/Planktonic_Foraminifers.txt
cleaned_data/odp_all_paleontology/range_tables/172/1063/HOLE_A/Planktonic_Foraminifers.txt
cleaned_data/odp_all_paleontology/range_tables/172/1062/HOLE_E/Planktonic_Foraminifers.txt
cleaned_data/odp_all_paleontology/range_tables/172/1062/HOLE_A/Planktonic_Foraminifers.txt
cleaned_data/odp_all_paleontology/range_tables/171/1050/HOLE_B/Planktonic_Foraminifers.txt


## process space_delim

handle files that use random number of spaces to separate the columns

In [9]:
for file in space_delim:
    file_size = os.path.getsize(file)
    print(f'{file}: {file_size}')

cleaned_data/odp_all_paleontology/range_tables/174/1071/HOLE_B/Planktonic_Foraminifers.txt: 919
cleaned_data/odp_all_paleontology/range_tables/174/1071/HOLE_B/Benthic_Foraminifers.txt: 3229
cleaned_data/odp_all_paleontology/range_tables/180/1114/HOLE_A/Planktonic_Foraminifers.txt: 23208
cleaned_data/odp_all_paleontology/range_tables/180/1114/HOLE_A/Nannofossils.txt: 30183
cleaned_data/odp_all_paleontology/range_tables/180/1112/HOLE_A/Planktonic_Foraminifers.txt: 2114
cleaned_data/odp_all_paleontology/range_tables/180/1112/HOLE_A/Nannofossils.txt: 1679
cleaned_data/odp_all_paleontology/range_tables/180/1115/HOLE_B/Planktonic_Foraminifers.txt: 38292
cleaned_data/odp_all_paleontology/range_tables/180/1115/HOLE_B/Nannofossils.txt: 27813
cleaned_data/odp_all_paleontology/range_tables/180/1115/HOLE_C/Planktonic_Foraminifers.txt: 61112
cleaned_data/odp_all_paleontology/range_tables/180/1115/HOLE_C/Nannofossils.txt: 46294
cleaned_data/odp_all_paleontology/range_tables/180/1115/HOLE_A/Planktoni

https://stackoverflow.com/questions/19759423/convert-a-space-delimited-file-to-comma-separated-values-file-in-python
