# QA ODP files

expeditions 101 - 210; range_tables for paleo fossils, age_models; https://www.ngdc.noaa.gov/mgg/geology/data/joides_resolution/odp_all_paleontology.tar.gz

In [1]:
import sys
import os 

sys.path.append('../scripts/')
import glob


import pandas as pd
import chardet
import space_delim as sd
import normalize_noaa_files as nf

from normalize_noaa_files import (
    unique_filenames_for_paths,
    unique_columns_for_paths,
    qa_files_for_paths,
)

In [2]:
base_directory = 'cleaned_data'
base_data_path = os.path.join(base_directory, 'odp_all_paleontology', 'range_tables')

In [3]:
csv_paths = glob.glob(os.path.join(base_data_path,'**' ,'*.txt'), recursive=True)
print('files', len(csv_paths))

files 2045


## unique file names

In [4]:
unique_filenames_for_paths(csv_paths)

{'Benthic Foraminifers.txt',
 'Benthic_Foraminifers.txt',
 'Bolboforms.txt',
 'Diatoms.txt',
 'Dinoflagellates_Acritarch_Prasinophytes.txt',
 'Dinoflagellates_Acritarchs_Prasinophytes.txt',
 'Macrofossils.txt',
 'Miscellaneous.txt',
 'Nannofossils .txt',
 'Nannofossils.txt',
 'Ostracodes.txt',
 'Planktonic Foraminifers.txt',
 'Planktonic_Foraminifers .txt',
 'Planktonic_Foraminifers.txt',
 'Pollen_Spores.txt',
 'Pteropods.txt',
 'Radiolarians.txt',
 'Silicoflagellates_Ebridians_Actiniscidians.txt',
 'Sponge_Spicules.txt',
 'Trace_Fossils.txt'}

## column names

In [5]:
taxa_columns = unique_columns_for_paths(csv_paths, sep="\t")
len(taxa_columns)

12975

## QA ODP paleo files

count the number of good files vs files that need to be fixed

In [6]:
expected_fields = {
    'Data',
    'Age From (oldest)',
    'Age To (youngest)',
    'Zone From (bottom)',
    'Zone To  (top)',
    'Leg',
    'Site',
    'H',
    'Cor',
    'T',
    'Sc',
    'Top(cm)',
    'Depth (mbsf)',
    'Scientist',
#     'Fossil Group',
    'Comment', 
    'Group Abundance',
    'Group Preservation'
}

results = nf.qa_files_for_paths(csv_paths, expected_fields, sep='\t')

In [7]:
print('bad_tabs', len(results['bad_tabs']))
print('bad_encoding', len(results['bad_encoding']))
print('space_delim', len(results['space_delim']))
print('missing_fields', len(results['missing_fields']))
print('good_files', len(results['good_files']))

bad_tabs 0
bad_encoding 0
space_delim 0
missing_fields 0
good_files 2045


In [8]:
results['missing_fields']

[]

## process latin_encoding
handle files with encoding that isn't utf-8

In [9]:
for file in results['bad_encoding']:
    print(file)

convert file to utf-8 encoding
https://codereview.stackexchange.com/a/202985

In [10]:
for file in results['bad_encoding']:
    with open(file, 'rb') as f:
        content_bytes = f.read()
    detected = chardet.detect(content_bytes)
    encoding = detected['encoding']
    content_text = content_bytes.decode(encoding)
    
    with open(file, 'w', encoding='utf-8') as f:
        f.write(content_text)
        print(file)

## process bad_tabs
handle files where the hearers and rows have different number of columns

In [11]:
for file in results['bad_tabs']:
    print(file)

## process space_delim

handle files that use random number of spaces to separate the columns

In [12]:
for file in results['space_delim']:
    file_size = os.path.getsize(file)
    print(f'"{file}",')

https://stackoverflow.com/questions/19759423/convert-a-space-delimited-file-to-comma-separated-values-file-in-python


## process missing_fields

handle files don't have the 

In [13]:
for file in results['missing_fields']:
    print(file)
    df = pd.read_csv(file, nrows=1)
    print(expected_fields - set(df.columns))

## check space_delim files were correctly fixed

After converting space delimited files, check the  files for errors. Errors
include values that have spaces or columns that have no values.

In [14]:
files_with_spaces = []
fixed_space_delim_files = (
    sd.space_delim_files_janus_iodp_1
    + sd.space_delim_files_janus_iodp_2
    + sd.space_delim_files_janus_iodp_3
)

skip_fields = {
    'Data', 'Age From (oldest)', 'Age To (youngest)', 'Zone From (bottom)', 
    'Zone To  (top)', 'Leg', 'Site','H', 'Cor', 'T', 'Sc', 'Top(cm)', 
    'Depth (mbsf)', 'Scientist', 'Comment', 'Fossil Group'
}

def valid_values(x):
    return isinstance(x, str) and ' ' in x

for file in fixed_space_delim_files:
    filename = os.path.join(base_directory, file)

    df = pd.read_csv(filename, dtype=str)
    df.dropna(axis="columns", how="all")
    
    taxa_columns = set(df.columns) - skip_fields
    for col in taxa_columns:
        # check if there are values with spaces         
        if sum(df[col].apply(valid_values)) > 0:
            files_with_spaces.append(filename)
            print(f'{col}: has space')
            print(filename)
            print('---')
            
        # check if column is blank       
        if df[col].isnull().values.all():
            print(f'{col}: has no values')
            print(filename)
            print('---')

Strip the beginning and ending spaces for every column in a dataframe

In [26]:
for file in files_with_spaces:
    update = False
    filename = os.path.join(base_directory, file)

    df = pd.read_csv(filename, dtype=str, sep="\t")
    
    for col in df.columns:
        if sum(df[col].apply(valid_values)) > 0:
            df = df.apply(lambda x: x.str.strip())
            update = True
        
    if update:
        df.to_csv(filename, index=False, sep="\t")