# Cleaning Jefferson Letter Documents
Takes Gutenberg Jefferson documents that have already had all gutenberg text removed and then further removes all letter headers (i.e. the 'TO:' line, the date line, and salutation in the body if it exists). As a side benefit, the leading-space check for date lines also catches many (or maybe all) tables, which we don't want either.

# WARNING: This will overwrite _clean and _cuts files, but not _cleanish

In [69]:
import re
from __future__ import print_function

In [70]:
## Set file path to directory with cleanish files
file_dir_path = '../train_data/thomas_jefferson/'

In [75]:
for volume_num in range(2,8):
    ## Sets file paths for reading and writing
    cleanish_file_name = 'jefferson_writings_of_vol_' + str(volume_num) + '_gutenberg_cleanish.txt'
    clean_file_name = 'jefferson_writings_of_vol_' + str(volume_num) + '_gutenberg_clean.txt'
    cuts_file_name = 'jefferson_writings_of_vol_' + str(volume_num) + '_gutenberg_cuts.txt'
    
    cleanish_file_path = file_dir_path + cleanish_file_name
    clean_file_path = file_dir_path + clean_file_name
    cuts_file_path = file_dir_path + cuts_file_name
    
    # Reads cleanish file and breaks into list of paragraphs
    with open(cleanish_file_path, 'r') as f:
        doc = f.read()
    paragraphs = doc.split('\n\n')

    ## Testing each paragraph and maintaining new list of keeps and cuts to write to new files
    keeps = [] # paragraphs to keep
    cuts = [] # paragraphs to cut
    leading_spaces = ' '*15 # leading spaces indicating a paragraph we don't want
    for par in paragraphs:
        if par.upper() == par: # check if all-caps TO line
            cuts.append(par)
        elif par[0:15] == leading_spaces: # check if big block of white space, which indicates a date line
            cuts.append(par)
        else: # check further if this is a body paragraph with a salutation
            head = par[0:50]
            double_dash_ind = head.find('--') # search for double dash in head
            if double_dash_ind != -1: # if double dash is found
                cuts.append(par[0:double_dash_ind+2])
                keeps.append(par[double_dash_ind+2:])
            else: # no double dash, so keep everything
                keeps.append(par)
    
    ## Reconstructing documents with paragraph breaks
    keeps = '\n\n'.join(keeps)
    cuts = '\n\n'.join(cuts)

    ## Writing new files
    with open (clean_file_path, 'w') as f:
        for par in keeps:
            f.write(par)

    with open (cuts_file_path, 'w') as f:
        for par in cuts:
            f.write(par)