In [1]:
import csv
import os
import regex as re # https://github.com/mrabarnett/mrab-regex
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from IPython.core.interactiveshell import InteractiveShell
from ipywidgets import IntProgress
from IPython.display import display

plt.style.use(['dark_background'])
InteractiveShell.ast_node_interactivity = "none"
pd.options.display.max_rows = 500


In [2]:
# helper functions
# progress bar
def p_bar(mx, desc):
    global p
    p = IntProgress(min=0, max=mx, description=desc)
    return p


def find_amendment_end(text, amendment_start):
    end_quote = last_amendment_char.search(text)
    end_eof = last_amendment_char_eof.search(text)
    
    if end_quote and end_eof:
        return max([end_quote.end(), end_eof.end()])
    elif end_quote:
        return end_quote.end()
    elif end_eof:
        return end_eof.end()
    else:
        return False


def extract_amendment(text, jr_string):
    
    if jr_string:
        start_match = pre_para_jr.search(text)
    else:
        start_match = pre_para_const.search(text)
    
    if start_match:
        start = start_match.end()
        end = find_amendment_end(text, start)
    
        return text[start:end] if end else text[start:]
           
    return 'EXTRACTION FAILED'


In [29]:
# paths
# production:
#raw_text_path = 'data/raw_text/'
#save_data_path = 'data/'
#clean_text_path = 'data/clean_txt/'

# testing:
raw_text_path = 'test_data/sample/'
save_data_path = 'test_data/'
clean_text_path = 'test_data/clean_sample/'
english_dict_path = 'data/nltk_english_words'  # path to dictionary of all English words

# load English dictionary
with open(english_dict_path, 'r') as file:
    english_words = set(file.read().split('\n'))

# load document statistics
with open(f'{save_data_path}text_file_statistics.csv', 'r') as file:
    doc_data = list(csv.DictReader(file))
    
# filters 
ascii_chars = re.compile("""[^!"#$%&'()*+,-.0-9:; =\?A-Z\[\]_`a-z\t\n]""")
word_chars = re.compile("[^a-zA-Z0-9 -']")

# detectors
pre_para_jr = re.compile('JOINT RESOLUTION{e<=3}[\w ,\n\.\d\t\(\)-]+:\n')  # preceeding paragraph - JOINT RESOLUTION 
pre_para_const = re.compile('[constiu]+(-\n)?[onstiu]+ ?:\n', re.IGNORECASE|re.BESTMATCH)  # preceeding paragraph - Constitution:
last_amendment_char = re.compile('\."?\n', re.REVERSE)
last_amendment_char_eof = re.compile('\."?$', re.REVERSE|re.MULTILINE)
amendment_dividers = re.compile('.{,2}article|sec.{,2}\n', re.IGNORECASE)  # text separating sections

# cleaning
replaces = {
    'empty_lines': (
        re.compile('^\n', re.MULTILINE), 
        ''
    ),
    'empty_numbered_lines': (
        re.compile('^[.*]?[gIfloOzZBJ\d]{1,2}[ .]{0,2}$', re.MULTILINE),
        ''
    ),
    'numbered_lines': (
        re.compile('^[.*]?[gIfloOzZBJ\d]{1,2}\.?(?: |\t)(?=[\w ,\.\d\t\(\)-:;\'\"\”]+$)', re.MULTILINE),
        ''
    ),
    'quotation_marks': (
        re.compile("""''\.?|«|»|“|”|„|‟|❝|❞|〝|〞|〟|＂"""),
        '"'
    ),
    'tabs_as_space': (
        re.compile('\t'),
        ' '
    ),
    'whitespace_at_linestart': (
        re.compile('^\s+', re.MULTILINE),
        ''
    ),
    'whitespace_at_lineend': (
        re.compile('\s+$', re.MULTILINE),
        ''
    ),
    'hyphenated_words': (
        re.compile('-\n'),
        ''
    ),
    'multiple_spaces': (
        re.compile('[ ]+'),
        ' '
    ),
    'carriage_returns': (
        re.compile('(?<!\.)\n'),
        ' '
    ),
    'spaces_before_punctuation': (
        re.compile('\s+(?=[.;,:])'),
        ''
    ),
    'repeated_punctuation': (
        re.compile('(?<=(?P<punct>,|\.|;|:))\g<punct>+'),
        ''
    ),
    'orphaned_quotation_marks': (
        re.compile('(?<!")((?:.|\n)+)"'),
        r'\1'
    )
}


# sets
pre_process = ['empty_lines', 'empty_numbered_lines', 'numbered_lines', 'quotation_marks', 'empty_lines']
clean_amendment = ['tabs_as_space', 'multiple_spaces', 'whitespace_at_linestart', 'whitespace_at_lineend', 
                   'hyphenated_words', 'carriage_returns', 'spaces_before_punctuation', 'repeated_punctuation',
                   'orphaned_quotation_marks']


In [30]:
display(p_bar(len(doc_data), 'Processing')) # show the progress bar

for i, doc in enumerate(doc_data):
    
    if float(doc['english_words']) > 50:  # threshold for non-rubbish text
        filename = doc['filename']
        
        with open(f'{raw_text_path}{filename}', 'r') as file:
            text = file.read()

        # general cleanup
        text = ascii_chars.sub('', text) # remove non-ascii characters
        
        for task in pre_process:
            pattern = replaces[task][0]
            fix = replaces[task][1]
            text = pattern.sub(fix, text)
        
        # extract amendment text
        clean_text = extract_amendment(text, bool(int(doc['jr_string'])))
        
        # clean amendment text
        for task in clean_amendment:
            pattern = replaces[task][0]
            fix = replaces[task][1]
            clean_text = pattern.sub(fix, clean_text)
        
        # save results
        with open(f'{clean_text_path}{filename}', 'w') as file:
            file.write(clean_text)
    
    if i % 100 == 0:
        p.value += 100  #update progress bar


IntProgress(value=0, description='Processing', max=24)

In [21]:
text = """
That for the purpose Of choosing representa-
That, for the purpose of choosing Representatives in the.
to all intents and purposes,:;. as part of the said Constitution :
That. 1‘or the purpose of choosing Representatives in the"""


In [22]:
pat = replaces['repeated_punctuation'][0]

print(pat.search(text))


None


In [None]:
text = ascii_chars.sub('', text)

for pattern, fix in general:
    text = pattern.sub(fix, text)

print(text)
print(pre_para.search(text))
print(last_amendment_char.search(text))
print(last_amendment_char_eof.search(text))


In [None]:
clean_text = extract_amendment(text)
print(clean_text)


In [None]:
result = pre_para.search(text)

if result is not None:
    print(result)
    print(result.start())
    print(result.end())

In [None]:
print(last_amendment_char.search(text))
print(last_amendment_char.search(text).end())


In [None]:
print(text[757:1470])
