In [1]:
import csv
import os
import regex as re # https://pypi.org/project/regex/
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from IPython.core.interactiveshell import InteractiveShell
from ipywidgets import IntProgress
from IPython.display import display

plt.style.use(['dark_background'])
InteractiveShell.ast_node_interactivity = "none"
pd.options.display.max_rows = 500


In [2]:
# helper functions
# progress bar
def p_bar(mx, desc):
    global p
    p = IntProgress(min=0, max=mx, description=desc)
    return p


In [3]:
# paths
# production:
#raw_text_path = 'data/raw_text/'
#save_data_path = 'data/'
#clean_text_path = 'data/clean_txt/'

# testing:
raw_text_path = 'test_data/sample/'
save_data_path = 'test_data/'
clean_text_path = 'test_data/clean_sample/'
english_dict_path = 'data/nltk_english_words'  # path to dictionary of all English words

# load English dictionary
with open(english_dict_path, 'r') as file:
    english_words = set(file.read().split('\n'))

# load document statistics
with open(f'{save_data_path}text_file_statistics.csv', 'r') as file:
    doc_data = list(csv.DictReader(file))
    
# patterns 
ascii_chars = re.compile("""[^!"#$%&'()*+,-.0-9:; =\?A-Z\[\]_`a-z\n]""")
word_chars = re.compile("[^a-zA-Z0-9 -']")

# line detectors
gen_line = r'^([\"\w\.\(\),;-]+ ?)+\n'
jt_title = r'JOINT RESOLUTION'
jt_para_ter = r'^([\"\w\.\(\),;-]+ ?)+:\n'
amend_ter = r'^([\w\.\(\),;-]+ ?)+\"\n'


# substitutions
general = [
    (r'^(.+)-\n([a-z]+.+\n)', r'\1\2', 0),
    (r'^([0-9]{1,2} )([a-z]+.+\n)', r'\2', 0),
    (r'^[0-9]{1,2}\n', '', 0),
    (r'^\n', '', 0),
]


In [4]:
display(p_bar(len(doc_data), 'Processing')) # show the progress bar

for i, doc in enumerate(doc_data):
    
    if float(doc['english_words']) > 50:  # threshold for non-rubbish text
        filename = doc['filename']
        with open(f'{raw_text_path}{filename}', 'r') as file:
            text = file.read()

        # general cleanup
        text = ascii_chars.sub('', text) # remove non-ascii characters
        
        for pattern, fix, flags in general:
            text = re.sub(pattern, fix, text, flags)

        # save results
        with open(f'{clean_text_path}{filename}', 'w') as file:
            file.write(text)
    
    if i % 100 == 0:
        p.value += 100  #update progress bar


IntProgress(value=0, description='Processing', max=24)