## Nicely Formatting Clauses and Contracts

**NOTE: This is ultimately built into the processing pipeline in Task 3 deployment**

In [None]:
import re
import os
from nltk.corpus import words

In [None]:
try:
    english_words = set(words.words())
except LookupError:
    import nltk

    nltk.download("words")
    english_words = set(words.words())

In [None]:
def process_text_document(text):
    # Remove spaces around specific punctuation and ensure spacing consistency
    text = re.sub(
        r"\s*([;:\[\]\(\)“”])\s*", r"\1", text
    )  # Remove spaces around these symbols
    text = re.sub(
        r"\s*([,;])", r"\1 ", text
    )  # Ensure a space after commas and semicolons
    text = re.sub(r"\s+", " ", text)  # Normalize multiple spaces to a single space
    text = text.strip()  # Remove leading and trailing whitespace

    # Remove all instances of "[END]"
    text = re.sub(r"\s*\[END\]\s*", "", text)

    # Ensure text ends with a proper punctuation mark
    if text and text[-1] not in {".", "!", "?"}:
        text = text.rstrip(text[-1]) + "."

    # Add paragraph breaks after each period not following specific exceptions
    text = re.sub(
        r"(?<!\d)(?<!\b[A-Z])(?<!\bNo)(?<!\bi\.e)(?<!\be\.g)\. ", ".\n\n", text
    )

    # Remove leading/trailing dashes and replace excessive dashes within the text
    text = re.sub(r"^-+|-+$", "", text, flags=re.MULTILINE)
    text = re.sub(r"-{2,}", "-", text)

    return text

In [None]:
def open_clause(clause):
    with open(clause, "r") as f:
        content = f.read()
        content = process_text_document(content)
    return content

In [None]:
example_file_path = "../tclp_content/contracts/british_contracts/2001/000000013.txt"

In [None]:
example = open_clause(example_file_path)
cleaned_example = process_text_document(example)

with open("cleaned_example.txt", "w") as f:
    f.write(cleaned_example)

In [None]:
os.makedirs("../tclp_content/cleaned_clauses", exist_ok=True)

In [None]:
# do this for a whole folder of clauses
folder = "../tclp_content/England:Wales"
for filename in os.listdir(folder):
    if filename.endswith(".txt"):
        file_path = os.path.join(folder, filename)
        cleaned_clause = open_clause(file_path)
        with open(
            f"../tclp_content/cleaned_clauses/{os.path.basename(file_path)}", "w"
        ) as f:
            f.write(cleaned_clause)
    else:
        continue

In [None]:
# make folder for cleaned contracts
os.makedirs("../tclp_content/cleaned_contracts", exist_ok=True)

In [None]:
# do this for a whole folder of contracts
# open folder
main_folder = "../tclp_content/contracts/british_contracts"
output_folder = "../tclp_content/cleaned_contracts"
for root, _, files in os.walk(main_folder):
    for filename in files:
        if filename.endswith(".txt"):
            file_path = os.path.join(root, filename)
            cleaned = open_clause(file_path)
            cleaned_path = file_path.replace(
                main_folder, "../tclp_content/cleaned_contracts"
            )
            os.makedirs(os.path.dirname(cleaned_path), exist_ok=True)
            with open(cleaned_path, "w", encoding="utf-8") as f:
                f.write(cleaned)

### Clauses with Noise

In [None]:
os.makedirs("../tclp_content/cleaned_noisy_clauses", exist_ok=True)

In [None]:
# do this for a whole folder of clauses
folder = "../tclp_content/1800_clauses"
for filename in os.listdir(folder):
    if filename.endswith(".txt"):
        file_path = os.path.join(folder, filename)
        cleaned_clause = open_clause(file_path)
        with open(
            f"../tclp_content/cleaned_noisy_clauses/{os.path.basename(file_path)}", "w"
        ) as f:
            f.write(cleaned_clause)
    else:
        continue