### Objective:
Cleaning the body, text and caption data for the tokenizer. The steps to be undertaken are listed below:
* Remove extra whitespaces and normalizing the spaces.
* Configure case normalization.
* Handling special characters.


In [18]:
import re
from typing import List

In [19]:
def clean_text(text: str) -> str:
    """
    Clean text while preserving important punctuation and structure.
    """
    # Replace multiple newlines/spaces with single space
    text = re.sub(r"\n+", " ".text)
    text = re.sub(r"\s+", " ", text)

    ## Replace multiple white-spaces with single space
    text = " ".join(text.split())

    ## Normalize quotes - Smart to regular quotes
    text = text.replace('"', '"').replace('"', '"')

    ## Normalize dashes -> to hyphen
    text = text.replace("-", "-").replace("–", "-")

    ## Fix spacing around punctuation.
    text = re.sub(r"\s+([.,!?;:])", r"\1", text)
    text = re.sub(r"\(\s+", "(", text)
    text = re.sub(r"\s+\)", ")", text)

    ## Remove leading/trailing whitespace
    return text.strip()

In [20]:
def text2sentence(text: str) -> List[str]:
    """
    Split the text into sentences while handling common abbreviations.
    """

    # Common abbreviations to avoid splitting on
    abbreviations = {"mr.", "mrs.", "dr.", "st.", "ave.", "prof."}

    # Split on sentence endings not part of abbreviations
    sentences = []
    current = []

    words = text.split()  ## Splitting based on whitespaces
    ## Iterate through each word until a stop character is found.
    for word in words:
        current.append(word)
        if word.lower() in abbreviations:
            continue
        if word.endswith((".", "!", "?")):
            sentences.append(" ".join(current))
            current = []

    ## To add the last uncompleted sentence if any.
    if current:
        sentences.append(" ".join(current))

    return sentences