In [3]:
import re

In [19]:
def clean_corpus(chat_export_file):
    """
    Prepare a WhatsApp chat export for training with chatterbot.
    """
    message_corpus = remove_chat_metadata(chat_export_file)
    cleaned_corpus = remove_non_message_text(message_corpus)
    return cleaned_corpus

def remove_chat_metadata(chat_export_file):
    """
    Remove WhatsApp chat metadata.

    WhatsApp chat exports come with metadata about each message:

     date    time    username  message
    ---------------------------------------
    8/26/22, 17:47 - Jane Doe: Message text

    This function removes all the metadata up to the text of each message.

    Args:
        chat_export_file (str): The name of the chat export file

    Returns:
        tuple: The text of each message in the conversation
    """
    date_time = r"(\d+\/\d+\/\d+,\s\d+:\d+)"  # e.g. "8/26/22, 17:47"
    dash_whitespace = r"\s-\s"  # " - "
    username = r"([\w\s]+)"  # e.g. "Jane Doe"
    metadata_end = r":\s"  # ": "
    # pattern = date_time + dash_whitespace + username + metadata_end
    # pattern = r"\[(.*?)\][^:]*:"
    pattern = r"\[(.*?)\]\s*([^:]+):"

    cleaned_corpus = ""
    with open(chat_export_file, "r") as corpus_file:
        for line in corpus_file.read():
            # content = line.read()
            cleaned_corpus += re.sub(pattern, "", line)
    return tuple(cleaned_corpus.split("\n"))

def remove_non_message_text(export_text_lines):
    """
    Remove conversation-irrelevant text from chat export.

    WhatsApp chat exports come with a standardized intro line,
    and an empty line at the end of the file.
    Text exports also replace media messages with text that isn't
    relevant for the conversation. This function removes all that.

    Args:
        export_text_lines (tuple): All lines from the export file

    Returns:
        tuple: Messages that are a relevant part of the conversation
    """
    messages = export_text_lines[1:-1]

    filter_out_msgs = ("<Media omitted>",)
    return tuple((msg for msg in messages if msg not in filter_out_msgs))

In [22]:
# remove_chat_metadata("chat.txt")
remove_chat_metadata("chat.txt")

('[2018-10-31 13:06:34] From Canada  🇨🇦 w love: \u200eLes messages et les appels sont chiffrés de bout en bout. Aucun tiers, pas même WhatsApp, ne peut les lire ou les écouter.',
 '[2018-10-31 13:06:34] Omi Maurin: \u200eOmi Maurin a créé ce groupe',
 '[2018-10-31 13:06:34] From Canada  🇨🇦 w love: \u200eVous avez été ajouté·e',
 '[2020-11-14 09:16:51] Mom Monica Maurin: we have power tools if you need any -- though blades on ours are rusty from disuse. Lemme know',
 '[2020-11-14 09:29:04] Rob Maurin: This is the piece we’re keeping',
 '[2020-11-14 09:36:18] Grandpa Maurin: Jesus have mercy on Rob !',
 '[2020-11-14 09:36:55] Omi Maurin: A bit uncomfortagle to sit slanted... :)',
 '[2020-11-14 09:37:06] Rob Maurin: That worked so much better than I feared!!',
 '[2020-11-14 09:37:53] Omi Maurin: GREAT!!!!!!  Congrats!!!',
 '[2020-11-14 09:38:04] Rob Maurin: The opposite end came off clean.  So I hope all I have to do is trim up the sawed end and pop it he end into place???',
 '[2020-11-14