In [1]:
import numpy as np
import pandas as pd

import datetime

"""
Generate a number of ten-line messages in the chat format.  This is only an approximations, not a perfect match.  There is no message_10, since that is the timestamp field.

Args:
    number_to_generate: Number of ten lines to generate.
    start_time: (Optional) Start time frame for the messages to arrive at, if None is provided it will use the start of the current day.
    end_time: (Optional) End time frame for the messages to arrive at, if None is provided it will add one hour to the start_time.
    room_names: (Optional) List of the names of the rooms to generate messages for.
    usernames: (Optional) List of the users to generate messages for.
    message_1: (Optional) List of the messages to generate.
    message_2: (Optional) List of the messages to generate.
    message_3: (Optional) List of the messages to generate.
    message_4: (Optional) List of the messages to generate.
    message_5: (Optional) List of the messages to generate.
    message_6: (Optional) List of the messages to generate.
    message_7: (Optional) List of the messages to generate.
    message_8: (Optional) List of the messages to generate.
    message_9: (Optional) List of the messages to generate.

    appendId: (Optional) Boolean of should we append the id to the messages.
    incomplete_odds: (Optional) Odds that we generate incomplete_odds messages.

Returns:
    Returns a dataframe of the generated chat logs.

Raises:
    KeyError: Raises an exception.
"""
def generate_chat_log_ten_lines(number_to_generate,
        start_time=None, end_time=None,

        room_names = ["Room1", "Room2", "Room3", "Room4", "Room5"],
        usernames  = ["User1", "User2", "User3", "User4", "User5"],
        message_1 = [
            "Echo1", "Echo2", "Echo3", "Echo4",
            "Bravo1", "Bravo2", "Bravo3", "Bravo4",
            "Tango1", "Tango2", "Tango3", "Tango4",
        ],
        message_2 = [
            "SMACK", "ESCORT", "TEST", "DANCE",
        ],
        message_3 = [
            "TEST1", "TEST2", "TEST3", "TEST4",
            "TEST5", "TEST6", "TEST7", "TEST8",
            "TEST9", "TEST10", "TEST11", "TEST12",
        ],
        message_4 = [
            "APPLE", "ORANGE", "BANANNA", "PEACH",
        ],
        message_5 = ["N/A"],
        message_6 = ["N/A"],
        message_7 = ["N/A"],
        message_8 = ["N/A"],
        message_9 = ["N/A"],

        roomname_column  = "Room Name",
        username_column  = "Username",
        message_column   = "Message",
        timestamp_column = "Timestamp",

        appendId        = False,
        incomplete_odds = 0.001,

        latency_odds    = 0.01,
        latency_millis  = 200,

    ):

    ## If start_time is None
    if start_time is None:
        start_time = datetime.datetime.now()
        start_time = start_time.replace(hour=0, minute=0, second=0, microsecond=0)
    ## If end_time is None
    if end_time is None:
        end_time = start_time + datetime.timedelta(hours=1)

    ## Get the differnce between the start_time and end_time
    time_difference = (end_time - start_time) / number_to_generate

    ## ====     DELETE ME
    #display(start_time)
    #display(end_time)
    #display(time_difference)
    ## ==== END DELETE ME

    ## Convert the message params to an array to make our life a little easier
    messages = [message_1, message_2, message_3, message_4, message_5, message_6, message_7, message_8, message_9]

    ## Create or initial Dictionary for our data to return
    chat_log = {
        roomname_column  : [],
        username_column  : [],
        message_column   : [],
        timestamp_column : [],
    }

    ## Intantiate our random number generator
    rnd_generator = np.random.default_rng()
    idx = 1

    ## == Loop through the number of messages to generate and start generating them
    for i in range(number_to_generate):
        r = np.random.choice(room_names)
        u = np.random.choice(usernames)
        t = start_time

        for j in range(len(messages)):
            ## Generate a random number, if we are greater then continue generating our message
            if rnd_generator.random() > incomplete_odds:
                ## Generate a random number, if we are less than it, then go ahead and add an
                ##    offset to our timestamp
                if rnd_generator.random() <= latency_odds:
                    t += datetime.timedelta(milliseconds=np.random.randint(1, latency_millis+1))

                chat_log[roomname_column].append(r)
                chat_log[username_column].append(u)
                chat_log[message_column].append(f"{j+1}. {np.random.choice(messages[j])}{f'_{idx}' if appendId else ''}")
                chat_log[timestamp_column].append(t.strftime('%Y-%m-%d %H:%M:%S.%fZ'))

        chat_log[roomname_column].append(np.random.choice(room_names))
        chat_log[username_column].append(np.random.choice(usernames))
        chat_log[message_column].append(f"10. Time to Completion{f'_{idx}' if appendId else ''} : {start_time.strftime('%Y-%m-%d %H:%M:%S.%fZ')}")
        chat_log[timestamp_column].append(start_time.strftime('%Y-%m-%d %H:%M:%S.%fZ'))

        idx += 1
        start_time += time_difference
    ## ================== END LOOP ==================

    return pd.DataFrame(chat_log)




In [None]:
def calculate_time_differences(row1, row2):
    return None

## #############################################################################
## Convert our messages, into a ten line format.  Prefixes and newcolumns are defaulted,
##     but you can adjust them to whatever format you want.
##     Best matches will only return those matches that are smallest
## #############################################################################
def convert_messages_to_ten_line(msg_df,
            prefixes=["1. ", "2. ", "3. ", "4. ", "5. ",
                      "6. ", "7. ", "8. ", "9. ", "10. "],
            new_columns=["msg_1", "msg_2", "msg_3", "msg_4", "msg_5",
                         "msg_6", "msg_7", "msg_8", "msg_9", "msg_10"],
            best_matches=False,
            msg_col="Message", timestmp_col="Timestamp",
            groupby_cnt_col="msg_cnt",
            match_group_col="match_group", time_diff_col="time_diff",
            groupby=["Room Name", "Username"]):

    ## First go ahead and create the new columns and strip off our prefixes
    new_df = msg_df.copy()
    ## Loop through the prefixes and create the new columns
    for i, prefix in enumerate(prefixes):
        new_df[new_columns[i]] = new_df[new_df[msg_col].str.startswith(prefix)][msg_col].str.slice(len(prefix))

    ## Drop the message column, since we don't need it anymore
    new_df = new_df.drop(columns=[msg_col])
    print(f"==================== new df {len(new_df)} records ====================")
    #display(new_df)

    ## Build our groupby (that includes the timestamp column)
    full_groupby = groupby.copy()
    full_groupby.append(timestmp_col)

    ## Add a column for our groupby count
    new_df[groupby_cnt_col] = 0

    ## GroupBy, first build a dic for our group by
    agg_dict = {}
    for col in new_df.columns:
        if not (col in full_groupby):
          agg_dict[col] = "first"
    agg_dict[groupby_cnt_col] = "count"
    new_df = new_df.groupby(full_groupby).agg(agg_dict).reset_index()
    #print("\n")
    #print(f"==================== groupedby df {len(new_df)} records ====================")
    #display(new_df)

    ## Add the new columns we need for building / calculating matches for our dataframes
    new_df[time_diff_col] = [{} for _ in range(len(new_df))]
    new_df[match_group_col] = 0

    ## If we have rows that equal the number of message we were looking for, then set them aside
    complete_df = new_df[new_df[groupby_cnt_col] == len(prefixes)]
    print(f"==================== complete df {len(complete_df)} records ====================")
    #display(complete_df)

    new_df = new_df[new_df[groupby_cnt_col] < len(prefixes)]
    #print("\n")
    print(f"==================== partial df {len(new_df)} records ====================")
    #display(new_df)

    ## ======== Calculate the time difference between matching rows ========
    ## Loop through all of the rows and calculate the different time differences between the
    ##     rows that were grouped together
    for index, row in new_df.iterrows():
        for index2, row2 in new_df[index + 1:].iterrows():
            #print(f"index2: {index2}")
            if index != index2:
                correct_group = True
                for col in groupby:
                    #print(f"{index}.{row[col]} != {index2}.{row2[col]} ({row[col] != row2[col]})")
                    if row2[col] != row[col]:
                        correct_group = False
                        break

                if correct_group:
                    new_df.at[index, match_group_col] += 1
                    new_df.at[index, time_diff_col][index2] = abs((row2[timestmp_col] - row[timestmp_col]).total_seconds() * 1000)
                    new_df.at[index2, match_group_col] += 1
                    new_df.at[index2, time_diff_col][index] = abs((row2[timestmp_col] - row[timestmp_col]).total_seconds() * 1000)

    ## Take any records that had no matches and move them to the complete dataframe
    tmp_df = new_df[new_df[match_group_col] == 0]
    print(f"==================== partial, with no matches df {len(tmp_df)} records ====================")
    complete_df = pd.concat([complete_df, tmp_df])

    ## Drop the records where there was no match found
    new_df = new_df[new_df[match_group_col] > 0]
    print(f"==================== partial, with matches df {len(new_df)} records ====================")

    #print("\n")
    #print(f"==================== unmatched/complete df {len(complete_df)} records, with timedifferences ====================")
    #display(complete_df)

    #print("\n")
    #print(f"==================== partial df {len(new_df)} records, with timedifferences ====================")
    #display(new_df)


    ## ======== Loop through all of our unmatched records and see about ========
    ## Setup our merged dataframe
    merged = {}
    for column in new_df.columns:
        merged[column] = []
    rows_inspected = []
    for index, row in new_df.iterrows():
        ## If we've already used this row, then move along
        if index not in rows_inspected:
            ## Matching Row
            time_differnces = row[time_diff_col]
            min_time_diff = min(time_differnces.values())
            key = [key for key, value in time_differnces.items() if value == min_time_diff][0]
            match_row = new_df.loc[key]

            #rows_inspected.append(index)

    return None


df = generate_chat_log_ten_lines(1000, appendId=True)
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
display(df)

prefixes = [
    "1. ", "2. ", "3. ", "4. ", "5. ",
    "6. ", "7. ", "8. ", "9. ", "10. "
    ]
new_columns = [
    "msg_1", "msg_2", "msg_3", "msg_4", "msg_5",
    "msg_6", "msg_7", "msg_8", "msg_9", "msg_10"
    ]

convert_messages_to_ten_line(df, prefixes=prefixes, new_columns=new_columns)

Unnamed: 0,Room Name,Username,Message,Timestamp
0,Room2,User1,1. Tango4_1,2025-01-10 00:00:00+00:00
1,Room2,User1,2. SMACK_1,2025-01-10 00:00:00+00:00
2,Room2,User1,3. TEST5_1,2025-01-10 00:00:00+00:00
3,Room2,User1,4. PEACH_1,2025-01-10 00:00:00+00:00
4,Room2,User1,5. N/A_1,2025-01-10 00:00:00+00:00
...,...,...,...,...
9985,Room1,User1,6. N/A_1000,2025-01-10 00:59:56.400000+00:00
9986,Room1,User1,7. N/A_1000,2025-01-10 00:59:56.400000+00:00
9987,Room1,User1,8. N/A_1000,2025-01-10 00:59:56.400000+00:00
9988,Room1,User1,9. N/A_1000,2025-01-10 00:59:56.400000+00:00


