In [1]:
import numpy as np
import pandas as pd

import datetime

class TenLineSampleGenerator:
    """Generate some sample ten-line data for testing the multi-line parser

    You don't need a messagebody for the last prefix.  It will alwasy be a timestamp of the message being generated.

    Attributes:
        prefixes: what are the default prefixes that ten-lines use
    """
    NOT_AVAILABLE = "N/A"
    room_name_parts = ["s", "t", "us", "fvey", "test", "experiment", "c2", "south", "north"]
    user_name_parts = ["joa", "c2", "falcon", "flame", "cannon", "bearcat", "test", "dragon", "wasp", "bearcat"]
    message_parts =   {
        0: [
            "Echo01", "Echo11", "Echo21", "Echo31", "Echo41",
            "Taco01", "Taco11", "Taco21", "Taco31",
            "Bravo01", "Bravo21", "Bravo41", "Bravo51",
            "Maple01", "Maple11", "Maple21",
            "Gravity01", "Gravity11", "Gravity21", "Gravity31", "Gravity41"
        ],
        1: ["SMACK", "ESCORT", "TEST", "DANCE"],
        2: ["TEST", "Test", "PRACTICE", "PERFORMANCE"],
        3: ["APPLE", "ORANGE", "BANANNA", "PEACH", "MANGO"],
        4: ["N/A"],
        5: ["N/A"],
        6: ["N/A"],
        7: ["N/A"],
        8: ["N/A"],
    }


    def __init__(self,
            sample_roomnames = None,
            num_of_rooms = 5,
            sample_usernames = None,
            num_of_users = 5,
            prefixes=["1. ", "2. ", "3. ", "4. ", "5. ",
                      "6. ", "7. ", "8. ", "9. ", "10. "],
            message_parts   = None,
            appendId        = False,
            incomplete_odds = 0.001,

            latency_odds    = 0.01,
            latency_millis  = 200,
        ):
        """Return a new Ten_Line_Sample_Generator object."""
        if sample_roomnames is None:
            self.sample_roomnames = self.generate_sample_roomnames(num_of_rooms)
        else:
            self.sample_roomnames = sample_roomnames

        if sample_usernames is None:
            self.sample_usernames = self.generate_sample_usernames(num_of_users)
        else:
            self.sample_usernames = sample_usernames

        self.prefixes = prefixes

        if message_parts is None:
            self.sample_message_parts = TenLineSampleGenerator.message_parts

        self.appendId = appendId
        self.incomplete_odds = incomplete_odds

        self.latency_odds = latency_odds
        self.latency_millis = latency_millis

        self.room_name_column = "Room Name"
        self.username_column = "Username"
        self.message_column = "Message"
        self.timestamp_column = "Timestamp"

    def generate_sample_roomnames(self, num_of_rooms):
        """Generate a list of sample room names"""
        roomnames = []
        while len(roomnames) < num_of_rooms:
            room_name = np.random.randint(1, len(TenLineSampleGenerator.room_name_parts)/2)
            room_name = "_".join(np.random.choice(TenLineSampleGenerator.room_name_parts, room_name))
            if room_name not in roomnames:
                roomnames.append(room_name)
        return roomnames

    def generate_sample_usernames(self, num_of_users):
        """Generate a list of sample user names"""
        usernames = []
        while len(usernames) < num_of_users:
            username = np.random.randint(1, 3)
            username = "_".join(np.random.choice(TenLineSampleGenerator.user_name_parts, username))
            if username not in usernames:
                usernames.append(username)
        return usernames

    def generate_sample_data(self,
        number_to_generate,
        start_time=None,
        end_time=None
    ):
        """Generate some random data for testing, the number of messages will be distrubuted evenly between the start and end time.

        Args:
            number_to_generate: how many records to generate?
            start_time: (Optional) what is the start time, if none is provided it will be set to now
            end_time: (Optional) what is the end time, if none is provided it will be set to start_time + 1 hour
        """
        ## If start_time is None
        if start_time is None:
            start_time = datetime.datetime.now()
            start_time = start_time.replace(hour=0, minute=0, second=0, microsecond=0)
        ## If end_time is None
        if end_time is None:
            end_time = start_time + datetime.timedelta(hours=1)

        ## Get the differnce between the start_time and end_time
        time_difference = (end_time - start_time) / number_to_generate

        ## Create our initial dictionary for returning
        chat_log = {
            self.room_name_column  : [],
            self.username_column   : [],
            self.message_column    : [],
            self.timestamp_column  : [],
        }

        ## Intantiate our random number generator and our index number
        rnd_generator = np.random.default_rng()
        idx = 1

        ## Loop through the number of messages to generate
        while idx <= number_to_generate:
            r = np.random.choice(self.sample_roomnames)
            u = np.random.choice(self.sample_usernames)
            t = start_time

            for j in range(len(self.message_parts)):
                ## Generate a random number, if we are greater then continue generating our message
                if rnd_generator.random() > self.incomplete_odds:
                    ## Generate a random number, if we are less than it, then go ahead and add an
                    ##    offset to our timestamp
                    if rnd_generator.random() <= self.latency_odds:
                        t += datetime.timedelta(milliseconds=np.random.randint(1, self.latency_millis+1))

                    chat_log[self.room_name_column].append(r)
                    chat_log[self.username_column].append(u)
                    #chat_log[self.message_column].append(f"{j+1}. {np.random.choice(self.message_parts[j])}{f'_{idx}' if self.appendId else ''}")
                    chat_log[self.message_column].append(f"{self.prefixes[j]}{np.random.choice(self.message_parts[j])}{f'_{idx}' if self.appendId else ''}")
                    chat_log[self.timestamp_column].append(t.strftime('%Y-%m-%d %H:%M:%S.%fZ'))

            chat_log[self.room_name_column].append(r)
            chat_log[self.username_column].append(u)
            chat_log[self.message_column].append(f"10. Time to Completion{f'_{idx}' if self.appendId else ''} : {start_time.strftime('%Y-%m-%d %H:%M:%S.%fZ')}")
            chat_log[self.timestamp_column].append(t.strftime('%Y-%m-%d %H:%M:%S.%fZ'))

            idx += 1
            start_time += time_difference
        ## ================== END LOOP ==================


        return pd.DataFrame(chat_log)




tmp = TenLineSampleGenerator(appendId=True)
fake_ten_line = tmp.generate_sample_data(100)
fake_ten_line

Unnamed: 0,Room Name,Username,Message,Timestamp
0,experiment_north,joa,1. Echo31_1,2025-01-15 00:00:00.000000Z
1,experiment_north,joa,2. ESCORT_1,2025-01-15 00:00:00.000000Z
2,experiment_north,joa,3. Test_1,2025-01-15 00:00:00.000000Z
3,experiment_north,joa,4. MANGO_1,2025-01-15 00:00:00.000000Z
4,experiment_north,joa,5. N/A_1,2025-01-15 00:00:00.000000Z
...,...,...,...,...
994,south,flame_cannon,6. N/A_100,2025-01-15 00:59:24.000000Z
995,south,flame_cannon,7. N/A_100,2025-01-15 00:59:24.000000Z
996,south,flame_cannon,8. N/A_100,2025-01-15 00:59:24.000000Z
997,south,flame_cannon,9. N/A_100,2025-01-15 00:59:24.000000Z


In [28]:
class TenLineParser:
    """A general parser for converting the ten-line messages from irc into standard record format.

    Attributes:
        likes_spam: A boolean indicating if we like SPAM or not.
        eggs: An integer count of the eggs we have laid.
    """
    def __init__(self,
        prefixes=["1. ", "2. ", "3. ", "4. ", "5. ",
                  "6. ", "7. ", "8. ", "9. ", "10. "],
        new_columns=["msg_1", "msg_2", "msg_3", "msg_4", "msg_5",
                  "msg_6", "msg_7", "msg_8", "msg_9", "msg_10"],
        best_matches=True,
        roomname_column  = "Room Name",
        username_column  = "Username",
        message_column   = "Message",
        timestamp_column = "Timestamp",

        groupby_cnt_col="msg_cnt",
        #match_group_col="match_group", 
        time_diff_col="time_diff",
        groupby=["Room Name", "Username"]
    ):
        """Return a new Ten_Line_Parser object.

        Args:
            prefixes: what are the prefixes to look for?
        """
        self.prefixes         = prefixes
        self.new_columns      = new_columns
        self.best_matches     = best_matches
        self.roomname_column  = roomname_column
        self.username_column  = username_column
        self.message_column   = message_column
        self.timestamp_column = timestamp_column

        self.groupby_cnt_col  = groupby_cnt_col
        #self.match_group_col  = match_group_col
        self.time_diff_col    = time_diff_col
        self.groupby          = groupby

    def _parseIncompleteRowTimeDiffs(self, dataset, index, row):
        """Take the index of the passed row and build a list of all the possible matching rows, and build a dictionary of those relationships

        Args:
            dataset: 
            row: row to parse time differences for
        """
        ## Filter down to only those rows, that might match our current row
        potential_matches = dataset[dataset[self.groupby_cnt_col] != len(self.new_columns)]
        potential_matches = potential_matches[~potential_matches.index.isin([index])]
        for col in self.groupby:
            potential_matches = potential_matches[potential_matches[col] == row[col]]
        ## If there are no matches, then just leave
        if len(potential_matches) <= 0:
            return None

        ## Loop throught the rows of our matches
        for match_index, match_row in potential_matches.iterrows():
            
            a = len(row[self.time_diff_col]) > 0 and row[self.time_diff_col][match_index] is not None
            b = len(match_row[self.time_diff_col]) > 0 and match_row[self.time_diff_col][index] is not None
            ## If both already have a match then go ahead and leave
            if a and b:
                pass
            ## Check to see if we already this index in our time_diffs, if it exists save
            elif a:
                match_row[self.time_diff_col][index] = row[self.time_diff_col][match_index]
                pass
            ## Check to see if our match already this our index in our time_diffs, if it exists save
            elif b:
                row[self.time_diff_col][match_index] = match_row[self.time_diff_col][index]
                pass
            
            ## Loop through the new columns and check to see if our rows have duplicate values
            full_match = True
            column_match = len(self.new_columns)
            bad_match = False
            for col in self.new_columns:
                v1 = row[col]
                v2 = match_row[col]

                ## If both rows already have a value for the same column, go ahead and mark as a bad match
                if v1 is not None and v2 is not None:
                    full_match = False
                    column_match -= 1
                    bad_match = True
                if v1 is None and v2 is None:
                    full_match = False
                    column_match -= 1
            
                ## Calculate the time time difference
                time_diff = { 
                    "time_diff": abs((row[self.timestamp_column] - match_row[self.timestamp_column]).total_seconds() * 1000),
                    "full_match": full_match,
                    "column_match": column_match,
                    "bad_match": bad_match
                }

            ## Store the resulting time_diff
            row[self.time_diff_col][match_index] = time_diff
            match_row[self.time_diff_col][index] = time_diff
        
        return None

    
    def parse(self, dataset):
        """Take a dataset of 10 line data and attempt to parse it, into a columnar format.

        Args:
            dataset: what is the dataset to parse?
        """
        ## ########################################## ADD THE NEW COLUMNS ##########################################
        ## First go ahead and create the new columns and strip off our prefixes
        new_dataset = dataset.copy()
        ## Make sure that our timestamp column is already int timestamp format
        new_dataset[self.timestamp_column] = pd.to_datetime(new_dataset[self.timestamp_column])
        ## Loop through the prefixes and create the new columns
        for i, prefix in enumerate(self.prefixes):
            new_dataset[self.new_columns[i]] = new_dataset[new_dataset[self.message_column].str.startswith(prefix)][self.message_column].str.slice(len(prefix))
        ## Drop the message column, since we don't need it anymore
        new_dataset = new_dataset.drop(columns=[self.message_column])

        ## ################################# MERGE ON IDENTICAL Room/User/Timestamp ################################
        ## Build our groupby (that includes the timestamp column)
        full_groupby = self.groupby.copy()
        full_groupby.append(self.timestamp_column)
        ## Add a column for our groupby count
        new_dataset[self.groupby_cnt_col] = 0
        ## GroupBy, first build a dic for our group by and count for our count column
        agg_dict = {}
        for col in new_dataset.columns:
            if not (col in full_groupby):
              agg_dict[col] = "first"
        agg_dict[self.groupby_cnt_col] = "count"
        new_dataset = new_dataset.groupby(full_groupby).agg(agg_dict).reset_index()
        ## Save off all of the columns we've created for later
        dataset_columns = new_dataset.columns

        ## ############################## Try to MERGE is cnt is less than len(prefix) #############################
        new_dataset[self.time_diff_col] = [{} for _ in range(len(new_dataset))]
        idx_inspected = []
        
        ## Save out the ones that are already fully matched
        already_matched = new_dataset[new_dataset[self.groupby_cnt_col] == len(self.new_columns)].copy()
        need_match = new_dataset[new_dataset[self.groupby_cnt_col] < len(self.new_columns)]
        ## build the merge result data
        merged = {
            self.roomname_column:  [],
            self.username_column:  [],
            self.timestamp_column: [],
            self.groupby_cnt_col:  [],
            self.time_diff_col:    []
        }
        for col in self.new_columns:
            merged[col]: []

        
        for index, row in need_match.iterrows():
            if index not in idx_inspected:
                self._parseIncompleteRowTimeDiffs(new_dataset, index, row)
                idx_inspected.append(index)
                '''## if we are doing best match, then go ahead and add it to the merges
                if self.best_matches and row[self.time_diff_col]["full_match"]:
                    for col in dataset_columns:
                        merged[col].append(row[col])
                    merged[self.time_diff_col].append(row[self.time_diff_col])
                    
                #break'''
        
        return new_dataset

parser = TenLineParser()
results = parser.parse(fake_ten_line)

In [29]:
results[results["msg_cnt"] < 10]

Unnamed: 0,Room Name,Username,Timestamp,msg_1,msg_2,msg_3,msg_4,msg_5,msg_6,msg_7,msg_8,msg_9,msg_10,msg_cnt,time_diff
1,experiment_north,bearcat,2025-01-15 00:51:36+00:00,Echo31_87,ESCORT_87,Test_87,APPLE_87,,,,,,,4,"{2: {'time_diff': 8.0, 'full_match': True, 'co..."
2,experiment_north,bearcat,2025-01-15 00:51:36.008000+00:00,,,,,N/A_87,N/A_87,N/A_87,N/A_87,N/A_87,Time to Completion_87 : 2025-01-15 00:51:36.00...,6,"{1: {'time_diff': 8.0, 'full_match': True, 'co..."
17,experiment_north,test,2025-01-15 00:04:12+00:00,Echo41_8,SMACK_8,PRACTICE_8,BANANNA_8,,N/A_8,N/A_8,N/A_8,N/A_8,Time to Completion_8 : 2025-01-15 00:04:12.000...,9,{}
37,s,joa,2025-01-15 00:33:36+00:00,Gravity01_57,DANCE_57,Test_57,,,,,,,,3,"{38: {'time_diff': 24.0, 'full_match': True, '..."
38,s,joa,2025-01-15 00:33:36.024000+00:00,,,,BANANNA_57,N/A_57,N/A_57,N/A_57,N/A_57,N/A_57,Time to Completion_57 : 2025-01-15 00:33:36.00...,7,"{37: {'time_diff': 24.0, 'full_match': True, '..."
43,s,test,2025-01-15 00:34:12+00:00,Gravity41_58,SMACK_58,,,,,,,,,2,"{44: {'time_diff': 45.0, 'full_match': True, '..."
44,s,test,2025-01-15 00:34:12.045000+00:00,,,TEST_58,MANGO_58,N/A_58,N/A_58,N/A_58,N/A_58,N/A_58,Time to Completion_58 : 2025-01-15 00:34:12.00...,8,"{43: {'time_diff': 45.0, 'full_match': True, '..."
62,t_north,bearcat_cannon,2025-01-15 00:35:24+00:00,Gravity01_60,ESCORT_60,,,,,,,,,2,"{63: {'time_diff': 194.0, 'full_match': True, ..."
63,t_north,bearcat_cannon,2025-01-15 00:35:24.194000+00:00,,,TEST_60,BANANNA_60,N/A_60,N/A_60,N/A_60,N/A_60,N/A_60,Time to Completion_60 : 2025-01-15 00:35:24.00...,8,"{62: {'time_diff': 194.0, 'full_match': True, ..."
64,t_north,flame_cannon,2025-01-15 00:11:24+00:00,Echo11_20,SMACK_20,,,,,,,,,2,"{65: {'time_diff': 111.0, 'full_match': True, ..."


In [27]:
results.iloc[102]['time_diff']

{101: {'time_diff': 10.0,
  'full_match': True,
  'column_match': 10,
  'bad_match': False}}