## Necessary workflows

In [2]:
workflows = [
                   "Tool: Rectangle,End Event: Change Attributes",                   
                    "Tool: Line,End Event: Delete,Tool: Line",
                   "Tool: Line,Tool: Move by Points",
                   "Tool: Rectangle,Menu: Add Surface - ",
                   "Tool: Rectangle,Menu: Extrude and Edit - ",
                   "Tool: 2D Polygon,End Event: Change Attributes",
                   "Menu: Save As - ,Menu: Export PDF - ",
                   "Menu: Duplicate - ,End Event: Drag,End Event: Modify Text",
                   "Menu: Copy - ,End Event: Set Active Layer,Menu: Paste - ",
                    "Tool: Mirror,End Event: Drag,End Event: Resize",
                  ]

## Processing to get this workflows

In [3]:
from multiprocessing import Pool, cpu_count
from collections import defaultdict
from tqdm import tqdm
import pandas as pd
import logging
# Configure logging

In [None]:
merged_processed_unitied_logs = pd.read_parquet('data/merged_logs.parquet')

In [6]:
counts = merged_processed_unitied_logs['message_content'].value_counts().reset_index()
counts.columns = ['message_content', 'count']
counts

Unnamed: 0,message_content,count
0,End Event: Drag,48492994
1,End Event: Delete,32555193
2,End Event: Resize,23870789
3,End Event: Shape Pane Edit,20951431
4,End Event: Set Active Layer,19596011
...,...,...
4924,Tool: St Frame_elevation,10
4925,End Event: Define Center Line Marker,10
4926,End Event: Generate columns and intercolumns,10
4927,Tool: DBTools,10


In [None]:
grouped_ids = {session: group['message_content'].tolist() for session, group in merged_processed_unitied_logs.groupby('session_anonymized')}

## Merging workflows back to merged logs

In [8]:
def processing_merge (grouped_ids, messages, combined_messages):
    output_messages = []
    processed_ids = {}  # Store processed groups
    for group_key in tqdm(grouped_ids, desc="Processing merging"):
        group_messages = grouped_ids[group_key]  # Messages for the current group
        merged_messages = []  # Store merged messages 
        i = 0  # Index tracker
        while i < len(group_messages):  # Iterate through messages
            # Check for a matching sequence
            if i <= len(group_messages) - len(messages) and group_messages[i:i + len(messages)] == messages:
                merged_messages.append(combined_messages)
                i += len(messages)  # Skip the matched sequence
            else:
                merged_messages.append(group_messages[i])  # Keep the current message
                i += 1  # Move to the next message
        processed_ids[group_key] = merged_messages  # Update results for the group
    return processed_ids


In [9]:
processing_grouped_ids = grouped_ids.copy()

In [10]:
for item in workflows:
    test_messages = item.split(',')
    test_messages = [msg for msg in test_messages]
    print(test_messages)
    processing_grouped_ids = processing_merge(processing_grouped_ids , test_messages, item)

['Tool: Rectangle', 'End Event: Change Attributes']


Processing merging: 100%|██████████| 1458538/1458538 [03:04<00:00, 7919.00it/s]


['Tool: Line', 'End Event: Delete', 'Tool: Line']


Processing merging: 100%|██████████| 1458538/1458538 [03:04<00:00, 7901.12it/s]


['Tool: Line', 'Tool: Move by Points']


Processing merging: 100%|██████████| 1458538/1458538 [03:02<00:00, 7991.20it/s]


['Tool: Rectangle', 'Menu: Add Surface - ']


Processing merging: 100%|██████████| 1458538/1458538 [03:07<00:00, 7781.00it/s]


['Tool: Rectangle', 'Menu: Extrude and Edit - ']


Processing merging: 100%|██████████| 1458538/1458538 [03:07<00:00, 7781.62it/s]


['Tool: 2D Polygon', 'End Event: Change Attributes']


Processing merging: 100%|██████████| 1458538/1458538 [03:09<00:00, 7679.40it/s]


['Menu: Save As - ', 'Menu: Export PDF - ']


Processing merging: 100%|██████████| 1458538/1458538 [03:12<00:00, 7562.03it/s]


['Menu: Duplicate - ', 'End Event: Drag', 'End Event: Modify Text']


Processing merging: 100%|██████████| 1458538/1458538 [03:08<00:00, 7747.57it/s]


['Menu: Copy - ', 'End Event: Set Active Layer', 'Menu: Paste - ']


Processing merging: 100%|██████████| 1458538/1458538 [03:12<00:00, 7588.33it/s]


['Tool: Mirror', 'End Event: Drag', 'End Event: Resize']


Processing merging: 100%|██████████| 1458538/1458538 [03:07<00:00, 7765.16it/s]


In [11]:
# Flatten the dictionary into a list of key-value pairs
data_flattened = [(key, value) for key, values in processing_grouped_ids.items() for value in values]
# Create a DataFrame
df_processed = pd.DataFrame(data_flattened, columns=['session_anonymized', 'message_content'])

## Merging with other schema

In [13]:
items = []

# Iterate through each 'message_eng' entry, split by ', ', and extend the items list
for message in workflows:
    if isinstance(message, str):  # Ensure the message is a string
        items.extend(message.split(','))
unique_items = list(set(items))
unique_items

['Menu: Add Surface - ',
 'End Event: Drag',
 'Menu: Duplicate - ',
 'End Event: Modify Text',
 'End Event: Set Active Layer',
 'Menu: Save As - ',
 'End Event: Delete',
 'Menu: Copy - ',
 'End Event: Resize',
 'End Event: Change Attributes',
 'Tool: Rectangle',
 'Tool: Mirror',
 'Tool: 2D Polygon',
 'Menu: Export PDF - ',
 'Menu: Paste - ',
 'Tool: Line',
 'Tool: Move by Points',
 'Menu: Extrude and Edit - ']

In [None]:
import pandas as pd
from multiprocessing import Pool, cpu_count
from tqdm import tqdm
import os
import sys

def init_worker():
    """Suppress stdout and stderr in worker processes."""
    sys.stdout = open(os.devnull, 'w')
    sys.stderr = open(os.devnull, 'w')

def process_rows(df_original, df_merged):
    results = []  # List to store processed rows
    index_original = 0
    for index, row in df_merged.iterrows():

        message_engs = row['message_content'].split(',')
        
        # Check if message_engs has more than one component and matches unique_items
        if len(message_engs) > 1 and any(item in unique_items for item in message_engs):
            i = len(message_engs)  # Number of components
        else:
            i = 1

        # Calculate the new index for matching
        new_index = index_original + i - 1

        
        # Ensure new_index is within bounds
        if new_index >= len(df_original):
            new_index = len(df_original) - 1

        # Check for matching conditions
        matched = (
            df_original.at[new_index, 'session_anonymized'] == row['session_anonymized'] and
            df_original.at[new_index, 'message_content'] == row['message_content']
        ) if new_index in df_original.index else False

        if matched:  # If match found
            processed_row = {
                'session_anonymized': df_original.at[new_index, 'session_anonymized'],
                'ts': df_original.at[new_index, 'ts'],
                'cat': df_original.at[new_index, 'cat'],
                'message': df_original.at[new_index, 'message'],
                'message_eng': df_original.at[new_index, 'message_eng'],  # Keep the merged DataFrame's message_eng
                'message_content': row['message_content'], 
                'localization_id': df_original.at[new_index, 'localization_id'],  
                'merge_count': df_original.at[new_index, 'merge_count'],  # Increment merge_count
            }
        else:  # If no match found
            processed_row = {
                'session_anonymized': row['session_anonymized'],
                'ts': df_original.at[new_index, 'ts'] if new_index in df_original.index else None,
                'cat': 'workflow',  # Default category
                'message': ", ".join(df_original.at[i, 'message'] for i in range(index_original, new_index+1)),
                'message_eng': ", ".join(df_original.at[i, 'message_eng'] for i in range(index_original, new_index+1)),
                'message_content': ", ".join(df_original.at[i, 'message_content'] for i in range(index_original, new_index+1)),
                'localization_id': ", ".join(df_original.at[i, 'localization_id'] for i in range(index_original, new_index+1)),
                'merge_count': 1,  # Initialize merge_count
            }

        # Update index_original for the next iteration
        index_original += i

        # Append the processed row to results
        results.append(processed_row)

    # Convert results back into a DataFrame
    processed_df = pd.DataFrame(results)
    return processed_df

def process_group(args):
    """Helper function to process a single group."""
    df_original, df_merged = args
    # Reset the index for each group before processing
    df_original = df_original.reset_index(drop=True)
    df_merged = df_merged.reset_index(drop=True)
    return process_rows(df_original, df_merged)

def parallel_process(df_original, df_merged, num_cpus=20):
    # Group the DataFrames by 'session_anonymized' and reset the index for each group
    grouped_original = {
        key: group.reset_index(drop=True)
        for key, group in df_original.groupby('session_anonymized')
    }
    grouped_merged = {
        key: group.reset_index(drop=True)
        for key, group in df_merged.groupby('session_anonymized')
    }

    # Create arguments for each group
    args = [
        (grouped_original[key], grouped_merged[key])
        for key in grouped_original.keys() & grouped_merged.keys()
    ]

    # Set the number of CPUs to the available count if exceeding
    num_cpus = min(num_cpus, cpu_count())

    # Use multiprocessing to process groups in parallel
    with Pool(processes=num_cpus, initializer=init_worker) as pool:
        results = list(tqdm(pool.imap(process_group, args), total=len(args), desc="Processing Groups"))

    # Combine all processed groups into a single DataFrame
    processed_df = pd.concat(results, ignore_index=True)
    return processed_df

# Example usage
processed_df_flow = parallel_process(merged_processed_unitied_logs, df_processed, num_cpus=20)
processed_df_flow

In [15]:
processed_df_flow[processed_df_flow['cat'] == 'workflow']

Unnamed: 0,session_anonymized,ts,cat,message,message_eng,message_content,localization_id,merge_count
96,38A9880F,1679008519,workflow,"Tool: Rectangle (-203), End Event: Wijzig inst...","Tool: Rectangle (-203), End Event: Change Attr...","Tool: Rectangle, End Event: Change Attributes","(-203), (11)",1
852,37C48083,1692048655,workflow,"Tool: Line (-201), Tool: Move by Points (-352)","Tool: Line (-201), Tool: Move by Points (-352)","Tool: Line, Tool: Move by Points","(-201), (-352)",1
1393,4FD137B1,1678449091,workflow,"Menu: Copy - (-28) (0), End Event: 繧｢繧ｯ繝繧｣繝悶↑...","Menu: Copy - (-28) (0), End Event: Set Active...","Menu: Copy - , End Event: Set Active Layer, Me...","(-28) (0), (32), (-29) (0)",1
1456,C86E0BA0,1697188269,workflow,"Menu: Duplicate - (-33) (0), End Event: Dépla...","Menu: Duplicate - (-33) (0), End Event: Drag ...","Menu: Duplicate - , End Event: Drag, End Event...","(-33) (0), (75), (282)",1
1463,C86E0BA0,1697188332,workflow,"Menu: Duplicate - (-33) (0), End Event: Dépla...","Menu: Duplicate - (-33) (0), End Event: Drag ...","Menu: Duplicate - , End Event: Drag, End Event...","(-33) (0), (75), (282)",1
...,...,...,...,...,...,...,...,...
375304506,300AAFA5,1685594045,workflow,"Menu: Copy - (-28) (0), End Event: Set Active...","Menu: Copy - (-28) (0), End Event: Set Active...","Menu: Copy - , End Event: Set Active Layer, Me...","(-28) (0), (32), (-29) (0)",1
375304548,300AAFA5,1685599183,workflow,"Menu: Copy - (-28) (0), End Event: Set Active...","Menu: Copy - (-28) (0), End Event: Set Active...","Menu: Copy - , End Event: Set Active Layer, Me...","(-28) (0), (32), (-29) (0)",1
375304595,300AAFA5,1685604958,workflow,"Tool: Rectangle (-203), End Event: Change Attr...","Tool: Rectangle (-203), End Event: Change Attr...","Tool: Rectangle, End Event: Change Attributes","(-203), (11)",1
375304596,300AAFA5,1685604964,workflow,"Tool: Rectangle (-203), End Event: Change Attr...","Tool: Rectangle (-203), End Event: Change Attr...","Tool: Rectangle, End Event: Change Attributes","(-203), (11)",1


In [None]:
processed_df_flow.to_parquet('data/merged_logs_with_workflows.parquet', index=False)

## Unique items vocabulary

In [18]:
df_final = processed_df_flow.copy()

In [19]:
workflows_df = df_final[df_final['cat'] == 'workflow']

In [20]:
counts_workflows = workflows_df['message_content'].value_counts().reset_index()
counts_workflows.columns = ['message_content', 'count']

In [21]:
counts_workflows

Unnamed: 0,message_content,count
0,"Tool: Rectangle, End Event: Change Attributes",703314
1,"Menu: Copy - , End Event: Set Active Layer, Me...",639281
2,"Tool: Line, Tool: Move by Points",397279
3,"Tool: Line, End Event: Delete, Tool: Line",392573
4,"Tool: 2D Polygon, End Event: Change Attributes",344332
5,"Menu: Duplicate - , End Event: Drag, End Event...",278704
6,"Tool: Rectangle, Menu: Add Surface -",238289
7,"Tool: Rectangle, Menu: Extrude and Edit -",217330
8,"Tool: Mirror, End Event: Drag, End Event: Resize",67975
9,"Menu: Save As - , Menu: Export PDF -",19321


In [22]:
counts_workflows.to_csv('workflows.csv')

In [23]:
counts = merged_processed_unitied_logs['message_content'].value_counts().reset_index()
counts.columns = ['message_content', 'count']

In [24]:
counts

Unnamed: 0,message_content,count
0,End Event: Drag,48492994
1,End Event: Delete,32555193
2,End Event: Resize,23870789
3,End Event: Shape Pane Edit,20951431
4,End Event: Set Active Layer,19596011
...,...,...
4924,Tool: St Frame_elevation,10
4925,End Event: Define Center Line Marker,10
4926,End Event: Generate columns and intercolumns,10
4927,Tool: DBTools,10


In [25]:
import json
messages = counts['message_content'].to_list()
workflows = counts_workflows['message_content'].to_list()

In [26]:
voc = messages + workflows

In [27]:
message_dict = {item: index + 1 for index, item in enumerate(voc)}

# Write the dictionary to a JSON file
with open('./1226voc_10workflows.json', 'w') as json_file:
    json.dump(message_dict, json_file, indent=4)

print("Sorted mtoi dictionary saved as voc_10workflows.json")

Sorted mtoi dictionary saved as voc_10workflows.json
