In [None]:
import pandas as pd
df_logs_result_test = pd.read_parquet('data/aligned_logs.parquet')

### Unit duplicate messages based on message content

In [5]:
import pandas as pd
import json
import numpy as np
from collections import defaultdict, Counter
from multiprocessing import Pool, cpu_count
from concurrent.futures import ProcessPoolExecutor, as_completed
from tqdm import tqdm
def process_group(group_df):
    """
    Process each group to remove consecutive duplicate end events based on 'message_content'.
    Add a column to indicate the number of merging times for each row.
    """
    # Identify consecutive duplicates in 'message_content'
    is_duplicate = group_df['message_content'] == group_df['message_content'].shift(-1)

    # Initialize 'merge_count' column
    group_df['merge_count'] = 1

    # Iteratively process is_duplicate
    current_group_count = 1
    indices_to_keep = []  # Indices to keep (last one in each group)
    indices_to_update = []  # Indices where merge_count needs to be updated

    for idx, is_dup in enumerate(is_duplicate):
        if is_dup:  # If consecutive duplicate
            current_group_count += 1
        else:
            last_index = group_df.index[idx]  # Use the original group index
            indices_to_keep.append(last_index)
            indices_to_update.append((last_index, current_group_count))
            current_group_count = 1  # Reset the counter for the next group

    # Update merge_count for the last occurrence of each group
    for index, count in indices_to_update:
        group_df.at[index, 'merge_count'] = count

    # Filter DataFrame to keep only the marked indices
    result = group_df.loc[indices_to_keep].reset_index(drop=True)
    return result

def process_group_wrapper(args):
    """Wrapper function for multiprocessing."""
    _, group_df = args  # Unpack tuple (session_id, group_df)
    return process_group(group_df)

def parallel_process(grouped_data, num_cpus):
    """
    Parallel process groups using multiprocessing.
    """
    with Pool(num_cpus) as pool:
        results = list(tqdm(pool.imap(process_group_wrapper, grouped_data), 
                            total=len(grouped_data), 
                            desc="Processing in parallel"))
    return results

In [6]:
# Prepare grouped data for multiprocessing
grouped_data = list(df_logs_result_test.groupby('session_anonymized'))  # Convert to list of tuples

# Determine the number of CPUs to use
num_cpus = min(10, cpu_count())  # Use up to 20 CPUs

# Process in parallel
processed_data = parallel_process(grouped_data, num_cpus)

# Combine all processed groups into a single DataFrame
united_df = pd.concat(processed_data, ignore_index=True)

Processing in parallel: 100%|██████████| 1458578/1458578 [28:16<00:00, 859.66it/s] 


In [7]:
united_df

Unnamed: 0,session_anonymized,ts,cat,message,message_eng,message_content,localization_id,merge_count
0,00000ED8,1685635280,UNDO,End Event: Selecteer actieve laag (32),End Event: Set Active Layer (32),End Event: Set Active Layer,(32),1
1,00000ED8,1685635296,UNDO,End Event: Dupliceer (206),End Event: Duplicate (206),End Event: Duplicate,(206),1
2,00000ED8,1685635300,Tool,Tool: Text (-200),Tool: Text (-200),Tool: Text,(-200),1
3,00000ED8,1685635524,UNDO,End Event: Verplaats (75),End Event: Drag (75),End Event: Drag,(75),32
4,00000ED8,1685635527,Menu,Menu: Group Chunk - Groepeer (-514) (1),Menu: Group Chunk - Group (-514) (1),Menu: Group Chunk - Group,(-514) (1),1
...,...,...,...,...,...,...,...,...
379985029,FFFFF929,1687233115,Menu,Menu: OpenGL Render Chunk - 繧ｷ繧ｧ繧､繝 (-529) (1),Menu: OpenGL Render Chunk - Shaded (-529) (1),Menu: OpenGL Render Chunk - Shaded,(-529) (1),1
379985030,FFFFF929,1687233155,Menu,Menu: Fit To Window - (-77) (0),Menu: Fit To Window - (-77) (0),Menu: Fit To Window -,(-77) (0),1
379985031,FFFFF929,1687233171,UNDO,End Event: 繧ｪ繝ｼ繧ｬ繝翫う繧ｶ縺ｮ邱ｨ髮 (311),End Event: Edit Organization (311),End Event: Edit Organization,(311),1
379985032,FFFFF929,1687233190,Menu,Menu: Document Preferences - (-170) (0),Menu: Document Preferences - (-170) (0),Menu: Document Preferences -,(-170) (0),1


### Drop the counts less than 10

In [8]:
# Count appearances of each message in 'message_eng'
counts_content = united_df['message_content'].value_counts().reset_index()
counts_content.columns = ['message_content', 'count']
counts_content

Unnamed: 0,message_content,count
0,End Event: Drag,48492994
1,End Event: Delete,32555193
2,End Event: Resize,23870789
3,End Event: Shape Pane Edit,20951431
4,End Event: Set Active Layer,19596011
...,...,...
5589,End Event: BD Auto Address,1
5590,End Event: Update of the &quot;Pynk 02&quot; m...,1
5591,End Event: Create Wall Feature,1
5592,End Event: Rename Symbol Def,1


In [9]:
def drop_less_commands(df, commands_set):
    return df[~df['message_content'].isin(commands_set)]

In [11]:
# Filter messages with a count less than 10 and extract their content as a list
df_merged_deleted = counts_content.loc[counts_content['count'] < 10, 'message_content'].tolist()
df_log_dataset_dropped = drop_less_commands(united_df, df_merged_deleted)

In [12]:
df_log_dataset_dropped


Unnamed: 0,session_anonymized,ts,cat,message,message_eng,message_content,localization_id,merge_count
0,00000ED8,1685635280,UNDO,End Event: Selecteer actieve laag (32),End Event: Set Active Layer (32),End Event: Set Active Layer,(32),1
1,00000ED8,1685635296,UNDO,End Event: Dupliceer (206),End Event: Duplicate (206),End Event: Duplicate,(206),1
2,00000ED8,1685635300,Tool,Tool: Text (-200),Tool: Text (-200),Tool: Text,(-200),1
3,00000ED8,1685635524,UNDO,End Event: Verplaats (75),End Event: Drag (75),End Event: Drag,(75),32
4,00000ED8,1685635527,Menu,Menu: Group Chunk - Groepeer (-514) (1),Menu: Group Chunk - Group (-514) (1),Menu: Group Chunk - Group,(-514) (1),1
...,...,...,...,...,...,...,...,...
379985029,FFFFF929,1687233115,Menu,Menu: OpenGL Render Chunk - 繧ｷ繧ｧ繧､繝 (-529) (1),Menu: OpenGL Render Chunk - Shaded (-529) (1),Menu: OpenGL Render Chunk - Shaded,(-529) (1),1
379985030,FFFFF929,1687233155,Menu,Menu: Fit To Window - (-77) (0),Menu: Fit To Window - (-77) (0),Menu: Fit To Window -,(-77) (0),1
379985031,FFFFF929,1687233171,UNDO,End Event: 繧ｪ繝ｼ繧ｬ繝翫う繧ｶ縺ｮ邱ｨ髮 (311),End Event: Edit Organization (311),End Event: Edit Organization,(311),1
379985032,FFFFF929,1687233190,Menu,Menu: Document Preferences - (-170) (0),Menu: Document Preferences - (-170) (0),Menu: Document Preferences -,(-170) (0),1


In [None]:
df_log_dataset_dropped.to_parquet('data/merged_logs.parquet')

IOStream.flush timed out


In [13]:
# Count appearances of each message in 'message_eng'
test = df_log_dataset_dropped['message_content'].value_counts().reset_index()
test.columns = ['message_content', 'count']
test

Unnamed: 0,message_content,count
0,End Event: Drag,48492994
1,End Event: Delete,32555193
2,End Event: Resize,23870789
3,End Event: Shape Pane Edit,20951431
4,End Event: Set Active Layer,19596011
...,...,...
4924,Tool: St Frame_elevation,10
4925,End Event: Define Center Line Marker,10
4926,End Event: Generate columns and intercolumns,10
4927,Tool: DBTools,10
