In [1]:
import os
import pandas as pd
import pyarrow.parquet as pq
import string
import numpy as np
from tqdm import tqdm
from multiprocessing import Pool, cpu_count

### Filtering Functions: **Actual Modeling Logs** → **Aligned Logs**
This process includes:

- Translation of command names based on `command_dictionary.csv`.  
- Redundant command removal based on `command_pairs_collections.csv`, retaining only high-level commands to represent the same action.  

In [2]:
def read_unique_commands(path):
    df_unique_commands = pd.read_parquet(path)
    df_unique_commands.reset_index(inplace=True)
    return df_unique_commands

def drop_less_commands(df, commands_set):
    return df[~df['message'].isin(commands_set)]

def read_language_dic(path, df):
    language_df = pd.read_csv(path)
    translation_dict = pd.Series(language_df.label.values, index=language_df.message).to_dict()
    df['message_eng'] = df['message'].map(translation_dict)
    return df.dropna(subset=['message_eng'])

def contains_non_printable(text):
    printable = set(string.printable)
    return any(char not in printable for char in text)

In [3]:

def get_mapping_dict(tool_menu_dict, tool_menu_event_path):
    df_translation = pd.read_csv(tool_menu_event_path)
    for index, row in df_translation.iterrows():
        tool_menu_key = row['tool/menu']
        # If the key does not exist in the dictionary, create a new list
        if tool_menu_key not in tool_menu_dict:
            tool_menu_dict[tool_menu_key] = []
        # Append the 'Following_UNDO' to the list of the corresponding 'tool_menu'
        tool_menu_dict[tool_menu_key].append(row['event'])

def check_message(tool_menu_dict, tool_menu_key, undo_row) -> bool:
    if tool_menu_key not in tool_menu_dict:
        return False
    else:
        # Extract the 'message_eng' from the undo_row
        undo_message = undo_row['message_eng']
        # Check if the undo message is listed under the tool/menu key in the dictionary
        if undo_message in tool_menu_dict[tool_menu_key]:
            return True
        else:
            return False
            
def find_drop_rows(rows_to_drop, index, row, undo_rows, tool_menu_dict):

    self_triggered = False
    
    # If no matching 'UNDO' row is found, check if the tool_menu_key exists in the dictionary
    if row['message_eng'] not in tool_menu_dict:
        self_triggered = True
        
    else:
        # Iterate over the 'UNDO' rows
        for _, undo_row in undo_rows.iterrows():
            # Check if the 'UNDO' row's message matches the criteria for the 'Tool'/'Menu' event
            if check_message(tool_menu_dict, row['message_eng'], undo_row):
                # If a match is found, mark the 'UNDO' row for dropping
                rows_to_drop.append(undo_row.name)
                break  # No need to check further 'UNDO' rows


    # If no matching 'UNDO' row is found and the event is not self-triggered, consider dropping the event row
    if self_triggered is False and not any(check_message(tool_menu_dict, row['message_eng'], undo_row) for _, undo_row in undo_rows.iterrows()):
        rows_to_drop.append(index)


def replace_low_level(df, tool_menu_event_path):
    tool_menu_dict = {}
    get_mapping_dict(tool_menu_dict, tool_menu_event_path)  # Populate the tool_menu_dict

    grouped_data = df.groupby('session_anonymized')
    processed_data = []
    for session_id, group_df in tqdm(grouped_data, desc="Processing grouped data filtering"):
        rows_to_drop = []
        for index, row in reversed(list(group_df.iterrows())):
            if row['cat'] in ['Tool', 'Menu']:
                ts = row['ts']
                # Define the range of indices for surrounding 40 rows
                start_index = max(0, index - 10)
                end_index = index + 30
                # Use boolean indexing to filter rows within the desired range
                surrounding_rows = group_df[(group_df.index >= start_index) & (group_df.index <= end_index)]
                sub_rows = surrounding_rows[(surrounding_rows['ts'] >= ts) & ~surrounding_rows.index.isin(rows_to_drop)]
                up_rows = surrounding_rows[(surrounding_rows['ts'] < ts) & ~surrounding_rows.index.isin(rows_to_drop)]

                # Find the first 'UNDO' action in these subsequent rows
                undo_rows = pd.concat([
                    sub_rows[sub_rows['cat'] == 'UNDO'].head(5),
                    up_rows[up_rows['cat'] == 'UNDO'].head(1)
                ])
                       
                if not undo_rows.empty:
                    find_drop_rows(rows_to_drop, index, row, undo_rows, tool_menu_dict)
                else:
                    rows_to_drop.append(index)
        # Drop the rows that are to be filtered out
        group_df = group_df.drop(rows_to_drop).reset_index(drop=True)
        processed_data.append(group_df)

    data = pd.concat(processed_data, ignore_index=True)

    return data

In [4]:
def process_file(file_path, unique_commands_path, lang_dict_path, tool_menu_event_path):
    df = pd.read_parquet(file_path)
    
    df_unique_commands = read_unique_commands(unique_commands_path)
    drop_commands = df_unique_commands[df_unique_commands['count'] <= 10]['message'].tolist()

    df = drop_less_commands(df, drop_commands)
    df = read_language_dic(lang_dict_path, df)
    df = df[~df['message_eng'].apply(contains_non_printable)]

    high_level_data = replace_low_level(df, tool_menu_event_path)
    return high_level_data


In [5]:
def process_files_in_parallel(file_paths, unique_commands_path, lang_dict_path, tool_menu_event_path, output_path, num_workers=80):
    with Pool(processes=num_workers) as pool:
        results = pool.starmap(process_file, [(file_path, unique_commands_path, lang_dict_path, tool_menu_event_path) for file_path in file_paths])
    
    combined_data = pd.concat(results, ignore_index=True)
    combined_data.to_parquet(output_path, index=False)


In [None]:
if __name__ == '__main__':
    # actual modeling logs folder, which contains the chunks of actual_modeling_logs
    input_folder = '/data/groupby'
    unique_commands_path = '/data/message_counts.parquet'
    lang_dict_path = '/data/command_dictionary.csv'
    tool_menu_event_path = '/data/command_pairs_collections.csv'
    output_path = '/data/aligned_logs.parquet'

    file_paths = [os.path.join(input_folder, f) for f in os.listdir(input_folder) if f.endswith('.parquet')]
    
    process_files_in_parallel(file_paths, unique_commands_path, lang_dict_path, tool_menu_event_path, output_path, num_workers=80)


Processing grouped data filtering: 100%|██████████| 1/1 [00:01<00:00,  1.57s/it]
Processing grouped data filtering: 100%|██████████| 38/38 [00:16<00:00,  2.30it/s]
Processing grouped data filtering: 100%|██████████| 9997/9997 [1:28:11<00:00,  1.89it/s]]    
Processing grouped data filtering:  87%|████████▋ | 8746/9998 [1:28:30<13:24,  1.56it/s]] 
Processing grouped data filtering: 100%|██████████| 10000/10000 [1:29:11<00:00,  1.87it/s]
Processing grouped data filtering: 100%|██████████| 9998/9998 [1:29:14<00:00,  1.87it/s]]
Processing grouped data filtering: 100%|██████████| 9999/9999 [1:29:31<00:00,  1.86it/s]] 
Processing grouped data filtering: 100%|██████████| 10000/10000 [1:29:47<00:00,  1.86it/s]
Processing grouped data filtering: 100%|██████████| 9997/9997 [1:29:48<00:00,  1.86it/s]]  
Processing grouped data filtering: 100%|██████████| 9999/9999 [1:29:37<00:00,  1.86it/s]]
Processing grouped data filtering: 100%|██████████| 9993/9993 [1:30:31<00:00,  1.84it/s]]  
Processing gro

### Postprocessing: Remove Infrequent Commands

In [None]:
from collections import Counter
df_filtered = pd.read_parquet(output_path)

message_list = df_filtered['message_eng'].to_list
counts = Counter(df_filtered['message_eng'])
combined_message_count = pd.DataFrame.from_dict(counts, orient='index', columns=['count'])
combined_message_count = combined_message_count.reset_index().rename(columns={'index': 'message'})
combined_message_count = combined_message_count.sort_values(by = 'count', ascending = False)

def drop_less_commands(df, commands_set):
    return df[~df['message_content'].isin(commands_set)]
# Filter messages with a count less than 10 and extract their content as a list
df_merged_deleted = combined_message_count.loc[combined_message_count['count'] < 10, 'message_content'].tolist()
df_log_dataset_dropped = drop_less_commands(df_filtered, df_merged_deleted)

## Aligned Logs

In [None]:
df_log_dataset_dropped.to_parquet('/data/aligned_logs.parquet')