In [1]:
%pip install python-Levenshtein


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import utils

import json
import os
import re
from tqdm import tqdm

import action_parser
from extract_coauthor_raw_logs import jsonl_names

import re
import Levenshtein



Logs successfully saved to: /Users/chelseashe/Thought_Toolkit/formal1_logs.json


### Set Up

In [3]:
# Use the current working directory
script_dir = os.getcwd()

# Get the raw log dataset
file_path = os.path.join(script_dir, 'formal1_logs.json')

# Feel free to uncomment the line below and start with a smaller sample (20 writing sessions) to reduce runtime
# file_path = os.path.join(script_dir, 'small_logs_for_test.json')

# Open and load the JSON file
with open(file_path) as f:
    logs_by_session = json.load(f)

In [4]:
"""
# Load raw JSON-like data from file
file_path = os.path.join(script_dir, '/content/testing_new_site.json')
with open(file_path, 'r') as f:
    raw_data = f.read()
"""

raw_data = logs_by_session


def fix_all_arrays(raw_json_string):
    def fix_array(match):
        array_content = match.group(1)
        # Add commas between objects: }{
        fixed = re.sub(r'\}\s*\{', '},\n{', array_content.strip())
        return f'[{fixed}]'

    # Regex: match any array content following a key
    fixed_json_string = re.sub(r'\[\s*({.*?})\s*\]', lambda m: fix_array(m), raw_json_string, flags=re.DOTALL)
    return fixed_json_string


# This regex looks for a pattern where a closing brace is immediately followed (with any whitespace) by an opening brace,
# and inserts a comma between them.
raw_data_str = json.dumps(logs_by_session)
fixed_data = re.sub(r'(\})\s*(\{)', r'\1, \2', raw_data_str)


# Now try to load the fixed data as JSON.
try:
    logs_by_session = json.loads(fixed_data)
    print("✅ Fixed and loaded JSON successfully!")
except json.JSONDecodeError as e:
    print("❌ Failed to parse JSON:", e)


for test_user, events in logs_by_session.items():
    if isinstance(events, list):
       logs_by_session[test_user] = [
            event for event in events if event.get("eventName") != "system-initialize"
        ]

✅ Fixed and loaded JSON successfully!


In [5]:
def are_strings_similar_lev(str1, str2, max_differences=4):
    distance = Levenshtein.distance(str1, str2)
    return distance <= max_differences

### Parsing Raw Log JSON File into Structured Level 1 Actions

This section processes raw logs and converts them into Level 1 actions using a analyzer. Each parsed action is enriched with a **level_1_action_type** key, which specifies the action type (e.g., `insert_text`, `delete_text`, `accept_suggestion`).

**level_1_actions_per_session** is a dictionary where each session key maps to a list of parsed actions, organizing the output by session for streamlined analysis and further processing.

In [6]:
# Use the current working directory
script_dir = os.getcwd()

In [7]:
ai_sentences = []


def split_insert_text_by_delta(action, prev_action, threshold=5):
    """
    Splits an 'insert_text' action into AI and human based on insert length from delta.
    """
    try:
        ops = action["action_logs"][0]["textDelta"]["ops"]
        inserted_text = ops[1].get("insert", "")
    except (IndexError, KeyError, TypeError):
        action["level_1_action_type"] = "insert_text_human"
        action["human_sentences_temporal_order"] = " ".join(
            action["sentences_temporal_order"]
        )
        return [action]

    if prev_action["level_1_action_type"] == "present_suggestion" and len(inserted_text.strip()) >= threshold:
        # ---- AI action ----
        ai_action = action.copy()
        ai_action["action_logs"] = ai_action["action_logs"][0]
        ai_action["action_delta"] = [
            "INSERT",
            inserted_text,
            action["action_delta"][2],
            action["action_delta"][3],
        ]
        ai_action["action_modified_sentences"] = utils.sent_tokenize(inserted_text)
        ai_sentences.extend(ai_action["action_modified_sentences"])
        ai_action["action_end_writing"] = (
            ai_action["action_start_writing"] + inserted_text
        )
        ai_action["level_1_action_type"] = "insert_text_ai"

        # ---- Human action ----
        action_human = action.copy()
        action_human["action_start_writing"] = ai_action["action_end_writing"]
        remaining_text = action_human["action_end_writing"][
            len(action_human["action_start_writing"]) :
        ]
        action_human["action_delta"] = [
            "INSERT",
            remaining_text,
            action["action_delta"][2],
            action["action_delta"][3],
        ]
        action_human["action_logs"] = action_human["action_logs"][1:]
        action_human["level_1_action_type"] = "insert_text_human"

        # ---- Filter modified sentences ----
        action_human["action_modified_sentences"] = [
            s
            for s in action_human["action_modified_sentences"]
            if not any(
                are_strings_similar_lev(s, ai_s)
                for ai_s in ai_action["action_modified_sentences"]
            )
        ]

        # ---- Filter temporal order ----
        sentences_human = [
            s
            for s in action_human["sentences_temporal_order"]
            if s.count("$") < 2 and not any(
                are_strings_similar_lev(s, ai_s)
                for ai_s in ai_action["action_modified_sentences"]
            )
        ]
        action_human["human_sentences_temporal_order"] = " ".join(sentences_human)

        sentences_without_prompts = [
            s
            for s in action_human["sentences_temporal_order"]
            if (
                s.count("$") < 2
            )
        ]
        action_human["sentences_temporal_order_without_prompts"] = sentences_without_prompts
        ai_action["sentences_temporal_order_without_prompts"] = sentences_without_prompts
        
        return [ai_action, action_human]

    else:
        # ---- Purely human insert ----
        action_human = action.copy()
        action_human["level_1_action_type"] = "insert_text_human"

        sentences_human = [
            s
            for s in action_human["sentences_temporal_order"]
            if not any(are_strings_similar_lev(s, ai_s) for ai_s in ai_sentences)
        ]
        action_human["human_sentences_temporal_order"] = " ".join(sentences_human)

        sentences_without_prompts = [
            s for s in action_human["sentences_temporal_order"] if (s.count("$") < 2)
        ]
        action_human["sentences_temporal_order_without_prompts"] = (
            sentences_without_prompts
        )

        return [action_human]

In [8]:
# Initialize an empty dictionary to store parsed actions
level_1_actions_per_session = {}

# Iterate through all sessions in the raw logs and parse actions
for session in tqdm(logs_by_session, desc="Parsing Level 1 Actions"):

    # Initialize the MergeActionsAnalyzer for each session
    actions_analyzer = action_parser.MergeActionsAnalyzer(
        last_action=None,
        raw_logs=logs_by_session[session]
    )

    # Parse the logs for the session into structured actions
    actions_lst, last_action = actions_analyzer.parse_actions_from_logs(
        all_logs=logs_by_session[session],
        last_action=None,
        DLT_CHAR_MAX_COUNT=9  # Optional: Specify tiny delete threshold here
    )

    # Store the parsed actions in the output dictionary
    level_1_actions_per_session[session] = actions_lst

# Add a new key to each action for classification and further analysis
for session_key, actions in level_1_actions_per_session.items():
    i = 0

    while i < len(actions):
        action = actions[i]
        if i>0 and "action_type" in action and action["action_type"] == "insert_text":
            split_actions = split_insert_text_by_delta(action, actions[i-1])
            if len(split_actions) == 1:
                actions[i] = split_actions[0]
                i += 1
            else:
                actions[i:i+1] = split_actions
                i += len(split_actions)
        else:
            if not "action_type" in action:
                action["level_1_action_type"] = "NEXT_CLICKED"
            else:
                action["level_1_action_type"] = action["action_type"]
            i += 1

Parsing Level 1 Actions:   0%|          | 0/14 [00:00<?, ?it/s]

Error: {'eventName': 'NEXT_CLICKED', 'eventSource': 'user', 'eventTimestamp': 1752768728425}
NEXT_CLICKED
Error: {'eventName': 'NEXT_CLICKED', 'eventSource': 'user', 'eventTimestamp': 1752768728425}
NEXT_CLICKED
Error: {'eventName': 'NEXT_CLICKED', 'eventSource': 'user', 'eventTimestamp': 1752783091396}
NEXT_CLICKED


Parsing Level 1 Actions:  14%|█▍        | 2/14 [00:00<00:01, 11.90it/s]

Error: {'eventName': 'NEXT_CLICKED', 'eventSource': 'user', 'eventTimestamp': 1752783091396}
NEXT_CLICKED
Error: {'eventName': 'NEXT_CLICKED', 'eventSource': 'user', 'eventTimestamp': 1752797464057}
NEXT_CLICKED
Error: {'eventName': 'NEXT_CLICKED', 'eventSource': 'user', 'eventTimestamp': 1752797464057}
NEXT_CLICKED
Error: {'eventName': 'NEXT_CLICKED', 'eventSource': 'user', 'eventTimestamp': 1752862294085}
NEXT_CLICKED
Error: {'eventName': 'NEXT_CLICKED', 'eventSource': 'user', 'eventTimestamp': 1752862294085}
NEXT_CLICKED


Parsing Level 1 Actions:  29%|██▊       | 4/14 [00:00<00:00, 10.47it/s]

Error: {'eventName': 'NEXT_CLICKED', 'eventSource': 'user', 'eventTimestamp': 1753050258907}
NEXT_CLICKED
Error: {'eventName': 'NEXT_CLICKED', 'eventSource': 'user', 'eventTimestamp': 1753050258907}
NEXT_CLICKED
Error: {'eventName': 'NEXT_CLICKED', 'eventSource': 'user', 'eventTimestamp': 1753211326951}
NEXT_CLICKED
Error: {'eventName': 'NEXT_CLICKED', 'eventSource': 'user', 'eventTimestamp': 1753211326951}
NEXT_CLICKED


Parsing Level 1 Actions:  43%|████▎     | 6/14 [00:00<00:00,  8.27it/s]

Error: {'eventName': 'NEXT_CLICKED', 'eventSource': 'user', 'eventTimestamp': 1752775741979}
NEXT_CLICKED
Error: {'eventName': 'NEXT_CLICKED', 'eventSource': 'user', 'eventTimestamp': 1752775741979}
NEXT_CLICKED
Error: {'eventName': 'NEXT_CLICKED', 'eventSource': 'user', 'eventTimestamp': 1752855918299}
NEXT_CLICKED


Parsing Level 1 Actions:  57%|█████▋    | 8/14 [00:00<00:00,  9.00it/s]

Error: {'eventName': 'NEXT_CLICKED', 'eventSource': 'user', 'eventTimestamp': 1752855918299}
NEXT_CLICKED
Error: {'eventName': 'NEXT_CLICKED', 'eventSource': 'user', 'eventTimestamp': 1753017123863}
NEXT_CLICKED
Error: {'eventName': 'NEXT_CLICKED', 'eventSource': 'user', 'eventTimestamp': 1753017123863}
NEXT_CLICKED


Parsing Level 1 Actions:  71%|███████▏  | 10/14 [00:00<00:00, 10.59it/s]

Error: {'eventName': 'NEXT_CLICKED', 'eventSource': 'user', 'eventTimestamp': 1753060358599}
NEXT_CLICKED
Error: {'eventName': 'NEXT_CLICKED', 'eventSource': 'user', 'eventTimestamp': 1753060358599}
NEXT_CLICKED
Error: {'eventName': 'NEXT_CLICKED', 'eventSource': 'user', 'eventTimestamp': 1753112478228}
NEXT_CLICKED
Error: {'eventName': 'NEXT_CLICKED', 'eventSource': 'user', 'eventTimestamp': 1753112478228}
NEXT_CLICKED
Error: {'eventName': 'NEXT_CLICKED', 'eventSource': 'user', 'eventTimestamp': 1753913882361}
NEXT_CLICKED
Error: {'eventName': 'NEXT_CLICKED', 'eventSource': 'user', 'eventTimestamp': 1753913882361}
NEXT_CLICKED


Parsing Level 1 Actions:  86%|████████▌ | 12/14 [00:01<00:00, 10.58it/s]

Error: {'eventName': 'NEXT_CLICKED', 'eventSource': 'user', 'eventTimestamp': 1753972753614}
NEXT_CLICKED
Error: {'eventName': 'NEXT_CLICKED', 'eventSource': 'user', 'eventTimestamp': 1753972753614}
NEXT_CLICKED
Error: {'eventName': 'NEXT_CLICKED', 'eventSource': 'user', 'eventTimestamp': 1754059083586}
NEXT_CLICKED
Error: {'eventName': 'NEXT_CLICKED', 'eventSource': 'user', 'eventTimestamp': 1754059083586}
NEXT_CLICKED


Parsing Level 1 Actions: 100%|██████████| 14/14 [00:01<00:00,  9.54it/s]
