In [None]:
%pip install python-Levenshtein

In [None]:
import utils

import json
import os
import re
from tqdm import tqdm

import action_parser
from extract_coauthor_raw_logs import jsonl_names
import level_2_learning_comparisons

import re
import Levenshtein

### Setup

In [None]:
# #Extracting logs from zip file
# import zipfile

# folder_name = '/content/vibewritingpilot.zip'

# with zipfile.ZipFile(folder_name, 'r') as zip_ref:
#     zip_ref.extractall('/content/pilot2')  # This will extract to /content/pilot_2

In [None]:
def load_jsonl_files_parsed(folder_path):
    """
    Reads all .jsonl files in the given folder and returns a dictionary.

    Each key in the dictionary is the file name (without extension),
    and its value is a list of parsed JSON objects (not strings).
    """
    jsonl_data = {}

    for filename in os.listdir(folder_path):
        if filename.endswith('.jsonl'):
            full_path = os.path.join(folder_path, filename)
            try:
                with open(full_path, 'r', encoding='utf-8') as file:
                    lines = file.read().splitlines()
                    parsed_lines = [json.loads(line) for line in lines]
                    jsonl_data[filename.split('.')[0]] = parsed_lines
            except Exception as e:
                print(f"Error reading {filename}: {e}")

    return jsonl_data

# Example usage
folder_path = "formal_raw_logs"  # Your folder path
data = load_jsonl_files_parsed(folder_path)

# Print a preview of one file to confirm the format
for file_name, events in data.items():
    print(f"\nFile: {file_name}, Number of events: {len(events)}")
    if events:
        print("First event:")
        print(json.dumps(events[0], indent=2))  # Pretty-print first event
    break  # Remove this if you want to preview more files

In [None]:
jsonl_data = {}
filename = "formal_raw_logs/6df918d72046461a98d275bf3fac31d0.jsonl"
# Print a preview of one file to confirm the format
with open(filename, 'r', encoding='utf-8') as file:
    lines = file.read().splitlines()
    parsed_lines = [json.loads(line) for line in lines]
    jsonl_data[filename.split('.')[0]] = parsed_lines

data = jsonl_data
print(data)

In [None]:
logs_by_session = data

In [None]:
for key in logs_by_session.keys():
  logs_by_session[key] = logs_by_session[key][1:]

### Loading Structured JSON File for Parsing and Analysis

Once the `extract_coauthor_raw_logs.py` script has successfully generated a structured JSON file, you can proceed to load the file in this section for further parsing and analysis.

In [None]:
# Use the current working directory
script_dir = os.getcwd()

# Get the raw log dataset
file_path = os.path.join(script_dir, 'formal1_logs.json')

# Feel free to uncomment the line below and start with a smaller sample (20 writing sessions) to reduce runtime
# file_path = os.path.join(script_dir, 'small_logs_for_test.json')

# Open and load the JSON file
with open(file_path) as f:
    logs_by_session = json.load(f)

In [None]:
"""
# Load raw JSON-like data from file
file_path = os.path.join(script_dir, '/content/testing_new_site.json')
with open(file_path, 'r') as f:
    raw_data = f.read()
"""

raw_data = logs_by_session


def fix_all_arrays(raw_json_string):
    def fix_array(match):
        array_content = match.group(1)
        # Add commas between objects: }{
        fixed = re.sub(r'\}\s*\{', '},\n{', array_content.strip())
        return f'[{fixed}]'

    # Regex: match any array content following a key
    fixed_json_string = re.sub(r'\[\s*({.*?})\s*\]', lambda m: fix_array(m), raw_json_string, flags=re.DOTALL)
    return fixed_json_string


# This regex looks for a pattern where a closing brace is immediately followed (with any whitespace) by an opening brace,
# and inserts a comma between them.
raw_data_str = json.dumps(logs_by_session)
fixed_data = re.sub(r'(\})\s*(\{)', r'\1, \2', raw_data_str)


# Now try to load the fixed data as JSON.
try:
    logs_by_session = json.loads(fixed_data)
    print("✅ Fixed and loaded JSON successfully!")
except json.JSONDecodeError as e:
    print("❌ Failed to parse JSON:", e)


for test_user, events in logs_by_session.items():
    if isinstance(events, list):
       logs_by_session[test_user] = [
            event for event in events if event.get("eventName") != "system-initialize"
        ]

### Parsing Raw Log JSON File into Structured Level 1 Actions

This section processes raw logs and converts them into Level 1 actions using a analyzer. Each parsed action is enriched with a **level_1_action_type** key, which specifies the action type (e.g., `insert_text`, `delete_text`, `accept_suggestion`).

**level_1_actions_per_session** is a dictionary where each session key maps to a list of parsed actions, organizing the output by session for streamlined analysis and further processing.

In [None]:
def are_strings_similar_lev(str1, str2, max_differences=4):
    distance = Levenshtein.distance(str1, str2)
    return distance <= max_differences

In [None]:
# Use the current working directory
script_dir = os.getcwd()

In [155]:
ai_sentences = []


def split_insert_text_by_delta(action, prev_action, threshold=5):
    """
    Splits an 'insert_text' action into AI and human based on insert length from delta.
    """
    try:
        ops = action["action_logs"][0]["textDelta"]["ops"]
        inserted_text = ops[1].get("insert", "")
    except (IndexError, KeyError, TypeError):
        action["level_1_action_type"] = "insert_text_human"
        action["human_sentences_temporal_order"] = " ".join(
            action["sentences_temporal_order"]
        )
        return [action]

    if prev_action["level_1_action_type"] == "present_suggestion" and len(inserted_text.strip()) >= threshold:
        # ---- AI action ----
        ai_action = action.copy()
        ai_action["action_logs"] = ai_action["action_logs"][0]
        ai_action["action_delta"] = [
            "INSERT",
            inserted_text,
            action["action_delta"][2],
            action["action_delta"][3],
        ]
        ai_action["action_modified_sentences"] = utils.sent_tokenize(inserted_text)
        ai_sentences.extend(ai_action["action_modified_sentences"])
        ai_action["action_end_writing"] = (
            ai_action["action_start_writing"] + inserted_text
        )
        ai_action["level_1_action_type"] = "insert_text_ai"

        # ---- Human action ----
        action_human = action.copy()
        action_human["action_start_writing"] = ai_action["action_end_writing"]
        remaining_text = action_human["action_end_writing"][
            len(action_human["action_start_writing"]) :
        ]
        action_human["action_delta"] = [
            "INSERT",
            remaining_text,
            action["action_delta"][2],
            action["action_delta"][3],
        ]
        action_human["action_logs"] = action_human["action_logs"][1:]
        action_human["level_1_action_type"] = "insert_text_human"

        # ---- Filter modified sentences ----
        action_human["action_modified_sentences"] = [
            s
            for s in action_human["action_modified_sentences"]
            if not any(
                are_strings_similar_lev(s, ai_s)
                for ai_s in ai_action["action_modified_sentences"]
            )
        ]

        # ---- Filter temporal order ----
        sentences_human = [
            s
            for s in action_human["sentences_temporal_order"]
            if s.count("$") < 2 and not any(
                are_strings_similar_lev(s, ai_s)
                for ai_s in ai_action["action_modified_sentences"]
            )
        ]
        action_human["human_sentences_temporal_order"] = " ".join(sentences_human)

        sentences_without_prompts = [
            s
            for s in action_human["sentences_temporal_order"]
            if (
                s.count("$") < 2
            )
        ]
        action_human["sentences_temporal_order_without_prompts"] = sentences_without_prompts
        ai_action["sentences_temporal_order_without_prompts"] = sentences_without_prompts
        
        return [ai_action, action_human]

    else:
        # ---- Purely human insert ----
        action_human = action.copy()
        action_human["level_1_action_type"] = "insert_text_human"

        sentences_human = [
            s
            for s in action_human["sentences_temporal_order"]
            if not any(are_strings_similar_lev(s, ai_s) for ai_s in ai_sentences)
        ]
        action_human["human_sentences_temporal_order"] = " ".join(sentences_human)

        sentences_without_prompts = [
            s for s in action_human["sentences_temporal_order"] if (s.count("$") < 2)
        ]
        action_human["sentences_temporal_order_without_prompts"] = (
            sentences_without_prompts
        )

        return [action_human]

In [162]:
# Initialize an empty dictionary to store parsed actions
level_1_actions_per_session = {}

# Iterate through all sessions in the raw logs and parse actions
for session in tqdm(logs_by_session, desc="Parsing Level 1 Actions"):

    # Initialize the MergeActionsAnalyzer for each session
    actions_analyzer = action_parser.MergeActionsAnalyzer(
        last_action=None,
        raw_logs=logs_by_session[session]
    )

    # Parse the logs for the session into structured actions
    actions_lst, last_action = actions_analyzer.parse_actions_from_logs(
        all_logs=logs_by_session[session],
        last_action=None,
        DLT_CHAR_MAX_COUNT=9  # Optional: Specify tiny delete threshold here
    )

    # Store the parsed actions in the output dictionary
    level_1_actions_per_session[session] = actions_lst

# Add a new key to each action for classification and further analysis
for session_key, actions in level_1_actions_per_session.items():
    i = 0

    while i < len(actions):
        action = actions[i]
        if i>0 and "action_type" in action and action["action_type"] == "insert_text":
            split_actions = split_insert_text_by_delta(action, actions[i-1])
            if len(split_actions) == 1:
                actions[i] = split_actions[0]
                i += 1
            else:
                actions[i:i+1] = split_actions
                i += len(split_actions)
        else:
            if not "action_type" in action:
                action["level_1_action_type"] = "NEXT_CLICKED"
            else:
                action["level_1_action_type"] = action["action_type"]
            i += 1

Parsing Level 1 Actions:  21%|██▏       | 3/14 [00:00<00:00, 23.43it/s]

Error: {'eventName': 'NEXT_CLICKED', 'eventSource': 'user', 'eventTimestamp': 1752768728425}
NEXT_CLICKED
Error: {'eventName': 'NEXT_CLICKED', 'eventSource': 'user', 'eventTimestamp': 1752768728425}
NEXT_CLICKED
Error: {'eventName': 'NEXT_CLICKED', 'eventSource': 'user', 'eventTimestamp': 1752783091396}
NEXT_CLICKED
Error: {'eventName': 'NEXT_CLICKED', 'eventSource': 'user', 'eventTimestamp': 1752783091396}
NEXT_CLICKED
Error: {'eventName': 'NEXT_CLICKED', 'eventSource': 'user', 'eventTimestamp': 1752797464057}
NEXT_CLICKED
Error: {'eventName': 'NEXT_CLICKED', 'eventSource': 'user', 'eventTimestamp': 1752797464057}
NEXT_CLICKED
Error: {'eventName': 'NEXT_CLICKED', 'eventSource': 'user', 'eventTimestamp': 1752862294085}
NEXT_CLICKED
Error: {'eventName': 'NEXT_CLICKED', 'eventSource': 'user', 'eventTimestamp': 1752862294085}
NEXT_CLICKED
Error: {'eventName': 'NEXT_CLICKED', 'eventSource': 'user', 'eventTimestamp': 1753050258907}
NEXT_CLICKED
Error: {'eventName': 'NEXT_CLICKED', 'eventSou

Parsing Level 1 Actions:  57%|█████▋    | 8/14 [00:00<00:00, 16.27it/s]

Error: {'eventName': 'NEXT_CLICKED', 'eventSource': 'user', 'eventTimestamp': 1753211326951}
NEXT_CLICKED
Error: {'eventName': 'NEXT_CLICKED', 'eventSource': 'user', 'eventTimestamp': 1752775741979}
NEXT_CLICKED
Error: {'eventName': 'NEXT_CLICKED', 'eventSource': 'user', 'eventTimestamp': 1752775741979}
NEXT_CLICKED
Error: {'eventName': 'NEXT_CLICKED', 'eventSource': 'user', 'eventTimestamp': 1752855918299}
NEXT_CLICKED
Error: {'eventName': 'NEXT_CLICKED', 'eventSource': 'user', 'eventTimestamp': 1752855918299}
NEXT_CLICKED
Error: {'eventName': 'NEXT_CLICKED', 'eventSource': 'user', 'eventTimestamp': 1753017123863}
NEXT_CLICKED
Error: {'eventName': 'NEXT_CLICKED', 'eventSource': 'user', 'eventTimestamp': 1753017123863}
NEXT_CLICKED
Error: {'eventName': 'NEXT_CLICKED', 'eventSource': 'user', 'eventTimestamp': 1753060358599}
NEXT_CLICKED
Error: {'eventName': 'NEXT_CLICKED', 'eventSource': 'user', 'eventTimestamp': 1753060358599}
NEXT_CLICKED


Parsing Level 1 Actions: 100%|██████████| 14/14 [00:00<00:00, 18.67it/s]

Error: {'eventName': 'NEXT_CLICKED', 'eventSource': 'user', 'eventTimestamp': 1753112478228}
NEXT_CLICKED
Error: {'eventName': 'NEXT_CLICKED', 'eventSource': 'user', 'eventTimestamp': 1753112478228}
NEXT_CLICKED
Error: {'eventName': 'NEXT_CLICKED', 'eventSource': 'user', 'eventTimestamp': 1753913882361}
NEXT_CLICKED
Error: {'eventName': 'NEXT_CLICKED', 'eventSource': 'user', 'eventTimestamp': 1753913882361}
NEXT_CLICKED
Error: {'eventName': 'NEXT_CLICKED', 'eventSource': 'user', 'eventTimestamp': 1753972753614}
NEXT_CLICKED
Error: {'eventName': 'NEXT_CLICKED', 'eventSource': 'user', 'eventTimestamp': 1753972753614}
NEXT_CLICKED
Error: {'eventName': 'NEXT_CLICKED', 'eventSource': 'user', 'eventTimestamp': 1754059083586}
NEXT_CLICKED
Error: {'eventName': 'NEXT_CLICKED', 'eventSource': 'user', 'eventTimestamp': 1754059083586}
NEXT_CLICKED





Making Human AI Edit Action

In [163]:
from nltk import sent_tokenize

from difflib import SequenceMatcher
import copy
def most_similar_sentence(insert_text, sentences): 
    """Return the most similar sentence and its similarity score.""" 
    best_sent, best_score = None, 0 
    for s in sentences: 
        score = SequenceMatcher(None, insert_text.split(), s.split()).ratio() 
        if score > best_score: 
            best_sent, best_score = s, score 
    return best_sent, best_score 
def get_inserted(before: str, after: str): 
    """ Returns the parts of after that were newly inserted compared to before. """ 
    sm = SequenceMatcher(None, before.split(), after.split()) 
    inserted = [] 
    for tag, i1, i2, j1, j2 in sm.get_opcodes(): 
        if tag == "insert": 
            inserted.extend(after.split()[j1:j2]) 
        elif tag == "replace": 
            # Replacement can be seen as deletion + insertion 
            inserted.extend(after.split()[j1:j2]) 
    return " ".join(inserted) 
def get_deleted(before: str, after: str): 
    """ Returns the parts of before that were deleted when transforming into after. """ 
    sm = SequenceMatcher(None, before.split(), after.split()) 
    deleted = [] 
    for tag, i1, i2, j1, j2 in sm.get_opcodes(): 
        if tag == "delete": 
            deleted.extend(before.split()[i1:i2]) 
        elif tag == "replace": 
            # Replacement can be seen as deletion + insertion 
            deleted.extend(before.split()[i1:i2]) 
    return " ".join(deleted) 
def split_action_delta(action_delta: str, delta: str, min_ratio=0.6): 
    """ Try to locate delta inside action_delta, even if noisy like 'For examp c'. Returns (before, match, after). """ 
    best = None 
    best_ratio = 0.0 
    # Slide over possible substrings of action_delta 
    for start in range(len(action_delta)): 
        for end in range(start+1, min(len(action_delta), start+len(delta)+5)+1): 
            sub = action_delta[start:end] 
            ratio = SequenceMatcher(None, sub, delta).ratio() 
            if ratio > best_ratio: 
                best_ratio = ratio 
                best = (start, end, sub) 
    if best and best_ratio >= min_ratio: 
        s, e, sub = best 
        return action_delta[:s], sub, action_delta[e:] 
    else: # fallback: couldn't match well 
        return action_delta, "", ""
    
def make_end_writing_insert(start, modified_sentences, end):
    start_sents = sent_tokenize(start)
    end_sents = sent_tokenize(end)
    result_sents = []

    modified_sentence_idx = 0
    
    for i in range(max(len(start_sents), len(end_sents))):
        if i < len(start_sents):
            if start_sents[i] not in end_sents:
                result_sents.append(modified_sentences[modified_sentence_idx])
                modified_sentence_idx+=1
            else:
                result_sents.append(start_sents[i])
        else:
            if modified_sentence_idx != len(modified_sentences):
                result_sents+=modified_sentences[modified_sentence_idx:]
            break
        if modified_sentence_idx == len(modified_sentences):
            if i < len(start_sents):
                result_sents+=start_sents[i+1:]
            break
    return " ".join(result_sents)

In [None]:
def split_human_ai_edit(level_1_actions_per_session):
    """
    Parses Level 2 present_suggestion actions based on Level 2 actions.

    Args:
        level_2_actions_per_session (dict): A dictionary where each session key maps to a list of level 2 actions from insert_text actions

    Returns:
        dict: The updated actions dictionary with Level 2 attributes added to each action.
    """
    for session_id, actions_lst in tqdm(
        level_1_actions_per_session.items(), desc="Parsing Level 1 present_suggestion Actions"
    ):
        
        last_insert_action = ""
        past_arguments = []
        past_prompts = []
        past_suggestions = []
        for idx, action in enumerate(actions_lst):
            try:
                if len(action["action_delta"]) > 1:
                    if action["action_delta"][0] == 'INSERT':
                        # This will compute a word level delta insert
                        delta = get_inserted(action["action_start_writing"], action["action_end_writing"])
                        action["action_delta_by_word"] = ('INSERT', delta, len(delta), len(delta.split()))
                    elif action["action_delta"][0] == 'DELETE':
                        delta = get_deleted(action["action_start_writing"], action["action_end_writing"])
                        action["action_delta_by_word"] = ('DELETE', delta, len(delta), len(delta.split()))
                for suggestion in past_suggestions: 
                    # Does not account for movements, just check if the same string is still in the writing after current action
                    if suggestion in action["action_start_writing"] and suggestion not in action["action_end_writing"]:
                        if action["action_delta"][0] == "INSERT":
                            #print(suggestion)
                            sent,score = most_similar_sentence(suggestion, action["action_modified_sentences"])
                            if sent is None or len(sent.split()) <= 2:
                                continue
                            #sent_idx = action["action_modified_sentences"].index(sent)
                            delta = get_inserted(suggestion, sent)
                            # Making end writing on a sentence level, assume linear writing within a block (might not be true but good enough proxy?)
                            # Splits the action modified sentences by before human edit ai, human edit ai sentence, and after human edit ai 
                            # Replace start writing with every modified sentence before human edit ai to be start writing for mid and repeat
                            if len(action["action_modified_sentences"]) > 1:
                                action_mid = copy.deepcopy(action)
                                action_mid["level_2_action_type"] = "human_edit_ai"
                                action_mid["action_delta_by_word"] = ("INSERT", delta, len(delta), len(delta.split()))
                                temporal_modified_sentences = []
                                for s in action["sentences_temporal_order"]:
                                    if s in action["action_modified_sentences"]:
                                        temporal_modified_sentences.append(s)
                                sent_idx = temporal_modified_sentences.index(sent)
                                #after = action["action_modified_sentences"][sent_idx:]
                                #before = action["action_modified_sentences"][:sent_idx]
                                after = temporal_modified_sentences[sent_idx:]
                                before = temporal_modified_sentences[:sent_idx]
                                #print("before")
                                #print(before)
                                #print(after[1:])
                                if action in actions_lst:
                                    actions_lst.remove(action)
                                if len(before) > 0:
                                    action_before = copy.deepcopy(action)
                                    action_before["action_modified_sentences"] = before
                                    action_before["action_end_writing"] = make_end_writing_insert(action_before["action_start_writing"], action_before["action_modified_sentences"], action_before["action_end_writing"])
                                    action_mid["action_start_writing"] = action_before["action_end_writing"]
                                    actions_lst.insert(idx, action_before)
                                    idx+=1
                                action_mid["action_modified_sentences"] = [sent]#action["action_modified_sentences"][sent_idx]
                                action_mid["action_end_writing"] = make_end_writing_insert(action_mid["action_start_writing"], action_mid["action_modified_sentences"], action_mid["action_end_writing"])
                                actions_lst.insert(idx, action_mid)
                                idx+=1
                                if len(after) > 1: 
                                    action_after = copy.deepcopy(action)
                                    action_after["action_modified_sentences"] = after[1:]
                                    action_after["action_start_writing"] = action_mid["action_end_writing"]
                                    actions_lst.insert(idx, action_after)
                                    idx+=1
                            else:
                                #action["level_1_action_type"] = "human_edit_ai_insert"
                                action["level_2_action_type"] = "human_edit_ai"
                            
                        # Kind of just colored the whole block as human edit ai delete since people usually delete in whole block so doesn't make as much sense to split. 
                        elif action["action_delta"][0] == 'DELETE':
                            action["level_2_action_type"] = "human_edit_ai"
                        past_suggestions.remove(suggestion)
                if action["level_1_action_type"] == "insert_text_ai":
                    past_suggestions.append(action["action_modified_sentences"][0])
                
            except Exception as e:
                import traceback

                print(f"\n!! Error in session {session_id}, action index {idx} !!")
                traceback.print_exc()
                continue

    return level_1_actions_per_session

In [165]:
level_1_actions_per_session_suggestions = split_human_ai_edit(
      level_1_actions_per_session
  )

Parsing Level 1 present_suggestion Actions: 100%|██████████| 14/14 [00:00<00:00, 78.07it/s]

before
['While corporate personhood has controversial implications within the sphere of campaign finance, it has also been applied to other areas of corporations.']
['This can change the type of court cases that corporations are affected by.', 'Typically, corporations are mostly affected by consumer protection cases (individual vs.', 'corporation), but if corporate personhood allows for corporations to be sued as individual vs.', 'individual, then it can fall into']
before
[]
['$example of corporate personhood$']
before
['$other benefits of corporate personhood$']
[]
before
['In this sense, people believed that Whether that was having the freedom to fund the candidate that they desire to or express political opinions without fear of government retaliation.']
[]
before
['These zero-price markets have increased in variety and number, with examples such as social media, travel booking, or software.']
['$add two sentences about google lawsuit about dominating google search engine$']
before




In [154]:
# Define the output file path in the same directory as this notebook
output_file = os.path.join(script_dir, "level_1_actions_per_session.json")

# Write the parsed actions dictionary to the JSON file
with open(output_file, "w") as f:
    json.dump(level_1_actions_per_session, f, default=utils.custom_serializer)

### Level 2 Learning Parsing Based on Generated Level 1 Actions and Outcome Measures Documentation

#### Key Features:

The pipeline consists of two main steps:

1. **Level 2 Parsing**: Augments Level 1 actions with detailed Level 2 attributes.
    <br/>Attributes include:
- vibe_writing: prewriting opinions are opposed with genAI writing task
- minor_vibe_writing: prewriting opinions are potentially opposed with genAI writing task
- constructive_learning: applied prewriting learning to genAI writing task (over 3/4 of the writing was mentioned before in the prewriting)
- minor_constructive_learning: applied prewriting learning to some of genAI writing task (over 1/4 of the writing was mentioned before in the prewriting)
    
2. **Outcome Measures Computation**: Computes Stance Detection and Natural Language Inference

In [None]:
from transformers import AutoTokenizer

nli_tokenizer = AutoTokenizer.from_pretrained("roberta-large-mnli")

In [None]:
def getFrontstageText(action, past_prompts_with_ai_inserts):
    frontstage_text = action["sentences_temporal_order"].copy()
    for sentIndex in range(len(action["sentences_temporal_order"])):
        if action["sentences_temporal_order"][sentIndex].count("$")>=2:
            index = frontstage_text.index(action["sentences_temporal_order"][sentIndex])
            deleted_ai_prompt = frontstage_text.pop(index)
            if (
                index < len(frontstage_text)
                and deleted_ai_prompt in past_prompts_with_ai_inserts
                and past_prompts_with_ai_inserts[deleted_ai_prompt]
                == frontstage_text[index]
            ):
                frontstage_text.pop(index)
    return frontstage_text

Level 2 Action Types: 

**insert_human_backstage** -- A: Human adds texts to the backstage

**insert_ai_backstage** -- B: AI adds texts to the backstage
* AI following a prompt
* AI suggestions shown

**move_frontstage** -- C: Human elevates AI texts from backstage to front stage
* moves or copies AI responses to the writing (not implemented) 
* Accept inline 
* Deleting prompt to make suggestion frontstage 

**human_edit_ai** -- D: Humans edit AI texts 

**insert_human_frontstage** -- E: Human adds texts to the front stage. 


In [None]:
def parse_level_2_actions(level_1_actions_per_session):
    """
    Parses Level 2 actions based on Level 1 actions, but only for insert_text actions.

    Args:
        level_1_actions_per_session (dict): A dictionary where each session key maps to a list of level 1 actions.

    Returns:
        dict: The updated actions dictionary with Level 2 attributes added to each action.
    """

    for session_id, actions_lst in tqdm(
        level_1_actions_per_session.items(), desc="Parsing Level 2 Actions"
    ):
        past_ai_suggestions = []
        past_prompts = []
        past_prompts_with_ai_inserts = {}
        frontstage_text = []
        backstage_text = []
        prewriting = False

        prewriting_ids = {
            "legislation_corporate_1": "ab7b7355bfde4657b68f7a2985e494a9",
            "legislation_antitrust_1": "4d85d96c21494339bfe570b87f354fec",
            "legislation_corporate_2": "4517db08add74465b0888b3002170e59",
            "legislation_corporate_3": "769f457ab1f2458abf376c4d248ae34e",
            "legislation_antitrust_2": "7204e89b27a744c3a256197c77b33f19",
            "legislation_corporate_4": "8e633d2f2e9246828ab498f5388559bc",
            "legislation_antitrust_3": "039142481529449685e10dd7d2a250d6",
            "legislation_corporate_5": "81d7a2e4bdf94cb3a2376afc93410a10",
            "legislation_antitrust_4": "4eedf1ef4ea14d769e65d657f4145f0c",
            "legislation_antitrust_5": "9f742fd07a35428eacee633b2a284120",
            "legislation_corporate_6": "ada424f7f72e46c18c906f14ac4b0e2e",
            "legislation_antitrust_6": "e626c3e15f9641ef94298979e83f9bed",
            "legislation_antitrust_7": "9c861b5263f24c33832ae6dee287cb42",
            "legislation_antitrust_8": "b6a1710464d047b2aef7979a1f66dd5c",
        }
        filename = utils.get_filename(
            "prewriting_content/", prewriting_ids[session_id]
        )
        prewriting_content = ""

        with open(filename, "r", encoding="utf-8") as file:
            for line in file:
                data = json.loads(line)
                if "content" in data:
                    prewriting_content += data["content"]

        for idx, action in enumerate(actions_lst):
            try:
                if action["action_type"] == None:
                    prewriting = True
                if prewriting:
                    if action["action_type"] == "present_suggestion" and action["action_logs"][0]["eventName"] == "suggestion-open":
                        last_ai_suggestions = []
                        for suggestion in action["action_logs"][0]["currentSuggestions"]:
                            last_ai_suggestions.append(suggestion["trimmed"])
                        backstage_text.append(" ".join(last_ai_suggestions))

                    # detecting human prompting AI
                    if (
                        action["level_1_action_type"] == "insert_text_human"
                        and "action_start_writing" in action
                        and "action_end_writing" in action
                    ):
                        start_text = action["action_start_writing"]
                        end_text = action["action_end_writing"]

                        start_prompts = utils.extract_prompts(start_text)
                        end_prompts = utils.extract_prompts(end_text)

                        for end_prompt in end_prompts:
                            if not end_prompt in start_prompts:
                                action["level_1_action_type"] = "insert_text_human_ai_prompt"
                                past_prompts.append(end_prompt)
                                action["past_ai_prompts"] = past_prompts.copy()
                                backstage_text.append(end_prompt)
                                break

                    # detecting all human text inserts as frontstage
                    if(action["level_1_action_type"] == "insert_text_human"):
                        action["writing_type"] = "frontstage"
                        action["level_2_action_type"] = "insert_human_frontstage"
                        frontstage_text = getFrontstageText(
                            action, past_prompts_with_ai_inserts
                        )

                    # detecting AI inserts after prompts labeled as "unused"
                    if (action["level_1_action_type"] == "insert_text_ai" 
                        and "action_start_writing" in action
                        and "action_end_writing" in action):
                        inserted_text = action["action_delta"][1]
                        
                        sentIndex = -1
                        for index in range(len(action["sentences_temporal_order"])):
                            if inserted_text in action["sentences_temporal_order"][index]:
                                sentIndex = index
                        if sentIndex>0 and action["sentences_temporal_order"][sentIndex-1].count("$")>=2:
                            action["writing_type"] = "backstage"
                            action["level_2_action_type"] = "insert_ai_backstage"
                            past_prompts_with_ai_inserts[action["sentences_temporal_order"][sentIndex - 1]] = inserted_text
                            backstage_text.append(action["sentences_temporal_order"][sentIndex])
                        else:
                            action["writing_type"] = "frontstage"
                            action["level_2_action_type"] = "move_frontstage"
                            frontstage_text = getFrontstageText(
                                action, past_prompts_with_ai_inserts
                            )

                    # detecting deletes of prompts to make AI inserted suggestions as frontstage
                    if (action["level_1_action_type"] == "delete_text"):
                        action["writing_type"] = "frontstage"
                        frontstage_text = getFrontstageText(
                                action, past_prompts_with_ai_inserts
                            )
                        if action["action_delta"][1].count("$")>=2:
                            if (
                                action["action_delta"][1]
                                in past_prompts_with_ai_inserts
                                and past_prompts_with_ai_inserts[action["action_delta"][1]]
                                in backstage_text
                            ):
                                action["writing_type"] = "frontstage"
                                action["level_2_action_type"] = "move_frontstage"
                                backstage_text.remove(
                                    past_prompts_with_ai_inserts[
                                        action["action_delta"][1]
                                    ]
                                )
                    
                    if (action["level_1_action_type"] == "insert_text_human_ai_prompt"):
                        action["level_2_action_type"] = "insert_human_backstage"
                    
                    if (action["action_type"] == "present_suggestion" and action["action_logs"][0]["eventName"] == "suggestion-open"):
                        action["level_2_action_type"] = "insert_ai_backstage"

                    action["frontstage_text"] = frontstage_text.copy()
                    action["backstage_text"] = backstage_text.copy()
                    # differentiating backstage and frontstage writing
                    # 1. Backstage writing: human written prompts for AI between 2 $s, inserted AI suggestions right after a prompt, and AI generated suggestions
                    if(action["level_1_action_type"] == "insert_text_human_ai_prompt" or (action["action_type"] == "present_suggestion" and action["action_logs"][0]["eventName"] == "suggestion-open")):
                        action["writing_type"] = "backstage"

                    # print(action["level_1_action_type"])

                    if "writing_type" in action:
                        print(action["level_1_action_type"])
                        print(action["writing_type"])
                        print("frontstage: ", action["frontstage_text"])
                        print("backstage: ", action["backstage_text"])
                else:
                    action["level_2_action_type"] = "insert_human_frontstage"
            except Exception as e:
                import traceback

                print(f"\n!! Error in session {session_id}, action index {idx} !!")
                traceback.print_exc()
                continue

    return level_1_actions_per_session


# Parse Level 2 actions
with open("level_1_actions_per_session.json", "r", encoding="utf-8") as f:
    level_1_actions_per_session = json.load(f)

level_2_actions_per_session = parse_level_2_actions(
    level_1_actions_per_session
)

In [None]:
parsed_level_2_actions = level_2_actions_per_session.copy()

In [None]:
level_2_actions_per_session = parsed_level_2_actions

In [None]:
def parse_level_2_actions_suggestions(level_2_actions_per_session):
    """
    Parses Level 2 present_suggestion actions based on Level 2 actions.

    Args:
        level_2_actions_per_session (dict): A dictionary where each session key maps to a list of level 2 actions from insert_text actions

    Returns:
        dict: The updated actions dictionary with Level 2 attributes added to each action.
    """

    for session_id, actions_lst in tqdm(
        level_2_actions_per_session.items(), desc="Parsing Level 2 present_suggestion Actions"
    ):
        last_insert_action = ""
        past_arguments = []
        past_prompts = []
        for idx, action in enumerate(actions_lst):
            try:
                if action["action_type"] and "insert" in action["level_1_action_type"]:
                    last_insert_action = action["level_1_action_type"]
                    if action["level_1_action_type"] == "insert_text_human_ai_prompt":
                        past_arguments = action.get("past_arguments",[])
                        past_prompts = action["past_ai_prompts"]
                # ai_suggestions=[]
                # if (
                #     action["action_type"] == "present_suggestion"
                #     and action["action_logs"][0]["eventName"] == "suggestion-open"
                # ):
                #     for suggestion in action["action_logs"][0]["currentSuggestions"]:
                #         ai_suggestions.append(suggestion["trimmed"])
                #     external_example_dict = level_2_learning_comparisons.get_external_example("".join(ai_suggestions))
                #     action["external_examples_dict"] = external_example_dict
                #     action["external_examples"] = level_2_learning_comparisons.parse_level_2_external_examples(external_example_dict) * 5
                #     continue
                if (
                    not action["action_type"] or not action["action_logs"] or action.get("level_1_action_type") != "query_suggestion"
                    # and action["action_logs"][0]["eventName"] != "suggestion-get"
                ):
                    continue

                label = ""
                suggestion_open = action["action_logs"][0]
                if suggestion_open["currentCursor"] == len(action["action_start_writing"]) + 1:
                    label+="end_writing_"
                    stripped_writing = action["action_start_writing"].strip()
                    if stripped_writing[-1] in [".", "!", "?"] or action["action_start_writing"][-2:] == "\n":
                        label+="end_sentence"
                    else:
                        label+="middle_sentence"
                elif suggestion_open["currentCursor"] == 1:
                    label+="start_writing"
                else:
                    label += "middle_writing_"
                    writingBefore = action["action_start_writing"][:suggestion_open["currentCursor"]+1]
                    writingBefore = writingBefore.strip()
                    if writingBefore=="":
                        label = "start_writing"
                    elif (
                        writingBefore[-1]
                        in [".", "!", "?"]
                        or writingBefore[-2:] == "\n"
                    ):
                        label += "end_sentence"
                    else:
                        label += "middle_sentence"
                action["sentence_location"] = label

                # last_suggestion_index = utils.find_last_suggestion(action["action_end_writing"])
                # end_suggestion_index = action["action_end_writing"].rfind("$")
                # if last_suggestion_index == 0:
                #     last_suggestion_index = utils.find_last_punctuation(action["action_end_writing"])
                #     end_suggestion_index = len(action["action_end_writing"])

                if last_insert_action == "insert_text_human_ai_prompt" and past_arguments != []:
                    classification = level_2_learning_comparisons.parse_classify_text(
                        past_prompts[-1]
                    )
                    if classification != "":
                        action["text_classification"] = classification
                    past_args = past_arguments.copy()
                    if len(past_args) > 4:
                        past_args = past_args[-4:]
                    past_args = " ".join(past_args)
                    if (len(nli_tokenizer.tokenize(past_args))>450):
                        past_args = " ".join(past_arguments.copy()[-2:])
                    
                    # nli, nli_info_dct = level_2_learning_comparisons.get_NLI(
                    #     past_args, past_prompts[-1]
                    # )
                    # stance = level_2_learning_comparisons.get_stance_difference(
                    #     past_args, past_prompts[-1]
                    # )
                    # action["nli_info"] = nli_info_dct
                    # if stance["labels"][0] == "disagrees" and nli == "contradiction":
                    #     action["stance"] = "counter prompt"
                    # elif (stance["labels"][0] == "disagrees" and nli != "entailment") or (nli == "contradiction" and stance["labels"][0] != "agrees"):
                    #     action["stance"] = "minor counter prompt"

            except Exception as e:
                import traceback

                print(f"\n!! Error in session {session_id}, action index {idx} !!")
                traceback.print_exc()
                continue

    return level_2_actions_per_session

# Parse Level 2 present_suggestions actions
level_2_actions_per_session_suggestions = parse_level_2_actions_suggestions(
    level_2_actions_per_session
)

In [None]:
def parse_level_2_actions_semantic(level_2_actions_per_session_suggestions):
    """
    Parses Level 2 actions based on Level 2 suggestion actions, but only for actions with a writing_type

    Args:
        level_1_actions_per_session (dict): A dictionary where each session key maps to a list of level 1 actions.

    Returns:
        dict: The updated actions dictionary with Level 2 attributes added to each action.
    """

    for session_id, actions_lst in tqdm(
        level_2_actions_per_session_suggestions.items(), desc="Parsing Level 2 Actions"
    ):
        past_ai_suggestions = []
        past_prompts = []
        past_prompts_with_ai_inserts = {}
        frontstage_text = []
        backstage_text = []
        prewriting = False

        prewriting_ids = {
            "legislation_corporate_1": "ab7b7355bfde4657b68f7a2985e494a9",
            "legislation_antitrust_1": "4d85d96c21494339bfe570b87f354fec",
            "legislation_corporate_2": "4517db08add74465b0888b3002170e59",
            "legislation_corporate_3": "769f457ab1f2458abf376c4d248ae34e",
            "legislation_antitrust_2": "7204e89b27a744c3a256197c77b33f19",
            "legislation_corporate_4": "8e633d2f2e9246828ab498f5388559bc",
            "legislation_antitrust_3": "039142481529449685e10dd7d2a250d6",
            "legislation_corporate_5": "81d7a2e4bdf94cb3a2376afc93410a10",
            "legislation_antitrust_4": "4eedf1ef4ea14d769e65d657f4145f0c",
            "legislation_antitrust_5": "9f742fd07a35428eacee633b2a284120",
            "legislation_corporate_6": "ada424f7f72e46c18c906f14ac4b0e2e",
            "legislation_antitrust_6": "e626c3e15f9641ef94298979e83f9bed",
            "legislation_antitrust_7": "9c861b5263f24c33832ae6dee287cb42",
            "legislation_antitrust_8": "b6a1710464d047b2aef7979a1f66dd5c",
        }
        filename = utils.get_filename("prewriting_content/", prewriting_ids[session_id])
        prewriting_content = ""

        with open(filename, "r", encoding="utf-8") as file:
            for line in file:
                data = json.loads(line)
                if "content" in data:
                    prewriting_content += data["content"]

        background_info = ""
        if "antitrust" in session_id:
            background_info = utils.background_info[0]
        else:
            background_info = utils.background_info[1]

        total_prewriting = background_info + prewriting_content
        A_sents = utils.sent_tokenize(total_prewriting)
        A_emb = level_2_learning_comparisons.prepare_paragraph_A(A_sents)

        last_semantic_expansion_score = 0.0

        for idx, action in enumerate(actions_lst):
            try:
                if "writing_type" in action:
                    frontstage_text = action["frontstage_text"].copy()
                    if action["writing_type"] == "backstage":
                        backstage_addition = utils.sent_tokenize(
                            action["backstage_text"][-1]
                        )
                        frontstage_text += backstage_addition
                    semantic_expansion_df = level_2_learning_comparisons.compute_novelty_paragraphB_vs_A(A_sents, A_emb, frontstage_text)
                    if action["level_1_action_type"] == "present_suggestion":
                        action["semantic_expansion"] = (
                            semantic_expansion_df["novelty"].iloc[:-5].sum()
                            + semantic_expansion_df["novelty"].iloc[-5:].mean()
                            - last_semantic_expansion_score
                        )
                    else:
                        action["semantic_expansion"] = (
                            semantic_expansion_df["novelty"].sum()
                            - last_semantic_expansion_score
                        )

                    if "writing_type" == "backstage":
                        if action["level_1_action_type"] == "present_suggestion":
                            last_semantic_expansion_score = (
                                semantic_expansion_df["novelty"].iloc[:-5].sum()
                            )
                        else:
                            last_semantic_expansion_score = (
                                semantic_expansion_df["novelty"].iloc[:-1].sum()
                            )
                    else:
                        last_semantic_expansion_score = semantic_expansion_df[
                            "novelty"
                        ].sum()
                    
                    print(action["level_1_action_type"])
                    print(action["writing_type"])
                    print(action["semantic_expansion"])

            except Exception as e:
                import traceback

                print(f"\n!! Error in session {session_id}, action index {idx} !!")
                traceback.print_exc()
                continue

    return level_1_actions_per_session


# Parse Level 2 actions semantic expansion
with open("level_1_actions_per_session.json", "r", encoding="utf-8") as f:
    level_1_actions_per_session = json.load(f)

level_2_actions_per_session_semantic = parse_level_2_actions_semantic(level_2_actions_per_session_suggestions)

In [None]:
level_1_actions = [
  (idx, action["level_1_action_type"]) for idx, action in enumerate(level_2_actions_per_session["legislation_corporate_6"]) if "level_1_action_type" in action
]

level_2_actions = [
  (idx, action["level_2_action_type"]) for idx, action in enumerate(level_2_actions_per_session["legislation_corporate_6"]) if "level_2_action_type" in action
]

only_level1 = [
    (idx, action["level_1_action_type"], action["action_delta"])
    for idx, action in enumerate(level_2_actions_per_session["legislation_corporate_6"])
    if action.get("level_1_action_type") and not action.get("writing_type")
]

for i,j,k in only_level1:
  print(i,j,k)
print([i for i in level_2_actions if i[1] == "human_edit_ai"])



In [None]:
# Use the current working directory
script_dir = os.getcwd()

In [None]:
# Define the output file path
script_dir = os.getcwd()
output_file = os.path.join(script_dir, "level_2_actions_per_session_semantic.json")

# Save the parsed actions to the JSON file
with open(output_file, "w") as f:
    json.dump(
        level_2_actions_per_session_semantic, f, default=utils.custom_serializer
    )

# Confirm successful save
print(f"Level 2 actions successfully saved to: {output_file}")