In [None]:
import os
import pandas as pd
import re

folder_path = "data"

def clean_split(data):
    """ Cleans and properly splits a comma-separated string while preserving spaces in words like 'long-circle'. """
    return [x.strip() for x in re.split(r",\s(?=[a-zA-Z])", data.strip(", "))] if data else []

def parse_file(file_path):
    with open(file_path, "r") as f:
        content = f.read()

    print(f"Processing {file_path}")

    timestamp_match = re.search(r"^([\d\-T:.Z]+), click,", content)
    timestamp = timestamp_match.group(1) if timestamp_match else "Unknown"

    times = re.findall(r"Times \(Movement Time\):\s*([\d., ]+) seconds", content)
    distances = re.findall(r"Distances:\s*([\d., ]+)", content)
    shapes = re.findall(r"Target Shapes:\s*([\w\- ,]+)", content)
    positions = re.findall(r"\(([\d.]+)\s*,\s*([\d.]+)\)", content)
    positions = [(float(x), float(y)) for x, y in positions] if positions else []

    times = [float(x) for x in times[0].strip(", ").split(", ")] if times else []
    distances = [float(x) for x in distances[0].strip(", ").split(", ")] if distances else []
    shapes = clean_split(shapes[0]) if shapes else []

    errors_section = re.search(r"Errors:\s*\n([\s\S]+?)\n\s*Overshoots", content)
    errors = {}
    if errors_section:
        error_trials = re.findall(r"Trial (\d+):\s*([\d.]+(?:,\s*[\d.]+)*|None)", errors_section.group(1))
        for trial, val in error_trials:
            trial_num = int(trial)
            errors[trial_num] = [float(x) for x in val.split(", ")] if val.lower() != "none" else []

    overshoots_section = re.search(r"Overshoots \(correction times\):\s*\n([\s\S]+?)\n\s*Total Hover", content)
    overshoots = {}
    if overshoots_section:
        overshoot_trials = re.findall(r"Trial (\d+):\s*([\d.]+(?:,\s*[\d.]+)*|None)", overshoots_section.group(1))
        for trial, val in overshoot_trials:
            trial_num = int(trial)
            overshoots[trial_num] = [float(x) for x in val.split(", ")] if val.lower() != "none" else []

    hover_times = re.findall(r"Total Hover Times \(per trial\):\s*([\d., ]+) seconds", content)
    hover_times = [float(x) for x in hover_times[0].strip(", ").split(", ")] if hover_times else []

    first_entry_click_delay = re.findall(r"First Entry Click Delays:\s*([\d., ]+) seconds", content)
    print(first_entry_click_delay)
    first_entry_click_delay = [float(x) for x in first_entry_click_delay[0].strip(", ").split(", ")] if first_entry_click_delay else []

    most_recent_click_delay = re.findall(r"Most Recent Entry Click Delays:\s*([\d., ]+) seconds", content)
    most_recent_click_delay = [float(x) for x in most_recent_click_delay[0].strip(", ").split(", ")] if most_recent_click_delay else []

    # Parse cursor path and segment it per trial
    cursor_path_match = re.search(r"Cursor Path:\n([\s\S]+)", content)
    trial_cursor_paths = []
    current_path = []

    
    if cursor_path_match:
        lines = cursor_path_match.group(1).strip().split("\n")
        prev_time = None
        
        for line in lines:
            parts = line.split(", ")
            if len(parts) == 3:
                try:
                    time, x, y = float(parts[0]), float(parts[1]), float(parts[2])

                    # If the time decreases, it's a new trial
                    if prev_time is not None and time < prev_time:
                        trial_cursor_paths.append(current_path)
                        current_path = []  # Start a new trial path

                    current_path.append((time, x, y))
                    prev_time = time
                except ValueError:
                    pass  # Skip malformed lines
        
        # Append last detected trial's cursor path
        if current_path:
            trial_cursor_paths.append(current_path)

    # Make sure we have as many cursor paths as trials
    while len(trial_cursor_paths) < len(times):
        trial_cursor_paths.append([])  # Fill in missing trials with empty paths

    # Create trials list
    trials = []
    for i in range(len(times)):
        trials.append({
            "timestamp": timestamp,
            "trial": i + 1,
            "click_time": times[i] if i < len(times) else None,
            "distance": distances[i] if i < len(distances) else None,
            "target_shape": shapes[i] if i < len(shapes) else None,
            "target_position": positions[i] if i < len(positions) else None,
            "errors": errors.get(i + 1, []),
            "overshoot_correction_times": overshoots.get(i + 1, []),
            "hover_time": hover_times[i] if i < len(hover_times) else None,
            "first_entry_click_delay": first_entry_click_delay[i] if i < len(first_entry_click_delay) else None,
            "most_recent_click_delay": most_recent_click_delay[i] if i < len(most_recent_click_delay) else None,
            "session_cursor_path": trial_cursor_paths[i] if i < len(trial_cursor_paths) else []
        })

    return trials

# Process all .txt files
all_trials = []
for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):
        file_path = os.path.join(folder_path, filename)
        all_trials.extend(parse_file(file_path))

# Convert to DataFrame
df = pd.DataFrame(all_trials)
df


Processing data\glove_test_log (2).txt
[]
Processing data\glove_test_log (3).txt
[]


Unnamed: 0,timestamp,trial,click_time,distance,target_shape,target_position,errors,overshoot_correction_times,hover_time,first_entry_click_delay,most_recent_click_delay,session_cursor_path
0,2025-03-21T18:21:37.879Z,1,4.348,1613.56,circle,"(1506.1, 579.1)",[],[],0.517,,,"[(8.4, 898.0, 520.0), (267.5, 898.0, 519.0), (..."
1,2025-03-21T18:21:37.879Z,2,0.791,570.92,circle,"(525.5, 223.2)",[],[],0.27,,,"[(7.5, 1524.0, 747.0), (39.0, 1523.0, 747.0), ..."
2,2025-03-21T18:21:37.879Z,3,0.75,1379.2,long-circle,"(1373.3, 126.9)",[],[],0.222,,,"[(28.3, 551.0, 383.0), (49.4, 552.0, 383.0), (..."
3,2025-03-21T18:21:37.879Z,4,0.948,1829.47,circle,"(1725.6, 607.7)",[],[],0.177,,,"[(21.4, 1410.0, 298.0), (28.1, 1409.0, 298.0),..."
4,2025-03-21T18:21:37.879Z,5,0.7,1435.94,long-rectangle,"(1406.5, 289.3)",[],[],0.232,,,"[(33.6, 1764.0, 768.0), (39.8, 1763.0, 768.0),..."
5,2025-03-21T18:24:28.682Z,1,3.483,1932.21,circle,"(1844.3, 576.1)",[],"[0.604, 0.485, 0.77]",0.819,,,"[(217.2, 932.0, 541.0), (221.6, 932.0, 540.0),..."
6,2025-03-21T18:24:28.682Z,2,0.833,1752.87,circle,"(1715.3, 361.1)",[],[],0.165,,,"[(117.5, 1866.0, 748.0), (120.6, 1865.0, 748.0..."
7,2025-03-21T18:24:28.682Z,3,1.997,580.38,long-circle,"(256.5, 520.6)",[],[0.693],0.372,,,"[(48.2, 1733.0, 536.0), (68.3, 1732.0, 535.0),..."
8,2025-03-21T18:24:28.682Z,4,1.265,151.3,long-rectangle,"(118.7, 93.9)",[],[],0.433,,,"[(186.6, 307.0, 686.0), (193.8, 308.0, 685.0),..."
9,2025-03-21T18:24:28.682Z,5,2.135,1424.85,long-rectangle,"(1419.9, 118.3)","[136.73, 92.3]",[],0.251,,,"[(257.2, 147.0, 291.0), (260.6, 148.0, 291.0),..."


In [31]:
import numpy as np

# Assume screen center at (960, 540)
screen_center = (960, 540)

# Add new columns
df["prev_target_distance"] = np.nan
df["velocity"] = np.nan
df["error_count"] = df["errors"].apply(len)
df["total_overshoot_time"] = df["overshoot_correction_times"].apply(sum)
df["actual_path_length"] = np.nan
df["optimal_path_ratio"] = np.nan
df["extra_movement_percentage"] = np.nan

# Compute distances and path efficiency
for i in range(len(df)):
    if df.loc[i, "target_position"] is not None:
        if df.loc[i, "trial"] % 5 == 0:  # First trial in set
            prev_x, prev_y = screen_center
        else:  # Use previous target position
            prev_x, prev_y = df.loc[i - 1, "target_position"] if i > 0 else screen_center

        # Compute Euclidean distance
        x, y = df.loc[i, "target_position"]
        df.loc[i, "prev_target_distance"] = np.sqrt((x - prev_x) ** 2 + (y - prev_y) ** 2)

        # Compute velocity (distance/time)
        if df.loc[i, "click_time"] > 0:
            df.loc[i, "velocity"] = df.loc[i, "prev_target_distance"] / df.loc[i, "click_time"]

        # Compute path efficiency
        if df.loc[i, "session_cursor_path"]:
            path = np.array(df.loc[i, "session_cursor_path"])[:, :2]  # Extract only (x, y)
            actual_path_length = np.sum(np.linalg.norm(np.diff(path, axis=0), axis=1))
            df.loc[i, "actual_path_length"] = actual_path_length

            if df.loc[i, "prev_target_distance"] > 0:
                df.loc[i, "optimal_path_ratio"] = actual_path_length / df.loc[i, "prev_target_distance"]
                df.loc[i, "extra_movement_percentage"] = (actual_path_length - df.loc[i, "prev_target_distance"]) / df.loc[i, "prev_target_distance"]

# Reorder columns
cols = [col for col in df.columns if col != "session_cursor_path"] + ["session_cursor_path"]
df = df[cols]

df.head()


Unnamed: 0,timestamp,trial,click_time,distance,target_shape,target_position,errors,overshoot_correction_times,hover_time,first_entry_click_delay,most_recent_click_delay,prev_target_distance,velocity,error_count,total_overshoot_time,actual_path_length,optimal_path_ratio,extra_movement_percentage,session_cursor_path
0,2025-03-21T18:21:37.879Z,1,4.348,1613.56,circle,"(1506.1, 579.1)",[],[],0.517,,,547.497963,125.919495,0,0.0,4765.437026,8.704027,7.704027,"[(8.4, 898.0, 520.0), (267.5, 898.0, 519.0), (..."
1,2025-03-21T18:21:37.879Z,2,0.791,570.92,circle,"(525.5, 223.2)",[],[],0.27,,,1043.187984,1318.821724,0,0.0,1301.373894,1.247497,0.247497,"[(7.5, 1524.0, 747.0), (39.0, 1523.0, 747.0), ..."
2,2025-03-21T18:21:37.879Z,3,0.75,1379.2,long-circle,"(1373.3, 126.9)",[],[],0.222,,,853.251739,1137.668985,0,0.0,1145.617763,1.342649,0.342649,"[(28.3, 551.0, 383.0), (49.4, 552.0, 383.0), (..."
3,2025-03-21T18:21:37.879Z,4,0.948,1829.47,circle,"(1725.6, 607.7)",[],[],0.177,,,596.056986,628.752094,0,0.0,1165.873087,1.955976,0.955976,"[(21.4, 1410.0, 298.0), (28.1, 1409.0, 298.0),..."
4,2025-03-21T18:21:37.879Z,5,0.7,1435.94,long-rectangle,"(1406.5, 289.3)",[],[],0.232,,,512.067125,731.524464,0,0.0,728.918866,1.423483,0.423483,"[(33.6, 1764.0, 768.0), (39.8, 1763.0, 768.0),..."
