In [1]:
import json
import re
from pathlib import Path
from PIL import Image
from IPython.display import display
from datetime import datetime
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Put screenshots into a list of [datetime, path_to_image]
# Make sure the list is sorted by datetime

# Directory path
screenshots_dir = Path('../mining/logs/screenshots')

# Regular expression to match timestamp patterns like 20250424_173019_991476 in the screenshot name
timestamp_pattern = re.compile(r'\d{8}_\d{6}_\d{6}')

# Store (datetime, path) pairs
images_with_timestamps = []

for image_path in screenshots_dir.iterdir():
    # if image_path.is_file() and image_path.suffix.lower() in ('.png', '.jpg', '.jpeg', '.gif'):
    if image_path.is_file() and image_path.suffix.lower() in ('.jpg'):
        # Extract filename
        filename = image_path.name

        # Search for the timestamp
        match = timestamp_pattern.search(filename)
        if match:
            timestamp_str = match.group()
            # print(f"Image: {filename} | Timestamp: {timestamp_str}")

            # Convert the string to a datetime object
            dt = datetime.strptime(timestamp_str, '%Y%m%d_%H%M%S_%f')
            # print(f"Datetime Object: {dt}")

            images_with_timestamps.append((dt, image_path))

            # Open and display image
            # img = Image.open(image_path)
            # display(img)
        else:
            print(f"No timestamp found in {filename}")

# Sort the list in-place by datetime
images_with_timestamps.sort(key=lambda x: x[0])

print("number of screenshots: ", len(images_with_timestamps))

number of screenshots:  257


In [3]:
# Go through the sanitized json log, find click events, and match them with the
# screenshot right before
# store everything in a new JSON

sanitized_dir = Path('../mining/logs/sanitized_json')

# TODO: go through all sanitized jsons
# TODO: group by session and add session prompt to json

# Load the JSON file
json_path = sanitized_dir / Path('sanitized_20250428_170208_0001.json')

with open(json_path, 'r') as f:
    data = json.load(f)

click_events = [
    event for event in data['events'] if event['event'] == 'MOUSE' and event['event_type'] == 'SINGLE_CLICK'
]

# For each click, find the nearest screenshot before it
screenshot_idx = 0
last_valid_screenshot = None
matches = []

for click_event in click_events:
    click_time = datetime.fromisoformat(click_event['timestamp'])

    # Move screenshot pointer forward as long as the screenshot time is <= click time
    while (screenshot_idx < len(images_with_timestamps) and
           images_with_timestamps[screenshot_idx][0] <= click_time):
        last_valid_screenshot = images_with_timestamps[screenshot_idx]
        screenshot_idx += 1

    if last_valid_screenshot:
        match = {
            "click_timestamp": click_time.isoformat(),
            "click_x": click_event["x"],
            "click_y": click_event["y"],
            "matched_screenshot_timestamp": last_valid_screenshot[0].isoformat(),
            "matched_screenshot_filename": last_valid_screenshot[1].name
        }
        matches.append(match)
        print(f"Click at {click_time} matched with screenshot taken at {last_valid_screenshot[0]}")
        # img = Image.open(last_valid_screenshot[1])
        # display(img)
    else:
        print(f"No screenshot before click at {click_time}")


# Convert to Hugging Face Dataset
hf_dataset = Dataset.from_list(matches)

# Save to disk
hf_dataset.save_to_disk('../data/processed/hf_click_image_dataset')
print("Dataset saved to disk!")


# # Save to a JSON file
# output_path = Path('../data/processed/matched_clicks.json')

# with open(output_path, 'w') as f:
#     json.dump(matches, f, indent=4)

# print(f"Saved {len(matches)} matches to {output_path}")


Click at 2025-04-28 17:02:08.530037 matched with screenshot taken at 2025-04-28 17:02:08.452197
Click at 2025-04-28 17:02:10.646456 matched with screenshot taken at 2025-04-28 17:02:10.530986
Click at 2025-04-28 17:02:15.363940 matched with screenshot taken at 2025-04-28 17:02:15.223790
Click at 2025-04-28 17:02:16.578514 matched with screenshot taken at 2025-04-28 17:02:16.304256
Click at 2025-04-28 17:02:17.466413 matched with screenshot taken at 2025-04-28 17:02:17.262577


Saving the dataset (1/1 shards): 100%|██████████| 5/5 [00:00<00:00, 38.44 examples/s]

Dataset saved to disk!



