In [9]:
import json
import re
from pathlib import Path
from PIL import Image
from IPython.display import display
from datetime import datetime
from datasets import Dataset, Features, Image as HFImage, Value

In [10]:
# Put screenshots into a list of [datetime, path_to_image]
# Make sure the list is sorted by datetime

# Directory path
screenshots_dir = Path('../mining/logs/screenshots')

# Regular expression to match timestamp patterns like 20250424_173019_991476 in the screenshot name
timestamp_pattern = re.compile(r'\d{8}_\d{6}_\d{6}')

# Store (datetime, path) pairs
images_with_timestamps = []

for image_path in screenshots_dir.iterdir():
    # if image_path.is_file() and image_path.suffix.lower() in ('.png', '.jpg', '.jpeg', '.gif'):
    if image_path.is_file() and image_path.suffix.lower() in ('.jpg'):
        # Extract filename
        filename = image_path.name

        # Search for the timestamp
        match = timestamp_pattern.search(filename)
        if match:
            timestamp_str = match.group()
            # print(f"Image: {filename} | Timestamp: {timestamp_str}")

            # Convert the string to a datetime object
            dt = datetime.strptime(timestamp_str, '%Y%m%d_%H%M%S_%f')
            # print(f"Datetime Object: {dt}")

            images_with_timestamps.append((dt, image_path))

            # Open and display image
            # img = Image.open(image_path)
            # display(img)
        else:
            print(f"No timestamp found in {filename}")

# Sort the list in-place by datetime
images_with_timestamps.sort(key=lambda x: x[0])

print("number of screenshots: ", len(images_with_timestamps))

number of screenshots:  33098


In [None]:
# Go through all sanitized json logs, find click events, and match them with the
# screenshot right before and add session prompts
# store everything in a Hugging Face dataset

sanitized_dir = Path('../mining/logs/sanitized_json')
session_prompts_file = Path('../mining/logs/session_prompts.log')

# Process all JSON files in the directory
all_matches = []

# Set the maximum allowed time difference (1 second)
MAX_TIME_DIFF_SECONDS = 1.0

# Load session prompts from log file
session_prompts = []
with open(session_prompts_file, 'r') as f:
    for line in f:
        # Parse timestamp and prompt from each line
        # Format: YYYY‑MM‑DD HH:MM:SS  Prompt text
        if line.strip():
            timestamp_str = line[:19]  # Extract the timestamp part
            prompt = line[21:].strip()  # Extract the prompt part

            # Convert timestamp string to datetime object
            timestamp = datetime.strptime(timestamp_str, '%Y‑%m‑%d %H:%M:%S')

            session_prompts.append({
                'timestamp': timestamp,
                'prompt': prompt
            })

print(f"Loaded {len(session_prompts)} session prompts from log file")

# Sort session prompts by timestamp
session_prompts.sort(key=lambda x: x['timestamp'])

# Sort JSON files by name (which contains timestamps) to process them in chronological order
json_files = sorted([f for f in sanitized_dir.iterdir() if f.is_file() and f.suffix.lower() == '.json'])

# We'll process all JSON files together to ensure proper time-based matching
all_click_events = []

# Load all click events from all files first
for json_path in json_files:
    print(f"Loading click events from {json_path.name}...")

    # Load the JSON file
    with open(json_path, 'r') as f:
        data = json.load(f)

    # Extract click events from the current file
    click_events = [
        {**event, "source_file": json_path.name}
        for event in data['events']
        if event['event'] == 'MOUSE' and event['event_type'] == 'SINGLE_CLICK'
    ]

    all_click_events.extend(click_events)
    print(f"Found {len(click_events)} click events in {json_path.name}")

# Sort all click events by timestamp
all_click_events.sort(key=lambda x: datetime.fromisoformat(x['timestamp']))
print(f"Total click events across all files: {len(all_click_events)}")

# Now match all click events to screenshots and session prompts
screenshot_idx = 0
last_valid_screenshot = None
skipped_events = 0

for click_event in all_click_events:
    click_time = datetime.fromisoformat(click_event['timestamp'])

    # Match with screenshot
    # Move screenshot pointer forward as long as the screenshot time is <= click time
    while (screenshot_idx < len(images_with_timestamps) and
          images_with_timestamps[screenshot_idx][0] <= click_time):
        last_valid_screenshot = images_with_timestamps[screenshot_idx]
        screenshot_idx += 1

    # For session prompts, since timestamps represent end of session,
    # find which session this click belongs to by finding the NEXT session timestamp
    # after the click time
    session_idx = 0
    while session_idx < len(session_prompts) and session_prompts[session_idx]['timestamp'] <= click_time:
        session_idx += 1

    # The session prompt for this click is the one before the next session
    # (which means this click belongs to the session that ends at session_prompts[session_idx-1])
    current_session = session_prompts[session_idx-1] if session_idx > 0 else None

    if last_valid_screenshot:
        # Calculate time difference in seconds
        time_diff = (click_time - last_valid_screenshot[0]).total_seconds()

        # Only add to dataset if screenshot is within the allowed time range
        if time_diff <= MAX_TIME_DIFF_SECONDS:
            # Get full path to the screenshot image
            screenshot_path = str(last_valid_screenshot[1])

            match = {
                "click_timestamp": click_time.isoformat(),
                "click_x": click_event["x"],
                "click_y": click_event["y"],
                "matched_screenshot_timestamp": last_valid_screenshot[0].isoformat(),
                "matched_screenshot_filename": last_valid_screenshot[1].name,
                "screenshot_image": screenshot_path,  # Store full path for HFImage feature
                "source_file": click_event["source_file"],
                "time_diff_seconds": time_diff
            }

            # Add session prompt if available
            if current_session:
                match["session_end_timestamp"] = current_session['timestamp'].isoformat()
                match["session_prompt"] = current_session['prompt']
            else:
                match["session_end_timestamp"] = ""
                match["session_prompt"] = ""

            all_matches.append(match)
            print(f"Click at {click_time} matched with screenshot taken at {last_valid_screenshot[0]} (diff: {time_diff:.3f}s)")

            if current_session:
                print(f"  Session: '{current_session['prompt']}' (ends at {current_session['timestamp']})")
            else:
                print("  No session prompt found for this click")
        else:
            print(f"Skipping click at {click_time}: Screenshot too old ({time_diff:.3f}s > {MAX_TIME_DIFF_SECONDS}s)")
            skipped_events += 1
    else:
        print(f"No screenshot before click at {click_time}")
        skipped_events += 1

print(f"Total matches: {len(all_matches)}")
print(f"Skipped events: {skipped_events} (no screenshot within {MAX_TIME_DIFF_SECONDS}s)")

# Define features schema with Image type
features = Features({
    "click_timestamp": Value("string"),
    "click_x": Value("float"),
    "click_y": Value("float"),
    "matched_screenshot_timestamp": Value("string"),
    "matched_screenshot_filename": Value("string"),
    "screenshot_image": HFImage(),  # This will load the image from the path
    "source_file": Value("string"),
    "time_diff_seconds": Value("float"),
    "session_end_timestamp": Value("string"),
    "session_prompt": Value("string")
})

# Convert to Hugging Face Dataset with the defined features
hf_dataset = Dataset.from_list(all_matches, features=features)

# Save to disk
hf_dataset.save_to_disk('../data/processed/hf_click_image_dataset')
print("Dataset saved to disk with actual images!")

Loaded 5 session prompts from log file
Loading click events from sanitized_20250501_154830_0001.json...
Found 5 click events in sanitized_20250501_154830_0001.json
Loading click events from sanitized_20250501_154830_0002.json...
Found 2 click events in sanitized_20250501_154830_0002.json
Loading click events from sanitized_20250501_154830_0003.json...
Found 2 click events in sanitized_20250501_154830_0003.json
Loading click events from sanitized_20250501_154830_0004.json...
Found 1 click events in sanitized_20250501_154830_0004.json
Loading click events from sanitized_20250501_154830_0005.json...
Found 0 click events in sanitized_20250501_154830_0005.json
Loading click events from sanitized_20250501_154830_0006.json...
Found 1 click events in sanitized_20250501_154830_0006.json
Loading click events from sanitized_20250501_154830_0007.json...
Found 0 click events in sanitized_20250501_154830_0007.json
Loading click events from sanitized_20250501_154830_0008.json...
Found 1 click events 

Saving the dataset (1/1 shards): 100%|██████████| 1833/1833 [00:09<00:00, 187.70 examples/s]

Dataset saved to disk with actual images!



