# Normalization Process Documentation

This notebook normalizes the mining data collected in the `logs` directory:

1. **For Screenshots**:
   - Resizes images to max 1120x1120 pixels while preserving aspect ratio
   - Keeps original filenames
   - Only processes screenshots taken during user activity (within 15 seconds of events)
   
2. **For JSON Files**:
   - Normalizes click coordinates to 0-1 range (if not already normalized)
   - Normalizes by dividing x by screen width and y by screen height
   - Preserves all other data
   
3. **Output Structure**:
   - Saves to a specific folder in `../data/normalized/[hostname]_[timestamp]/`
   - Maintains the same folder structure as the original logs directory
   - Copies session_prompts.log if it exists

## How to Use

1. Run all cells in order
2. The normalized data will be saved to a new directory in `../data/normalized/`
3. Each normalization run creates a subfolder named `[hostname]_[timestamp]`

You can now use this normalized data with your analysis or training scripts.

In [10]:
import os
import json
import glob
import re
import shutil
import socket
from datetime import datetime
from PIL import Image
import pyautogui
from tqdm import tqdm
from datasets import Dataset

In [11]:
# Configuration
screen_width, screen_height = pyautogui.size()
print(f"Screen size: {screen_width}x{screen_height}")

# Define paths
ROOT_DIR = os.path.abspath('../')
MINING_DIR = os.path.join(ROOT_DIR, 'mining')
# LOGS_DIR = os.path.join(MINING_DIR, 'logs')
LOGS_DIR = os.path.join(ROOT_DIR, 'data', 'old 5-1-2025')
SCREENSHOTS_DIR = os.path.join(LOGS_DIR, 'screenshots')
JSON_DIR = os.path.join(LOGS_DIR, 'sanitized_json')

# Generate distinctive subfolder name with timestamp and hostname
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
hostname = socket.gethostname().replace(".", "_")
NORMALIZED_SUBFOLDER = f"{hostname}_{timestamp}"

# Set up output directory structure
OUTPUT_DIR = os.path.join(ROOT_DIR, 'data', 'normalized', NORMALIZED_SUBFOLDER)
OUTPUT_SCREENSHOTS_DIR = os.path.join(OUTPUT_DIR, 'screenshots')
OUTPUT_JSON_DIR = os.path.join(OUTPUT_DIR, 'sanitized_json')

# Create output directories
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(OUTPUT_SCREENSHOTS_DIR, exist_ok=True)
os.makedirs(OUTPUT_JSON_DIR, exist_ok=True)

print(f"Input: {LOGS_DIR}\nOutput: {OUTPUT_DIR}")

Screen size: 1440x900
Input: /Users/jacob/git-repos/visual-data-mining/mining/logs_jacob_may_6_2025
Output: /Users/jacob/git-repos/visual-data-mining/data/normalized_my_mac_20250509_005206


In [12]:
# Core functions for normalization and filtering
def normalize_image_size(img, max_size=(1120, 1120)):
    """Resize an image to a maximum size while preserving aspect ratio."""
    original_width, original_height = img.size
    resize_ratio = min(max_size[0] / original_width, max_size[1] / original_height)
    
    if resize_ratio < 1:
        new_width, new_height = int(original_width * resize_ratio), int(original_height * resize_ratio)
        return img.resize((new_width, new_height), Image.Resampling.LANCZOS)
    return img

def extract_activity_timestamps(filepath):
    """Extract timestamps of all user activity events from a JSON file."""
    try:
        with open(filepath, 'r') as f:
            data = json.load(f)
        
        timestamps = []
        for event in data.get('events', []):
            ts = event.get('timestamp')
            if not ts:
                continue
                
            if isinstance(ts, str):
                try:
                    dt = datetime.fromisoformat(ts.replace('Z', '+00:00'))
                    timestamps.append(dt.timestamp())
                except:
                    pass
            elif isinstance(ts, (int, float)):
                timestamps.append(ts / 1000 if ts > 1e10 else ts)
                
        return timestamps
    except Exception as e:
        print(f"Error extracting timestamps from {filepath}: {e}")
        return []

def merge_time_ranges(ranges):
    """Merge overlapping time ranges."""
    if not ranges:
        return []
    
    sorted_ranges = sorted(ranges)
    merged = [sorted_ranges[0]]
    
    for current in sorted_ranges[1:]:
        last = merged[-1]
        if current[0] <= last[1]:
            merged[-1] = (last[0], max(last[1], current[1]))
        else:
            merged.append(current)
    
    return merged

def extract_timestamp_from_filename(filename):
    """Extract timestamp from screenshot filename."""
    match = re.search(r'screen_(\d{8})_(\d{6})_\d+\.jpg', filename)
    if match:
        dt = datetime.strptime(f"{match.group(1)}_{match.group(2)}", "%Y%m%d_%H%M%S")
        return dt.timestamp()
    return None

def is_in_activity_range(timestamp, activity_ranges):
    """Check if a timestamp falls within any activity range."""
    return any(start <= timestamp <= end for start, end in activity_ranges)

def process_screenshot(filepath):
    """Process a single screenshot file and save it to the output directory."""
    try:
        filename = os.path.basename(filepath)
        output_path = os.path.join(OUTPUT_SCREENSHOTS_DIR, filename)
        
        img = Image.open(filepath)
        normalized_img = normalize_image_size(img)
        normalized_img.save(output_path, quality=95)
        
        return {"status": "success", "filename": filename}
    except Exception as e:
        return {"status": "error", "filename": os.path.basename(filepath), "error": str(e)}

def process_json_file(filepath):
    """Process a single JSON file and save it to the output directory."""
    try:
        filename = os.path.basename(filepath)
        output_path = os.path.join(OUTPUT_JSON_DIR, filename)
        
        with open(filepath, 'r') as f:
            data = json.load(f)
        
        file_needed_normalization = False
        events_normalized = 0
        
        for event in data.get('events', []):
            if event.get('event') == 'MOUSE':
                for coord in ['x', 'y']:
                    if coord in event:
                        value = event[coord]
                        if isinstance(value, int) or value > 1:
                            file_needed_normalization = True
                            events_normalized += 1
                            event[coord] = value / (screen_width if coord == 'x' else screen_height)
        
        with open(output_path, 'w') as f:
            json.dump(data, f, indent=2)
        
        return {
            "status": "success", 
            "filename": filename, 
            "normalized": file_needed_normalization,
            "events_normalized": events_normalized
        }
    except Exception as e:
        return {"status": "error", "filename": os.path.basename(filepath), "error": str(e)}

In [13]:
# Execute the optimized normalization process
num_cores = max(1, os.cpu_count() // 2)
print(f"Using {num_cores} cores for parallel processing")

# First, extract activity timestamps and build activity ranges
print(f"Analyzing user activity from {len(json_files := glob.glob(os.path.join(JSON_DIR, '*.json')))} JSON files...")

# Extract activity timestamps and create ranges
print("Extracting activity timestamps...")
all_timestamps = []
for filepath in tqdm(json_files, desc="Processing JSON files"):
    all_timestamps.extend(extract_activity_timestamps(filepath))

print(f"Found {len(all_timestamps)} activity events")
time_ranges = [(t - 15, t + 15) for t in all_timestamps]  # 15-second buffer
merged_ranges = merge_time_ranges(time_ranges)
print(f"Created {len(merged_ranges)} activity ranges after merging")

# Calculate total activity time
total_activity_seconds = sum(end - start for start, end in merged_ranges)/3600
print(f"Total activity time: {total_activity_seconds:.2f} hours")

# Pre-filter screenshot files
print("Pre-filtering screenshots based on activity...")
screenshot_files = glob.glob(os.path.join(SCREENSHOTS_DIR, "*.jpg"))
print(f"Found {len(screenshot_files)} total screenshots")

filtered_screenshot_files = []
for filepath in tqdm(screenshot_files, desc="Filtering screenshots"):
    filename = os.path.basename(filepath)
    timestamp = extract_timestamp_from_filename(filename)
    if timestamp is not None and is_in_activity_range(timestamp, merged_ranges):
        filtered_screenshot_files.append(filepath)

filtering_ratio = len(filtered_screenshot_files)/len(screenshot_files)*100
print(f"Filtered to {len(filtered_screenshot_files)} screenshots (keeping {filtering_ratio:.2f}%)")

# Process filtered screenshots and JSON files in parallel
screenshots_dataset = Dataset.from_dict({"filepath": filtered_screenshot_files})
json_dataset = Dataset.from_dict({"filepath": json_files})

print(f"Processing {len(filtered_screenshot_files)} filtered screenshots...")
screenshot_results = screenshots_dataset.map(
    lambda example: process_screenshot(example["filepath"]),
    num_proc=num_cores, batched=False, desc="Processing screenshots"
)

print(f"Processing {len(json_files)} JSON files...")
json_results = json_dataset.map(
    lambda example: process_json_file(example["filepath"]),
    num_proc=num_cores, batched=False, desc="Processing JSON files"
)

# Extract statistics
success_screenshots = sum(1 for result in screenshot_results if result["status"] == "success")
error_screenshots = sum(1 for result in screenshot_results if result["status"] == "error")
success_json = sum(1 for result in json_results if result["status"] == "success")
error_json = sum(1 for result in json_results if result["status"] == "error")
files_with_nonnormalized_coords = sum(1 for result in json_results if result.get("normalized", False))
total_events_normalized = sum(result.get("events_normalized", 0) for result in json_results)

# Print summary
print("\n===== Normalization Summary =====")
print(f"Output directory: {OUTPUT_DIR}")
print(f"Screenshots filtered: {len(filtered_screenshot_files)} out of {len(screenshot_files)} ({filtering_ratio:.2f}%)")
print(f"Screenshots normalized: {success_screenshots} (Errors: {error_screenshots})")
print(f"JSON files processed: {success_json} (Errors: {error_json})")
print(f"Files with coordinates normalized: {files_with_nonnormalized_coords}")
print(f"Total events normalized: {total_events_normalized}")

# Copy session_prompts.log if it exists
session_prompts_path = os.path.join(LOGS_DIR, "session_prompts.log")
if os.path.exists(session_prompts_path):
    shutil.copy2(session_prompts_path, os.path.join(OUTPUT_DIR, "session_prompts.log"))
    print("Copied session_prompts.log to output directory")

print("\nNormalization complete! The normalized data is ready for use.")

Using 4 cores for parallel processing
Analyzing user activity from 2222 JSON files...
Extracting activity timestamps...


Processing JSON files: 100%|██████████| 2222/2222 [00:00<00:00, 2614.51it/s]


Found 196451 activity events
Created 371 activity ranges after merging
Total activity time: 15.29 hours
Pre-filtering screenshots based on activity...
Found 378469 total screenshots


Filtering screenshots: 100%|██████████| 378469/378469 [00:10<00:00, 36050.07it/s]

Filtered to 131385 screenshots (keeping 34.71%)
Processing 131385 filtered screenshots...



Processing screenshots (num_proc=4):   5%|▌         | 6916/131385 [03:51<1:09:23, 29.90 examples/s]


TimeoutError: 