# Normalization Process Documentation

This notebook normalizes the mining data collected in the `logs` directory:

1. **For Screenshots**:
   - Resizes images to max 1120x1120 pixels while preserving aspect ratio
   - Keeps original filenames
   
2. **For JSON Files**:
   - Normalizes click coordinates to 0-1 range (if not already normalized)
   - Normalizes by dividing x by screen width and y by screen height
   - Preserves all other data
   
3. **Output Structure**:
   - Saves to a specific folder in `../data/normalized/[hostname]_[timestamp]/`
   - Maintains the same folder structure as the original logs directory
   - Copies session_prompts.log if it exists

## How to Use

1. Run all cells in order
2. The normalized data will be saved to a new directory in `../data/normalized/`
3. Each normalization run creates a subfolder named `[hostname]_[timestamp]`

You can now use this normalized data with your analysis or training scripts.

In [None]:
import os
import json
import glob
import re
import shutil
import socket
from datetime import datetime
from PIL import Image
import pyautogui
from tqdm import tqdm  # Changed from tqdm.notebook to avoid ipywidgets dependency

In [None]:
# Configuration

# Get screen width and height for reference
screen_width, screen_height = pyautogui.size()
print(f"Screen size: {screen_width}x{screen_height}")

# Define paths
ROOT_DIR = os.path.abspath('../')
MINING_DIR = os.path.join(ROOT_DIR, 'mining')
# LOGS_DIR = os.path.join(MINING_DIR, 'logs')
LOGS_DIR = os.path.join(ROOT_DIR, 'data', 'old 5-1-2025')
SCREENSHOTS_DIR = os.path.join(LOGS_DIR, 'screenshots')
JSON_DIR = os.path.join(LOGS_DIR, 'sanitized_json')

# Generate distinctive subfolder name with timestamp and hostname
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
hostname = socket.gethostname().replace(".", "_")
NORMALIZED_SUBFOLDER = f"{hostname}_{timestamp}"

# Set up output directory structure
OUTPUT_DIR = os.path.join(ROOT_DIR, 'data', 'normalized', NORMALIZED_SUBFOLDER)
OUTPUT_SCREENSHOTS_DIR = os.path.join(OUTPUT_DIR, 'screenshots')
OUTPUT_JSON_DIR = os.path.join(OUTPUT_DIR, 'sanitized_json')

# Create output directories
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(OUTPUT_SCREENSHOTS_DIR, exist_ok=True)
os.makedirs(OUTPUT_JSON_DIR, exist_ok=True)

print(f"Mining directory: {MINING_DIR}")
print(f"Input screenshots: {SCREENSHOTS_DIR}")
print(f"Input JSON: {JSON_DIR}")
print(f"Output directory: {OUTPUT_DIR}")
print(f"Output screenshots: {OUTPUT_SCREENSHOTS_DIR}")
print(f"Output JSON: {OUTPUT_JSON_DIR}")

In [None]:
# Function to normalize image size
def normalize_image_size(img, max_size=(1120, 1120)):
    """Resize an image to a maximum size while preserving aspect ratio."""
    original_width, original_height = img.size

    # Calculate width and height ratios
    width_ratio = max_size[0] / original_width
    height_ratio = max_size[1] / original_height

    # Use the smaller ratio to ensure both dimensions fit within max_size
    resize_ratio = min(width_ratio, height_ratio)

    # Only resize if the image is larger than max_size
    if resize_ratio < 1:
        new_width = int(original_width * resize_ratio)
        new_height = int(original_height * resize_ratio)
        return img.resize((new_width, new_height), Image.Resampling.LANCZOS)
    else:
        return img

# Process all screenshot images
def process_screenshots():
    """Process and normalize all screenshot images."""

    # Get all screenshot files
    screenshot_files = glob.glob(os.path.join(SCREENSHOTS_DIR, "*.jpg"))
    print(f"Found {len(screenshot_files)} screenshots to process")

    # Process each screenshot
    success_count = 0
    error_count = 0

    for filepath in tqdm(screenshot_files, desc="Normalizing screenshots"):
        try:
            # Get the filename
            filename = os.path.basename(filepath)
            output_path = os.path.join(OUTPUT_SCREENSHOTS_DIR, filename)

            # Load and normalize the image
            img = Image.open(filepath)
            normalized_img = normalize_image_size(img)

            # Save with the same filename
            normalized_img.save(output_path, quality=95)
            success_count += 1

        except Exception as e:
            print(f"Error processing {filepath}: {e}")
            error_count += 1

    print(f"Screenshot processing complete: {success_count} successful, {error_count} errors")
    return success_count, error_count

In [None]:
# Function to process JSON files
def process_json_files():
    """Process all JSON files, normalizing coordinates to 0-1 range if needed."""

    # Get all JSON files
    json_files = glob.glob(os.path.join(JSON_DIR, "*.json"))
    print(f"Found {len(json_files)} JSON files to process")

    # Initialize statistics
    success_count = 0
    error_count = 0
    files_with_nonnormalized_coords = 0
    events_normalized = 0

    for filepath in tqdm(json_files, desc="Processing JSON files"):
        try:
            # Get the filename
            filename = os.path.basename(filepath)
            output_path = os.path.join(OUTPUT_JSON_DIR, filename)

            # Load JSON data
            with open(filepath, 'r') as f:
                data = json.load(f)

            # Track if this file needed normalization
            file_needed_normalization = False

            # Check coordinates in events
            for event in data.get('events', []):
                if event.get('event') == 'MOUSE':
                    # Check x coordinate
                    if 'x' in event:
                        x = event['x']
                        # If x is an integer or > 1, it's likely not normalized
                        if isinstance(x, int) or x > 1:
                            file_needed_normalization = True
                            events_normalized += 1
                            # Normalize by dividing by screen width
                            event['x'] = x / screen_width

                    # Check y coordinate
                    if 'y' in event:
                        y = event['y']
                        # If y is an integer or > 1, it's likely not normalized
                        if isinstance(y, int) or y > 1:
                            file_needed_normalization = True
                            # Normalize by dividing by screen height
                            event['y'] = y / screen_height

            # Save JSON file
            with open(output_path, 'w') as f:
                json.dump(data, f, indent=2)

            if file_needed_normalization:
                files_with_nonnormalized_coords += 1

            success_count += 1

        except Exception as e:
            print(f"Error processing {filepath}: {e}")
            error_count += 1

    print(f"JSON processing complete: {success_count} successful, {error_count} errors")
    print(f"Files with non-normalized coordinates: {files_with_nonnormalized_coords}")
    print(f"Total events normalized: {events_normalized}")
    return success_count, error_count, files_with_nonnormalized_coords

In [None]:
# Execute the normalization process

# Process screenshots
screenshot_results = process_screenshots()

# Process JSON files
json_results = process_json_files()

# Print summary
print("\n===== Normalization Summary =====")
print(f"Output directory: {OUTPUT_DIR}")
print(f"Screenshots normalized: {screenshot_results[0]}")
print(f"JSON files processed: {json_results[0]}")
print(f"Files with coordinates normalized: {json_results[2]}")

# Copy session_prompts.log if it exists
session_prompts_path = os.path.join(LOGS_DIR, "session_prompts.log")
if os.path.exists(session_prompts_path):
    output_session_prompts = os.path.join(OUTPUT_DIR, "session_prompts.log")
    shutil.copy2(session_prompts_path, output_session_prompts)
    print(f"Copied session_prompts.log to output directory")

print("\nNormalization complete! The normalized data is ready for use.")