# Post-processing script for GUIrilla dataset
This script below was used to populate the GUIrilla-Task dataset after graph collection and filtering. The dataset from graph collection is saved in `raw-tasks` folder. The script processes the data and generates tasks based on the accessibility data and screenshots using OpenAI's API.

It segments the images, identifies interactive elements, and generates tasks for each element.

The script also saves the results in a `task_dataset` folder that was later used to publish GUIrilla-Task.

In [None]:
import json
import os
import shutil
import time
from PIL import Image, ImageDraw, ImageFont
from time import sleep



# Define interactable UI element roles
INTERACTABLE_ROLES = {
    "AXButton",
    "AXTextField",
    "AXCheckBox",
    "AXRadioButton",
    "AXPopUpButton",
    "AXMenuButton",
    "AXIncrementor",
    "AXMenuItem",
    "AXMenuItemCheckbox",
    "AXMenuItemRadio",
    "AXMenuItemTextField",
    "AXScrollBar",
    "AXSlider",
    "AXTabGroup",
    "AXTabButton",
    "AXLink",
    "AXDisclosureTriangle",
    "AXComboBox",
    "AXSearchField"
}

In [None]:
def process_elements_with_ids(element_data, draw, font, scaling_factor=1.0, parent_x=0, parent_y=0, interactable_only=True, current_id=1, id_mapping=None):
    """
    Recursively process UI elements and draw rectangles with numeric IDs
    
    Args:
        element_data (dict): Element data from accessibility JSON
        draw (ImageDraw): PIL ImageDraw object
        font (ImageFont): Font for drawing element IDs
        scaling_factor (float): Screen scaling factor
        parent_x (int): Parent element's x position
        parent_y (int): Parent element's y position
        interactable_only (bool): If True, only show interactable elements
        current_id (int): Current numeric ID to assign
        id_mapping (dict): Mapping of numeric IDs to original IDs
    
    Returns:
        tuple: (dict of visualized elements, mapping of numeric IDs to original IDs)
    """
    visualized_elements = {}
    if id_mapping is None:
        id_mapping = {}
    
    # Use bbox directly if available, otherwise calculate from position and size
    if "bbox" in element_data:
        bbox = element_data["bbox"]
        if len(bbox) == 4:
            role = element_data.get("role", "unknown")
            original_id = element_data.get("id", "")
            
            # Skip non-interactable elements if interactable_only is True
            if interactable_only and role not in INTERACTABLE_ROLES:
                pass
            else:
                x1, y1, x2, y2 = bbox
                
                # Draw rectangle using bbox coordinates
                top_left = (x1 * scaling_factor + parent_x, y1 * scaling_factor + parent_y)
                bottom_right = (x2 * scaling_factor + parent_x, y2 * scaling_factor + parent_y)
                
                # Get color based on role
                # color = color_for_role(role)
                color = "red"
                
                # Draw rectangle
                try:
                    # Assign a numeric ID to this element
                    numeric_id = current_id
                    current_id += 1
                    id_mapping[numeric_id] = original_id
                    
                    # Draw rectangle
                    draw.rectangle([top_left, bottom_right], outline=color, width=2)
                    
                    # Get text size - using getbbox instead of textsize which is deprecated
                    text_bbox = font.getbbox(str(numeric_id))
                    text_width = text_bbox[2] - text_bbox[0]
                    
                    # Draw element ID below the box
                    text_position = (
                        x1 * scaling_factor + (x2 - x1) * scaling_factor / 2 - text_width / 2 + parent_x,  # Center horizontally
                        y2 * scaling_factor + 5 + parent_y  # Position below the box
                    )
                    
                    # Draw text in the correct color
                    draw.text(text_position, str(numeric_id), fill=color, font=font)
                    
                    # Add element to visualized elements dict with numeric ID as key
                    element_info = {
                        "original_id": original_id,
                        "name": element_data.get("name", ""),
                        "role": role,
                        "description": element_data.get("description", ""),
                        "role_description": element_data.get("role_description", ""),
                        "value": element_data.get("value", ""),
                        "absolute_position": element_data.get("absolute_position", ""),
                        "position": element_data.get("position", ""),
                        "size": element_data.get("size", ""),
                        "bbox": bbox,
                        "visible_bbox": element_data.get("visible_bbox", bbox),
                        "enabled": element_data.get("enabled", False),
                        "children": []
                    }
                    visualized_elements[numeric_id] = element_info
                    
                except Exception as e:
                    print(f"Error drawing rectangle for {role}: {e}")
    
    # Process children recursively
    children = element_data.get("children", [])
    for child in children:
        child_elements, id_mapping = process_elements_with_ids(
            child, draw, font, scaling_factor, parent_x, parent_y, interactable_only, 
            current_id=current_id, id_mapping=id_mapping
        )
        visualized_elements.update(child_elements)
        
        # Update current_id to be greater than any ID used so far
        if id_mapping:
            max_child_id = max(id_mapping.keys())
            current_id = max_child_id + 1
    
    return visualized_elements, id_mapping

# Function to load accessibility data from JSON file
def load_accessibility_data(json_path):
    """
    Load accessibility data from a JSON file
    
    Args:
        json_path (str): Path to the JSON file
        
    Returns:
        dict: Parsed JSON data
    """
    try:
        with open(json_path, 'r') as f:
            return json.load(f)
    except Exception as e:
        print(f"Error loading accessibility data: {e}")
        return {}


In [None]:
# Define colors for different UI element roles
def color_for_role(role):
    color = "red"
    if role == "AXButton":
        color = "blue"
    elif role == "AXTextField":
        color = "green"
    elif role == "AXStaticText":
        color = "yellow"
    elif role == "AXImage":
        color = "purple"
    elif role == "AXGroup":
        color = "orange"
    elif role == "AXScrollBar":
        color = "brown"
    elif role == "AXRow":
        color = "pink"
    elif role == "AXColumn":
        color = "cyan"
    elif role == "AXCell":
        color = "magenta"
    elif role == "AXTable":
        color = "lightblue"
    elif role == "AXOutline":
        color = "lightgreen"
    elif role == "AXLayoutArea":
        color = "lightyellow"
    elif role == "AXLayoutItem":
        color = "lavender"
    elif role == "AXHandle":
        color = "peachpuff"
    elif role == "AXSplitter":
        color = "lightsalmon"
    elif role == "AXIncrementor":
        color = "lightpink"
    elif role == "AXBusyIndicator":
        color = "lightcyan"
    elif role == "AXProgressIndicator":
        color = "plum"
    elif role == "AXToolbar":
        color = "darkred"
    elif role == "AXPopover":
        color = "darkblue"
    elif role == "AXMenu":
        color = "darkgreen"
    elif role == "AXMenuItem":
        color = "olive"
    elif role == "AXMenuBar":
        color = "rebeccapurple"
    elif role == "AXMenuBarItem":
        color = "darkorange"
    elif role == "AXMenuButton":
        color = "saddlebrown"
    elif role == "AXMenuItemCheckbox":
        color = "palevioletred"
    elif role == "AXMenuItemRadio":
        color = "darkcyan"
    elif role == "AXMenuItemPopover":
        color = "darkmagenta"
    elif role == "AXMenuItemSplitter":
        color = "black"
    elif role == "AXMenuItemTable":
        color = "white"
    elif role == "AXMenuItemTextField":
        color = "lightgray"
    elif role == "AXMenuItemStaticText":
        color = "darkgray"
    elif role == "AXMenuItemImage":
        color = "salmon"
    elif role == "AXMenuItemGroup":
        color = "lightblue"
    elif role == "AXMenuItemScrollBar":
        color = "lightgreen"
    elif role == "AXMenuItemRow":
        color = "lightyellow"
    elif role == "AXMenuItemColumn":
        color = "lavender"
    elif role == "AXMenuItemCell":
        color = "peachpuff"
    elif role == "AXMenuItemOutline":
        color = "burlywood"
    elif role == "AXMenuItemLayoutArea":
        color = "lightpink"
    elif role == "AXMenuItemLayoutItem":
        color = "lightcyan"
    elif role == "AXMenuItemHandle":
        color = "plum"
    elif role == "AXMenuItemSplitter":
        color = "darkred"
    elif role == "AXMenuItemIncrementor":
        color = "darkblue"
    elif role == "AXMenuItemBusyIndicator":
        color = "darkgreen"
    elif role == "AXMenuItemProgressIndicator":
        color = "darkyellow"
    elif role == "AXMenuItemToolbar":
        color = "rebeccapurple"
    elif role == "AXMenuItemPopover":
        color = "darkorange"
    return color

# Function to segment image using accessibility data
def segment_image_with_accessibility(image_path, accessibility_data, scaling_factor=1.0, interactable_only=True):
    """
    Segment an image by drawing rectangles around UI elements based on accessibility data
    
    Args:
        image_path (str): Path to the image file
        accessibility_data (dict): JSON data containing UI element information
        scaling_factor (float): Screen scaling factor for retina displays
        interactable_only (bool): If True, only show interactable elements
    """
    if not os.path.exists(image_path):
        print(f"Image not found: {image_path}")
        return None, [], {}
    
    # Open the original image
    try:
        image = Image.open(image_path)
        # # Convert image to black and white
        # image = image.convert('L').convert('RGBA')
        
        # Add padding to the image to prevent text from being cut off
        padding = 50  # Add 50 pixels of padding on all sides
        padded_image = Image.new(image.mode, (image.width + 2*padding, image.height + 2*padding), (255, 255, 255, 0))
        padded_image.paste(image, (padding, padding))
        image = padded_image
        
        draw = ImageDraw.Draw(image)
        
        # Try to load a font for element IDs with size proportional to image dimensions
        font_size = max(int(min(image.width, image.height) * 0.02), 12)  # 2% of the smaller dimension, minimum 12px
        try:
            font = ImageFont.truetype("Arial.ttf", font_size)
        except IOError:
            try:
                font = ImageFont.truetype("DejaVuSans.ttf", font_size)
            except IOError:
                font = ImageFont.load_default()
        
        # Process the accessibility data and draw rectangles with IDs
        visualized_elements, id_mapping = process_elements_with_ids(
            accessibility_data, draw, font, scaling_factor, interactable_only=interactable_only,
            parent_x=padding, parent_y=padding  # Adjust for padding
        )
        
        # Return the segmented image, visualized elements, and ID mapping
        return image, visualized_elements, id_mapping
        
    except Exception as e:
        print(f"Error segmenting image: {e}")
        return None, [], {}
    

# Main function to segment window components
def segment_window_components(image_path, json_path, scaling_factor=1.0, interactable_only=True):
    """
    Segment window components using accessibility data
    
    Args:
        image_path (str): Path to the screenshot image
        json_path (str): Path to the accessibility data JSON file
        scaling_factor (float): Screen scaling factor for retina displays
        interactable_only (bool): If True, only show interactable elements
        
    Returns:
        tuple: (segmented_image, visualized_elements, id_mapping)
    """
    if not image_path or not os.path.exists(image_path):
        print(f"Image not found: {image_path}")
        return None, {}, {}
    
    if not json_path or not os.path.exists(json_path):
        print(f"Accessibility data not found: {json_path}")
        return None, {}, {}
    
    # Load accessibility data
    accessibility_data = load_accessibility_data(json_path)
    
    # Segment the image and get visualized elements with ID mapping
    segmented_image, visualized_elements, id_mapping = segment_image_with_accessibility(
        image_path, accessibility_data, scaling_factor, interactable_only
    )
    return segmented_image, visualized_elements, id_mapping

def describe_elements(element_data):
    """
    Convert element data to simple text descriptions
    
    Args:
        element_data (dict): Dictionary of element dictionaries
        
    Returns:
        list: List of text descriptions for each element
    """
    descriptions = []
    
    for numeric_id, element in element_data.items():
        parts = []
        parts.append(f"Element {numeric_id}")
        
        if element.get('name'):
            parts.append(f"name: {element['name']}")
        
        parts.append(f"role: {element['role']}")
        
        if element.get('description'):
            parts.append(f"description: {element['description']}")
        
        if element.get('value') is not None:
            parts.append(f"value: {element['value']}")
        
        descriptions.append(" | ".join(parts))
    
    return descriptions


In [None]:
prompt_text = """
## Task
Evaluate accessibility elements in the screenshot and identify those with good segmentation.

## Segmentation Criteria
- Box completely contains the element (no cutting through)
- Box contains an element, not just empty space

## Selection Rules
- Choose up to 15 segmented elements
- Prioritize icons and unique elements non text elements
- Ignore the three window controls (close, minimize, maximize)
- For nested elements, pick only one
- Be extremely critical - accessibility elements are often incorrectly segmented
- Include all the good elements, but up to 15

## Output example
Input elements: 1, 6, 3 - icons without text, 8, 9 - low quality segmentation.

```json
{"perfect_element_ids": [1, 6, 3, 2, 5, 4, 7, 10]}
```

Return empty array if no perfect segmentations exist: `{"perfect_element_ids": []}`
"""

In [None]:
improvement_prompt = """
You are given:
- An original task description for a UI interaction: {task_string}
- A screenshot showing the full interface with a red-highlighted element
- A cropped view focusing on just the highlighted element

Your goal:
Change the original task into a natural-language instruction FULLY in English that only involves inputting text, and precisely instructs what to do. Output an action representing a solution to this task, that is "type" + the exact text to input. Upon generation, it must be clear to you that the generated action is solving the previously formulated task, if not - edit the task to make it clear.

Key Principles:
- Make instructions sound like natural human instructions rather than technical commands
- Always include specific input text (never placeholders or generic descriptions), don't interprete the text in the action, rather make sure it is clear in the task formulation that the action is solving it.
- Consider real-world context for what the user might be trying to accomplish

## Requirements
- Each improved task must sound conversational and natural, as CONCISE as possible
- Each action must start with "type" followed by the precise text to enter
- The task should be a realistic instruction including the precise text that MUST match the input in the action
- both task and action MUST be FULLY in English - both the formulation and the input text
- NEVER use placeholder text like "your email" or "a number" in neither task nor action
- Keep the focus on what the user wants to accomplish, not just the UI action
- The task should match real-world usage patterns for the application shown
- Do not analyze the input if it involves a comples text (for example, link, or programming line), rather be direct in the task formulation about what to input
- the task NEVER involves clicking, pressing or selecting an element
- the task formulation should allow a reader to have a clue about what to input, and thus, must be obviuos
- NEVER add "by entering it" or "by typing it" to the task formulation

## Format
Return a JSON with two fields:
- "task": A natural language instruction 
- "action": The specific typing action (always "type" + exact text to input)

## Examples of Good Transformations

Too direct → More natural:
- ❌ "Type John Smith in the name field" 
- ✅ "Put down John Smith as your full name"

Too vague → Specific but natural:
- ❌ "Enter your password" 
- ✅ "Use SecurePass456 as your account password"

Too mechanical → Goal-oriented:
- ❌ "Input 555-123-4567 in the phone field" 
- ✅ "Add your mobile number as 555-123-4567"


## Sample Format

{{"task": "Look up flights to Barcelona for your vacation", "action": "type Barcelona"}}
{{"task": "Send Jane a message about tomorrow's meeting", "action": "type Jane"}}
{{"task": "Use john.doe@example.com as your login email", "action": "type john.doe@example.com"}}
{{"task": "Set your new password to BlueSky92!", "action": "type BlueSky92!"}}
{{"task": "Search for that chocolate chip cookie recipe", "action": "type chocolate chip cookie recipe"}}

## Examples of good and bad final formulations:
Too mysterious formulation → Clear and direct:
- BAD : task: "Enter coded message", action: "type 1234"
- GOOD: task: "Enter 1234 as your coded message", action: "type 1234"

Non matching task and action:
- BAD: 'task': 'Save your converted files to the Desktop folder', 'action': 'type /Users/yourname/Desktop'
- GOOD: 'task': 'Use /Users/yourname/Desktop as your destination folder', 'action': 'type /Users/yourname/Desktop'

Confusing task and action formulations → Matching formulaiton of task an action:
- BAD :  'task': 'Enter your VISA card number to make a payment', 'action': 'type 4111111111111111'
- GOOD:  'task': 'Use 859022930 as your VISA card number', 'action': 'type 859022930'

Task formulation involves clicking → Text-based only task formulation:
- BAD :  'task': 'Check the box labeled Include borders and shadings', 'action': 'type Include borders and shadings'
- GOOD:  'task': 'Select Include borders and shadings as your option', 'action': 'type Include borders and shadings'

## Avoid These Common Mistakes
- ❌ Using placeholder text: "Enter your name" → "Enter Maria Garcia"
- ❌ Being too mechanical: "Type password in password field" → "Use TrustNo1 as your password"
- ❌ Focusing only on the UI: "Fill the search box" → "Find information about electric cars"
- ❌ Being vague: "Type the code" → "Enter 8294 as your verification code"
- ❌ Using impersonal language: "Input required" → "Add your birthday as 03/15/1988"
"""

In [None]:
import base64
import pandas as pd
import logging
from pydantic import BaseModel
import openai
import json

with open("../../config_open_ai.env", "r") as f:
    API_KEY = f.read().strip()

client = openai.OpenAI(api_key=API_KEY)

def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

class ElementIds(BaseModel):
    perfect_element_ids: list[int]

def get_interactive_elements(image_path):
    # Encode the image
    base64_image = encode_image(image_path)
    
    # Create a prompt with the element descriptions
    text_input = f"{prompt_text}"

    try:
        completion = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "user",
                    "content": [
                        { "type": "text", "text": text_input},
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{base64_image}",
                            },
                        },
                    ],
                }
            ],
            temperature=0.0,
            response_format={
                'type': 'json_schema',
                'json_schema': {
                    "name": "elements",
                    "schema": ElementIds.model_json_schema()
                }
            } 
        )
        result = json.loads(completion.choices[0].message.content)
        return result["perfect_element_ids"]
    except openai.BadRequestError as e:
        print(f"Error identifying interactive elements: {e}")
        return []

In [None]:
prompt_task = """
You are given a UI screenshot, an image of the clicked UI element. 
The clicked element is highlighted in red.
Your task is to describe the action needed to click this element.

Guidelines:

    0. If the element is not perfectly selected (ex. partially), the box is strangely located, or no human would do this task - return empty string.

    1. The task must describe the function, not the appearance of the element.
    For example, prefer "Create a new document" over "Click the grey + button." Repeating the element's text is acceptable.

    2. The task must be unique to this screen.
    For example, if there are two buttons labeled "Open," you must specify which "Open" button is meant.
    
    3. The task must consider the app context, but not imagine extra information.
    For example, if the app is an image editor and the button is "Delete," the better task is "Delete an image", not just a generic "delete."

    4. Use the fewest words possible without sacrificing clarity.

    5. Write the task in straightforward English only.

    6. Select a category for each task. Must be one of Navigation (go back), Settings (adjust volume), Files (save file), Apps (open edge), Search & Information (check weather), Media (play music), Accounts (sign in), Communication (share file), Input (enlarge font), Connectivity (connect wifi), Modes (dark mode), E-commerce (add to cart)

    7. Select a category for each element. Must be one of Image, Text, Checkbox/Control, Menu item, Input field, Button, Group, Link.


Important notes:

    The click is based on accessibility information. Metadata may be incorrect or the element may not exist.

    Rely primarily on the images.

    The element image should show a single element with a unique function.
    If the element is obstructed, covered by a window or pop-up, or if multiple cropped elements are shown — return an empty string.

    Inspect the red box carefully: if the element is not visible, return an empty string.

    If there is no red box - return empty string.

Return your answer in JSON format, with no extra text.

Example: {"task": "Open the menu to see tutorials", "task_category": "Search & Information", "element_category": "Button"}

"""

In [None]:
import json
import json
import pandas as pd
import base64
from pydantic import BaseModel
import json
import openai

def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')
    
class Task(BaseModel):
    task: str
    task_category: str
    element_category: str
    
class InputTask(BaseModel):
    task: str
    action: str 
    

def predict_task_openai(image_path="screenshot_bbox.jpg", element_path="element.jpg"):
    # Path to your image
    base64_image = encode_image(image_path)
    base64_element = encode_image(element_path)
    
    text_input = f"{prompt_task}"

    try:
        completion = client.chat.completions.create(
            model="gpt-4.1",
            messages=[
                {
                    "role": "user",
                    "content": [
                        { "type": "text", "text": text_input},
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{base64_image}",
                            },
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{base64_element}",
                            },
                        },
                    ],
                }
            ],
            response_format={
            'type': 'json_schema',
            'json_schema': 
                {
                    "name":"task", 
                    "schema": Task.model_json_schema()
                }
            } 
        )
        return completion.choices[0].message.content
    except openai.BadRequestError:
        print(f"Error predicting task, bad request.")
        return '{"task": "", "task_category": "", "element_category": ""}'
    

def improve_task(original_task, image_path, element_path):
    text_input = improvement_prompt.format(
        task_string=original_task
    )

    base64_image = encode_image(image_path)
    base64_element = encode_image(element_path)

    try:
        completion = client.chat.completions.create(
            model="gpt-4.1",
            messages=[
                {
                    "role": "user",
                    "content": [
                        { "type": "text", "text": text_input},
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{base64_image}",
                            },
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{base64_element}",
                            },
                        },
                    ],
                }
            ],
            response_format={
            'type': 'json_schema',
            'json_schema':
                {
                    "name":"task",
                    "schema": InputTask.model_json_schema()
                }
            }
        )
        return completion.choices[0].message.content
    except openai.BadRequestError:
        print(f"Error predicting task, bad request.")
        return {"task": "", "action": ""}

def predict_task(image_path, bbox):
    """Process tasks with OpenAI API and return the results."""
    formatted_task, task_category, element_category = "", "", ""
    image = Image.open(image_path).convert("RGB")

    # Check if bbox is inside the image
    if bbox[0] < 0 or bbox[1] < 0 or bbox[2] > image.width or bbox[3] > image.height:
        print(f"Bounding box is not inside the image: {image_path}")
        return {"formatted_task": "", "task_category": "", "element_category": ""}

    # Add red bounding box to image
    draw = ImageDraw.Draw(image)
    draw.rectangle(bbox, outline='red', width=3)
    
    element_image = image.crop(bbox)

    # Save temporaryimages
    image.save("screenshot_bbox.jpg")

    try:
        element_image.save("element.jpg")
    except ValueError:
        print(f"Bounding box is not valid for image: {image_path}")
        return {"formatted_task": "", "task_category": "", "element_category": ""}
    
    print(f"Processing tasks for image: {image_path}")

    try: 
        prediction = json.loads(predict_task_openai())
        print(prediction)
        formatted_task = prediction['task']
        task_category = prediction['task_category']
        element_category = prediction['element_category']
        print(f"Extracted task: {prediction['task']} {prediction['task_category']} {prediction['element_category']}")
        return {"formatted_task": formatted_task, "task_category": task_category, "element_category": element_category}

    except json.JSONDecodeError:
        print(f"Error predicting task: {prediction}")
        return {"formatted_task": "", "task_category": "", "element_category": ""}

In [None]:
import pandas as pd
import json
import os
from PIL import Image
import io
from glob import glob
from tqdm import tqdm
import matplotlib.pyplot as plt
import random

def process_element(element_id, visualized_elements, img_cropped_path, scaling_factor):
    """Process a single interactive element and predict its task."""
    element_data = visualized_elements[element_id]
    if "bbox" not in element_data:
        return None
    
    bbox = element_data["bbox"]
    x1, y1, x2, y2 = bbox
    
    # Predict task for this element
    prediction = predict_task(img_cropped_path, [
        x1 * scaling_factor, 
        y1 * scaling_factor, 
        x2 * scaling_factor, 
        y2 * scaling_factor
    ])
    
    # Calculate center point for the action
    bbox_center_x = round((x1 + x2) / 2, 2)
    bbox_center_y = round((y1 + y2) / 2, 2)
    action = f"left click ({bbox_center_x}, {bbox_center_y})"
    
    return {
        "task": prediction['formatted_task'],
        "task_category": prediction['task_category'],
        "element_category": prediction['element_category'],
        "action": action,
        "element_data": element_data
    }

def save_image_files(screen_id, img, img_cropped, accessibility_data, segmented_image):
    """Save all image files and accessibility data for a task."""
    # Create folder for this task
    os.makedirs(f"task_dataset/{screen_id}", exist_ok=True)
    
    # Save original image
    img_path = f"task_dataset/{screen_id}/image.png"
    img.save(img_path)
    
    # Save cropped image
    img_cropped_path = f"task_dataset/{screen_id}/image_cropped.png"
    img_cropped.save(img_cropped_path)
    
    # Save accessibility data
    a11y_path = f"task_dataset/{screen_id}/a11y.json"
    with open(a11y_path, "w") as f:
        json.dump(accessibility_data, f)
    
    # Save segmented image
    segmented_image_path = f"task_dataset/{screen_id}/segmented_image.png"
    segmented_image.save(segmented_image_path)
    
    return {
        "img_path": img_path,
        "img_cropped_path": img_cropped_path,
        "a11y_path": a11y_path,
        "segmented_image_path": segmented_image_path
    }

def determine_scaling_factor(img_width):
    """Determine the scaling factor based on image width."""
    return 1 if img_width < 2000 else 2

def process_task_item(item_data):
    """Process a single task item from the dataset."""
    try:
        # Extract data
        screen_id = item_data.id

        # Check if the screen_id is already in the dataset
        if os.path.exists(f"task_dataset/{screen_id}"):
            return []

        app_name = item_data.app_name
        task = item_data.task
        raw_task = item_data.raw_task
        action = item_data.action
        element_data = json.loads(item_data.element)
        accessibility_data = json.loads(item_data.accessibility)
        
        # Process images
        img = Image.open(io.BytesIO(item_data.image['bytes'])).convert("RGB")
        img_cropped = Image.open(io.BytesIO(item_data.image_cropped['bytes'])).convert("RGB")
        
        # Determine scaling factor
        scaling_factor = determine_scaling_factor(img.size[0])
        
        # Save all image files first
        file_paths = save_image_files(
            screen_id, 
            img, 
            img_cropped, 
            accessibility_data, 
            Image.new('RGB', img_cropped.size)  # Temporary placeholder image
        )
        
        # Segment the image
        segmented_image, visualized_elements, id_mapping = segment_image_with_accessibility(
            file_paths["img_cropped_path"], 
            accessibility_data, 
            scaling_factor=scaling_factor
        )
        
        # Save the actual segmented image
        segmented_image.save(file_paths["segmented_image_path"])
        
        results = []
        
        # Add original task if it exists
        if task is not None and task != "":
            results.append({
                "screen_id": screen_id,
                "app_name": app_name,
                "task": task,
                "raw_task": raw_task,
                "action": action,
                "image_path": file_paths["img_path"],
                "image_cropped_path": file_paths["img_cropped_path"],
                "segmented_image_path": file_paths["segmented_image_path"],
                "a11y_path": file_paths["a11y_path"],
                "scaling_factor": scaling_factor,
                "element_data": element_data,
                "original_task": True,
                "task_category": "",
                "element_category": ""
            })
     
        # Process interactive elements
        if visualized_elements:
            element_ids = get_interactive_elements(file_paths["segmented_image_path"])
            
            for element_id in element_ids:
                element_result = process_element(
                    element_id, 
                    visualized_elements, 
                    file_paths["img_cropped_path"], 
                    scaling_factor
                )
                
                
                if element_result:
                    
                    if element_result["element_category"] == "Input field":
                        # Get improved task
                        improved_task = json.loads(improve_task(element_result["task"], file_paths["img_path"], file_paths["img_cropped_path"]))
                        element_result["task"]  = improved_task["task"]
                        element_result["action"] = improved_task["action"]
                        
                    results.append({
                        "screen_id": screen_id,
                        "app_name": app_name,
                        "task": element_result["task"],
                        "raw_task": "",
                        "action": element_result["action"],
                        "image_path": file_paths["img_path"],
                        "image_cropped_path": file_paths["img_cropped_path"],
                        "segmented_image_path": file_paths["segmented_image_path"],
                        "a11y_path": file_paths["a11y_path"],
                        "scaling_factor": scaling_factor,
                        "element_data": element_result["element_data"],
                        "original_task": False,
                        "task_category": element_result["task_category"],
                        "element_category": element_result["element_category"]
                    })

        # Save results to a CSV file using pandas
        if results:
            results_df = pd.DataFrame(results)
            results_df = results_df[results_df['task'] != ''].reset_index(drop=True)
            os.makedirs(f"task_dataset/{screen_id}", exist_ok=True)
            results_df.to_csv(f"task_dataset/{screen_id}/results.csv", index=False)
        
        return results
    
    except Exception as e:
        print(f"Error processing item {item_data.id}: {e}")
        return []

# Main processing function
def process_parquet_files(parquet_files):
    """Process all parquet files and generate task data."""
    all_data = []
    
    for file_path in parquet_files:
        print(f"Processing {file_path}...")
        df = pd.read_parquet(file_path, engine='pyarrow')
        
        for i in tqdm(range(len(df))):
            try:
                results = process_task_item(df.iloc[i])
                all_data.extend(results)
                    
            except Exception as e:
                print(f"Error processing item {i} in file {file_path}: {e}")

# List of parquet files
parquet_files = sorted(glob('raw-tasks/data/*.parquet'))

# Process all files
process_parquet_files(parquet_files)