# AI-Driven Test Automation with BLIP-2

This notebook trains a multimodal model to generate GUI test steps from screenshots and functional descriptions.

**Architecture:** BLIP-2 (Vision Encoder + Q-Former + FLAN-T5)

**Dataset:** SuperAGI/GUIDE

---

## Table of Contents
1. Setup & Installation
2. Load Dataset
3. Initialize Model
4. Training
5. Evaluation
6. Inference Demo
7. Sequence Generation

## 1. Setup & Installation

In [None]:
# Check GPU
!nvidia-smi

In [None]:
# Install dependencies
!pip install -q torch torchvision transformers datasets pillow accelerate bitsandbytes
!pip install -q rouge-score nltk wandb gradio evaluate
!pip install -q -U transformers  # Ensure latest version

In [None]:
# Clone repository (if not already in Colab)
import os
if not os.path.exists('diebold-cap'):
    !git clone YOUR_REPO_URL diebold-cap
    
%cd diebold-cap

In [None]:
# Import modules
import sys
sys.path.append('./src')

import torch
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

# Custom modules
from data.dataset import GUITestDataset, get_dataloaders
from models.blip2_model import GUITestBLIP2
from training.trainer import GUITestTrainer
from utils.evaluation import GUITestEvaluator, evaluate_model

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

## 2. Load Dataset

Loading the SuperAGI/GUIDE dataset from HuggingFace.

In [None]:
# Load dataset to explore
dataset = GUITestDataset(split='train', max_history_length=10)
print(f"Training samples: {len(dataset)}")

# Show example
sample = dataset[0]
print("\n" + "="*60)
print("Example Sample")
print("="*60)
print(f"\nWorkflow: {sample['workflow']}")
print(f"\nInput Text:\n{sample['input_text']}")
print(f"\nTarget Text: {sample['target_text']}")
print("\nImage:")
plt.figure(figsize=(8, 6))
plt.imshow(sample['image'])
plt.axis('off')
plt.title(f"Screenshot - {sample['workflow']}")
plt.show()

In [None]:
# Show more examples
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for i in range(6):
    sample = dataset[i * 100]  # Sample every 100th
    axes[i].imshow(sample['image'])
    axes[i].set_title(f"Action: {sample['target_text'][:50]}...", fontsize=9)
    axes[i].axis('off')

plt.tight_layout()
plt.show()

## 3. Initialize Model

Loading BLIP-2 with FLAN-T5-base backend.

In [None]:
# Initialize model
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = GUITestBLIP2(
    model_name="Salesforce/blip2-flan-t5-base",
    device=device,
    freeze_vision=True,      # Keep vision encoder frozen (saves memory)
    freeze_qformer=False,     # Train Q-Former
    freeze_lm_encoder=True,   # Freeze T5 encoder, train decoder
)

print(f"\nModel loaded on {device}")

In [None]:
# Test model with sample
print("Testing model generation...\n")
sample = dataset[0]
prediction = model.generate(
    images=[sample['image']],
    prompts=[sample['input_text']],
    max_length=50,
    num_beams=2
)

print(f"Input: {sample['input_text'][:200]}...")
print(f"\nPredicted action: {prediction[0]}")
print(f"Ground truth: {sample['target_text']}")

## 4. Training

Training with memory-efficient settings for Colab.

In [None]:
# Create dataloaders
train_loader, val_loader = get_dataloaders(
    batch_size=4,  # Small batch for memory efficiency
    num_workers=2,
    processor=model.processor,
    device=device,
    max_history_length=10,
    use_cot=False  # Don't use chain-of-thought initially
)

print(f"Train batches: {len(train_loader)}")
print(f"Val batches: {len(val_loader)}")

In [None]:
# Training configuration
training_config = {
    'learning_rate': 5e-5,
    'weight_decay': 0.01,
    'num_epochs': 3,
    'gradient_accumulation_steps': 4,  # Effective batch = 4 * 4 = 16
    'max_grad_norm': 1.0,
    'warmup_steps': 100,
    'use_wandb': False,  # Set True if you want WandB logging
    'device': device,
}

print("Training configuration:")
for key, value in training_config.items():
    print(f"  {key}: {value}")

In [None]:
# Initialize trainer
trainer = GUITestTrainer(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    output_dir='./outputs',
    **training_config
)

In [None]:
# Train!
trainer.train()

## 5. Evaluation

Evaluate the trained model on validation set.

In [None]:
# Load best model
best_model = GUITestBLIP2(device=device)
best_model.load_model('./outputs/best_model')
print("Best model loaded!")

In [None]:
# Evaluate on validation set
evaluator = GUITestEvaluator()
results = evaluate_model(
    model=best_model,
    dataloader=val_loader,
    evaluator=evaluator,
    max_batches=50  # Evaluate on subset for speed
)

In [None]:
# Visualize some predictions
val_dataset = GUITestDataset(split='validation')

fig, axes = plt.subplots(3, 2, figsize=(15, 18))

for i in range(3):
    sample_idx = i * 50
    sample = val_dataset[sample_idx]
    
    # Get prediction
    prediction = best_model.generate(
        images=[sample['image']],
        prompts=[sample['input_text']],
        max_length=128,
        num_beams=4
    )[0]
    
    # Display image
    axes[i, 0].imshow(sample['image'])
    axes[i, 0].axis('off')
    axes[i, 0].set_title(f"Screenshot {i+1}", fontsize=12, fontweight='bold')
    
    # Display text
    text_content = f"""Task: {sample['raw_data']['question'][:100]}...
    
PREDICTED:
{prediction}

GROUND TRUTH:
{sample['target_text']}

MATCH: {prediction.lower() == sample['target_text'].lower()}
"""
    axes[i, 1].text(0.05, 0.95, text_content, 
                    transform=axes[i, 1].transAxes,
                    fontsize=10, verticalalignment='top',
                    bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
    axes[i, 1].axis('off')

plt.tight_layout()
plt.show()

## 6. Interactive Demo

Test the model interactively with Gradio.

In [None]:
import gradio as gr

def predict_next_action(image, task, history_text, current_action):
    """
    Predict next action given screenshot and context.
    """
    # Format input
    if not history_text.strip():
        history_text = "None"
    if not current_action.strip():
        current_action = "None"
    
    prompt = f"""Task: {task}

Previous steps:
{history_text}

Current action: {current_action}

Predict the next action:"""
    
    # Generate prediction
    prediction = best_model.generate(
        images=[image],
        prompts=[prompt],
        max_length=128,
        num_beams=4,
        temperature=0.7
    )[0]
    
    return prediction

# Create Gradio interface
demo = gr.Interface(
    fn=predict_next_action,
    inputs=[
        gr.Image(type="pil", label="Screenshot"),
        gr.Textbox(label="Task Description", placeholder="e.g., Login to the application"),
        gr.Textbox(label="Previous Steps (one per line)", lines=5, placeholder="1. Open homepage\n2. Click login"),
        gr.Textbox(label="Current Action", placeholder="e.g., Click login button"),
    ],
    outputs=gr.Textbox(label="Predicted Next Action"),
    title="GUI Test Automation - Next Step Predictor",
    description="Upload a screenshot and provide context to predict the next test step.",
    examples=[
        # You can add example images here
    ]
)

demo.launch(share=True)

## 7. Sequence Generation

Generate full test sequences autoregressively.

In [None]:
# Generate full sequence
sample = val_dataset[10]
task = sample['raw_data']['question']

print(f"Task: {task}\n")
print("Generating test sequence...\n")

sequence = best_model.generate_sequence(
    initial_image=sample['image'],
    question=task,
    max_steps=10,
    max_length=128,
    num_beams=4
)

print("="*60)
print("Generated Test Sequence")
print("="*60)
for step in sequence:
    print(f"\nStep {step['step_num']}: {step['action']}")
print("\n" + "="*60)

In [None]:
# Visualize sequence
plt.figure(figsize=(12, 8))
plt.imshow(sample['image'])
plt.axis('off')
plt.title(f"Task: {task}\n", fontsize=14, fontweight='bold')

# Add text box with sequence
sequence_text = "\n".join([f"{s['step_num']}. {s['action']}" for s in sequence])
plt.text(0.5, -0.15, f"Generated Sequence:\n{sequence_text}",
         ha='center', va='top', transform=plt.gca().transAxes,
         fontsize=11, bbox=dict(boxstyle='round', facecolor='lightblue', alpha=0.8))

plt.tight_layout()
plt.show()

## 8. Save Model

Save the model for later use.

In [None]:
# Save to Google Drive (if mounted)
from google.colab import drive
drive.mount('/content/drive')

# Save model
best_model.save_model('/content/drive/MyDrive/gui_test_automation_model')
print("✓ Model saved to Google Drive!")

## Summary

This notebook demonstrated:
1. ✅ Loading GUIDE dataset
2. ✅ Fine-tuning BLIP-2 for GUI test automation
3. ✅ Evaluating with multiple metrics (BLEU, ROUGE, EM)
4. ✅ Interactive demo with Gradio
5. ✅ Autoregressive sequence generation

**Next steps:**
- Experiment with different hyperparameters
- Try larger models (BLIP-2 with T5-XL)
- Add chain-of-thought reasoning
- Fine-tune on domain-specific data