In [1]:
import os
from unsloth import FastVisionModel
import torch
from datasets import load_dataset
from transformers import TextStreamer
from unsloth import is_bf16_supported
from unsloth.trainer import UnslothVisionDataCollator
from trl import SFTTrainer, SFTConfig
from huggingface_hub import login
import torch
from torch.utils.data import DataLoader

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
HF_TOKEN = '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'
login(token=HF_TOKEN)

# 1. Load the model
model, tokenizer = FastVisionModel.from_pretrained(
    "unsloth/Llama-3.2-11B-Vision-Instruct",
    load_in_4bit=True,
    use_gradient_checkpointing="unsloth",
)

# Enable gradient checkpointing for memory savings
model.config.gradient_checkpointing = True

model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers=True,
    finetune_language_layers=True,
    finetune_attention_modules=True,
    finetune_mlp_modules=True,
    r=32,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

# 2. Load and split the dataset
dataset = load_dataset("miketes/Web-filtered-english-wave-ui-25k")
train_dataset = dataset["train"].shuffle(seed=42).select(range(int(0.8 * len(dataset["train"]))))
val_dataset = dataset["train"].shuffle(seed=42).select(range(int(0.8 * len(dataset["train"])), int(0.9 * len(dataset["train"]))))
test_dataset = dataset["train"].shuffle(seed=42).select(range(int(0.9 * len(dataset["train"])), len(dataset["train"])))

def convert_to_conversation(sample):
    instruction = (
        f"In this user interface image, locate the text element '{sample['OCR']}' and determine its precise "
        f"bounding box coordinates. The coordinates should be formatted as [x1, y1, x2, y2] where:"
        f"\n- x1, y1 is the top-left corner"
        f"\n- x2, y2 is the bottom-right corner"
        f"\nThe box should tightly enclose only this specific text element. Return only the coordinates."
    )
    
    combined_text = f"{instruction}\nTarget Text: {sample['OCR']}"
    return {
        "messages": [
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": sample["image"]},
                    {"type": "text", "text": combined_text},
                ],
            },
            {
                "role": "assistant",
                "content": [
                    {"type": "text", "text": sample["bbox"]},
                ],
            },
        ]
    }

train_dataset = [convert_to_conversation(sample) for sample in train_dataset]
val_dataset = [convert_to_conversation(sample) for sample in val_dataset]

# 3. Before training
FastVisionModel.for_inference(model)
image = dataset["train"][0]["image"]
ocr_label = dataset["train"][0]["OCR"]

instruction = (
    f"In this user interface image, locate the text element {ocr_label} and determine its precise "
    f"bounding box coordinates. The coordinates should be formatted as [x1, y1, x2, y2] where:"
    f"\n- x1, y1 is the top-left corner"
    f"\n- x2, y2 is the bottom-right corner"
    f"\nThe box should tightly enclose only this specific text element. Return only the coordinates."
)
combined_text = f"{instruction}\nTarget Text: {ocr_label}"

messages = [
    {"role": "user", "content": [
        {"type": "image", "image": image},
        {"type": "text", "text": combined_text}
    ]}
]
input_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
inputs = tokenizer(
    image,
    input_text,
    add_special_tokens=False,
    return_tensors="pt",
).to("cuda")

print("\nBefore training:\n")
text_streamer = TextStreamer(tokenizer, skip_prompt=True)
_ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=128, use_cache=True, temperature=0.3, num_beams=1)

# 4. Training with Updated Configuration
FastVisionModel.for_training(model)

data_collator = UnslothVisionDataCollator(model, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=data_collator)

training_args = SFTConfig(
    per_device_train_batch_size=14,  # Optimized for multi-GPU
    gradient_accumulation_steps=2,
    num_train_epochs=2,
    warmup_ratio=0.05,
    learning_rate=1e-3,
    bf16=True,  # Mixed precision
    optim="adamw_8bit",
    weight_decay=0.01,
    lr_scheduler_type="cosine",
    output_dir="outputs",
    seed=3407,
    logging_steps=50,
    save_steps=200,
    eval_steps=200,
    save_total_limit=3,
    dataset_num_proc=16,  # Increased for faster data loading
    max_seq_length=2048,
    remove_unused_columns=False,
    dataset_text_field="",
    dataset_kwargs={"skip_prepare_dataset": True},
    report_to="tensorboard",
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    args=training_args,
)

torch.cuda.empty_cache()
trainer.train()

# 5. After training
print("\nAfter training:\n")
FastVisionModel.for_inference(model)

messages = [
    {"role": "user", "content": [
        {"type": "image", "image": image},
        {"type": "text", "text": combined_text}
    ]}
]
input_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
inputs = tokenizer(
    image,
    input_text,
    add_special_tokens=False,
    return_tensors="pt",
).to("cuda")

text_streamer = TextStreamer(tokenizer, skip_prompt=True)
_ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=128, use_cache=True, temperature=0.3, num_beams=1)

# 6. Save the model
model.save_pretrained("lora_model")
tokenizer.save_pretrained("lora_model")

model.push_to_hub_merged(
    "miketes/Llama-3.2-11B-finetuned-lora-improved",
    tokenizer,
    save_method="merged_16bit",
    token=HF_TOKEN,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
[2024-12-20 13:31:24,506] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/usr/bin/ld: cannot find -laio
collect2: error: ld returned 1 exit status
/Users/923676946/cuda-12.4/lib64/libcufile.so: undefined reference to `dlvsym'
/Users/923676946/cuda-12.4/lib64/libcufile.so: undefined reference to `dlopen'
/Users/923676946/cuda-12.4/lib64/libcufile.so: undefined reference to `dlclose'
/Users/923676946/cuda-12.4/lib64/libcufile.so: undefined reference to `dlerror'
/Users/923676946/cuda-12.4/lib64/libcufile.so: undefined reference to `dlsym'
collect2: error: ld returned 1 exit status


==((====))==  Unsloth 2024.12.4: Fast Mllama vision patching. Transformers: 4.47.1.
   \\   /|    GPU: NVIDIA A100 80GB PCIe. Max memory: 79.138 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 8.0. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]


Before training:

To determine the precise bounding box coordinates of the "About Us" text element, we need to analyze the image and identify the text's position.

**Step 1: Identify the Text Element**
The "About Us" text element is located in the top-right corner of the image, within the navigation bar.

**Step 2: Determine the Text's Position**
The text is positioned between the "About Us" and "Resources" links, with the "About Us" link on the left and the "Resources" link on the right.

**Step 3: Calculate the Bounding Box Coordinates**
To calculate the bounding box coordinates,


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 12,749 | Num Epochs = 2
O^O/ \_/ \    Batch size per device = 14 | Gradient Accumulation steps = 2
\        /    Total batch size = 28 | Total steps = 910
 "-____-"     Number of trainable parameters = 134,348,800
🦥 Unsloth needs about 1-3 minutes to load everything - please wait!


Step,Training Loss
50,1.1583
100,0.9799
150,0.5886


KeyboardInterrupt: 

In [None]:
# 2nd upgrade

In [1]:
import os
from unsloth import FastVisionModel
import torch
from datasets import load_dataset
from transformers import TextStreamer
from unsloth import is_bf16_supported
from unsloth.trainer import UnslothVisionDataCollator
from trl import SFTTrainer, SFTConfig
from huggingface_hub import login
import torch
from torch.utils.data import DataLoader

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
HF_TOKEN = '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'
login(token=HF_TOKEN)

# 1. Load the model (same as before)
model, tokenizer = FastVisionModel.from_pretrained(
    "unsloth/Llama-3.2-11B-Vision-Instruct",
    load_in_4bit=True,
    use_gradient_checkpointing="unsloth",
)

model.config.gradient_checkpointing = True

model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers=True,
    finetune_language_layers=True,
    finetune_attention_modules=True,
    finetune_mlp_modules=True,
    r=32,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

# 2. Data preparation function
def prepare_dataset(dataset):
    """Filter and clean the dataset for better quality training"""
    def is_valid_sample(sample):
        # Check if all required fields are present
        if not all(field in sample for field in ['image', 'OCR', 'bbox']):
            return False
            
        # Verify bbox format
        if not isinstance(sample['bbox'], (list, tuple)) or len(sample['bbox']) < 4:
            return False
            
        # Verify OCR text is not empty
        if not sample['OCR'] or len(sample['OCR'].strip()) == 0:
            return False
            
        return True
    
    return dataset.filter(is_valid_sample)

# 3. Enhanced conversation format
def convert_to_conversation(sample):
    """Enhanced conversation format using additional context"""
    
    # Get additional context
    element_type = sample.get('type', 'text')
    element_purpose = sample.get('purpose', '')
    platform = sample.get('platform', '')
    
    # Create detailed instruction
    instruction = (
        f"In this user interface image{' from ' + platform if platform else ''}, "
        f"locate the {element_type} element with text '{sample['OCR']}'. "
        f"{element_purpose + '. ' if element_purpose else ''}"
        f"Determine its precise bounding box coordinates formatted as [x1, y1, x2, y2] where:"
        f"\n- x1, y1 is the top-left corner"
        f"\n- x2, y2 is the bottom-right corner"
        f"\nThe box should tightly enclose only this specific UI element. Return only the coordinates."
    )
    
    combined_text = f"{instruction}\nTarget Text: {sample['OCR']}"
    return {
        "messages": [
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": sample["image"]},
                    {"type": "text", "text": combined_text},
                ],
            },
            {
                "role": "assistant",
                "content": [
                    {"type": "text", "text": str(sample["bbox"])},
                ],
            },
        ]
    }

# 4. Load and prepare dataset
dataset = load_dataset("miketes/Web-filtered-english-wave-ui-25k")
cleaned_dataset = prepare_dataset(dataset["train"])

# Split the cleaned dataset
train_dataset = cleaned_dataset.shuffle(seed=42).select(range(int(0.8 * len(cleaned_dataset))))
val_dataset = cleaned_dataset.shuffle(seed=42).select(range(int(0.8 * len(cleaned_dataset)), int(0.9 * len(cleaned_dataset))))
test_dataset = cleaned_dataset.shuffle(seed=42).select(range(int(0.9 * len(cleaned_dataset)), len(cleaned_dataset)))

train_dataset = [convert_to_conversation(sample) for sample in train_dataset]
val_dataset = [convert_to_conversation(sample) for sample in val_dataset]

# 5. Training setup
FastVisionModel.for_inference(model)
image = dataset["train"][0]["image"]
ocr_label = dataset["train"][0]["OCR"]

# Create example instruction
sample = dataset["train"][0]
messages = [
    {"role": "user", "content": [
        {"type": "image", "image": image},
        {"type": "text", "text": convert_to_conversation(sample)["messages"][0]["content"][1]["text"]}
    ]}
]

input_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
inputs = tokenizer(
    image,
    input_text,
    add_special_tokens=False,
    return_tensors="pt",
).to("cuda")

print("\nBefore training:\n")
text_streamer = TextStreamer(tokenizer, skip_prompt=True)
_ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=128, use_cache=True, temperature=0.3, num_beams=1)

# 6. Training configuration
FastVisionModel.for_training(model)

data_collator = UnslothVisionDataCollator(model, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=data_collator)

training_args = SFTConfig(
    # Increase batch size and reduce accumulation for faster training
    per_device_train_batch_size=14,
    gradient_accumulation_steps=2,
    
    # Reduce to 1 epoch
    num_train_epochs=1,
    
    # Keep learning rate settings for stability
    learning_rate=5e-4,
    warmup_ratio=0.05,
    
    # Keep optimization settings
    bf16=True,
    optim="adamw_8bit",
    weight_decay=0.01,
    lr_scheduler_type="cosine",
    
    # Reduce evaluation frequency
    logging_steps=50,
    save_steps=200,
    eval_steps=200,
    save_total_limit=2,
    
    # Other settings
    output_dir="outputs",
    seed=3407,
    dataset_num_proc=16,
    max_seq_length=2048,
    remove_unused_columns=False,
    dataset_text_field="",
    dataset_kwargs={"skip_prepare_dataset": True},
    report_to="tensorboard",
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    args=training_args,
)

torch.cuda.empty_cache()
trainer.train()

# 7. After training and saving (same as before)
print("\nAfter training:\n")
FastVisionModel.for_inference(model)

messages = [
    {"role": "user", "content": [
        {"type": "image", "image": image},
        {"type": "text", "text": convert_to_conversation(sample)["messages"][0]["content"][1]["text"]}
    ]}
]
input_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
inputs = tokenizer(
    image,
    input_text,
    add_special_tokens=False,
    return_tensors="pt",
).to("cuda")

text_streamer = TextStreamer(tokenizer, skip_prompt=True)
_ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=128, use_cache=True, temperature=0.3, num_beams=1)

# Save the model
model.save_pretrained("lora_model")
tokenizer.save_pretrained("lora_model")

model.push_to_hub_merged(
    "miketes/Llama-3.2-11B-finetuned-lora-improved",
    tokenizer,
    save_method="merged_16bit",
    token=HF_TOKEN,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
[2024-12-20 14:49:35,662] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/usr/bin/ld: cannot find -laio
collect2: error: ld returned 1 exit status
/Users/923676946/cuda-12.4/lib64/libcufile.so: undefined reference to `dlvsym'
/Users/923676946/cuda-12.4/lib64/libcufile.so: undefined reference to `dlopen'
/Users/923676946/cuda-12.4/lib64/libcufile.so: undefined reference to `dlclose'
/Users/923676946/cuda-12.4/lib64/libcufile.so: undefined reference to `dlerror'
/Users/923676946/cuda-12.4/lib64/libcufile.so: undefined reference to `dlsym'
collect2: error: ld returned 1 exit status


==((====))==  Unsloth 2024.12.4: Fast Mllama vision patching. Transformers: 4.47.1.
   \\   /|    GPU: NVIDIA A100 80GB PCIe. Max memory: 79.138 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 8.0. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]


Before training:

The link element with text 'About Us' is located in the top-right corner of the image. The precise bounding box coordinates are [1004, 71, 1103, 93].<|eot_id|>


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 12,749 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 14 | Gradient Accumulation steps = 2
\        /    Total batch size = 28 | Total steps = 455
 "-____-"     Number of trainable parameters = 134,348,800
🦥 Unsloth needs about 1-3 minutes to load everything - please wait!


Step,Training Loss
50,1.3921
100,0.6142
150,0.5802
200,0.5612
250,0.5443
300,0.524
350,0.522
400,0.5054
450,0.5105



After training:

[855.0, 36.0, 931.0, 72.0]<|eot_id|>
Unsloth: Merging QLoRA weights directly to the 16bit version of unsloth/llama-3.2-11b-vision-instruct.


chat_template.json:   0%|          | 0.00/5.15k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/5.27k [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/210 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/89.4k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/477 [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit:   0%|          | 0/5 [00:00<?, ?it/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit:  20%|██        | 1/5 [00:46<03:07, 46.91s/it]

model-00002-of-00005.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit:  40%|████      | 2/5 [01:23<02:02, 40.97s/it]

model-00003-of-00005.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit:  60%|██████    | 3/5 [02:04<01:21, 40.83s/it]

model-00004-of-00005.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit:  80%|████████  | 4/5 [02:45<00:41, 41.02s/it]

model-00005-of-00005.safetensors:   0%|          | 0.00/1.47G [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/1.47G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit: 100%|██████████| 5/5 [03:02<00:00, 36.48s/it]
