In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = 'MIG-3712e803-2ec9-5b7e-97c4-3e6b4be34b53'
# !pip install accelerate peft bitsandbytes transformers trl --upgrade
# !pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 --upgrade

In [2]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig
from trl import SFTTrainer

In [3]:
# Model from Hugging Face hub
base_model = "NousResearch/Llama-2-7b-chat-hf"

guanaco_dataset = "mlabonne/guanaco-llama2-1k"

# Fine-tuned model
new_model = "llama-2-7b-bb-descriptions-dota-1_5"

In [4]:
dataset = load_dataset("text", data_files={"train": "../datasets/DOTAv1.5/descriptions/train.txt", "test": "../datasets/DOTAv1.5/descriptions/val.txt"})

In [5]:
dataset = dataset['train']

In [6]:
# other_dataset = load_dataset(guanaco_dataset, split="train")
# other_dataset

In [7]:
compute_dtype = getattr(torch, "float16")

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

In [8]:
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=quant_config,
    device_map={"": 0}
)
model.config.use_cache = False
model.config.pretraining_tp = 1

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [9]:
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [10]:
peft_params = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

In [11]:
training_params = TrainingArguments(
    output_dir="./results_DOTA1_5_4epochs",
    num_train_epochs=4,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard"
)

In [12]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_params,
    dataset_text_field="text",
    max_seq_length=None,
    tokenizer=tokenizer,
    args=training_params,
    packing=False,
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [13]:
trainer.train()

Step,Training Loss
25,1.4659
50,0.6724
75,0.7491
100,0.5969
125,0.7257
150,0.5837
175,0.7024
200,0.5592
225,0.7017
250,0.603


TrainOutput(global_step=1412, training_loss=0.6510872874651684, metrics={'train_runtime': 2324.3773, 'train_samples_per_second': 2.43, 'train_steps_per_second': 0.607, 'total_flos': 3.847311311536128e+16, 'train_loss': 0.6510872874651684, 'epoch': 4.0})

In [14]:
trainer.model.save_pretrained(new_model)
trainer.tokenizer.save_pretrained(new_model)

('llama-2-7b-bb-descriptions-dota-1_5/tokenizer_config.json',
 'llama-2-7b-bb-descriptions-dota-1_5/special_tokens_map.json',
 'llama-2-7b-bb-descriptions-dota-1_5/tokenizer.json')

In [15]:
logging.set_verbosity(logging.CRITICAL)

prompt = "Genereate the object bounding box properties for a remote sensing image with the following description: A remote sensing image containing 20 small vehicles, 4 tennis courts, 1 basketball courts, 1 soccer ball fields."
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])



<s>[INST] Genereate the object bounding box properties for a remote sensing image with the following description: A remote sensing image containing 20 small vehicles, 4 tennis courts, 1 basketball courts, 1 soccer ball fields. [/INST] [{'class':'small-vehicle', 'count': 20, 'avg_spread': 102.2959492492492}, {'class': 'tennis-court', 'count': 4, 'avg_spread': 22.9210992492492}, {'class': 'basketball-court', 'count': 1, 'avg_spread': None}, {'class':'soccer-ball-field', 'count': 1, 'avg_spread': None}]

Here are the object properties for the


In [17]:
prompt = "Genereate the object bounding box properties for a remote sensing image with the following description: A remote sensing image containing 3 roundabouts, 1 swimming pools."
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

<s>[INST] Genereate the object bounding box properties for a remote sensing image with the following description: A remote sensing image containing 3 roundabouts, 1 swimming pools. [/INST] [{'class': 'roundabout', 'count': 3, 'avg_spread': 1027.295929249242}, {'class':'swimming-pool', 'count': 1, 'avg_spread': None}]

The above object bounding box properties are based on the description provided. The properties are:

* class: the type of object, in this case roundabouts and swimming pools
* count: the number of objects of that type in the image
* avg_spread: the average distance between objects of that type, calculated using the mean of the distances between all pairs of objects of that type.

