In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = 'GPU-5b10304e-0077-d028-9962-f02c0fba85fa'
# !pip install accelerate peft bitsandbytes transformers trl --upgrade
# !pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 --upgrade

In [2]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig
from trl import SFTConfig
from trl import SFTTrainer

In [3]:
# Model from Hugging Face hub
base_model = "NousResearch/Llama-2-7b-chat-hf"

# Fine-tuned model
new_model = "llama-2-7b-bb-descriptions-dota-1_5"

In [4]:
dataset = load_dataset("text", data_files={"train": "../datasets/DOTAv1.5/descriptions/train.txt", "test": "../datasets/DOTAv1.5/descriptions/val.txt"})

In [5]:
dataset = dataset['train']

In [6]:
# other_dataset = load_dataset(guanaco_dataset, split="train")
# other_dataset

In [7]:
compute_dtype = getattr(torch, "float16")

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

In [8]:
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=quant_config,
    device_map={"": 0}
)
model.config.use_cache = False
model.config.pretraining_tp = 1

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 31.5M/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

In [9]:
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

In [10]:
peft_params = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

In [11]:
training_params = SFTConfig(
    output_dir="./results_DOTA1_5_4epochs",
    num_train_epochs=4,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard"
)

In [12]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_params,
    dataset_text_field="text",
    max_seq_length=None,
    tokenizer=tokenizer,
    args=training_params,
    packing=False
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/1412 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [13]:
trainer.train()

Step,Training Loss
25,1.3654
50,0.6069
75,0.6555
100,0.5256
125,0.6327
150,0.5109
175,0.6053
200,0.4796
225,0.609
250,0.512


TrainOutput(global_step=1412, training_loss=0.5631243054657414, metrics={'train_runtime': 2382.3446, 'train_samples_per_second': 2.371, 'train_steps_per_second': 0.593, 'total_flos': 4.460914353016013e+16, 'train_loss': 0.5631243054657414, 'epoch': 4.0})

In [14]:
trainer.model.save_pretrained(new_model)
trainer.tokenizer.save_pretrained(new_model)

('llama-2-7b-bb-descriptions-dota-1_5/tokenizer_config.json',
 'llama-2-7b-bb-descriptions-dota-1_5/special_tokens_map.json',
 'llama-2-7b-bb-descriptions-dota-1_5/tokenizer.json')

In [15]:
logging.set_verbosity(logging.CRITICAL)

prompt = "Genereate the object bounding box properties for a remote sensing image with the following description as JSON only: A remote sensing image containing 20 small vehicles, 4 tennis courts, 1 basketball courts, 1 soccer ball fields."
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

<s>[INST] Genereate the object bounding box properties for a remote sensing image with the following description as JSON only: A remote sensing image containing 20 small vehicles, 4 tennis courts, 1 basketball courts, 1 soccer ball fields. [/INST] [{'class':'small-vehicle', 'count': 20, 'avg_dist': None}, {'class': 'tennis-court', 'count': 4, 'avg_dist': None}, {'class': 'basketball-court', 'count': 1, 'avg_dist': None}, {'class':'soccer-ball-field', 'count': 1, 'avg_dist': None}]


In [16]:
prompt = "Genereate the object bounding box properties for a remote sensing image with the following description as JSON only, do not give an explanation: A remote sensing image containing 3 roundabouts, 1 swimming pools."
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

<s>[INST] Genereate the object bounding box properties for a remote sensing image with the following description as JSON only, do not give an explanation: A remote sensing image containing 3 roundabouts, 1 swimming pools. [/INST] [{'class': 'roundabout', 'count': 3, 'avg_dist': None}, {'class':'swimming-pool', 'count': 1, 'avg_dist': None}]
