In [1]:
from transformers import TrainingArguments, Trainer
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
from peft import PeftModel, LoraConfig, prepare_model_for_kbit_training, get_peft_model
from datasets import load_dataset, Dataset
import polars as pl
from unsloth import FastLanguageModel


In [2]:
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

In [3]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/mistral-7b-instruct-v0.2-bnb-4bit", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = True,
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

==((====))==  Unsloth: Fast Mistral patching release 2024.4
   \\   /|    GPU: NVIDIA GeForce RTX 3090. Max memory: 23.688 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.1. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.25.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Unsloth 2024.4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [4]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "mistral", # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
    map_eos_token = True, # Maps <|im_end|> to </s> instead
)

In [5]:
orig_prefix = "Original Text:"
rewrite_prefix = "Rewritten Text:"
response_start = "The prompt was:"
sys_prompt = """You are an expert in "Reverse Prompt Engineering". You are able to reverse-engineer prompts used to rewrite text. 

I will be providing you with an "original text" and "rewritten text". Please try to be as specific as possible and come up with a prompt that is based on the tone, style, and any other properties you consider relevant."""

def format_prompts(x):
    messages = [
        {"role": "user", "content": f"{sys_prompt}\n{orig_prefix} {x['original_text']}\n{rewrite_prefix} {x['rewritten_text']}"},
        {"role": "assistant", "content": f"{response_start} {x['rewrite_prompt']}"}
    ]
    output = tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = False)
    return {"texts": output}

In [6]:
df = pl.read_csv('./data/predictions/combined-filtered_*.csv', ignore_errors=True)
dataset = Dataset.from_list(df.to_dicts())
dataset = dataset.map(format_prompts)

Map:   0%|          | 0/6856 [00:00<?, ? examples/s]

In [7]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "texts",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 10,
        # max_steps = 60,
        num_train_epochs=3,
        save_strategy="epoch",
        logging_steps=200,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        # optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "./outputs",
    ),
)

Map (num_proc=2):   0%|          | 0/6856 [00:00<?, ? examples/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [8]:
trainer_stats = trainer.train()


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 6,856 | Num Epochs = 3
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 2,571
 "-____-"     Number of trainable parameters = 41,943,040


  0%|          | 0/2571 [00:00<?, ?it/s]

{'loss': 1.2628, 'grad_norm': 0.579164445400238, 'learning_rate': 0.00018516204607575167, 'epoch': 0.23}
{'loss': 1.1619, 'grad_norm': 0.4446599781513214, 'learning_rate': 0.00016954314720812183, 'epoch': 0.47}
{'loss': 1.1565, 'grad_norm': 0.44324904680252075, 'learning_rate': 0.000153924248340492, 'epoch': 0.7}
{'loss': 1.1464, 'grad_norm': 0.44189155101776123, 'learning_rate': 0.0001383053494728622, 'epoch': 0.93}
{'loss': 1.014, 'grad_norm': 0.52754807472229, 'learning_rate': 0.00012268645060523235, 'epoch': 1.17}
{'loss': 0.9399, 'grad_norm': 0.6610332131385803, 'learning_rate': 0.0001070675517376025, 'epoch': 1.4}
{'loss': 0.9322, 'grad_norm': 0.6497707962989807, 'learning_rate': 9.144865286997268e-05, 'epoch': 1.63}
{'loss': 0.9279, 'grad_norm': 0.6296718120574951, 'learning_rate': 7.582975400234283e-05, 'epoch': 1.87}
{'loss': 0.8416, 'grad_norm': 0.7901496887207031, 'learning_rate': 6.021085513471301e-05, 'epoch': 2.1}
{'loss': 0.6892, 'grad_norm': 0.8700945973396301, 'learnin

In [9]:
model.save_pretrained('/home/lawrence/Projects/my_models/mistral_pr_lora')