In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from transformers import TrainingArguments

from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from peft import LoraConfig

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

# Set experiment and data paths
experiment_name = 'gemma_public_data_sft'
data_file_path = '../external/public_10k_unique_rewrite_prompt.csv'
model_dir_path = 'google/gemma-7b-it'
cache_dir = '/tmp/k7/'
output_dir_path = f'../logs/train_outputs_{model_dir_path}'
model_save_dir_path =  f'{experiment_name}_adapter'

# Set BitsAndBytesConfig for quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype='float16',
    bnb_4bit_use_double_quant=False,
)

# Set LoRA configuration
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "dense"],
)

# Set training hyperparameters
num_epochs = 5
batch_size = 1
max_sequence_length = 512
learning_rate = 1e-4

# Set training arguments
training_args = TrainingArguments(
    output_dir=output_dir_path,
    fp16=True,
    learning_rate=learning_rate,
    optim="adafactor",
    num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size*2,
    gradient_accumulation_steps=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    logging_steps=50,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    weight_decay=0.01,
    report_to='none',
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
)

In [3]:
# Load and split data
data_df = pd.read_csv(data_file_path)
train_df, val_df = train_test_split(data_df, test_size=0.3, random_state=42)
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

In [4]:
# Load tokenizer and set padding token
tokenizer = AutoTokenizer.from_pretrained(model_dir_path, cache_dir=cache_dir)
tokenizer.pad_token = tokenizer.eos_token

# Load and quantize model
model = AutoModelForCausalLM.from_pretrained(
    model_dir_path,
    quantization_config=bnb_config,
    trust_remote_code=True,
    use_auth_token=True,
    cache_dir=cache_dir,
)

model.config.gradient_checkpointing = False

`low_cpu_mem_usage` was None, now set to True since model is quantized.
Downloading shards: 100%|██████████| 4/4 [03:49<00:00, 57.38s/it]
Loading checkpoint shards: 100%|██████████| 4/4 [00:04<00:00,  1.16s/it]


In [5]:
# Function to get token length of text
def get_token_length(text):
    tokenized = tokenizer(text, return_length=True)
    length = tokenized['length'][0]
    return length

# Function to format prompts
def format_prompts(example):
    output_texts = []
    for i in range(len(example['rewritten_text'])):
        original_text = example['original_text'][i]
        rewritten_text = example['rewritten_text'][i]
        rewrite_prompt = example['rewrite_prompt'][i]
        text = f"Instruct: Original Text:{original_text}\nRewritten Text:{rewritten_text}\nWrite a prompt that was likely given to the LLM to rewrite original text into rewritten text.Output: {rewrite_prompt}"
        if get_token_length(text) > max_sequence_length:
            continue
        output_texts.append(text)
    return output_texts

In [6]:
response_template = "Output:"
data_collator = DataCollatorForCompletionOnlyLM(
    response_template=response_template, 
    tokenizer=tokenizer
)

In [7]:
# Initialize trainer
trainer = SFTTrainer(
    model=model,
    args=training_args,
    max_seq_length=max_sequence_length,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    formatting_func=format_prompts,
    data_collator=data_collator,
    peft_config=lora_config,
)

# Train the model
trainer.train()

Map: 100%|██████████| 7399/7399 [00:09<00:00, 807.18 examples/s]
Map: 100%|██████████| 3172/3172 [00:05<00:00, 586.86 examples/s]
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss
1,1.1521,1.101506
2,0.8819,1.009603
3,0.7184,1.007249
4,0.5089,1.087927
5,0.3717,1.170157


Could not locate the best model at outputs/checkpoint-1596/pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


FileNotFoundError: [Errno 2] No such file or directory: 'outputs/checkpoint-1596'

In [8]:
# Save the trained model and tokenizer
trainer.save_model(model_save_dir_path)
tokenizer.save_pretrained(model_save_dir_path)

('gemma_public_data_sft_adapter/tokenizer_config.json',
 'gemma_public_data_sft_adapter/special_tokens_map.json',
 'gemma_public_data_sft_adapter/tokenizer.model',
 'gemma_public_data_sft_adapter/added_tokens.json',
 'gemma_public_data_sft_adapter/tokenizer.json')

: 