In [1]:
import os
import torch


os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "6"

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cuda')

# SFT on IMBD

In [2]:
from datasets import load_dataset
from trl import SFTConfig, SFTTrainer, DataCollatorForCompletionOnlyLM
from peft import LoraConfig
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)
# Load dataset
dataset = load_dataset("imdb", split="train")

model_name = "EleutherAI/pythia-160m"
# model_name = "facebook/opt-350m"

tokenizer = AutoTokenizer.from_pretrained(model_name)
# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=BitsAndBytesConfig(load_in_8bit=True),
)

# Define SFT configuration
sft_config = SFTConfig(
    dataset_text_field="text",
    output_dir=f"{model_name}_imbd",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    save_strategy="epoch",
    logging_steps=50,
    num_train_epochs=1,
    max_steps=500,
    fp16=True,
    save_steps=500,
)

peft_config = LoraConfig(
            r=16,
            lora_alpha=32,
            lora_dropout=0.05,
            bias="none",
            task_type="CAUSAL_LM",
        )   

# Initialize SFTTrainer
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    args=sft_config,
    max_seq_length=512,
    peft_config=peft_config

)

# Train the model
trainer.train()


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.

Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss
50,3.7608
100,3.7683
150,3.7215
200,3.737
250,3.7814
300,3.7734
350,3.735
400,3.759
450,3.7175
500,3.7409


TrainOutput(global_step=500, training_loss=3.7494828796386717, metrics={'train_runtime': 273.3132, 'train_samples_per_second': 14.635, 'train_steps_per_second': 1.829, 'total_flos': 1433970309685248.0, 'train_loss': 3.7494828796386717, 'epoch': 0.16})

## SFT on CodeAlpaca

In [6]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from trl import SFTConfig, SFTTrainer, DataCollatorForCompletionOnlyLM

# Load dataset
dataset = load_dataset("lucasmccabe-lmi/CodeAlpaca-20k", split="train")

model_name = "EleutherAI/pythia-160m"

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Define a function to format the prompts
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['instruction'])):
        text = f"### Question: {example['instruction'][i]}\n ### Answer: {example['output'][i]}"
        output_texts.append(text)
    return output_texts

# Define response template and data collator
response_template = " ### Answer:"
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

# Define SFT configuration
sft_config = SFTConfig(
    output_dir="opt_350m_finetuned",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    save_strategy="epoch",
    logging_steps=50,
    num_train_epochs=1,
    max_steps=500,
    fp16=True,
    save_steps=100,
)

# Initialize SFTTrainer
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    args=sft_config,
    formatting_func=formatting_prompts_func,
    data_collator=collator,
    max_seq_length=512
)

# Train the model
trainer.train()


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.

Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss
50,3.6054
100,2.4515
150,2.2239
200,2.055
250,1.8748
300,1.7555
350,1.6815
400,1.5703
450,1.3405
500,1.3345


TrainOutput(global_step=500, training_loss=1.9892971649169922, metrics={'train_runtime': 149.3994, 'train_samples_per_second': 26.774, 'train_steps_per_second': 3.347, 'total_flos': 4286187457413120.0, 'train_loss': 1.9892971649169922, 'epoch': 0.1997602876548142})

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from trl import SFTConfig, SFTTrainer, DataCollatorForCompletionOnlyLM

# Load dataset
dataset = load_dataset("tatsu-lab/alpaca", split="train")

model_name = "EleutherAI/pythia-1b"

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})


# # Define a function to format the prompts
# def formatting_prompts_func(example):
#     output_texts = []
#     for i in range(len(example['instruction'])):
#         text = f"### Question: {example['instruction'][i]}\n ### Answer: {example['output'][i]}"
#         output_texts.append(text)
#     return output_texts


# Define a function to format the prompts
def formatting_prompts_func(example):
    output_texts = []
    for instruction, output in zip(example['instruction'], example['output']):
        text = (f"### Question: {instruction}\n ### Answer:\n{output}")
        output_texts.append(text)
    return {"text": output_texts}


# Define response template and data collator
response_template = " ### Answer:"
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

# Define SFT configuration
sft_config = SFTConfig(
    output_dir="opt_350m_finetuned",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    save_strategy="epoch",
    logging_steps=50,
    num_train_epochs=1,
    max_steps=500,
    fp16=True,
    save_steps=100,
)

# Initialize SFTTrainer
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    args=sft_config,
    formatting_func=formatting_prompts_func,
    data_collator=collator,
    max_seq_length=512
)

# Train the model
trainer.train()


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.

Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/52002 [00:00<?, ? examples/s]

ValueError: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from trl import SFTConfig, SFTTrainer, DataCollatorForCompletionOnlyLM

# Load dataset
dataset = load_dataset("lucasmccabe-lmi/CodeAlpaca-20k", split="train")

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", device_map="auto")
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")

# Define a function to format the prompts
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['instruction'])):
        text = f"### Question: {example['instruction'][i]}\n ### Answer: {example['output'][i]}"
        output_texts.append(text)
    return output_texts

# Define response template and data collator
response_template = " ### Answer:"
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

# Define SFT configuration
sft_config = SFTConfig(
    output_dir="opt_350m_finetuned",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    save_strategy="epoch",
    logging_steps=50,
    num_train_epochs=1,
    max_steps=500,
    fp16=True,
    save_steps=100,
)

# Initialize SFTTrainer
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    args=sft_config,
    formatting_func=formatting_prompts_func,
    data_collator=collator,
    max_seq_length=512
)

# Train the model
trainer.train()


In [None]:
import json
import pandas as pd


data = []
with open('../databricks-dolly-15k.jsonl', 'r') as f:
    for line in f:
        data.append(json.loads(line))

df = pd.DataFrame(data)

from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2,random_state = 42)

# Save test dataset to CSV
test.to_csv('test.csv', index=False)

In [None]:
from datasets import Dataset

def create_prompt(row):
    prompt = f"Instruction: {row['instruction']}\nContext: {row['context']}\nResponse: {row['response']}"
    return prompt

train['text'] = train.apply(create_prompt, axis=1)
data_df = train

data = Dataset.from_pandas(data_df)

In [None]:
data

Dataset({
    features: ['instruction', 'context', 'response', 'category', 'text', '__index_level_0__'],
    num_rows: 12008
})

In [None]:
from huggingface_hub import notebook_login

notebook_login()


import torch
import pandas as pd
from datasets import Dataset
from peft import LoraConfig, AutoPeftModelForCausalLM, prepare_model_for_kbit_training, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig, TrainingArguments, BitsAndBytesConfig
from trl import SFTTrainer
import os

data = Dataset.from_pandas(data_df)

# model_id="facebook/opt-350m"
model_id="facebook/opt-125m"
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# hf_Yd Vt Ni Nl uc lS ir RZ Kw ng wu ro tw Tc KZ Pu LV
# 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

In [None]:
# no quantization, full model training

model = AutoModelForCausalLM.from_pretrained(
                                model_id,
                                quantization_config=None,
                                device_map="auto"
                            )

# Modify model configuration parameters
model.config.use_cache=False
model.config.pretraining_tp=1
model.gradient_checkpointing_enable()

# Define training arguments
training_arguments = TrainingArguments(
                            output_dir=f"{model_id.replace('-', '_')}-finetuned-dolly-with-exp",
                            per_device_train_batch_size=8,
                            gradient_accumulation_steps=1,
                            optim="paged_adamw_32bit",
                            learning_rate=2e-4,
                            lr_scheduler_type="cosine",
                            save_strategy="epoch",
                            logging_steps=50,
                            num_train_epochs=1,
                            max_steps=500,
                            fp16=True,
                            save_steps=500,
                        )

# Initialize SFTTrainer for training
trainer = SFTTrainer(
            model=model,
            train_dataset=data,
            dataset_text_field="text",
            args=training_arguments,
            tokenizer=tokenizer,
            packing=False,
            max_seq_length=512
    )



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/12008 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [None]:
# Train the model
trainer.train()

Step,Training Loss
50,3.096
100,3.1504
150,3.1175
200,3.0868
250,3.061
300,3.0044
350,2.9534
400,2.9364
450,2.8655
500,2.8654


TrainOutput(global_step=500, training_loss=3.0136783447265625, metrics={'train_runtime': 88.2284, 'train_samples_per_second': 45.337, 'train_steps_per_second': 5.667, 'total_flos': 820073207808000.0, 'train_loss': 3.0136783447265625, 'epoch': 0.3331112591605596})

In [None]:
quantization_config_loading = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False
)

# Load LLAMA2 13B model with quantization configurations
model = AutoModelForCausalLM.from_pretrained(
                                model_id,
                                quantization_config=quantization_config_loading,
                                device_map="auto"
                            )

# Modify model configuration parameters
model.config.use_cache=False
model.config.pretraining_tp=1
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

# Configure PEFT (Parameter Efficient Fine-Tuning)
peft_config = LoraConfig(
                    r=16,
                    lora_alpha=16,
                    lora_dropout=0.05,
                    bias="none",
                    task_type="CAUSAL_LM",
                    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"]
                )

# Apply PEFT configurations to the model
model = get_peft_model(model, peft_config)



pytorch_model.bin:   0%|          | 0.00/251M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [None]:
# Define training arguments
training_arguments = TrainingArguments(
                            output_dir=f"{model_id.replace('-', '_')}-finetuned-dolly-with-exp",
                            per_device_train_batch_size=8,
                            gradient_accumulation_steps=1,
                            optim="paged_adamw_32bit",
                            learning_rate=2e-4,
                            lr_scheduler_type="cosine",
                            save_strategy="epoch",
                            logging_steps=50,
                            num_train_epochs=1,
                            max_steps=500,
                            fp16=True,
                            save_steps=500,
                        )

# Initialize SFTTrainer for training
trainer = SFTTrainer(
            model=model,
            train_dataset=data,
            peft_config=peft_config,
            dataset_text_field="text",
            args=training_arguments,
            tokenizer=tokenizer,
            packing=False,
            max_seq_length=512
    )


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/12008 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [None]:
# Train the model
trainer.train()



Step,Training Loss
50,3.054
100,2.942
150,2.8614
200,2.8416
250,2.8166
300,2.8553
350,2.8092
400,2.8488
450,2.8428
500,2.7971




TrainOutput(global_step=500, training_loss=2.8668854064941405, metrics={'train_runtime': 98.8712, 'train_samples_per_second': 40.457, 'train_steps_per_second': 5.057, 'total_flos': 828603450114048.0, 'train_loss': 2.8668854064941405, 'epoch': 0.3331112591605596})

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from trl import SFTConfig, SFTTrainer, DataCollatorForCompletionOnlyLM

# Load dataset
dataset = load_dataset("lucasmccabe-lmi/CodeAlpaca-20k", split="train")

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", device_map="auto")
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")

# Define a function to format the prompts
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['instruction'])):
        text = f"### Question: {example['instruction'][i]}\n ### Answer: {example['output'][i]}"
        output_texts.append(text)
    return output_texts

# Define response template and data collator
response_template = " ### Answer:"
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

# Define SFT configuration
sft_config = SFTConfig(
    output_dir="opt_350m_finetuned",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    save_strategy="epoch",
    logging_steps=50,
    num_train_epochs=1,
    max_steps=500,
    fp16=True,
    save_steps=500,
)

# Initialize SFTTrainer
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    args=sft_config,
    formatting_func=formatting_prompts_func,
    data_collator=collator,
    max_seq_length=512
)

# Train the model
trainer.train()



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/20022 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


OutOfMemoryError: CUDA out of memory. Tried to allocate 450.00 MiB. GPU 