# Create Dataset for training

In [1]:
from transformers import AutoTokenizer
from datasets import load_dataset
from transformers import AutoModelWithLMHead, AutoModelForCausalLM
from transformers import Trainer, TrainingArguments
import transformers
import torch
transformers.set_seed(42)

import dotenv
import wandb
import os

dotenv.load_dotenv("./.env", override=True)
wandb.login(key=os.getenv('WANDB_API_KEY'))
os.environ['WANDB_NOTEBOOK_NAME'] = 'just_finetuning_leo-hessianai.ipynb'


  from .autonotebook import tqdm as notebook_tqdm
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mfelix-ml[0m ([33mfml-team[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/felix/.netrc


In [2]:
# Specify the path to your JSON file
#file_path = 'data/berufslexikon_regex_cleaned.json'
file_path = 'data/chunks.json'

# Load the dataset
ds = load_dataset('json', data_files=file_path)

# Specify the model checkpoint
model_checkpoint = "LeoLM/leo-hessianai-7b-chat"

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
ds["train"][1]

{'profession': '24-Stunden-BetreuerIn',
 'url': 'https://www.berufslexikon.at/berufe/3045-24-Stunden-BetreuerIn/',
 'id': '24-Stunden-BetreuerIn_2_4',
 'chunk': 1,
 'text': '(Kurz-/Spezialausbildung) \nDiplom-SozialbetreuerIn für Familienarbeit\n(Mittlere/Höhere Schulen) \nHeimhelferIn\n(Kurz-/Spezialausbildung) \nAnforderungen\nBereitschaft, am Wochenende zu arbeiten\nBereitschaft, an unterschiedlichen Orten zu arbeiten\nFreude am Kontakt mit Menschen\nInteresse für Soziales\nKommunikationsfähigkeit\nPhysische Ausdauer\nPsychische Belastbarkeit\nSelbstständiges Arbeiten\nSinn für Sauberkeit und Hygiene\nVerantwortungsbewusstsein\nBeschäftigungsmöglichkeiten\n24-Stunden-Betreuung bedeutet, dass die Betreuungskräfte bis auf entsprechende Erholungszeiten 24 Stunden dienstbereit sind. Dienstbereitschaft bedeutet aber nicht einen 24-Stunden-Arbeitstag. Das Betreuungsverhältnis kann verschiedene rechtliche Formen haben. Die Betreuungskraft kann\nein direktes Dienstverhältnis zu der pflegebe

In [4]:
# count the number of words in the dataset
total_words = 0
for example in ds["train"]:
    total_words += len(example["profession"].split())
    total_words += len(example["text"].split())
print(f"Total number of words in the dataset: {total_words}")


Total number of words in the dataset: 1049814


In [5]:
# As this dataset has no validation split, we will create one
#ds = ds["train"].train_test_split(test_size=0.2, seed=42)

# Models
* https://huggingface.co/LeoLM/leo-hessianai-13b - did not fit on 24GB VRAM
    - WARNING:accelerate.big_modeling:You shouldn't move a model when it is dispatched on multiple devices.
    - needs FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE pip install flash-attn==v2.1.1 --no-build-isolation
* https://huggingface.co/LeoLM/leo-hessianai-7b
* https://huggingface.co/LeoLM/leo-hessianai-7b-chat
    - needs FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE pip install flash-attn --no-build-isolation

In [6]:
# from transformers import AutoModelForCausalLM, AutoTokenizer
# import torch

# model = AutoModelForCausalLM.from_pretrained(
#     pretrained_model_name_or_path=model_checkpoint,
#     torch_dtype=torch.float16,
#     trust_remote_code=True  # True for flash-attn2 else False
# )


In [7]:
# # We'll create a tokenizer from model checkpoint
# tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)#, use_fast=False)

# # We'll need padding to have same length sequences in a batch
# tokenizer.pad_token = tokenizer.eos_token

# # Define a tokenization function that first concatenates text and target
# def tokenize_function(example):
#     # concatenate all texts and tokenize as one sample
#     # merged = str(example["profession"]) + " " + str(example["url"]) + " " + str(example["content"])
#     # print(type(merged))
#     # print(merged)
#     merged =  example["content"]
#     batch = tokenizer(merged, padding='max_length', truncation=True, max_length=128)
#     batch["labels"] = batch["input_ids"].copy()
#     return batch

# # def tokenize_function(example):
# #     # If the fields are lists of strings, join the strings. Otherwise, use the fields as they are.
# #     profession = ' '.join(example["profession"]) if isinstance(example["profession"], list) else example["profession"]
# #     url = ' '.join(example["url"]) if isinstance(example["url"], list) else example["url"]
# #     content = ' '.join(example["content"]) if isinstance(example["content"], list) else example["content"]

# #     merged = profession + " " + url + " " + content
# #     batch = tokenizer(merged, padding='max_length', truncation=True, max_length=2048)
# #     batch["labels"] = batch["input_ids"].copy()
# #     return batch

# # Apply it on our dataset, and remove the text columns
# tokenized_datasets = ds.map(tokenize_function, batched=True, remove_columns=["profession", "url", "content"])

In [8]:
# tokenized_datasets

In [9]:
# # Let's check out one prepared example
# print(tokenizer.decode(tokenized_datasets["train"][ 1]["input_ids"]))

# Training

## QLORA
https://freedium.cfd/https%3A%2F%2Fmedium.com%2F%40newhardwarefound%2Fqlora-with-llama-2-ca1b4bcf26f0

In [10]:
# Start a new wandb run
model_name = model_checkpoint.split("/")[-1]
run = wandb.init(project=f"{model_name}-ams-finetuned", job_type="train")

In [11]:
import torch

# If CUDA is available, use it
if torch.cuda.is_available():
    device = torch.device('cuda')
    torch.backends.cudnn.benchmark = True
    print(f"Using {torch.cuda.device_count()} GPUs.")
else:
    device = torch.device('cpu')
    print("No GPU available, using the CPU instead.")

Using 1 GPUs.


In [12]:
# # Import the necessary library for loading datasets
# from datasets import load_dataset

# # Specify the name of the dataset
# dataset_name = "timdettmers/openassistant-guanaco"

# # Load the dataset from the specified name and select the "train" split
# dataset = load_dataset(dataset_name, split="train")

In [13]:
dataset = ds["train"]

In [14]:
dataset

Dataset({
    features: ['profession', 'url', 'id', 'chunk', 'text'],
    num_rows: 6153
})

In [15]:
# Code from taprosoft's github: https://github.com/taprosoft/llm_finetuning/blob/efa6df245fee4faf27206d84802d8f58d4b6e77d/inference.py#L20
from transformers import (AutoModelForCausalLM,
    BitsAndBytesConfig,
    LlamaTokenizer)
import torch
import os

#os.environ["HUGGING_FACE_HUB_TOKEN"] = "{{your_huggingface_hub_token}}"

def load_hf_model(
    base_model,
    mode=8,
    gradient_checkpointing=False,
    device_map="auto",
):
    kwargs = {"device_map": device_map}
    if mode == 8:
        kwargs["quantization_config"] = BitsAndBytesConfig(
            load_in_8bit=True,
            llm_int8_threshold=0.0,
        )
    elif mode == 4:
        kwargs["quantization_config"] = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16,
        )
    elif mode == 16:
        kwargs["torch_dtype"] = torch.float16

    model = AutoModelForCausalLM.from_pretrained(base_model, **kwargs)

    # setup tokenizer
    # tokenizer = LlamaTokenizer.from_pretrained(base_model)

    # tokenizer.pad_token_id = 0  # unk. we want this to be different from the eos token
    # tokenizer.padding_side = "left"  # Allow batched inference
    return model#, tokenizer


In [16]:
# Code from taprosoft's github
from dataclasses import dataclass, field
import transformers
import torch
import copy
from typing import Dict, Sequence
from torch.nn.utils.rnn import pad_sequence

IGNORE_INDEX = -100

@dataclass
class DataCollatorForCausalLM(object):
    tokenizer: transformers.PreTrainedTokenizer
    source_max_len: int
    target_max_len: int
    train_on_source: bool
    predict_with_generate: bool

    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
        # Extract elements
        sources = [f"{self.tokenizer.bos_token}{example['profession']}" for example in instances]
        targets = [f"{example['text']}{self.tokenizer.eos_token}" for example in instances]
        # Tokenize
        tokenized_sources_with_prompt = self.tokenizer(
            sources,
            max_length=self.source_max_len,
            truncation=True,
            add_special_tokens=False,
        )
        tokenized_targets = self.tokenizer(
            targets,
            max_length=self.target_max_len,
            truncation=True,
            add_special_tokens=False,
        )
        # Build the input and labels for causal LM
        input_ids = []
        labels = []
        for tokenized_source, tokenized_target in zip(
            tokenized_sources_with_prompt['input_ids'],
            tokenized_targets['input_ids']
        ):
            if not self.predict_with_generate:
                input_ids.append(torch.tensor(tokenized_source + tokenized_target))
                if not self.train_on_source:
                    labels.append(
                        torch.tensor([IGNORE_INDEX for _ in range(len(tokenized_source))] + copy.deepcopy(tokenized_target))
                    )
                else:
                    labels.append(torch.tensor(copy.deepcopy(tokenized_source + tokenized_target)))
            else:
                input_ids.append(torch.tensor(tokenized_source))
        # Apply padding
        input_ids = pad_sequence(input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)
        labels = pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX) if not self.predict_with_generate else None
        data_dict = {
            'input_ids': input_ids,
            'attention_mask':input_ids.ne(self.tokenizer.pad_token_id),
        }
        if labels is not None:
            data_dict['labels'] = labels
        return data_dict

# Code from taprosoft's github
from dataclasses import dataclass, field
import transformers
import torch
import copy
from typing import Dict, Sequence
from torch.nn.utils.rnn import pad_sequence

IGNORE_INDEX = -100

@dataclass
class DataCollatorForCausalLM(object):
    tokenizer: transformers.PreTrainedTokenizer
    source_max_len: int
    target_max_len: int
    train_on_source: bool
    predict_with_generate: bool

    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
        # Extract elements
        sources = [f"{self.tokenizer.bos_token}{example['profession']}" for example in instances]
        targets = [f"{example['text']}{self.tokenizer.eos_token}" for example in instances]
        # Tokenize
        tokenized_sources_with_prompt = self.tokenizer(
            sources,
            max_length=self.source_max_len,
            truncation=True,
            add_special_tokens=False,
        )
        tokenized_targets = self.tokenizer(
            targets,
            max_length=self.target_max_len,
            truncation=True,
            add_special_tokens=False,
        )
        # Build the input and labels for causal LM
        input_ids = []
        labels = []
        for tokenized_source, tokenized_target in zip(
            tokenized_sources_with_prompt['input_ids'],
            tokenized_targets['input_ids']
        ):
            if not self.predict_with_generate:
                input_ids.append(torch.tensor(tokenized_source + tokenized_target))
                if not self.train_on_source:
                    labels.append(
                        torch.tensor([IGNORE_INDEX for _ in range(len(tokenized_source))] + copy.deepcopy(tokenized_target))
                    )
                else:
                    labels.append(torch.tensor(copy.deepcopy(tokenized_source + tokenized_target)))
            else:
                input_ids.append(torch.tensor(tokenized_source))
        # Apply padding
        input_ids = pad_sequence(input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)
        labels = pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX) if not self.predict_with_generate else None
        data_dict = {
            'input_ids': input_ids,
            'attention_mask':input_ids.ne(self.tokenizer.pad_token_id),
        }
        if labels is not None:
            data_dict['labels'] = labels
        return data_dict

In [17]:
import bitsandbytes as bnb
import torch
import peft

# COPIED FROM https://github.com/artidoro/qlora/blob/main/qlora.py
def print_trainable_parameters(model, use_4bit=False):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        num_params = param.numel()
        # if using DS Zero 3 and the weights are initialized empty
        if num_params == 0 and hasattr(param, "ds_numel"):
            num_params = param.ds_numel

        all_param += num_params
        if param.requires_grad:
            trainable_params += num_params
    if use_4bit:
        trainable_params /= 2
    print(
        f"all params: {all_param:,d} || trainable params: {trainable_params:,d} || trainable%: {100 * trainable_params / all_param}"
    )


# COPIED FROM https://github.com/artidoro/qlora/blob/main/qlora.py
def find_all_linear_names(model):
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, bnb.nn.Linear4bit):
            names = name.split(".")
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if "lm_head" in lora_module_names:  # needed for 16-bit
        lora_module_names.remove("lm_head")
    return list(lora_module_names)


def create_peft_model(model, gradient_checkpointing=True, bf16=True):
    from peft import (
        get_peft_model,
        LoraConfig,
        TaskType,
        prepare_model_for_kbit_training,
    )
    from peft.tuners.lora import LoraLayer

    # prepare int-4 model for training
    model = prepare_model_for_kbit_training(
        model, use_gradient_checkpointing=gradient_checkpointing
    )
    if gradient_checkpointing:
        model.gradient_checkpointing_enable()

    # get lora target modules
    modules = find_all_linear_names(model)
    print(f"Found {len(modules)} modules to quantize: {modules}")

    peft_config = LoraConfig(
        r=64,
        lora_alpha=16,
        target_modules=modules,
        lora_dropout=0.1,
        bias="none",
        task_type=TaskType.CAUSAL_LM,
    )

    model = get_peft_model(model, peft_config)

    # pre-process the model by upcasting the layer norms in float 32 for
    for name, module in model.named_modules():
        if isinstance(module, LoraLayer):
            if bf16:
                module = module.to(torch.bfloat16)
        if "norm" in name:
            module = module.to(torch.float32)
        if "lm_head" in name or "embed_tokens" in name:
            if hasattr(module, "weight"):
                if bf16 and module.weight.dtype == torch.float32:
                    module = module.to(torch.bfloat16)

    model.print_trainable_parameters()
    return model

In [18]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import glob

# Defining the model
model_name = model_checkpoint

# # setup tokenizer
tokenizer = LlamaTokenizer.from_pretrained(model_name)

tokenizer.pad_token_id = 0  # unk. we want this to be different from the eos token
tokenizer.padding_side = "left"  # Allow batched inference

# Get a list of all checkpoint files in the results directory
checkpoints = glob.glob('./results/checkpoint-*')

if checkpoints:
    # Sort the checkpoints by modification time
    checkpoints.sort(key=os.path.getmtime)
    
    # Get the latest checkpoint
    latest_checkpoint = checkpoints[-1]
    
    #print checkpoint and latest checkpoint
    print("Found the following checkpoints:")
    print(checkpoints)
    print(f"Loading the latest checkpoint: {latest_checkpoint}")
    

else:
    print("No checkpoints found in the results directory.")
    latest_checkpoint = model_name

model = load_hf_model(
    latest_checkpoint,
    mode=4,
    gradient_checkpointing=False,
    device_map='auto')


# create peft config
model = create_peft_model(
    model, gradient_checkpointing=False, bf16=False
)

# get all linear layer names
modules = find_all_linear_names(model)


from transformers import TrainingArguments
# Define training args
output_dir = "./results"
training_args = TrainingArguments(
    report_to="wandb", # we need one line to track experiments in wandb
    output_dir=output_dir,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,
    bf16=False,  # Use BF16 if available
    fp16=True,
    learning_rate=2e-4,
    num_train_epochs=3,
    optim="paged_adamw_8bit", #"adamw_torch" if not mode = 4,8
    gradient_checkpointing=False,
    # logging strategies
    logging_dir=f"{output_dir}/logs",
    logging_strategy="steps",
    logging_steps=10,
    remove_unused_columns=False,
)

data_collator = DataCollatorForCausalLM(
    tokenizer=tokenizer,              # Ensure this is the right tokenizer for your model
    source_max_len=660,               # Adjust based on your data's typical length
    target_max_len=None,              # Not using explicit target sequences
    train_on_source=False,             # Focusing on training using the source text
    predict_with_generate=False        # Generating output based on input sequences
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Found the following checkpoints:
['./results/checkpoint-500', './results/checkpoint-1000']
Loading the latest checkpoint: ./results/checkpoint-1000


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards: 100%|██████████| 2/2 [00:32<00:00, 16.24s/it]


Found 1 modules to quantize: ['base_layer']
trainable params: 159,907,840 || all params: 7,059,279,872 || trainable%: 2.2652146238635487


In [19]:
# Get the longest sequence
longest_sequence = max(dataset["text"], key=len)

# Print the length of the longest sequence
print(len(longest_sequence))

# Tokenize the longest sequence
tokens = tokenizer.tokenize(longest_sequence)

# Print the number of tokens
print(len(tokens))

2481
624


In [20]:
# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=data_collator,
)

In [21]:
device = next(model.parameters()).device
print(device)

cuda:0


In [22]:
# move model to device
model.to(device)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32128, 4096, padding_idx=0)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): lora.Linear4bit(
                  (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.1, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=4096, out_features=64, bias=False)
                  )
                  (lora_B): ModuleDict(
                    (default): Linear(in_features=64, out_features=4096, bias=False)
                  )
                  (lora_embedding_A): ParameterDict()
                  (lora_embedding_B): ParameterDict()
         

In [23]:
import torch

try:
    trainer.train()
except RuntimeError as e:
    if "CUBLAS_STATUS_EXECUTION_FAILED" in str(e):
        print("CUDA error: Out of memory or hardware issue. Trying to free up memory.")
        torch.cuda.empty_cache()
    else:
        raise e  # Re-raise the exception if it's not a CUDA error

# Save the model
trainer.save_model(f"{model_name}-ams-finetuned")

#full 1120 epochs 57m 22.2s

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Step,Training Loss
10,0.8675
20,0.7033
30,0.7794
40,0.7686
50,0.6567
60,0.9825
70,0.8491
80,0.6555
90,0.6663
100,0.8663


Checkpoint destination directory ./results/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


KeyboardInterrupt: 

In [None]:
wandb.finish()