In [1]:
!pip install transformers[torch] datasets evaluate seqeval bitsandbytes accelerate peft trl
!pip install torch
!pip install --upgrade huggingface_hub
!pip install tqdm
!pip install sentencepiece
!pip install protobuf

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl.metadata (2.2 kB)
Collecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl.metadata (18 kB)
Collecting peft
  Downloading peft-0.11.1-py3-none-any.whl.metadata (13 kB)
Collecting trl
  Downloading trl-0.8.6-py3-none-any.whl.metadata (11 kB)
Collecting transformers[torch]
  Downloading transformers-4.41.0-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-

In [1]:
# Set hyperparameters
batch_size = 1
learning_rate = 2e-5
num_epochs = 4

In [2]:
from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration, BitsAndBytesConfig
from huggingface_hub import login
import torch

login(token="FILL IN")
base_model_id = "llava-hf/llava-v1.6-34b-hf"#"llava-hf/llava-v1.6-mistral-7b-hf"
processor = LlavaNextProcessor.from_pretrained(base_model_id)
response_start = "<|im_start|>assistant\n\n"

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


# Download Dataset and Get Processed Prompt

In [3]:
from datasets import load_dataset, Dataset

train_dataset = load_dataset("ekolasky/EntityExtractionFromEmails", split="train")
val_dataset = load_dataset("ekolasky/EntityExtractionFromEmails", split="val")

In [4]:
from handler import PromptCreator
prompt_creator = PromptCreator(base_model_id=base_model_id)

def get_full_prompt(example):
    text = prompt_creator(entities=example['entities'])
    return {"text": text, "image": example["image"]}

train_dataset = train_dataset.map(get_full_prompt)
val_dataset = val_dataset.map(get_full_prompt)

In [5]:
# Check prompt format
print(train_dataset["text"][3])

<|im_start|>system
Answer the questions.<|im_end|><|im_start|>user
Return a list of all the names of companies and people in the email. If the full name of the person or company is not given, do not return the name. Also do not return any duplicate names. Each name should be categorized as either a company or a person. Here is an example of how I would like you to format your output:

- Name: Elon Musk | Category: person
- Name: SpaceX | Category: company
- Name: Gwen Shotwell | Category: person

Return your output in the same format. Do not include any text outside of the format. Obviously replace the names given by the names found in the email. If you cannot find any names return "NO NAMES FOUND". Return your output below:
<image><|im_end|><|im_start|>assistant

- Name: Ed Schneider | Category: person
- Name: Jeremy Schneider | Category: person
- Name: Webb Investment Network | Category: company


# Load Model with QLoRA

In [6]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = LlavaNextForConditionalGeneration.from_pretrained(
    base_model_id,
    device_map="auto",
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    quantization_config=bnb_config
)

Loading checkpoint shards:   0%|          | 0/15 [00:00<?, ?it/s]

In [7]:
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
import torch
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

def find_all_linear_names(model):
    cls = torch.nn.Linear
    lora_module_names = set()

    multimodal_keywords = ['mm_projector', 'vision_tower', 'vision_resampler']
    for name, module in model.named_modules():
        if any(mm_keyword in name for mm_keyword in multimodal_keywords):
            continue
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if 'lm_head' in lora_module_names: # needed for 16-bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

config = LoraConfig(
    r=32, # Default 32
    lora_alpha=64, # Default 64
    target_modules=find_all_linear_names(model),
    bias="none",
    lora_dropout=0.05,  # Conventional
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

# print(find_all_linear_names(model))

trainable params: 251199488 || all params: 18087655424 || trainable%: 1.3887896585352377


In [8]:
# for param in model.parameters():
#     if param.dtype in [torch.float16, torch.float32, torch.bfloat16, torch.complex64, torch.complex128]:
#         param.requires_grad = True

# Train with SFT

In [8]:
from trl import DataCollatorForCompletionOnlyLM, SFTTrainer

class LLavaDataCollator:
    def __init__(self, processor, completion_only=True):
        self.processor = processor
        self.completion_only = completion_only

    def __call__(self, examples):
        texts = []
        images = []
        for example in examples:
            if ('text' not in example):
                print(example.keys())
                raise ValueError("Missing prompt text")
            if ('image' not in example):
                raise ValueError("Missing image")
                
            texts.append(example['text'] + self.processor.tokenizer.eos_token)
            images.append(example["image"])

        batch = self.processor(texts, images, return_tensors="pt", padding=True)

        labels = batch["input_ids"].clone()

        # Mask labels for instructions (allows for completion only SFT
        if (self.completion_only):
            for i, text in enumerate(texts):
                seperated_text = text.split(response_start)
                if (len(seperated_text) != 2):
                    raise ValueError("Tokenized input does not include response header")
                
                instruction_ids = self.processor.tokenizer(seperated_text[0]+response_start, add_special_tokens=False)["input_ids"]                
                labels[i, :len(instruction_ids)] = -100

        # Mask labels for padded tokens
        if self.processor.tokenizer.pad_token_id is not None:
            labels[labels == self.processor.tokenizer.pad_token_id] = -100
        batch["labels"] = labels

        return batch

data_collator = LLavaDataCollator(processor)

In [9]:
from transformers import TrainingArguments, DataCollatorForLanguageModeling


args = TrainingArguments(
    output_dir="./llava_email_entity_extraction",
    overwrite_output_dir=False,
    do_train=True,
    do_eval=True,
    evaluation_strategy="epoch",
    save_strategy="no",
    # eval_steps=4000,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=1,
    learning_rate=learning_rate,
    warmup_ratio=0.1,
    fp16=True,
    num_train_epochs=num_epochs,
    run_name="llava_email_entity_extraction",
    remove_unused_columns=False
)

print("Batch Size", args.train_batch_size)
print("Parallel Mode", args.parallel_mode)

trainer = SFTTrainer(
    model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    dataset_text_field="text",
    packing=False,
    max_seq_length=8000,
    data_collator=data_collator,
    dataset_kwargs={"skip_prepare_dataset": True}
)



Batch Size 1
Parallel Mode ParallelMode.NOT_PARALLEL


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [10]:
trainer.train()



Epoch,Training Loss,Validation Loss
1,No log,0.265684
2,No log,0.266507
3,No log,0.255227
4,No log,0.256018


TrainOutput(global_step=52, training_loss=0.1857000864469088, metrics={'train_runtime': 266.4004, 'train_samples_per_second': 0.195, 'train_steps_per_second': 0.195, 'total_flos': 2202768750993408.0, 'train_loss': 0.1857000864469088, 'epoch': 4.0})

# Save to HuggingFace

In [11]:
model.push_to_hub("llava-v1.6-34b-email-entities")
processor.push_to_hub("llava-v1.6-34b-email-entities")



adapter_model.safetensors:   0%|          | 0.00/1.00G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ekolasky/llava-v1.6-34b-email-entities/commit/762b08c72af5eb5ce507352339c3f558a4aa56e5', commit_message='Upload processor', commit_description='', oid='762b08c72af5eb5ce507352339c3f558a4aa56e5', pr_url=None, pr_revision=None, pr_num=None)