In [1]:
DATASET_PATH = './data/english_embeddings.json'

In [2]:
from datasets import Dataset
import json

with open(DATASET_PATH) as f:
    data = json.load(f)
    dataset = Dataset.from_list(data)
    dataset_dict = dataset.train_test_split(test_size=0.2)

In [3]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 221
    })
    test: Dataset({
        features: ['text'],
        num_rows: 56
    })
})

In [4]:
print(dataset_dict["train"][1]["text"])

AJP-3
3
3-26
Edition C Version 1 + UK national elements (Change 1)
h. Joint collection management board. J2 chairs the joint collection
management board to coordinate collection activities between
components, contributing nations, and complementary national agency
activity. The overall purpose of the joint collection management board
is to review, validate, de-conflict and prioritize all joint intelligence,
surveillance and reconnaissance (JISR) collection requirements and
assigned capabilities. The joint collection management board seeks to
prioritize, coordinate and synchronize the JISR activity between the joint
level and the subordinate formations (land, maritime, air, and special
operations forces components). At the joint level, subordinate formation
component collection management elements participate in the joint
collection management board. The board should include, but is not
limited to, representation from targeting, current operations, current
plans, future plans, electroni

In [5]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

model_id = "mistralai/Mistral-7B-v0.1"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load 4-bit quantized model
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
)

# Load (slow) Tokenizer, fast tokenizer sometimes ignores added tokens
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

In [6]:
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [7]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [9]:
# Add LoRA adapters to model
config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=
    [
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    lora_dropout=0.05,
    bias="none",
    modules_to_save=["lm_head", "embed_tokens"],  # needed because we added new tokens to tokenizer/model
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 283115520 || all params: 4035186688 || trainable%: 7.016169062064471


In [10]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): PeftModelForCausalLM(
      (base_model): LoraModel(
        (model): MistralForCausalLM(
          (model): MistralModel(
            (embed_tokens): ModulesToSaveWrapper(
              (original_module): Embedding(32000, 4096)
              (modules_to_save): ModuleDict(
                (default): Embedding(32000, 4096)
              )
            )
            (layers): ModuleList(
              (0-31): 32 x MistralDecoderLayer(
                (self_attn): MistralAttention(
                  (q_proj): lora.Linear4bit(
                    (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.05, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=4096, out_features=8, bias=False)
                    )
                    (lora_B): Mo

In [13]:
import transformers
import os
tokenizer.pad_token = tokenizer.eos_token


def tokenize(element):
    return tokenizer(
        element["text"],
        truncation=True,
        max_length=512,
        add_special_tokens=False,
    )


dataset_tokenized = dataset_dict.map(
    tokenize,
    batched=True,
    num_proc=os.cpu_count(),  # multithreaded
    remove_columns=["text"]  # don't need this anymore, we have tokens from here on
)

Map (num_proc=10):   0%|          | 0/221 [00:00<?, ? examples/s]

Map (num_proc=10):   0%|          | 0/56 [00:00<?, ? examples/s]

In [14]:
dataset_tokenized

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 221
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 56
    })
})

In [15]:
# define collate function - transform list of dictionaries [ {input_ids: [123, ..]}, {.. ] to single batch dictionary { input_ids: [..], labels: [..], attention_mask: [..] }
def collate(elements):
    tokenlist = [e["input_ids"] for e in elements]
    tokens_maxlen = max([len(t) for t in tokenlist])

    input_ids, labels, attention_masks = [], [], []
    for tokens in tokenlist:
        pad_len = tokens_maxlen - len(tokens)

        # pad input_ids with pad_token, labels with ignore_index (-100) and set attention_mask 1 where content otherwise 0
        input_ids.append(tokens + [tokenizer.pad_token_id] * pad_len)
        labels.append(tokens + [-100] * pad_len)
        attention_masks.append([1] * len(tokens) + [0] * pad_len)

    batch = {
        "input_ids": torch.tensor(input_ids),
        "labels": torch.tensor(labels),
        "attention_mask": torch.tensor(attention_masks)
    }
    return batch

In [16]:
bs = 1  # batch size
ga_steps = 1  # gradient acc. steps
epochs = 5
steps_per_epoch = len(dataset_tokenized["train"]) // (bs * ga_steps)

args = TrainingArguments(
    output_dir="out",
    per_device_train_batch_size=bs,
    per_device_eval_batch_size=bs,
    evaluation_strategy="steps",
    logging_steps=1,
    eval_steps=steps_per_epoch,  # eval and save once per epoch  	
    save_steps=steps_per_epoch,
    gradient_accumulation_steps=ga_steps,
    num_train_epochs=epochs,
    lr_scheduler_type="constant",
    optim="paged_adamw_8bit",
    learning_rate=2e-4,
    group_by_length=True,
    fp16=True,
    ddp_find_unused_parameters=False,
)

In [None]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=collate,
    train_dataset=dataset_tokenized["train"],
    eval_dataset=dataset_tokenized["test"],
    args=args,
)

trainer.train()



Step,Training Loss,Validation Loss


