In [14]:
%pip install datasets protobuf sentencepiece accelerate peft --verbose

Using pip 25.1.1 from c:\Users\adminuser\Desktop\Code\.venv\lib\site-packages\pip (python 3.10)
Note: you may need to restart the kernel to use updated packages.


In [1]:
import torch
#Ensure GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device {device}")

Using device cuda


In [2]:
from datasets import load_dataset
data_cache_directory = "D:\DatasetCache"
#Step1: Load the dataset from HuggingFace
dataset_name = "chibbss/fitness-chat-prompt-completion-dataset"
dataset = load_dataset(dataset_name,split="train",cache_dir=data_cache_directory)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
#Step2: Load pretrained mistral7b model with trust_remote_core=True
from transformers import AutoModelForCausalLM, AutoTokenizer
model_name = "mistralai/Mistral-7B-Instruct-v0.3"
model_cache_directory = "D:\ModelCache"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, cache_dir=model_cache_directory)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype = torch.float16 if device.type == "cuda" else torch.float32,
    trust_remote_code = True,
    cache_dir = model_cache_directory
).to(device)

Loading checkpoint shards: 100%|██████████| 3/3 [00:08<00:00,  2.77s/it]


In [4]:
#Disable cache to help wtih hidden state shape
model.config.use_cache = False
print(model)

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32768, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): MistralMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): MistralRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): MistralRMSNorm((4096,), eps=1e-0

In [5]:
#SteP3: Define the LoRA configuration
from peft import LoraConfig
lora_config = LoraConfig(
    r=8,
    lora_alpha=8,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

In [6]:
#Apply LoRA adaptation using PEFT
from peft import get_peft_model
model = get_peft_model(model,lora_config)
print(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32768, 4096)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): Linear

In [7]:
#Tokenize the dataset with proper prompt formatting
def tokenize_function(examples):
    texts = []
    #process each example in the batch
    for instr, out in zip(examples["instruction"], examples["output"]):
        #Format the instructions column items as prompts that Mistral expects
        text = f"~~[INST] {instr} [/INST] {out}~~"
        texts.append(text)

    #Tokenize the list of texts
    return tokenizer(texts, truncation=True, padding="max_length", max_length=512) 

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["instruction","output"])

#print a sample tokenized prompt
print("Sample Tokenized prompt:")
print(tokenizer.decode(tokenized_dataset[0]["input_ids"]))

Sample Tokenized prompt:
</s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s

In [8]:
#Define data collator
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [9]:
#Define the training Arguments

from transformers import TrainingArguments, TrainerCallback

training_args = TrainingArguments(
    output_dir = "D:\FTModelOutput",
    eval_strategy = "epoch",
    save_strategy = "epoch",
    per_device_train_batch_size = 1,
    per_device_eval_batch_size = 1,
    num_train_epochs = 2,
    weight_decay = 0.01,
    save_total_limit = 1,
    logging_dir = "D:\FTModelOutput\logs",
    logging_steps = 100,
    eval_steps = 500,
    fp16 = True if torch.cuda.is_available() else False,
    optim = "adamw_torch",
    report_to = "none",
    gradient_accumulation_steps = 2
)
print(type(tokenized_dataset))
#Custom Callback for printing progress every 500 steps
class PrintProgressCallback(TrainerCallback):
    def on_step_end(self, args, state, control, **kwargs):
        if state.global_step % 500 == 0 and state.global_step > 0:
            print(f"Training step {state.global_step}/{state.max_steps} - Loss: {state.log_history[-1]['loss']:.4f}")


#Memory Saver Callback implementation
class SaveMemoryCallback(TrainerCallback):
    def on_step_end(self, args, state, control, **kwargs):
        torch.cuda.empty_cache()

<class 'datasets.arrow_dataset.Dataset'>


In [10]:
#Initialize Trainer

from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    processing_class=tokenizer,
    data_collator=data_collator,
    callbacks=[PrintProgressCallback(), SaveMemoryCallback()]
)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [11]:
torch.cuda.empty_cache()
print(torch.cuda.memory_summary())

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |  13837 MiB |  13837 MiB |  13844 MiB |   6656 KiB |
|       from large pool |  13824 MiB |  13824 MiB |  13824 MiB |      0 KiB |
|       from small pool |     13 MiB |     13 MiB |     20 MiB |   6656 KiB |
|---------------------------------------------------------------------------|
| Active memory         |  13837 MiB |  13837 MiB |  13844 MiB |   6656 KiB |
|       from large pool |  13824 MiB |  13824 MiB |  13824 MiB |      0 KiB |
|       from small pool |     13 MiB |     13 MiB |     20 MiB |   6656 KiB |
|---------------------------------------------------------------

In [12]:
#Train the model and save it

trainer.train()

#save thie model
trainer.save_model("D:\FTModelOutput\mistral_7b_gym_finetuned")
print("fine tuning complete")

OutOfMemoryError: CUDA out of memory. Tried to allocate 14.00 MiB. GPU 0 has a total capacity of 15.86 GiB of which 5.94 MiB is free. Of the allocated memory 15.46 GiB is allocated by PyTorch, and 15.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)