In [1]:
!pip install --upgrade transformers datasets bitsandbytes peft huggingface_hub accelerate
!pip install transformers[torch]



In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
from huggingface_hub import notebook_login, login
import bitsandbytes as bnb
import os
import accelerate


In [3]:
# Verify versions
import transformers
import datasets
import peft
import accelerate
import huggingface_hub

print("Transformers version:", transformers.__version__)
print("Datasets version:", datasets.__version__)
print("BitsAndBytes version:", bnb.__version__)
print("PEFT version:", peft.__version__)
print("Accelerate version:", accelerate.__version__)
print("Hugging Face Hub version:", huggingface_hub.__version__)

Transformers version: 4.40.2
Datasets version: 2.19.1
BitsAndBytes version: 0.43.1
PEFT version: 0.10.0
Accelerate version: 0.30.1
Hugging Face Hub version: 0.23.0


In [4]:

os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

In [5]:
#notebook_login()
#login()

In [6]:
# Load the token from environment variable
#hf_token = os.getenv("HUGGINGFACE_API_TOKEN")

In [7]:
# Ensure the quantized model is moved to the GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [8]:
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, load_in_4bit=False, torch_dtype=torch.float16)
model = AutoModelForCausalLM.from_pretrained(model_name, load_in_4bit=False, torch_dtype=torch.float16)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [9]:
# Optional Quantize the model
model = bnb.nn.quantization.quantize(model, weight_bits=4)  # 4-bit quantization

AttributeError: module 'bitsandbytes.nn' has no attribute 'quantization'

In [10]:
model = model.to(device)

In [None]:
# Write a Text Generation Function
def generate_text(prompt, max_length=100):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model.generate(inputs.input_ids, max_length=max_length)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
# Test the model with an actual question
prompt = "What are the benefits of using renewable energy sources?"
output = generate_text(prompt)
print(output)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


What are the benefits of using renewable energy sources? Renewable energy sources, such as solar, wind, and hydro power, offer a number of benefits over traditional fossil fuel-based energy sources. Some of the most significant advantages include:
  1. Sustainability: Renewable energy sources are sustainable and can be replenished naturally, unlike fossil fuels, which are finite and will eventually run out.
  2. Reduced greenhouse gas emissions: Renewable energy sources produce little to no greenhouse gas emissions, which helps to


In [11]:
# Load Dataset for Fine-Tuning
dataset = load_dataset('wikitext', 'wikitext-2-raw-v1')

In [12]:
# Set the pad_token to eos_token
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)

In [13]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/4358 [00:00<?, ? examples/s]

Map:   0%|          | 0/36718 [00:00<?, ? examples/s]

Map:   0%|          | 0/3760 [00:00<?, ? examples/s]

In [14]:
lora_config = LoraConfig(
    r=8,       # The rank of the low-rank matrices
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],  # Target modules in the model for LoRA
    lora_dropout=0.1,
    bias="none"
)

model = get_peft_model(model, lora_config)

In [15]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-6,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir='./logs',
    fp16=True,  # Enable mixed precision training
)

In [16]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation']
)

Detected kernel version 4.14.343, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [17]:
# Clear GPU cache
torch.cuda.empty_cache()

In [18]:
trainer.train()

OutOfMemoryError: CUDA out of memory. Tried to allocate 64.00 MiB. GPU 0 has a total capacity of 21.96 GiB of which 35.06 MiB is free. Including non-PyTorch memory, this process has 21.92 GiB memory in use. Of the allocated memory 21.66 GiB is allocated by PyTorch, and 41.75 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)