In [1]:
!pip install transformers datasets accelerate huggingface_hub

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12=

In [4]:
import os
import json
from getpass import getpass
from huggingface_hub import login

In [5]:
token = getpass(prompt="enter huggingface token")
login(token=token)

enter huggingface token··········


In [6]:
dir_name = "sample_data"

if os.path.isfile(dir_name + '/topics.json'):
    with open(dir_name + '/topics.json') as t:
        topics = json.load(t)

    topics = topics.keys()
    tfiles = [dir_name + f"""/{"-".join(t.lower().split(' '))}.json""" for t in topics]
    tfiles = [tf for tf in tfiles if os.path.isfile(tf)]

    data = []
    for tf in tfiles:
        with open(tf) as t:
            data.extend(json.load(t))

    print(f'there are {len(data)} rows in the dataset')

there are 180 rows in the dataset


In [7]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset

In [8]:
dataset = load_dataset("json", data_files=tfiles)

In [9]:
def preprocess(row):
    row['input'] = f'<question>\n{row["question"]}\n</question>\n\n<documents>\n{row["documents"]}\n</documents>\n\n<answer>'
    return row

In [10]:
dataset = dataset.map(preprocess).shuffle(seed=42)

In [11]:
print(dataset['train'][0]['input'])

<question>
What techniques are most effective for urban gardening in small spaces?
</question>

<documents>
Document 1: Coffee varietals, distinct genetic variants within species like Coffea arabica and Coffea canephora (robusta), significantly influence flavor profiles and growing characteristics. Traditional arabica varietals include Typica (noted for sweet, clean flavors but low yield) and Bourbon (known for complex acidity and moderate body). Hybridization has produced numerous cultivars addressing specific agricultural challenges while maintaining quality attributes. Examples include Caturra, a natural Bourbon mutation with compact growth allowing denser planting; Catuai, combining Mundo Novo's vigor with Caturra's size; and Gesha (or Geisha), an Ethiopian-origin varietal that gained fame after winning the 2004 Panama Cup of Excellence with its distinctive floral and tea-like characteristics. Modern breeding programs increasingly focus on developing varieties combining disease res

In [12]:
model_id = "meta-llama/Llama-3.2-1B"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [13]:
# model = model.to(torch.device('cpu'))

In [14]:
tokenizer.pad_token = tokenizer.eos_token

In [15]:
def tokenize_function(row):
    prompt = row["input"]
    full_text = prompt + "\n" + f'row["answer"]\n</answer>'

    tokenized = tokenizer(
        full_text,
        truncation=True,
        max_length=256,
        padding="max_length",
    )

    prompt_tokenized = tokenizer(
        prompt,
        truncation=True,
        max_length=256,
        add_special_tokens=False  # Ensure no extra tokens are added
    )
    prompt_length = len(prompt_tokenized["input_ids"])

    # Copy the tokenized full text to create labels and mask out prompt tokens
    labels = tokenized["input_ids"].copy()
    labels[:prompt_length] = [-100] * prompt_length  # -100 tells PyTorch to ignore these tokens in loss computation

    tokenized["labels"] = labels
    return tokenized

In [16]:
tokenized_dataset = dataset.map(tokenize_function)

Map:   0%|          | 0/180 [00:00<?, ? examples/s]

In [17]:
from peft import LoraConfig, get_peft_model

# Define LoRA Configuration
lora_config = LoraConfig(
    r=4,               # Low-rank dimension
    lora_alpha=32,     # Scaling factor
    lora_dropout=0.1,  # Dropout for regularization
    bias="none",       # No bias training
    task_type="CAUSAL_LM"  # Task type: Causal Language Modeling
)

# Wrap the model with LoRA
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 425,984 || all params: 1,236,240,384 || trainable%: 0.0345


In [18]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [20]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./llama-1b-finetuned",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=16,
    num_train_epochs=30,
    logging_steps=10,
    save_strategy="epoch",
    fp16=True,
    report_to=None
#    use_cpu=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],  # or the appropriate split name
)

trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss
10,10.2197
20,6.4045
30,2.5093
40,1.5932
50,1.1251
60,0.8332


TrainOutput(global_step=60, training_loss=3.7808291276295978, metrics={'train_runtime': 428.091, 'train_samples_per_second': 12.614, 'train_steps_per_second': 0.14, 'total_flos': 6436230606618624.0, 'train_loss': 3.7808291276295978, 'epoch': 29.711111111111112})