In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq
from datasets import load_dataset
from peft import get_peft_model, LoraConfig, TaskType

In [None]:

secret = "secret"

# Load model and tokenizer
model_name = "meta-llama/Llama-3.2-1B" 
tokenizer = AutoTokenizer.from_pretrained(model_name, token=secret)
tokenizer.pad_token = tokenizer.eos_token  # LLaMA uses no official pad token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    token=secret,
    device_map="auto",
    torch_dtype=torch.float16,
)

# Prepare LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()





The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


trainable params: 1,703,936 || all params: 1,237,518,336 || trainable%: 0.1377


In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./llama3-translation",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    eval_strategy="steps",
    eval_steps=200,
    save_steps=200,
    num_train_epochs=3,
    learning_rate=2e-4,
    fp16=True,
    logging_dir="./logs",
    logging_steps=50,
    report_to="none"
)


In [None]:
# For German
dataset = load_dataset("wmt14", "de-en", split="train")
val_dataset = load_dataset("wmt14", "de-en", split="validation")

max_length = 512

train_raw = dataset['translation'][:10000]
print(train_raw[0])
val_raw = val_dataset['translation'][:960]
del dataset

def preprocess(example):
    input_text = f"Translate English to German:\n{example['en']}\n"
    target_text = example['de']

    input_ids = tokenizer(input_text, truncation=True, max_length=max_length, padding=False)["input_ids"]
    target_ids = tokenizer(target_text, truncation=True, max_length=max_length - len(input_ids), padding=False)["input_ids"]

    input_ids_combined = input_ids + target_ids
    labels = [-100] * len(input_ids) + target_ids

    # Truncate both to max_length
    input_ids_combined = input_ids_combined[:max_length]
    labels = labels[:max_length]

    attention_mask = [1] * len(input_ids_combined)

    return {
        "input_ids": input_ids_combined,
        "attention_mask": attention_mask,
        "labels": labels
    }

# Apply preprocessing
train_data = [preprocess(example) for example in train_raw]
val_data = [preprocess(example) for example in val_raw]


{'de': 'Wiederaufnahme der Sitzungsperiode', 'en': 'Resumption of the session'}


In [None]:
# Data collator
collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding=True)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=tokenizer,
    data_collator=collator
)

# Train
trainer.train()

# Save
trainer.save_model("./llama3-1b-1percent-german-translation-lora")

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss
200,1.5616,1.605539
400,1.4659,1.590243
600,1.4405,1.567389
800,1.3123,1.52182
1000,1.2931,1.508652
1200,1.2767,1.496396
1400,1.246,1.500401
1600,1.233,1.500201
1800,1.2313,1.49979



Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.2-1B/resolve/main/config.json.
Access to model meta-llama/Llama-3.2-1B is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in meta-llama/Llama-3.2-1B.

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.2-1B/resolve/main/config.json.
Access to model meta-llama/Llama-3.2-1B is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in meta-llama/Llama-3.2-1B.

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.2-1B/resolve/main/config.json.
Access to model meta-llama/Llama-3.2-1B is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in meta-llama/Llama-3.2-1B.

Cannot access gated repo for u

In [None]:
# zip and save model
!zip -r llama3-1b-10k-german-translation-lora.zip llama3-1b-10k-german-translation-lora

  adding: llama3-1b-1percent-german-translation-lora/ (stored 0%)
  adding: llama3-1b-1percent-german-translation-lora/training_args.bin (deflated 52%)
  adding: llama3-1b-1percent-german-translation-lora/special_tokens_map.json (deflated 64%)
  adding: llama3-1b-1percent-german-translation-lora/README.md (deflated 66%)
  adding: llama3-1b-1percent-german-translation-lora/adapter_config.json (deflated 54%)
  adding: llama3-1b-1percent-german-translation-lora/adapter_model.safetensors (deflated 8%)
  adding: llama3-1b-1percent-german-translation-lora/tokenizer.json (deflated 85%)
  adding: llama3-1b-1percent-german-translation-lora/tokenizer_config.json (deflated 96%)


In [None]:

import gc
gc.collect()
torch.cuda.empty_cache()

**Czech**

In [None]:
# Load model and tokenizer
secret = userdata.get('llama3accesssecret')
model_name = "meta-llama/Llama-3.2-1B" 
tokenizer = AutoTokenizer.from_pretrained(model_name, token=secret)
tokenizer.pad_token = tokenizer.eos_token  # LLaMA uses no official pad token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    token=secret,
    device_map="auto",
    torch_dtype=torch.float16,
)

# Prepare LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


trainable params: 1,703,936 || all params: 1,237,518,336 || trainable%: 0.1377


In [None]:
def preprocess_czech(example):
    input_text = f"Translate English to Czech: English \n{example['en']}\n Czech:"
    target_text = example['cs']

    input_ids = tokenizer(input_text, truncation=True, max_length=max_length, padding=False)["input_ids"]
    target_ids = tokenizer(target_text, truncation=True, max_length=max_length - len(input_ids), padding=False)["input_ids"]

    input_ids_combined = input_ids + target_ids
    labels = [-100] * len(input_ids) + target_ids

    # Truncate both to max_length
    input_ids_combined = input_ids_combined[:max_length]
    labels = labels[:max_length]

    attention_mask = [1] * len(input_ids_combined)

    return {
        "input_ids": input_ids_combined,
        "attention_mask": attention_mask,
        "labels": labels
    }

# For czech
dataset = load_dataset("wmt14", "cs-en", split="train")
val_dataset = load_dataset("wmt14", "cs-en", split="validation")

max_length = 512

train_raw = dataset['translation'][:10000]
print(train_raw[0])
val_raw = val_dataset['translation'][:960]
del dataset
del val_dataset

# Apply preprocessing
train_data = [preprocess_czech(example) for example in train_raw]
val_data = [preprocess_czech(example) for example in val_raw]

Resolving data files:   0%|          | 0/36 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/30 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/36 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/30 [00:00<?, ?it/s]

{'cs': 'Následný postup na základě usnesení Parlamentu: viz zápis', 'en': "Action taken on Parliament's resolutions: see Minutes"}


In [None]:
# Data collator
collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding=True)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=tokenizer,
    data_collator=collator
)

# Train
trainer.train()

# Save
trainer.save_model("./llama3-1b-10k-czech-translation-lora")

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss
200,1.5474,1.911865
400,1.4585,1.889611
600,1.3743,1.843709
800,1.3146,1.830167
1000,1.2694,1.821721
1200,1.2685,1.809622
1400,1.2209,1.81959
1600,1.2011,1.817351
1800,1.2129,1.815116



Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.2-1B/resolve/main/config.json.
Access to model meta-llama/Llama-3.2-1B is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in meta-llama/Llama-3.2-1B.

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.2-1B/resolve/main/config.json.
Access to model meta-llama/Llama-3.2-1B is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in meta-llama/Llama-3.2-1B.

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.2-1B/resolve/main/config.json.
Access to model meta-llama/Llama-3.2-1B is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in meta-llama/Llama-3.2-1B.

Cannot access gated repo for u

In [None]:
# Zip and save model
!zip -r llama3-1b-10k-czech-translation-lora.zip llama3-1b-10k-czech-translation-lora

  adding: llama3-1b-1percent-czech-translation-lora/ (stored 0%)
  adding: llama3-1b-1percent-czech-translation-lora/training_args.bin (deflated 52%)
  adding: llama3-1b-1percent-czech-translation-lora/special_tokens_map.json (deflated 64%)
  adding: llama3-1b-1percent-czech-translation-lora/README.md (deflated 66%)
  adding: llama3-1b-1percent-czech-translation-lora/adapter_config.json (deflated 54%)
  adding: llama3-1b-1percent-czech-translation-lora/adapter_model.safetensors (deflated 8%)
  adding: llama3-1b-1percent-czech-translation-lora/tokenizer.json (deflated 85%)
  adding: llama3-1b-1percent-czech-translation-lora/tokenizer_config.json (deflated 96%)


In [None]:

gc.collect()
torch.cuda.empty_cache()

Arabic 

In [None]:
dataset = load_dataset("wmt14", "ar-en", split="train")
val_dataset = load_dataset("wmt14", "ar-en", split="validation")

max_length = 512

train_raw = dataset['translation'][:10000]
print(train_raw[0])
val_raw = val_dataset['translation'][:960]
del dataset

def preprocess(example):
    input_text = f"Translate English to Arabic:\n{example['en']}\n"
    target_text = example['ar']

    input_ids = tokenizer(input_text, truncation=True, max_length=max_length, padding=False)["input_ids"]
    target_ids = tokenizer(target_text, truncation=True, max_length=max_length - len(input_ids), padding=False)["input_ids"]

    input_ids_combined = input_ids + target_ids
    labels = [-100] * len(input_ids) + target_ids

    # Truncate
    input_ids_combined = input_ids_combined[:max_length]
    labels = labels[:max_length]

    attention_mask = [1] * len(input_ids_combined)

    return {
        "input_ids": input_ids_combined,
        "attention_mask": attention_mask,
        "labels": labels
    }

# Apply preprocessing
train_data = [preprocess(example) for example in train_raw]
val_data = [preprocess(example) for example in val_raw]


In [None]:
# Data collator
collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding=True)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=tokenizer,
    data_collator=collator
)

# Train
trainer.train()

# Save
trainer.save_model("./llama3-1b-10k-arabic-translation-lora")