In [None]:
from credentials import hf_token
from datasets import load_dataset, Dataset
from huggingface_hub import login
import itertools
import os
from peft import LoraConfig, get_peft_model
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer

login(token = hf_token)
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(device)

cuda:0


## Load filtered OSCAR Dataset

In [5]:
#-----------------------------------------------------------------------------
# Load Monolingual Dataset from OSCAR-2301 DB
#-----------------------------------------------------------------------------

target_languages = ["en", "de", "ru", "is", "cs", "zh"]

def filter_oscar_dataset(target_languages: list, max_samples_per_language: int = 1000):
    """
    Filters and extracts a subset of texts from the OSCAR dataset for specific languages.
    
    Parameters:
    - target_languages (list): List of language codes to filter.
    - max_samples_per_language (int): Maximum number of samples per language (default: 1000).
    
    Returns:
    - Dataset: A Hugging Face Dataset containing extracted text samples.
    """
    collected_sentences = []

    for language in target_languages:
        oscar_dataset = load_dataset(
            "oscar-corpus/OSCAR-2301",
            language=language,
            streaming=True,
            split="train"
        )

        samples_to_extract = 100 if language == 'en' else max_samples_per_language
        collected_sentences.extend(item['text'] for item in itertools.islice(oscar_dataset, samples_to_extract))

    return Dataset.from_dict({'text': collected_sentences})


In [6]:
monoLangDataset = filter_oscar_dataset(target_languages,1000)

README.md:   0%|          | 0.00/37.4k [00:00<?, ?B/s]

OSCAR-2301.py:   0%|          | 0.00/22.3k [00:00<?, ?B/s]

The repository for oscar-corpus/OSCAR-2301 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/oscar-corpus/OSCAR-2301.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


## Finetune LLM

In [7]:
#-----------------------------------------------------------------------------
# Training parameters
#-----------------------------------------------------------------------------

batch_size = 2 ## For 16GB GPU (OSCAR DB)

lr = 2e-4

In [8]:
#-----------------------------------------------------------------------------
# QLoRA parameters
#-----------------------------------------------------------------------------

# LoRA attention dimension
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.1

In [None]:
# -----------------------------------------------------------------------------
# FINETUNING: Fine-tune LLAMA3.2-3B with PEFT (LoRA) using SFT.
# -----------------------------------------------------------------------------

def finetune_model(dataset: Dataset,
                   new_model: str,
                   model_name: str = "meta-llama/Llama-3.2-3B",
                   num_train_epochs: int = 2,
                   quantization:str = "4bit"):
    """
    Fine-tunes the specified LLAMA3.2-3B model on the given training texts
    using supervised fine-tuning (SFT) with a parameter-efficient adapter (LoRA).
    This function is compatible with other Llama-3.2 model with different size and also with
    other CausalLM models.

    Args:
      dataset (Dataset): Dataset with .
      new_model (str): Name of finetuned model.
      model_name (str): Hugging Face model repository ID.
      num_train_epochs (int): Number of training epochs.

    Returns:
      The fine-tuned model and tokenizer.
    """
    # Load tokenizer and base model (using FP16 and auto device mapping)
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "left"
    if quantization =="4bit":
      compute_dtype = getattr(torch, "float16")
      bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=False
      )
      model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto"
        )
    elif quantization == None:
      model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map=device
        )
    else:
      raise ValueError("This quantization is not compatible with LORA finetuning")
      
    model.config.use_cache = False
    model.config.pretraining_tp = 1
    

    # Configure LoRA parameters
    peft_config = LoraConfig(
        lora_alpha=lora_alpha,
        lora_dropout=lora_dropout,
        r=lora_r,
        bias="none",
        task_type="CAUSAL_LM",
    )

    # Apply LoRA adapter to the base model using get_peft_model
    model = get_peft_model(model, peft_config)


    # Set up training arguments.
    training_args = TrainingArguments(
        output_dir=f"./results/monolingualData_finetuned_models/{new_model}",
        per_device_train_batch_size=batch_size,  # adjust based on available GPU memory
        gradient_accumulation_steps=1,
        num_train_epochs=num_train_epochs,
        learning_rate=lr,
        weight_decay=0.001,
        fp16=False,
        logging_steps=50,
        save_steps=100,
        group_by_length=True,
        report_to="tensorboard" 
    )

    # Create an SFTTrainer for supervised fine-tuning.
    trainer = SFTTrainer(
        model=model,
        processing_class=tokenizer,
        args=training_args,
        train_dataset=dataset
    )

    # Start training
    trainer.train()

    # Save the fine-tuned model and tokenizer.
    trainer.model.save_pretrained(new_model)
    

In [10]:
finetune_model(dataset=monoLangDataset, new_model="monolingualData_finetuned-Llama3.2-3B", model_name="meta-llama/Llama-3.2-3B", num_train_epochs= 2, quantization="4bit")

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/844 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

Converting train dataset to ChatML:   0%|          | 0/5100 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/5100 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/5100 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (154099 > 131072). Running this sequence through the model will result in indexing errors


Truncating train dataset:   0%|          | 0/5100 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
50,2.4866
100,2.2375
150,2.3204
200,2.4214
250,2.4544
300,2.3535
350,2.2947
400,2.4263
450,2.3695
500,2.4258
