In [1]:
from credentials import hf_token
from datasets import load_dataset, Dataset
from huggingface_hub import login
import itertools
import os
from peft import PeftModel, LoraConfig, get_peft_model, AutoPeftModelForCausalLM
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer, DPOTrainer, DPOConfig

login(token = hf_token)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

  from .autonotebook import tqdm as notebook_tqdm
2025-03-18 01:04:12.948844: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-18 01:04:12.967846: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742259852.990915   27068 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742259852.997885   27068 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1742259853.014936   27068 computation_placer.cc:177] computation placer already r

cuda


## Load filtered OSCAR Dataset

In [2]:
#-----------------------------------------------------------------------------
# Load Monolingual Dataset from OSCAR-2301 DB
#-----------------------------------------------------------------------------

target_languages = ["en", "de", "ru", "is", "cs", "zh"]

def filter_oscar_dataset(target_languages: list, max_samples_per_language: int = 5000):
    """
    Filters and extracts a subset of texts from the OSCAR dataset for specific languages.
    
    Parameters:
    - target_languages (list): List of language codes to filter.
    - max_samples_per_language (int): Maximum number of samples per language (default: 5000).
    
    Returns:
    - Dataset: A Hugging Face Dataset containing extracted text samples.
    """
    collected_sentences = []

    for language in target_languages:
        oscar_dataset = load_dataset(
            "oscar-corpus/OSCAR-2301",
            language=language,
            streaming=True,
            split="train"
        )

        samples_to_extract = 1000 if language == 'en' else max_samples_per_language
        collected_sentences.extend(item['text'] for item in itertools.islice(oscar_dataset, samples_to_extract))

    return Dataset.from_dict({'text': collected_sentences})


In [3]:
monoLangDataset = filter_oscar_dataset(target_languages,3000)

## Finetune LLM with monoligual prompts

In [4]:
#-----------------------------------------------------------------------------
# Training parameters
#-----------------------------------------------------------------------------

batch_size = 2 ## For 16GB GPU (OSCAR DB)

lr = 2e-4

In [5]:
#-----------------------------------------------------------------------------
# QLoRA parameters
#-----------------------------------------------------------------------------

# LoRA attention dimension
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.1

In [6]:
# -----------------------------------------------------------------------------
# FINETUNING: Fine-tune LLAMA3.2-3B with PEFT (LoRA) using SFT.
# -----------------------------------------------------------------------------

def finetune_model(dataset: Dataset,
                   new_model: str,
                   model_name: str = "meta-llama/Llama-3.2-3B",
                   num_train_epochs: int = 2,
                   quantization:str = "4bit"):
    """
    Fine-tunes the specified LLAMA3.2-3B model on the given training texts
    using supervised fine-tuning (SFT) with a parameter-efficient adapter (LoRA).
    This function is compatible with other Llama-3.2 model with different size and also with
    other CausalLM models.

    Args:
      dataset (Dataset): Dataset with .
      new_model (str): Name of finetuned model.
      model_name (str): Hugging Face model repository ID.
      num_train_epochs (int): Number of training epochs.

    Returns:
      The fine-tuned model and tokenizer.
    """
    # Load tokenizer and base model (using FP16 and auto device mapping)
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "left"
    if quantization =="4bit":
      compute_dtype = getattr(torch, "float16")
      bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=False
      )
      model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto"
        )
    elif quantization == None:
      model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map=device
        )
    else:
      raise ValueError("This quantization is not compatible with LORA finetuning")
      
    model.config.use_cache = False
    model.config.pretraining_tp = 1
    

    # Configure LoRA parameters
    peft_config = LoraConfig(
        lora_alpha=lora_alpha,
        lora_dropout=lora_dropout,
        r=lora_r,
        bias="none",
        task_type="CAUSAL_LM",
    )

    # Apply LoRA adapter to the base model using get_peft_model
    model = get_peft_model(model, peft_config)


    # Set up training arguments.
    training_args = TrainingArguments(
        output_dir=f"./results/{new_model}",
        per_device_train_batch_size=batch_size,  # adjust based on available GPU memory
        gradient_accumulation_steps=1,
        num_train_epochs=num_train_epochs,
        learning_rate=lr,
        weight_decay=0.001,
        fp16=False,
        logging_steps=100,
        save_steps=2000,
        group_by_length=True,
        report_to="tensorboard" 
    )

    # Create an SFTTrainer for supervised fine-tuning.
    trainer = SFTTrainer(
        model=model,
        processing_class=tokenizer,
        args=training_args,
        train_dataset=dataset
    )

    # Start training
    trainer.train()

    # Save the fine-tuned model and tokenizer.
    trainer.model.save_pretrained(new_model)
    

In [7]:
finetune_model(dataset=monoLangDataset, new_model="monolingualData_finetuned-Llama3.2-3B", model_name="meta-llama/Llama-3.2-3B", num_train_epochs= 4, quantization="4bit")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Converting train dataset to ChatML:   0%|          | 0/16000 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/16000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/16000 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (140775 > 131072). Running this sequence through the model will result in indexing errors


Truncating train dataset:   0%|          | 0/16000 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
100,2.4808
200,2.43
300,2.3599
400,2.4245
500,2.4131
600,2.3793
700,2.3691
800,2.3138
900,2.3726
1000,2.3878


## Finetune with parallel data

In [2]:
directions = ["cs-en", "de-en", "is-en", "ru-en", "zh-en"]

In [3]:
from datasets import concatenate_datasets

def translation_prompt_template(lang1, lang2, text):
    """Creates a formatted translation prompt."""
    return f"Translate the following text from {lang1} to {lang2}: \n{lang1}: {text} \n{lang2}: "

def format_bidirectional_prompts(batch):
    inputs = []
    outputs = []
    language_name = {"en": "English", "de": "German", "ru": "Russian", "is": "Islandic", "zh": "Chinese", "cs": "Czech"}
    for example in batch['translation']:  
        lang1, lang2 = list(example.keys()) 
        text1, text2 = example[lang1], example[lang2]  

        # Generate forward and reverse examples
        forward_prompt = translation_prompt_template(language_name[lang1], language_name[lang2], text1)
        reverse_prompt = translation_prompt_template(language_name[lang2], language_name[lang1], text2)

        # Add the corresponding outputs for each direction
        inputs.append(forward_prompt)
        outputs.append(text2 + ' <|end_of_text|>')  # Add EOS token 

        inputs.append(reverse_prompt)
        outputs.append(text1 + ' <|end_of_text|>')  

    return {
        'input': inputs,
        'output': outputs
    }   

def collect_parallel_data(directions: list, max_samples_per_direction: int = 1000):
    
    collected_data = []
    for direction in directions:
        Alma_Parallel_dataset = load_dataset(
            "haoranxu/ALMA-Human-Parallel",
            direction,
            split="train"
        )
        collected_data.append(Dataset.from_dict(Alma_Parallel_dataset[:max_samples_per_direction]))

    processed_datasets = [
        ds.map(format_bidirectional_prompts, remove_columns=["translation"], batched=True)
        for ds in collected_data
        ]

    return concatenate_datasets(processed_datasets)


In [4]:
Parallel_data = collect_parallel_data(directions, 1000)

Map: 100%|██████████| 1000/1000 [00:00<00:00, 64371.28 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 84039.03 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 71153.82 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 47628.45 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 71624.04 examples/s]


In [5]:
def formatting_prompts_func(example):
    return example['input'] + " " + example['output']

In [6]:
model_fn_path = "./monolingualData_finetuned-Llama3.2-3B" 

# Load the base model
compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=False,
    )
base_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-3B",device_map="auto", quantization_config=bnb_config)

# Load the LoRA model weights
ft_model = PeftModel.from_pretrained(base_model, model_fn_path, torch_dtype=torch.float16)
# Merge LoRA weights into the base model
ft_model = ft_model.merge_and_unload()

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

Loading checkpoint shards: 100%|██████████| 2/2 [00:06<00:00,  3.37s/it]


In [7]:
# LoRA attention dimension
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.1

In [8]:
def LORA_finetune(model, dataset, new_model_name, num_epoch, format_func):
    peft_config = LoraConfig(
        lora_alpha=lora_alpha,
        lora_dropout=lora_dropout,
        r=lora_r,
        bias="none",
        task_type="CAUSAL_LM",
    )

    # Apply LoRA adapter to the base model using get_peft_model
    lora_model = get_peft_model(model, peft_config)

    # Set up training arguments.
    training_args = TrainingArguments(
        output_dir=f"./results/{new_model_name}",
        per_device_train_batch_size=6,  # adjust based on available GPU memory
        gradient_accumulation_steps=1,
        num_train_epochs=num_epoch,
        learning_rate=2e-4,
        weight_decay=0.001,
        fp16=False,
        logging_steps=100,
        save_steps=500,
        group_by_length=True
        )
    
    # Create an SFTTrainer for supervised fine-tuning.
    trainer = SFTTrainer(
        model=lora_model,
        processing_class=tokenizer,
        args=training_args,
        train_dataset=dataset,
        formatting_func=format_func
    )

    # Start training
    trainer.train()

    # Save the fine-tuned model and tokenizer.
    trainer.model.save_pretrained(new_model_name)

In [9]:
LORA_finetune(ft_model, Parallel_data, "parallelData_finetuned-Llama3.2-3B", 3, formatting_prompts_func)

Applying formatting function to train dataset: 100%|██████████| 10000/10000 [00:00<00:00, 17736.19 examples/s]
Converting train dataset to ChatML: 100%|██████████| 10000/10000 [00:00<00:00, 24979.49 examples/s]
Applying chat template to train dataset: 100%|██████████| 10000/10000 [00:00<00:00, 25340.61 examples/s]
Tokenizing train dataset: 100%|██████████| 10000/10000 [00:03<00:00, 2572.40 examples/s]
Truncating train dataset: 100%|██████████| 10000/10000 [00:01<00:00, 5130.79 examples/s]
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
100,1.9761
200,1.8671
300,1.8235
400,1.7979
500,1.7656
600,1.719
700,1.6819
800,1.6728
900,1.6551
1000,1.6677


## Finetune with DPO

In [2]:
directions = ["cs-en", "de-en", "is-en", "ru-en", "zh-en",
              "en-cs", "en-de", "en-is", "en-ru", "en-zh"]

In [3]:
from datasets import concatenate_datasets

def prepare_DPO_dataset(directions: list, max_samples_per_direction: int=500):
    Preference_Dataset = load_dataset(
            "haoranxu/X-ALMA-Preference",
            split="train"
        )
    ds_list = []
    for dir in directions:
        filtered_direction = Preference_Dataset.filter(lambda x: x["directions"] == dir).select(range(max_samples_per_direction))
        ds_list.append(filtered_direction)
    DPO_dataset = concatenate_datasets(ds_list)

    return DPO_dataset


In [4]:
DPO_dataset = prepare_DPO_dataset(directions, 500)

In [5]:
def return_prompt_and_responses(sample) :
    language_name = {"en": "English", "de": "German", "ru": "Russian", "is": "Islandic", "zh": "Chinese", "cs": "Czech"}
    source_lang , target_lang = sample["directions"].split("-")
    return {
         "prompt": (
            f"Translate the following text from {language_name[source_lang]} to {language_name[target_lang]}:\n"
            f"{language_name[source_lang]}: {sample['source']}\n"
            f"{language_name[target_lang]}: "
        ),
        "chosen": sample["chosen"]+ ' <|end_of_text|>',
        "rejected": sample["reject"]+ ' <|end_of_text|>',
    }

In [6]:
original_columns = DPO_dataset.column_names
DPO_dataset = DPO_dataset.map(return_prompt_and_responses,remove_columns=original_columns)

In [7]:
model_fn_path = "./results/parallelData_finetuned-Llama3.2-3B" 

# Load the base model
compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=False,
    )
base_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-3B",device_map="auto", quantization_config=bnb_config)

# Load the LoRA model weights
ft_model = PeftModel.from_pretrained(base_model, model_fn_path, torch_dtype=torch.float16)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

Loading checkpoint shards: 100%|██████████| 2/2 [00:06<00:00,  3.39s/it]


In [8]:
def DPO_finetune(ft_model, dataset, new_model_name, num_epochs):
    # Enable gradients only for floating point parameters
    for param in ft_model.parameters():
        if param.dtype in [torch.float16, torch.bfloat16, torch.float32]:
            param.requires_grad = True

    # Enable gradient checkpointing for 4-bit quantization models
    ft_model.gradient_checkpointing_enable()

    # hyperparameters and training arguments
    training_args = DPOConfig(
        output_dir=f"./results/{new_model_name}",
        per_device_train_batch_size=4,
        gradient_accumulation_steps=1,
        learning_rate=1e-5,
        weight_decay=0.001,
        num_train_epochs=num_epochs,
        logging_steps=100,
        fp16=False,
        save_steps=200,
        beta=0.1,
        loss_type="robust", 
        bf16=True 
        )

    trainer = DPOTrainer(
        model=ft_model,
        args=training_args,
        train_dataset=dataset,
        processing_class=tokenizer
    )

    # Train the model
    trainer.train()

    ft_model.save_pretrained(f"./results/{new_model_name}")
    # Save the model
    merged_model = ft_model.merge_and_unload()
    full_model_path = f"./results/{new_model_name}_full"
    merged_model.save_pretrained(full_model_path)


In [9]:
DPO_finetune(ft_model, DPO_dataset, "DPO_finetuned-Llama3.2-3B", 1)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
100,0.3392


RuntimeError: [enforce fail at inline_container.cc:626] . unexpected pos 44544 vs 44436