In [1]:
from credentials import hf_token
from datasets import load_dataset, Dataset
from huggingface_hub import login
import itertools
import os
from peft import LoraConfig, get_peft_model
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer

login(token = hf_token)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

2025-03-09 14:17:40.282246: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741529860.300585    1193 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741529860.306155    1193 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-09 14:17:40.326476: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


cuda


## Load filtered OSCAR Dataset

In [2]:
#-----------------------------------------------------------------------------
# Load Monolingual Dataset from OSCAR-2301 DB
#-----------------------------------------------------------------------------

target_languages = ["en", "de", "ru", "is", "cs", "zh"]

def filter_oscar_dataset(target_languages: list, max_samples_per_language: int = 5000):
    """
    Filters and extracts a subset of texts from the OSCAR dataset for specific languages.
    
    Parameters:
    - target_languages (list): List of language codes to filter.
    - max_samples_per_language (int): Maximum number of samples per language (default: 5000).
    
    Returns:
    - Dataset: A Hugging Face Dataset containing extracted text samples.
    """
    collected_sentences = []

    for language in target_languages:
        oscar_dataset = load_dataset(
            "oscar-corpus/OSCAR-2301",
            language=language,
            streaming=True,
            split="train"
        )

        samples_to_extract = 1000 if language == 'en' else max_samples_per_language
        collected_sentences.extend(item['text'] for item in itertools.islice(oscar_dataset, samples_to_extract))

    return Dataset.from_dict({'text': collected_sentences})


In [3]:
monoLangDataset = filter_oscar_dataset(target_languages,3000)

## Finetune LLM with monoligual prompts

In [4]:
#-----------------------------------------------------------------------------
# Training parameters
#-----------------------------------------------------------------------------

batch_size = 2 ## For 16GB GPU (OSCAR DB)

lr = 2e-4

In [5]:
#-----------------------------------------------------------------------------
# QLoRA parameters
#-----------------------------------------------------------------------------

# LoRA attention dimension
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.1

In [6]:
# -----------------------------------------------------------------------------
# FINETUNING: Fine-tune LLAMA3.2-3B with PEFT (LoRA) using SFT.
# -----------------------------------------------------------------------------

def finetune_model(dataset: Dataset,
                   new_model: str,
                   model_name: str = "meta-llama/Llama-3.2-3B",
                   num_train_epochs: int = 2,
                   quantization:str = "4bit"):
    """
    Fine-tunes the specified LLAMA3.2-3B model on the given training texts
    using supervised fine-tuning (SFT) with a parameter-efficient adapter (LoRA).
    This function is compatible with other Llama-3.2 model with different size and also with
    other CausalLM models.

    Args:
      dataset (Dataset): Dataset with .
      new_model (str): Name of finetuned model.
      model_name (str): Hugging Face model repository ID.
      num_train_epochs (int): Number of training epochs.

    Returns:
      The fine-tuned model and tokenizer.
    """
    # Load tokenizer and base model (using FP16 and auto device mapping)
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "left"
    if quantization =="4bit":
      compute_dtype = getattr(torch, "float16")
      bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=False
      )
      model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto"
        )
    elif quantization == None:
      model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map=device
        )
    else:
      raise ValueError("This quantization is not compatible with LORA finetuning")
      
    model.config.use_cache = False
    model.config.pretraining_tp = 1
    

    # Configure LoRA parameters
    peft_config = LoraConfig(
        lora_alpha=lora_alpha,
        lora_dropout=lora_dropout,
        r=lora_r,
        bias="none",
        task_type="CAUSAL_LM",
    )

    # Apply LoRA adapter to the base model using get_peft_model
    model = get_peft_model(model, peft_config)


    # Set up training arguments.
    training_args = TrainingArguments(
        output_dir=f"./results/{new_model}",
        per_device_train_batch_size=batch_size,  # adjust based on available GPU memory
        gradient_accumulation_steps=1,
        num_train_epochs=num_train_epochs,
        learning_rate=lr,
        weight_decay=0.001,
        fp16=False,
        logging_steps=100,
        save_steps=2000,
        group_by_length=True,
        report_to="tensorboard" 
    )

    # Create an SFTTrainer for supervised fine-tuning.
    trainer = SFTTrainer(
        model=model,
        processing_class=tokenizer,
        args=training_args,
        train_dataset=dataset
    )

    # Start training
    trainer.train()

    # Save the fine-tuned model and tokenizer.
    trainer.model.save_pretrained(new_model)
    

In [7]:
finetune_model(dataset=monoLangDataset, new_model="monolingualData_finetuned-Llama3.2-3B", model_name="meta-llama/Llama-3.2-3B", num_train_epochs= 4, quantization="4bit")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Converting train dataset to ChatML:   0%|          | 0/16000 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/16000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/16000 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (140775 > 131072). Running this sequence through the model will result in indexing errors


Truncating train dataset:   0%|          | 0/16000 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
100,2.4808
200,2.43
300,2.3599
400,2.4245
500,2.4131
600,2.3793
700,2.3691
800,2.3138
900,2.3726
1000,2.3878


## Finetune with parallel data

In [2]:
directions = ["cs-en", "de-en", "is-en", "ru-en", "zh-en"]

In [3]:
from datasets import concatenate_datasets

def translation_prompt_template(lang1, lang2, text):
    """Creates a formatted translation prompt."""
    return f"Translate the following text from {lang1} to {lang2}: \n{lang1}: {text} \n{lang2}: "

def format_bidirectional_prompts(batch):
    inputs = []
    outputs = []
    language_name = {"en": "English", "de": "German", "ru": "Russian", "is": "Islandic", "zh": "Chinese", "cs": "Czech"}
    for example in batch['translation']:  
        lang1, lang2 = list(example.keys()) 
        text1, text2 = example[lang1], example[lang2]  

        # Generate forward and reverse examples
        forward_prompt = translation_prompt_template(language_name[lang1], language_name[lang2], text1)
        reverse_prompt = translation_prompt_template(language_name[lang2], language_name[lang1], text2)

        # Add the corresponding outputs for each direction
        inputs.append(forward_prompt)
        outputs.append(text2 + ' <|end_of_text|>')  # Add EOS token 

        inputs.append(reverse_prompt)
        outputs.append(text1 + ' <|end_of_text|>')  

    return {
        'input': inputs,
        'output': outputs
    }   

def collect_parallel_data(directions: list, max_samples_per_direction: int = 1000):
    
    collected_data = []
    for direction in directions:
        Alma_Parallel_dataset = load_dataset(
            "haoranxu/ALMA-Human-Parallel",
            direction,
            split="train"
        )
        collected_data.append(Dataset.from_dict(Alma_Parallel_dataset[:max_samples_per_direction]))

    processed_datasets = [
        ds.map(format_bidirectional_prompts, remove_columns=["translation"], batched=True)
        for ds in collected_data
        ]

    return concatenate_datasets(processed_datasets)


In [4]:
Parallel_data = collect_parallel_data(directions, 1000)

README.md:   0%|          | 0.00/3.03k [00:00<?, ?B/s]

(…)-00000-of-00001-3a60b130a713425b.parquet:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

(…)-00000-of-00001-d1f9a3fc339fbc84.parquet:   0%|          | 0.00/216k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/12076 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1002 [00:00<?, ? examples/s]

(…)-00000-of-00001-39460826cd7ac756.parquet:   0%|          | 0.00/2.67M [00:00<?, ?B/s]

(…)-00000-of-00001-34198d3f975c1787.parquet:   0%|          | 0.00/214k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14211 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1002 [00:00<?, ? examples/s]

(…)-00000-of-00001-f71a989f63b28d68.parquet:   0%|          | 0.00/378k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2009 [00:00<?, ? examples/s]

(…)-00000-of-00001-3ba3fad04eea46f0.parquet:   0%|          | 0.00/3.06M [00:00<?, ?B/s]

(…)-00000-of-00001-e9c97fe731036b74.parquet:   0%|          | 0.00/252k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/15000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1002 [00:00<?, ? examples/s]

(…)-00000-of-00001-6bd744feceb30dbf.parquet:   0%|          | 0.00/3.06M [00:00<?, ?B/s]

(…)-00000-of-00001-d1cc83e30e3dcdb2.parquet:   0%|          | 0.00/196k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/15406 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1002 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [5]:
def formatting_prompts_func(example):
    return example['input'] + " " + example['output']

In [6]:
from peft import PeftModel

model_fn_path = "./monolingualData_finetuned-Llama3.2-3B" 

# Load the base model
compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=False,
    )
base_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-3B",device_map="auto",
                                                      quantization_config=bnb_config, trust_remote_code=True,)

# Load the LoRA model weights
ft_model = PeftModel.from_pretrained(base_model, model_fn_path, torch_dtype=torch.float16)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

config.json:   0%|          | 0.00/844 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

In [7]:
# LoRA attention dimension
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.1

In [10]:
def LORA_finetune(model, dataset, new_model_name, num_epoch, format_func):
    peft_config = LoraConfig(
        lora_alpha=lora_alpha,
        lora_dropout=lora_dropout,
        r=lora_r,
        bias="none",
        task_type="CAUSAL_LM",
    )

    # Apply LoRA adapter to the base model using get_peft_model
    lora_model = get_peft_model(model, peft_config)

    # Set up training arguments.
    training_args = TrainingArguments(
        output_dir=f"./results/{new_model_name}",
        per_device_train_batch_size=6,  # adjust based on available GPU memory
        gradient_accumulation_steps=1,
        num_train_epochs=num_epoch,
        learning_rate=2e-4,
        weight_decay=0.001,
        fp16=False,
        logging_steps=50,
        save_steps=500,
        group_by_length=True
        )
    
    # Create an SFTTrainer for supervised fine-tuning.
    trainer = SFTTrainer(
        model=lora_model,
        processing_class=tokenizer,
        args=training_args,
        train_dataset=dataset,
        formatting_func=format_func
    )

    # Start training
    trainer.train()

    # Save the fine-tuned model and tokenizer.
    trainer.model.save_pretrained(new_model_name)

In [11]:
LORA_finetune(ft_model, Parallel_data, "parallelData_finetuned-Llama3.2-3B", 3, formatting_prompts_func)

Applying formatting function to train dataset:   0%|          | 0/10000 [00:00<?, ? examples/s]

Converting train dataset to ChatML:   0%|          | 0/10000 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/10000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/10000 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/10000 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
50,1.9868
100,1.8644
150,1.8184
200,1.8354
250,1.7961
300,1.7683
350,1.7562
400,1.7729
450,1.7835
500,1.6989
