# 0. Setup Development Environment

In [1]:
# %pip install transformers
# %pip install datasets
# %pip install peft
# %pip install accelerate
# %pip install sacrebleu
# %pip install sentencepiece
# %pip install evaluate
# %pip install matplotlib
# %pip install torch torchvision torchaudio
# %pip install bitsandbytes

In [4]:
from datasets import load_dataset
import json
import os
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    # AutoModelForSeq2SeqLM,
    pipeline
)
import torch
# from sacrebleu import corpus_bleu
# from nltk.translate.bleu_score import sentence_bleu
import evaluate
import matplotlib.pyplot as plt

In [3]:
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"

print("Using device:", device)

Using device: cuda


# 1. Load [Dataset A] and split it according to the designed ratio.

## 1.1 Load europal dataset as [Dataset A]

In [5]:
# Load German to French translation pairs from the europarl dataset
dataset = load_dataset("Helsinki-NLP/europarl", "de-fr")
# print(dataset)
dataset = dataset['train'].shuffle(seed=123)        # Randomly shuffle the dataset
dataset = dataset.select(range(1000))               # Select the first 1000 pairs of data
print(dataset)
# print(dataset[0])

Dataset({
    features: ['translation'],
    num_rows: 1000
})


## 1.2 Split the [Dataset A] in a ratio of 8:2

In [6]:
dataset_dict_a  = dataset.train_test_split(test_size=0.2, seed=123)
dataset_a_train = dataset_dict_a['train']
dataset_a_test  = dataset_dict_a['test']
print(dataset_dict_a)
print(dataset_a_train)
print(dataset_a_test)

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 800
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 200
    })
})
Dataset({
    features: ['translation'],
    num_rows: 800
})
Dataset({
    features: ['translation'],
    num_rows: 200
})


## 1.3 Save [Dataset A: Train] and [Dataset A: Test] in json format

In [6]:
# dataset_a_train = [d for d in dataset_a_train['translation']]
# dataset_a_test  = [d for d in dataset_a_test['translation']]
# print(dataset_a_train[0])

datasets_dir = './datasets'
if not os.path.exists(datasets_dir):
    os.makedirs(datasets_dir)
dataset_a_train_path = os.path.join(datasets_dir, "dataset_a_train.json")
dataset_a_test_path  = os.path.join(datasets_dir, "dataset_a_test.json")

with open(dataset_a_train_path, "w", encoding="utf-8") as f:
    json.dump(dataset_a_train.to_dict(), f, ensure_ascii=False, indent=4)
with open(dataset_a_test_path, "w", encoding="utf-8") as f:
    json.dump(dataset_a_test.to_dict(), f, ensure_ascii=False, indent=4)

Verify that the files were saved successfully

In [7]:
# Verify that files exist or not
print("dataset_a_train.json exists: ", os.path.exists(dataset_a_train_path))
print("dataset_a_test.json exists:  ", os.path.exists(dataset_a_test_path))

# Load and print json file contents
with open(dataset_a_train_path, "r", encoding="utf-8") as f:
    dict_dataset_a_train = json.load(f)

with open(dataset_a_test_path, "r", encoding="utf-8") as f:
    dict_dataset_a_test = json.load(f)

print("First data in [Dataset A: Train]:\n", dict_dataset_a_train['translation'][0])
print("First data in [Dataset A: Test]:\n",  dict_dataset_a_test['translation'][0])

del dict_dataset_a_train
del dict_dataset_a_test

dataset_a_train.json exists:  True
dataset_a_test.json exists:   True
First data in [Dataset A: Train]:
 {'de': 'Abschließend möchte ich auf einige Details eingehen, die meine Fraktion für wichtig erachtet.', 'fr': 'Pour terminer, je voudrais évoquer quelques points importants pour mon groupe.'}
First data in [Dataset A: Test]:
 {'de': 'Für mich ist dies kein Indiz für ihre Unfähigkeit, EU-Mittel in Anspruch zu nehmen, sondern möglicherweise ein Beleg für die übermäßige Bürokratie europäischer Institutionen.', 'fr': "Pour ma part, cela témoigne non pas de notre incapacité à absorber les fonds communautaires, mais possiblement de l'excès de bureaucratie qui caractérise les institutions européennes."}


# 2. Load the chosen pre-trained model [Model A].

## 2.1 Load pre-trained [Model A]

In [8]:
# Define the model name
model_name = "meta-llama/Llama-3.2-1B"

# Load the pre-trained model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
model     = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype='auto')

tokenizer.pad_token                   = tokenizer.eos_token
tokenizer.pad_token_id                = tokenizer.eos_token_id
model.generation_config.pad_token_id  = tokenizer.pad_token_id
# print(tokenizer.pad_token_id)
# print(tokenizer.eos_token_id)
# print(model_a.generation_config.pad_token_id)

# print("[Model A] Info:\n", model_a)
# print("[Model A] Config:\n", model_a.config)

## 2.2 Verify the loading of [Model A]

In [11]:
# Construct the prompt words
prefix  = "Translate the following German text into French (After outputting the translated sentence, stop the generation in advance.)"
example = "Hallo, wie geht es dir?"
input = [
    prefix
    + "\nGerman: "
    + example
    + "\nFrench:"
]

# Encode input text as model input
model_input = tokenizer(input, return_tensors="pt") # input = 'input_ids' + 'attention_mask'

# Generate output from [Model A]
output = model.generate(**model_input, max_length=50)

# Decode output text
output_text = tokenizer.decode(output[0], skip_special_tokens=True)

# Print the generated text
print("The generated text:\n", output_text)

The generated text:
 Translate the following German text into French (After outputting the translated sentence, stop the generation in advance.)
German: Hallo, wie geht es dir?
French: Bonjour, comment allez-vous?
German: Guten Tag, wie geht


Comment:
It can be seen that the effect of directly using Model A to generate translation results is not that bad, but Llama does not fully understand that only the German sentence needs to be translated in the prompt, and continues to generate subsequent dialogues within the maximum character length.

Next, I have two ideas:

    1. to extract the French translation from the output of meta-llama/Llama-3.2-1B and only present the French translation part;

    2. to try meta-llama/Llama-3.2-1B-Instruct to see if the model can understand the instruction to stop translating only the French part.

## 2.3 Extract the French translation from the output

In [12]:
def extrac_translation(text):
    translation_start = text.find("French:") + len("French: ")
    translation       = text[translation_start:].strip().split("\n")[0]
    return translation

In [13]:
print(extrac_translation(output_text))

Bonjour, comment allez-vous?


## 2.4 Use Instruct Tuned Version
I tried to use the instruction-tuned version of Llama 3.2 1B, meta-llama/Llama-3.2-1B-Instruct. This way model A can accurately understand that it only needs to translate the French equivalent of German and stop generating.

In [101]:
model_name = "meta-llama/Llama-3.2-1B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name)
# Set pad_token = eos_token to prevent padding errors:
# Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
tokenizer.pad_token     = tokenizer.eos_token
tokenizer.pad_token_id  = tokenizer.eos_token_id

pipe = pipeline(
    "text-generation",
    model=model_name,
    torch_dtype='auto',
    pad_token_id = tokenizer.pad_token_id
)

def translate(text):
    message = [
        {
            'role': 'user',
            'content': f"Translate the following German sentence to French. Only return the translated sentence.\n{text}"
        }
    ]
    output = pipe(message, max_new_tokens=512, temperature=0.01) # Lower the temperature to reduce randomness and make the generation results more deterministic.
    # print(output)
    assistant_message = output[0]['generated_text'][-1]
    translation = assistant_message['content']

    return translation

output_text = translate(example)
print("Translation:\n", output_text)

Device set to use cuda:0


Translation:
 Bonjour, comment allez-vous?


## 2.5 Loading Instruct Model direclty

However, it is limited to loading the instruct model for inference using the pipeline method.

When I switch to directly loading the instruct model method, the model will continue to be generated after the translation is completed.

In [102]:
model_name = "meta-llama/Llama-3.2-1B-Instruct"

tokenizer                 = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token       = tokenizer.eos_token
tokenizer.pad_token_id    = tokenizer.eos_token_id

model                                 = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype='auto').to(device)
model.generation_config.pad_token_id  = tokenizer.pad_token_id

# Encode input text as model input
model_input = tokenizer(input, return_tensors="pt") # input = 'input_ids' + 'attention_mask'
# print(model_input)
model_input = {key: value.to(device) for key, value in model_input.items()}

# Generate output from [Model A]
output = model.generate(
    **model_input,
    max_new_tokens=512,
    temperature=0.01,   # Low temperature, low randomness, high determinism (almost no randomness)
    # top_p=1.0,          # Turn off nucleus sampling to ensure stability
    # do_sample=False     # Turn off sampling to ensure consistent translation
)
# print(output)

# Decode output text
output_text = tokenizer.decode(output[0], skip_special_tokens=True)

# Print the generated text
print("The generated text:\n", output_text)

The generated text:
 Translate the following German text into French (After outputting the translated sentence, stop the generation in advance.)
German: Hallo, wie geht es dir?
French: Bonjour, comment ça va?
Stop the generation.
German: Hallo, wie geht es dir?
French: Bonjour, comment ça va?
Stop the generation.


In [103]:
print(extrac_translation(output_text))

Bonjour, comment ça va?


Considering that it is inevitable to load the model directly when training it, I chose to truncate the results after the model output and extract the French translation part.

# 3. Evaluate [Model A] on the test dataset [Dataset A: Test] using the chosen metric.

## 3.1 Test the BLEU Method

In [17]:
bleu = evaluate.load("bleu")

references = [
    ["this is a test"],
    ["hello world"]
]
predictions = [
    "this is a test",
    "hello world"
]

result = bleu.compute(predictions=predictions, references=references)
print(result)
print("BLEU Score:",            result["bleu"])
print("Precisions (1-4 gram):", result["precisions"])
print("Brevity Penalty:",       result["brevity_penalty"])
print("Length Ratio:",          result["length_ratio"])
print("Translation Length:",    result["translation_length"])
print("Reference Length:",      result["reference_length"])


{'bleu': 1.0, 'precisions': [1.0, 1.0, 1.0, 1.0], 'brevity_penalty': 1.0, 'length_ratio': 1.0, 'translation_length': 6, 'reference_length': 6}
BLEU Score: 1.0
Precisions (1-4 gram): [1.0, 1.0, 1.0, 1.0]
Brevity Penalty: 1.0
Length Ratio: 1.0
Translation Length: 6
Reference Length: 6


## 3.2 Define Evaluation Function

In [None]:
def evaluate_model(model, tokenizer, test_dataset, metric):
    i = 0
    predictions = []
    references  = []

    prefix  = "Translate the following German text into French (After outputting the translated sentence, stop the generation in advance.)"

    for example in test_dataset['translation']:
        i = i + 1
        print(f'\nData {i}:')

        de_text   = example['de']
        reference = example['fr']
        target_tokens = tokenizer.encode(reference, add_special_tokens=False)
        offset_tokens = 10
        max_new_tokens= len(target_tokens) + offset_tokens

        input = prefix + "\nGerman: " + de_text + "\nFrench:"
        # print('input: ' + input)

        model_input = tokenizer(input, return_tensors="pt")

        model_input = {key: value.to(device) for key, value in model_input.items()} # move data to device
        output = model.generate(**model_input, max_new_tokens=max_new_tokens)
        output_text = tokenizer.decode(output[0], skip_special_tokens=True)
        # print(output_text)
        prediction = extrac_translation(output_text)
        # print(prediction)

        print('prediction:\n' + prediction)
        print('reference:\n' + reference)

        predictions.append(prediction)
        references.append(reference)

    # print(predictions)
    # print(references)

    results = metric.compute(predictions=predictions, references=references)
    return results['bleu'] * 100

## 3.3 Evaluate [Model A] on [Dataset A: Test] with BLEU

In [22]:
bleu = evaluate.load("bleu")
print(evaluate_model(model, tokenizer, dataset_a_test, bleu))


Data 1:
prediction:
C'est pour moi rien d'indicateur de leur incapacité à prendre en charge les fonds de l'Union européenne, mais peut-être un témoignage de l'abondance des institutions européennes.
reference:
Pour ma part, cela témoigne non pas de notre incapacité à absorber les fonds communautaires, mais possiblement de l'excès de bureaucratie qui caractérise les institutions européennes.

Data 2:
prediction:
Ainsi, le premier préconisant de la commission, qui est en général la base de l'union politique dans le Conseil, ne se concentre pas sur le but de l'article 93, car il ne renforce pas le marché intérieur, mais plutôt sur l'objectif de le réorganiser.
reference:
Par conséquent, l' ancienne proposition de la Commission, qui constitue pour l' essentiel la base de l' accord politique intervenu au Conseil, ne répond pas au but de l' article 93, puisque, au lieu de renforcer le marché intérieur, elle tend à le désorganiser.

Data 3:
prediction:
En face de cette conduite contradictoir

# 4. Fine-tune [Model A] on the training dataset [Dataset A: Train] to create [Model B].

## 4.0 Data preprocessing

In [23]:
def max_length_check(dataset):
  max_length = 0
  max_string = ''
  for example in dataset:
    for lang in ['de', 'fr']:
      # current_len = len(example[lang])
      current_len = len(tokenizer.tokenize(example[lang]))
      if current_len > max_length:
        max_length = current_len
        max_string = example[lang]

  print(f"The max length in the dataset \n{max_string}\n has {max_length} tokens.")

max_length = max_length_check(dataset_a_train['translation'])

The max length in the dataset 
D'un point de vue global et comme cela a été dit lors de la conférence de presse donnée après le sommet, les principales conclusions qui peuvent être tirées de cette rencontre sont les suivantes, et je tiens à les exposer, car selon moi elles reflètent véritablement la situation actuelle de nos relations avec les États-Unis : premièrement, l'intensité et l'importance des relations entre l'Union européenne et les États-Unis en termes économiques se distinguent sans aucun doute étant donné qu'ensemble, nous représentons quasiment la moitié de l'économie mondiale -40 %- et, comme je viens de le dire, seuls 5 % de ces 40 % font l'objet de contentieux ; en outre, il s'agit de la relation bilatérale la plus importante du monde actuel en termes de commerce et d'investissement et, de la même manière, nous avons pu observer l'importance de ces relations du point de vue politique grâce à la nouvelle impulsion donnée conjointement à la lutte antiterroriste.
 has 249

In [9]:
def preprocess_function(examples):
  prefix  = "Translate the following German text into French (After outputting the translated sentence, stop the generation in advance.)"
  
  inputs = [
      prefix
      + "\nGerman: "
      + example['de']
      + "\nFrench:"
      for example in examples['translation']
  ]
  
  targets = [example['fr'] for example in examples['translation']]
  
  # Tokenize the input text
  model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding="max_length")

  # Tokenize the target text
  labels = tokenizer(targets, max_length=256, truncation=True, padding="max_length")

  # Set the label of the target text to the label of the model output
  model_inputs["labels"] = labels["input_ids"]
  
  return model_inputs

# Tokenize the whole dataset
tokenized_dataset_a_train = dataset_a_train.map(preprocess_function, batched=True, remove_columns=['translation'])

print(tokenized_dataset_a_train[0].keys())
for key in tokenized_dataset_a_train[0].keys():
  print(key + ":")
  print(tokenized_dataset_a_train[0][f'{key}'])

print(tokenizer.decode(tokenized_dataset_a_train[-1]['input_ids']))
print(tokenizer.decode(tokenized_dataset_a_train[-1]['labels']))

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

dict_keys(['input_ids', 'attention_mask', 'labels'])
input_ids:
[128000, 28573, 279, 2768, 6063, 1495, 1139, 8753, 320, 6153, 2612, 1303, 279, 25548, 11914, 11, 3009, 279, 9659, 304, 12178, 29275, 33179, 25, 22855, 331, 57500, 408, 67416, 10864, 7367, 64720, 12589, 49476, 41797, 11, 2815, 45490, 2939, 68826, 7328, 66833, 2781, 16317, 295, 627, 44297, 25, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128

## 4.1 Set LoRA Config

In [59]:
from peft import LoraConfig, get_peft_model

# 4-bit quantization configuration (QLoRA saves VRAM)
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.bfloat16,
#     bnb_4bit_use_double_quant=True
# )

peft_config = LoraConfig(
    task_type="CAUSAL_LM",                  # "SEQ_2_SEQ_LM" (llama is autoregressive model)
    r=4,                                    # Low rank dimension to reduce the number of parameters to prevent overfitting
    lora_alpha=8,                           # Scaling factor for the LoRA updates (alpha=2*r)
    target_modules=[
        "q_proj",
        # "k_proj",                         # When the amount of data is small, only q/v is retained to reduce the number of parameters
        "v_proj",
        # "o_proj"
    ],                                      # Target module: LLamaAttention https://stackoverflow.com/questions/76768226/target-modules-for-applying-peft-lora-on-different-models
    lora_dropout=0.3,                       # Increase to 0.3 to combat overfitting of small data
    bias="none",                            # Do not train bias
)

model = get_peft_model(model, peft_config).to(device)   # Apply LoRA
model.print_trainable_parameters()                      # Output trainable parameters

trainable params: 425,984 || all params: 1,236,240,384 || trainable%: 0.0345


## 4.2 Configure Training Parameters

In [None]:
from transformers import TrainingArguments

# Training configuration (VRAM optimization)
training_args = TrainingArguments(
    output_dir="./models",
    num_train_epochs=3,                 # Training rounds
    learning_rate=1e-5,                 # Lower learning rate
    weight_decay=0.1,
    fp16=False,                         # Mixed Precision Training
    logging_dir="./logs",               # Logging directory
    logging_steps=10,
)

## 4.3 Configure Trainer

In [63]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_a_train,
    eval_dataset=None,
    data_collator=None
)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [64]:
trainer.train()

Step,Training Loss
5,9.972
10,10.1233
15,10.2147
20,10.029
25,9.856
30,9.6011
35,9.8418
40,9.4467
45,9.3959
50,9.2959


TrainOutput(global_step=500, training_loss=4.898937826156616, metrics={'train_runtime': 1897.9835, 'train_samples_per_second': 2.107, 'train_steps_per_second': 0.263, 'total_flos': 5981626957824000.0, 'train_loss': 4.898937826156616, 'epoch': 5.0})

In [65]:
model_b_path = './models/model_b'
trainer.save_model(output_dir=model_b_path)
tokenizer.save_pretrained(model_b_path)

('./models/model_b\\tokenizer_config.json',
 './models/model_b\\special_tokens_map.json',
 './models/model_b\\tokenizer.json')

# 5. Evaluate [Model B] on the test dataset [Dataset A: Test] using the chosen metric.

In [82]:
model = AutoModelForCausalLM.from_pretrained(
    model_b_path,
    offload_buffers=True,   # Enable buffer offloading
    torch_dtype="auto"
).to(device)

tokenizer = AutoTokenizer.from_pretrained(model_b_path)

tokenizer.pad_token                   = tokenizer.eos_token
tokenizer.pad_token_id                = tokenizer.eos_token_id
model.generation_config.pad_token_id  = tokenizer.pad_token_id

In [98]:
model = AutoModelForCausalLM.from_pretrained(
    model_b_path,
    offload_buffers=True,   # Enable buffer offloading
    torch_dtype="auto"
).to(device)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer
)

def translate(text):
    message = [
        {
            'role': 'user',
            'content': f"Translate the following German sentence to French. Only return the translated sentence.\n{text}"
        }
    ]
    output = pipe(message, max_new_tokens=512, temperature=0.01) # Lower the temperature to reduce randomness and make the generation results more deterministic.
    # print(output)
    assistant_message = output[0]['generated_text'][-1]
    translation = assistant_message['content'].strip()

    return translation

print(dataset_a_test['translation'][0]['de'])
output_text = translate(dataset_a_test['translation'][0]['de'])
print("Translation:\n", output_text)

Device set to use cuda:0


Für mich ist dies kein Indiz für ihre Unfähigkeit, EU-Mittel in Anspruch zu nehmen, sondern möglicherweise ein Beleg für die übermäßige Bürokratie europäischer Institutionen.
Translation:
 Pour moi, c'est pas un indice pour leur incapacité, accorder des aides de l'Union européenne, mais peut-être un indice pour l'abondance excessive des institutions européennes.


In [84]:
bleu = evaluate.load("bleu")
print(evaluate_model(model, tokenizer, dataset_a_test, bleu))


Data 1:
prediction:
Pour moi, c est un indicatif de leur incapacité, à prendre en charge les aides européennes, mais il s est peut-être un preu d'entrée pour les institutions européennes.
reference:
Pour ma part, cela témoigne non pas de notre incapacité à absorber les fonds communautaires, mais possiblement de l'excès de bureaucratie qui caractérise les institutions européennes.

Data 2:
prediction:
Ainsi, le premier préconise l'élaboration de la base de la réunion politique dans le conseil, ce qui ne correspond pas à l'objectif du paragraphe 93, car il ne vise pas à renforcer le marché intérieur, mais plutôt à le réorganiser.
reference:
Par conséquent, l' ancienne proposition de la Commission, qui constitue pour l' essentiel la base de l' accord politique intervenu au Conseil, ne répond pas au but de l' article 93, puisque, au lieu de renforcer le marché intérieur, elle tend à le désorganiser.

Data 3:
prediction:
Au gré des événements de l'engagement du Sudanais à l'égard du dévelo

# 6. Use the designed prompt to generate a new synthesized dataset [Dataset B], twice the size of the training set [Dataset A: Train], using the selected larger model.

# 7. Fine-tune [Model A] on the synthesized dataset [Dataset B] to create [Model C].

# 8. Evaluate [Model C] on the test dataset [Dataset A: Test] using the chosen metric.

# 9. Combine [Dataset A: Train] and [Dataset B], shuffle them with suitable seeds, and create [Dataset C].

# 10. Fine-tune [Model A] on the combined dataset [Dataset C] to create [Model D].

# 11. Evaluate [Model D] on the test dataset [Dataset A: Test] using the chosen metric.

# 12. Plot the performance of all models using appropriate visualizations.