# 0. Pre

In [None]:
%pip install transformers
%pip install datasets
%pip install peft
%pip install accelerate
%pip install sacrebleu
%pip install sentencepiece

In [184]:
from datasets import load_dataset
import json
import os
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from sacrebleu import corpus_bleu

# 1. Load [Dataset A] and split it according to the designed ratio.

## 1.1 Load OPUS-100 dataset as [Dataset A]

In [185]:
# Load German to French translation pairs from the OPUS-100 dataset
dataset_a = load_dataset("Helsinki-NLP/opus-100", "de-fr")
# print(dataset_a)
# print(dataset_a['test'][0])
dataset_a = dataset_a['test'].shuffle(seed=123)         # Randomly shuffle the dataset
dataset_a = dataset_a.select(range(1000))               # Select the first 1000 pairs of data
print(dataset_a)
# print(dataset_a[0])

Dataset({
    features: ['translation'],
    num_rows: 1000
})


## 1.2 Split the [Dataset A] in a ratio of 8:2

In [189]:
train_test_split = dataset_a.train_test_split(test_size=0.2, seed=123)
dataset_a_train  = train_test_split['train']
dataset_a_test   = train_test_split['test']
print(dataset_a_train)
print(dataset_a_test)

Dataset({
    features: ['translation'],
    num_rows: 800
})
Dataset({
    features: ['translation'],
    num_rows: 200
})


## 1.3 Save [Dataset A: Train] and [Dataset A: Test] in json format

In [193]:
dataset_a_train_list = [d for d in dataset_a_train['translation']]
dataset_a_test_list  = [d for d in dataset_a_test['translation']]

datasets_dir = './datasets'
if not os.path.exists(datasets_dir):
    os.makedirs(datasets_dir)
dataset_a_train_path = os.path.join(datasets_dir, "dataset_a_train.json")
dataset_a_test_path  = os.path.join(datasets_dir, "dataset_a_test.json")

with open(dataset_a_train_path, "w", encoding="utf-8") as f:
    json.dump(dataset_a_train_list, f, ensure_ascii=False, indent=4)
with open(dataset_a_test_path, "w", encoding="utf-8") as f:
    json.dump(dataset_a_test_list, f, ensure_ascii=False, indent=4)

Verify that the files were saved successfully

In [212]:
# Verify that files exist or not
print("Does dataset_a_train.json exist? ", os.path.exists(dataset_a_train_path))
print("Does  dataset_a_test.json exist? ",  os.path.exists(dataset_a_test_path))

# Load and print json file contents
with open(dataset_a_train_path, "r", encoding="utf-8") as f:
    loaded_dataset_a_train = json.load(f)

with open(dataset_a_test_path, "r", encoding="utf-8") as f:
    loaded_dataset_a_test = json.load(f)

print("First data in [Dataset A: Train]:\n", loaded_dataset_a_train[0])
print("First data in [Dataset A: Test]:\n", loaded_dataset_a_test[0])

Does dataset_a_train.json exist?  True
Does  dataset_a_test.json exist?  True
First data in [Dataset A: Train]:
 {'de': 'Die Frage ist, ob Ihr mutig genug seid, das Gleiche zu tun.', 'fr': "Il est temps de vous demander si c'est votre cas."}
First data in [Dataset A: Test]:
 {'de': 'Wenn du mich aufmuntern willst, dann bring das Gras mit, dass wir auf der Farmstation hinter der Wand deines Zimmers versteckt haben.', 'fr': "Si tu veux me réconforter, quand tu seras à la Station Agro, apporte moi l'herbe qu'on a planqué derrière le mur de notre chambre."}


# 2. Load your chosen pre-trained model [Model A].

# 3. Evaluate [Model A] on the test dataset [Dataset A: Test] using the chosen metric.

# 4. Fine-tune [Model A] on the training dataset [Dataset A: Train] to create [Model B].

# 5. Evaluate [Model B] on the test dataset [Dataset A: Test] using the chosen metric.

# 6. Use the designed prompt to generate a new synthesized dataset [Dataset B], twice the size of the training set [Dataset A: Train], using the selected larger model.

# 7. Fine-tune [Model A] on the synthesized dataset [Dataset B] to create [Model C].

# 8. Evaluate [Model C] on the test dataset [Dataset A: Test] using the chosen metric.

# 9. Combine [Dataset A: Train] and [Dataset B], shuffle them with suitable seeds, and create [Dataset C].

# 10. Fine-tune [Model A] on the combined dataset [Dataset C] to create [Model D].

# 11. Evaluate [Model D] on the test dataset [Dataset A: Test] using the chosen metric.

# 12. Plot the performance of all models using appropriate visualizations.