In [1]:
!pip install -U evaluate



In [2]:
from datasets import load_dataset, DatasetDict

# Load english-korean sentence pairs
dataset = load_dataset("IWSLT/iwslt2017", "iwslt2017-ko-en", trust_remote_code=True)
#If you plan to use english-french sentence pairs, use the following
# dataset = load_dataset("IWSLT/iwslt2017", "iwslt2017-fr-en")
#If you plan to use english-chinese sentence pairs, use the following
# dataset = load_dataset("IWSLT/iwslt2017", "iwslt2017-zh-en")
#If you plan to use english-vietnamese sentence pairs, then use the following
# dataset = load_dataset("Angelectronic/IWSLT15_English_Vietnamese")

## Since the number of training data is too large, it will take a long time to train. So, let's just use a subset of training data
# You can use any number instead of 5000. But all you have to do is, achieve higher score than 0.1 BLEU score.
dataset['train'] = dataset['train'].select(range(5000))

# Possible language pairs
#'iwslt2017-en-it', 'iwslt2017-en-nl', 'iwslt2017-en-ro', 'iwslt2017-it-en', 'iwslt2017-it-nl',
#'iwslt2017-it-ro', 'iwslt2017-nl-en', 'iwslt2017-nl-it', 'iwslt2017-nl-ro', 'iwslt2017-ro-en',
#'iwslt2017-ro-it', 'iwslt2017-ro-nl', 'iwslt2017-ar-en', 'iwslt2017-de-en', 'iwslt2017-en-ar',
#'iwslt2017-en-de', 'iwslt2017-en-fr', 'iwslt2017-en-ja', 'iwslt2017-en-ko', 'iwslt2017-en-zh',
#'iwslt2017-fr-en', 'iwslt2017-ja-en', 'iwslt2017-ko-en', 'iwslt2017-zh-en'

# If you plan to use the dataset that has only a train data, then execute the following, otherwise pass it
# Split into train (70%), validation (15%), and test (15%)
# train_test_split = dataset['train'].train_test_split(test_size=0.3, seed=42)
# validation_test_split = train_test_split['test'].train_test_split(test_size=0.5, seed=42)

# Combine splits into a new DatasetDict
# dataset = DatasetDict({
#     'train': train_test_split['train'],
#     'validation': validation_test_split['train'],
#     'test': validation_test_split['test']
# })

#Do not change the below
dataset['test'] = dataset['test'].select(range(100))
dataset['validation'] = dataset['validation'].select(range(100))
print(dataset)
for i in dataset['validation']['translation'][:10]:
    print(i)


DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 100
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 100
    })
})
{'en': 'Last year I showed these two slides so that  demonstrate that the arctic ice cap,  which for most of the last three million years  has been the size of the lower 48 states,  has shrunk by 40 percent.', 'ko': '작년에 이 두개의 슬라이드로 북극의 만년설이 지난 3백만년 동안 미국의 본토 48개주 크기였던 것이 40%나 줄었다는 것을 보여들였습니다.'}
{'en': "But this understates the seriousness of this particular problem  because it doesn't show the thickness of the ice.", 'ko': '하지만 이 사진은 문제를 축소시키고 있습니다. 사진으로는 빙하의 두께를 알수 없기 때문입니다.'}
{'en': 'The arctic ice cap is, in a sense,  the beating heart of the global climate system.', 'ko': '어떤 의미에서 북극 만년설은 지구 기후 시스템의 고동치는 심장입니다.'}
{'en': 'It expands in winter and contracts in summer.', 'ko': '빙산은 겨울에 늘어나고 여름에 줄어듭니다.'}
{'en': "

In [3]:
import os
from torch.utils.data import Dataset, DataLoader
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast, Seq2SeqTrainer, Seq2SeqTrainingArguments
import evaluate

# Disable wandb
os.environ["WANDB_DISABLED"] = "true"

# Define compute_metrics function for BLEU score
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = []
    decoded_labels = []
    for pred, label in zip(predictions, labels):
        decoded_pred = tokenizer.decode(pred, skip_special_tokens=True)
        decoded_label = tokenizer.decode(label, skip_special_tokens=True)
        decoded_preds.append(decoded_pred)
        decoded_labels.append(decoded_label)
    # Compute BLEU
    bleu = evaluate.load("bleu")
    result = bleu.compute(predictions=decoded_preds, references=[[label] for label in decoded_labels])
    return {"bleu": result["bleu"]}

# Load pre-trained model and tokenizer
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

# Define TranslationDataset class
class TranslationDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]["translation"]
        inputs = self.tokenizer(
            item["ko"], max_length=self.max_length, padding="max_length", truncation=True, return_tensors="pt"
        )
        targets = self.tokenizer(
            item["en"], max_length=self.max_length, padding="max_length", truncation=True, return_tensors="pt"
        )
        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "labels": targets["input_ids"].squeeze()
        }

# Create datasets
train_dataset = TranslationDataset(dataset["train"], tokenizer)
validation_dataset = TranslationDataset(dataset["validation"], tokenizer)
test_dataset = TranslationDataset(dataset["test"], tokenizer)

# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    predict_with_generate=True,
    logging_dir="./logs",
    save_total_limit=1,
)

# Trainer setup
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()


# Save the trained model
model.save_pretrained("./final_model")
tokenizer.save_pretrained("./final_model")

config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss,Bleu
1,0.3297,0.149462,0.31318
2,0.2198,0.13634,0.33899
3,0.1483,0.137163,0.346844


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


('./final_model/tokenizer_config.json',
 './final_model/special_tokens_map.json',
 './final_model/sentencepiece.bpe.model',
 './final_model/added_tokens.json',
 './final_model/tokenizer.json')

In [7]:
test_results = trainer.predict(test_dataset)
print(f"BLEU score on test data is {test_results.metrics['test_bleu']}")

BLEU score on test data is 0.138526264826469
