### For this project, we have decide to use a pretrained model NLLB-200 and finetune it to fit our project needs.
Model link: https://huggingface.co/facebook/nllb-200-distilled-600M

In [32]:
#Import packages
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq, NllbTokenizerFast, Seq2SeqTrainingArguments, Seq2SeqTrainer
import numpy
import evaluate
import pandas
from sklearn.model_selection import train_test_split
from datasets import load_dataset, Dataset, DatasetDict

In [33]:
#Load model directly
tokenizer = NllbTokenizerFast.from_pretrained("facebook/nllb-200-distilled-600M", src_lang = 'en', tgt_lang = 'zh')
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, return_tensors='pt')

We will be using 2 datasets from Amazon to finetune the NLLB-200 model. The source dataset will be in English and the target dataset will be in Chinese.

In [34]:
#load the source dataset
en_data = pandas.read_json('en-US.jsonl', lines=True)
zh_data = pandas.read_json('zh-CN.jsonl', lines=True)

data = pandas.concat([en_data['utt'], zh_data['utt']], axis=1, keys=['en', 'zh'])
train_data, test_data = train_test_split(data)

dataset = DatasetDict({
                        'train': Dataset.from_pandas(train_data),
                        'test': Dataset.from_pandas(test_data)
                    })
dataset = dataset.remove_columns(["__index_level_0__"])
dataset


DatasetDict({
    train: Dataset({
        features: ['en', 'zh'],
        num_rows: 12390
    })
    test: Dataset({
        features: ['en', 'zh'],
        num_rows: 4131
    })
})

In [35]:
def tokenize_function(data):
    return tokenizer(data['en'], data['zh'], truncation=True, padding="max_length", max_length=2000)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Map: 100%|██████████| 12390/12390 [00:03<00:00, 3224.80 examples/s]

[A
  0%|          | 0/1550 [06:18<?, ?it/s]

[A
[A
Map: 100%|██████████| 4131/4131 [00:01<00:00, 2750.37 examples/s]


DatasetDict({
    train: Dataset({
        features: ['en', 'zh', 'input_ids', 'attention_mask'],
        num_rows: 12390
    })
    test: Dataset({
        features: ['en', 'zh', 'input_ids', 'attention_mask'],
        num_rows: 4131
    })
})

We will be following the tutorial from Hugging Face to use PyTorch Trainer to finetune the pretrained model. https://huggingface.co/docs/transformers/en/training

In [36]:
metric = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = numpy.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [43]:
training_args = TrainingArguments(
                    output_dir="test_trainer", 
                    evaluation_strategy="epoch",
                    )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [45]:
trainer.train()

  0%|          | 0/4647 [00:17<?, ?it/s]


RuntimeError: MPS backend out of memory (MPS allocated: 16.54 GB, other allocations: 2.95 GB, max allowed: 18.13 GB). Tried to allocate 1.91 GB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).