### For this project, we have decide to use a pretrained model NLLB-200 and finetune it to fit our project needs.
Model link: https://huggingface.co/facebook/nllb-200-distilled-600M

In [1]:
#Import packages
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq
import numpy
import evaluate
import pandas
from sklearn.model_selection import train_test_split
from datasets import load_dataset, Dataset, DatasetDict

  from .autonotebook import tqdm as notebook_tqdm





In [2]:
#Load model directly
tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M", num_labels=2)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer)

  return self.fget.__get__(instance, owner)()


We will be using 2 datasets from Amazon to finetune the NLLB-200 model. The source dataset will be in English and the target dataset will be in Chinese.

In [3]:
#load the source dataset
en_data = pandas.read_json('en-US.jsonl', lines=True)
zh_data = pandas.read_json('zh-CN.jsonl', lines=True)

data = pandas.concat([en_data['utt'], zh_data['utt']], axis=1, keys=['en', 'zh'])
#data['translation'] = 'en: ' + data['en'] + ', zh: ' + data['zh']
#data = data.drop(['en', 'zh'], axis=1)
train_data, test_data = train_test_split(data)

dataset = DatasetDict({
                        'train': Dataset.from_pandas(train_data),
                        'test': Dataset.from_pandas(test_data)
                    })
dataset = dataset.remove_columns(["__index_level_0__"])
dataset


DatasetDict({
    train: Dataset({
        features: ['en', 'zh'],
        num_rows: 12390
    })
    test: Dataset({
        features: ['en', 'zh'],
        num_rows: 4131
    })
})

In [4]:
def tokenize_function(data):
    return tokenizer(data['en'], data['zh'], truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

Map: 100%|██████████| 12390/12390 [00:00<00:00, 65734.54 examples/s]
Map: 100%|██████████| 4131/4131 [00:00<00:00, 62290.30 examples/s]


DatasetDict({
    train: Dataset({
        features: ['en', 'zh', 'input_ids', 'attention_mask'],
        num_rows: 12390
    })
    test: Dataset({
        features: ['en', 'zh', 'input_ids', 'attention_mask'],
        num_rows: 4131
    })
})

We will be following the tutorial from Hugging Face to use PyTorch Trainer to finetune the pretrained model. https://huggingface.co/docs/transformers/en/training

In [5]:
metric = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = numpy.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [6]:
training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    data_collator=data_collator,
    tokenizer=tokenizer
)

In [7]:
trainer.train()

  0%|          | 0/4647 [00:00<?, ?it/s]

ValueError: You have to specify either decoder_input_ids or decoder_inputs_embeds