### For this project, we have decide to use a pretrained model NLLB-200 and finetune it to fit our project needs.
Model link: https://huggingface.co/facebook/nllb-200-distilled-600M

In [2]:
#Import packages
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq
import numpy
import evaluate
import pandas
from sklearn.model_selection import train_test_split
from datasets import load_dataset, Dataset, DatasetDict

In [3]:
#Load model directly
tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M", src_lang = 'en', tgt_lang = 'zh')
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")

We will be using 2 datasets from Amazon to finetune the NLLB-200 model. The source dataset will be in English and the target dataset will be in Chinese.

In [4]:
#load the source dataset
en_data = pandas.read_json('en-US.jsonl', lines=True)
zh_data = pandas.read_json('zh-CN.jsonl', lines=True)

data = pandas.concat([en_data['utt'], zh_data['utt']], axis=1, keys=['en', 'zh'])
train_data, test_data = train_test_split(data)
train_data, dev_data = train_test_split(train_data)

dataset = DatasetDict({
                        'train': Dataset.from_pandas(train_data),
                        'test': Dataset.from_pandas(test_data),
                        'dev': Dataset.from_pandas(dev_data)
                    })
dataset = dataset.remove_columns(["__index_level_0__"])
dataset


DatasetDict({
    train: Dataset({
        features: ['en', 'zh'],
        num_rows: 5061
    })
    test: Dataset({
        features: ['en', 'zh'],
        num_rows: 2250
    })
    dev: Dataset({
        features: ['en', 'zh'],
        num_rows: 1688
    })
})

In [5]:
def tokenize_function(data):
    #return tokenizer(data['en'], data['zh'], truncation=True)
    padding = 'max_length'
    max_length = 100
    sources = [d for d in data['en']]
    targets = [d for d in data['zh']]
    inputs = tokenizer(sources, max_length=max_length, padding=padding, truncation=True)
    label = tokenizer(targets, max_length=max_length, padding=padding, truncation=True)
    inputs['labels'] = label['input_ids']
    return inputs

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

Map: 100%|██████████| 5061/5061 [00:00<00:00, 12358.18 examples/s]
Map: 100%|██████████| 2250/2250 [00:00<00:00, 20734.64 examples/s]
Map: 100%|██████████| 1688/1688 [00:00<00:00, 19973.38 examples/s]


DatasetDict({
    train: Dataset({
        features: ['en', 'zh', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 5061
    })
    test: Dataset({
        features: ['en', 'zh', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2250
    })
    dev: Dataset({
        features: ['en', 'zh', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1688
    })
})

We will be following the tutorial from Hugging Face to use PyTorch Trainer to finetune the pretrained model. https://huggingface.co/docs/transformers/en/training

In [6]:
metric = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = numpy.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer,model=model)

In [7]:
training_args = TrainingArguments(
                    output_dir="Trained Model", 
                    evaluation_strategy="epoch"
                    )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    data_collator=data_collator,
    tokenizer=tokenizer
)


In [9]:
trainer.train()

Non-default generation parameters: {'max_length': 200}


{'loss': 2.4285, 'learning_rate': 3.683517640863612e-05, 'epoch': 0.79}


                                                      
 33%|███▎      | 633/1899 [1:21:44<2:13:30,  6.33s/it]

{'eval_loss': 0.11612840741872787, 'eval_runtime': 466.6431, 'eval_samples_per_second': 4.822, 'eval_steps_per_second': 0.604, 'epoch': 1.0}


Non-default generation parameters: {'max_length': 200}


{'loss': 0.1067, 'learning_rate': 2.367035281727225e-05, 'epoch': 1.58}


                                                       
 67%|██████▋   | 1266/1899 [2:41:07<1:04:42,  6.13s/it]

{'eval_loss': 0.10672058910131454, 'eval_runtime': 466.9341, 'eval_samples_per_second': 4.819, 'eval_steps_per_second': 0.604, 'epoch': 2.0}


Non-default generation parameters: {'max_length': 200}


{'loss': 0.0856, 'learning_rate': 1.0505529225908373e-05, 'epoch': 2.37}


                                                     
100%|██████████| 1899/1899 [3:59:35<00:00,  7.57s/it]

{'eval_loss': 0.10698454082012177, 'eval_runtime': 470.4405, 'eval_samples_per_second': 4.783, 'eval_steps_per_second': 0.599, 'epoch': 3.0}
{'train_runtime': 14375.3035, 'train_samples_per_second': 1.056, 'train_steps_per_second': 0.132, 'train_loss': 0.7063732468623622, 'epoch': 3.0}





TrainOutput(global_step=1899, training_loss=0.7063732468623622, metrics={'train_runtime': 14375.3035, 'train_samples_per_second': 1.056, 'train_steps_per_second': 0.132, 'train_loss': 0.7063732468623622, 'epoch': 3.0})

In [10]:
trainer.save_model('./Trained Model/FineTuned Model')

Non-default generation parameters: {'max_length': 200}
