### For this project, we have decide to use a pretrained model NLLB-200 and finetune it to fit our project needs.
Model link: https://huggingface.co/facebook/nllb-200-distilled-600M

In [1]:
#Import packages
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq, pipeline
import numpy
import evaluate
import pandas
from sklearn.model_selection import train_test_split
from datasets import load_dataset, Dataset, DatasetDict

In [2]:
#Load model directly
tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M", src_lang = 'en', tgt_lang = 'zh')
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")

We will be using 2 datasets from Amazon to finetune the NLLB-200 model. The source dataset will be in English and the target dataset will be in Chinese.

In [3]:
#load the source dataset
en_data = pandas.read_csv('en-US.csv')
zh_data = pandas.read_csv('zh-CN.csv')

data = pandas.concat([en_data['utt1'], zh_data['utt1']], axis=1, keys=['en', 'zh'])
train_data, test_data = train_test_split(data)

dataset = DatasetDict({
                        'train': Dataset.from_pandas(train_data),
                        'test': Dataset.from_pandas(test_data)
                    })
dataset = dataset.remove_columns(["__index_level_0__"])
dataset


DatasetDict({
    train: Dataset({
        features: ['en', 'zh'],
        num_rows: 4070
    })
    test: Dataset({
        features: ['en', 'zh'],
        num_rows: 1357
    })
})

We will be following the tutorials from Hugging Face to use PyTorch Trainer to finetune the pretrained model. https://huggingface.co/docs/transformers/en/training, https://huggingface.co/docs/transformers/tasks/translation

In [4]:
def tokenize_function(data):
    #return tokenizer(data['en'], data['zh'], truncation=True)
    padding = 'max_length'
    max_length = 1000
    sources = [str(d) for d in data['en']]
    targets = [str(d) for d in data['zh']]
    inputs = tokenizer(sources, max_length=max_length, padding=padding, truncation=True)
    label = tokenizer(targets, max_length=max_length, padding=padding, truncation=True)
    inputs['labels'] = label['input_ids']
    return inputs


In [5]:
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

Map:   0%|          | 0/4070 [00:00<?, ? examples/s]

Map:   0%|          | 0/1357 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['en', 'zh', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 4070
    })
    test: Dataset({
        features: ['en', 'zh', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1357
    })
})

In [6]:
metric = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = numpy.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer,model=model)

In [7]:
training_args = TrainingArguments(
                    output_dir="Trained Model", 
                    evaluation_strategy="epoch"
                    )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    data_collator=data_collator,
    tokenizer=tokenizer
)


In [8]:
trainer.train()

  0%|          | 0/1527 [00:00<?, ?it/s]

RuntimeError: MPS backend out of memory (MPS allocated: 44.85 GB, other allocations: 1.01 GB, max allowed: 45.90 GB). Tried to allocate 488.28 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

Once we finished training our model, it is saved to the Trained Model folder. However, due to the size limit of GitHub, the final version of the FineTuned Model is pushed to a Hugging Face repo so that we can easily reference it for further usage.

In [None]:
trainer.save_model('./Trained Model/FineTuned Model')

Non-default generation parameters: {'max_length': 200}


The next step is to test our fine tune model to make sure everything is working correctly.

In [None]:
translator = pipeline("translation", model="duongy18418/Multilingual_Model")

config.json:   0%|          | 0.00/896 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/39.9k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/4.23k [00:00<?, ?B/s]

In [None]:
ecommerce_dataset = pandas.read_csv("Amazon_Ecommerce_Data_2020.csv", usecols=["Category"], nrows=100)
ecommerce_dataset = ecommerce_dataset.dropna()
en_list = ecommerce_dataset['Category'].tolist()
zh_list = []

In [None]:
for i in range(len(en_list)):
    zh_list.append(translator(en_list[i], src_lang='en', tgt_lang='zh'))

In [None]:
ecommerce_dataset = pandas.DataFrame(list(zip(en_list, zh_list)), columns=['Category-en', 'Category-zh'])
print(ecommerce_dataset.to_markdown())

|    | Category-en                                                                                                                                         | Category-zh                                                                         |
|---:|:----------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------|
|  0 | Sports & Outdoors | Outdoor Recreation | Skates, Skateboards & Scooters | Skateboarding | Standard Skateboards & Longboards | Longboards            | [{'translation_text': 'zheng 运动和户外娱乐滑板滑板滑板滑板滑板长板'}]              |
|  1 | Toys & Games | Learning & Education | Science Kits & Toys                                                                                           | [{'translation_text': 'zheng 玩具游戏学习教育科学套件玩具'}]                        |
|  2 | Toys & Games | Arts & Crafts | Craft Kits             