### For this project, we have decide to use a pretrained model NLLB-200 and finetune it to fit our project needs.
Model link: https://huggingface.co/facebook/nllb-200-distilled-600M

In [2]:
#Import packages
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq, pipeline
import numpy
import evaluate
import pandas
from sklearn.model_selection import train_test_split
from datasets import load_dataset, Dataset, DatasetDict
from datetime import datetime

In [3]:
#Load model directly
tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M", src_lang = 'en', tgt_lang = 'zh')
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")

We will be using 2 datasets from Amazon to finetune the NLLB-200 model. The source dataset will be in English and the target dataset will be in Chinese.

In [4]:
#load the source dataset
en_data = pandas.read_csv('./Datasets/Train/en-US.csv')
zh_data = pandas.read_csv('./Datasets/Train/zh-CN.csv')

data = pandas.concat([en_data['Categories'], zh_data['Categories']], axis=1, keys=['en', 'zh'])
train_data, test_data = train_test_split(data)

dataset = DatasetDict({
                        'train': Dataset.from_pandas(train_data),
                        'test': Dataset.from_pandas(test_data)
                    })
dataset = dataset.remove_columns(["__index_level_0__"])
dataset

DatasetDict({
    train: Dataset({
        features: ['en', 'zh'],
        num_rows: 4070
    })
    test: Dataset({
        features: ['en', 'zh'],
        num_rows: 1357
    })
})

We will be following the tutorials from Hugging Face to use PyTorch Trainer to finetune the pretrained model. https://huggingface.co/docs/transformers/en/training, https://huggingface.co/docs/transformers/tasks/translation

In [5]:
def tokenize_function(data):
    #return tokenizer(data['en'], data['zh'], truncation=True)
    padding = 'max_length'
    max_length = 300
    sources = [str(d) for d in data['en']]
    targets = [str(d) for d in data['zh']]
    inputs = tokenizer(sources, max_length=max_length, padding=padding, truncation=True)
    label = tokenizer(targets, max_length=max_length, padding=padding, truncation=True)
    inputs['labels'] = label['input_ids']
    return inputs

In [6]:
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

Map:   0%|          | 0/4070 [00:00<?, ? examples/s]

Map:   0%|          | 0/1357 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['en', 'zh', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 4070
    })
    test: Dataset({
        features: ['en', 'zh', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1357
    })
})

In [13]:
metric = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = numpy.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer,model=model)

In [14]:
training_args = TrainingArguments(
                    output_dir="Trained Model", 
                    evaluation_strategy="epoch"
                    )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    data_collator=data_collator,
    tokenizer=tokenizer
)

In [15]:
trainer.train()

Non-default generation parameters: {'max_length': 200}


{'loss': 2.4492, 'learning_rate': 3.362802881466929e-05, 'epoch': 0.98}


                                                      
 33%|███▎      | 509/1527 [2:38:37<4:28:27, 15.82s/it]

{'eval_loss': 0.10732854902744293, 'eval_runtime': 927.1849, 'eval_samples_per_second': 1.464, 'eval_steps_per_second': 0.183, 'epoch': 1.0}


Non-default generation parameters: {'max_length': 200}


{'loss': 0.099, 'learning_rate': 1.7256057629338573e-05, 'epoch': 1.96}


                                                       
 67%|██████▋   | 1018/1527 [5:16:25<2:13:51, 15.78s/it]

{'eval_loss': 0.08964341878890991, 'eval_runtime': 926.6748, 'eval_samples_per_second': 1.464, 'eval_steps_per_second': 0.183, 'epoch': 2.0}


Non-default generation parameters: {'max_length': 200}


{'loss': 0.0862, 'learning_rate': 8.840864440078585e-07, 'epoch': 2.95}


                                                     
100%|██████████| 1527/1527 [7:54:16<00:00, 18.64s/it]
Non-default generation parameters: {'max_length': 200}


{'eval_loss': 0.08631503582000732, 'eval_runtime': 928.7818, 'eval_samples_per_second': 1.461, 'eval_steps_per_second': 0.183, 'epoch': 3.0}
{'train_runtime': 28456.2952, 'train_samples_per_second': 0.429, 'train_steps_per_second': 0.054, 'train_loss': 0.8641671183535533, 'epoch': 3.0}


Once we finished training our model, it is saved to the Trained Model folder. However, due to the size limit of GitHub, the final version of the FineTuned Model is pushed to a Hugging Face repo so that we can easily reference it for further usage.

In [24]:
trainer.save_model('./Trained Model/FineTuned Model')

Non-default generation parameters: {'max_length': 200}


The next step is to test our fine tune model to make sure everything is working correctly.

In [9]:
translator = pipeline("text2text-generation", model="duongy18418/E-commerce_Translation_Model")

In [11]:
ecommerce_dataset = pandas.read_csv("./Datasets/Test/Amazon_Ecommerce_Data_2020.csv", usecols=["Category"], nrows=100)
ecommerce_dataset = ecommerce_dataset.dropna()
en_list = ecommerce_dataset['Category'].tolist()
zh_list = []

for i in range(len(en_list)):
    zh_list.append(translator(en_list[i]))

In [12]:
ecommerce_dataset = pandas.DataFrame(list(zip(en_list, zh_list)), columns=['Category-en', 'Category-zh'])
print(ecommerce_dataset.to_markdown())

|    | Category-en                                                                                                                     | Category-zh                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
|---:|:--------------------------------------------------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [14]:
currentDateTime = datetime.now().strftime("%m-%d-%Y %H-%M-%S %p")
ecommerce_dataset.to_csv(f'./Results/result {currentDateTime}.csv', index=False)