### For this project, we have decide to use a pretrained model NLLB-200 and finetune it to fit our project needs.
Model link: https://huggingface.co/facebook/nllb-200-distilled-600M

In [28]:
#Import packages
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq, pipeline
import numpy
import evaluate
import pandas
from sklearn.model_selection import train_test_split
from datasets import load_dataset, Dataset, DatasetDict
from datetime import datetime

In [2]:
#Load model directly
tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M", src_lang = 'en', tgt_lang = 'zh')
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")

We will be using 2 datasets from Amazon to finetune the NLLB-200 model. The source dataset will be in English and the target dataset will be in Chinese.

In [3]:
#load the source dataset
en_data = pandas.read_csv('./Datasets/Train/en-US.csv')
zh_data = pandas.read_csv('./Datasets/Train/zh-CN.csv')

data = pandas.concat([en_data['Categories'], zh_data['Categories']], axis=1, keys=['en', 'zh'])
train_data, test_data = train_test_split(data)

dataset = DatasetDict({
                        'train': Dataset.from_pandas(train_data),
                        'test': Dataset.from_pandas(test_data)
                    })
dataset = dataset.remove_columns(["__index_level_0__"])
dataset


DatasetDict({
    train: Dataset({
        features: ['en', 'zh'],
        num_rows: 4070
    })
    test: Dataset({
        features: ['en', 'zh'],
        num_rows: 1357
    })
})

We will be following the tutorials from Hugging Face to use PyTorch Trainer to finetune the pretrained model. https://huggingface.co/docs/transformers/en/training, https://huggingface.co/docs/transformers/tasks/translation

In [19]:
def tokenize_function(data):
    #return tokenizer(data['en'], data['zh'], truncation=True)
    padding = 'max_length'
    max_length = 100
    sources = [str(d) for d in data['en']]
    targets = [str(d) for d in data['zh']]
    inputs = tokenizer(sources, max_length=max_length, padding=padding, truncation=True)
    label = tokenizer(targets, max_length=max_length, padding=padding, truncation=True)
    inputs['labels'] = label['input_ids']
    return inputs


In [20]:
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

Map:   0%|          | 0/4070 [00:00<?, ? examples/s]

Map:   0%|          | 0/1357 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['en', 'zh', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 4070
    })
    test: Dataset({
        features: ['en', 'zh', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1357
    })
})

In [21]:
metric = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = numpy.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer,model=model)

In [22]:
training_args = TrainingArguments(
                    output_dir="Trained Model", 
                    evaluation_strategy="epoch"
                    )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    data_collator=data_collator,
    tokenizer=tokenizer
)

In [23]:
trainer.train()

  0%|          | 0/1527 [00:00<?, ?it/s]

Checkpoint destination directory Trained Model/checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Non-default generation parameters: {'max_length': 200}


{'loss': 0.224, 'grad_norm': 0.6391696929931641, 'learning_rate': 3.362802881466929e-05, 'epoch': 0.98}


  0%|          | 0/170 [00:00<?, ?it/s]

{'eval_loss': 0.2343258112668991, 'eval_runtime': 52.3143, 'eval_samples_per_second': 25.939, 'eval_steps_per_second': 3.25, 'epoch': 1.0}


Checkpoint destination directory Trained Model/checkpoint-1000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Non-default generation parameters: {'max_length': 200}


{'loss': 0.1932, 'grad_norm': 0.40780875086784363, 'learning_rate': 1.7256057629338573e-05, 'epoch': 1.96}


  0%|          | 0/170 [00:00<?, ?it/s]

{'eval_loss': 0.22028295695781708, 'eval_runtime': 53.2208, 'eval_samples_per_second': 25.498, 'eval_steps_per_second': 3.194, 'epoch': 2.0}


Checkpoint destination directory Trained Model/checkpoint-1500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Non-default generation parameters: {'max_length': 200}


{'loss': 0.1909, 'grad_norm': 0.43432778120040894, 'learning_rate': 8.840864440078585e-07, 'epoch': 2.95}


  0%|          | 0/170 [00:00<?, ?it/s]

{'eval_loss': 0.21806859970092773, 'eval_runtime': 52.4701, 'eval_samples_per_second': 25.862, 'eval_steps_per_second': 3.24, 'epoch': 3.0}
{'train_runtime': 3105.3292, 'train_samples_per_second': 3.932, 'train_steps_per_second': 0.492, 'train_loss': 0.20297973766464303, 'epoch': 3.0}


TrainOutput(global_step=1527, training_loss=0.20297973766464303, metrics={'train_runtime': 3105.3292, 'train_samples_per_second': 3.932, 'train_steps_per_second': 0.492, 'train_loss': 0.20297973766464303, 'epoch': 3.0})

Once we finished training our model, it is saved to the Trained Model folder. However, due to the size limit of GitHub, the final version of the FineTuned Model is pushed to a Hugging Face repo so that we can easily reference it for further usage.

In [24]:
trainer.save_model('./Trained Model/FineTuned Model')

Non-default generation parameters: {'max_length': 200}


The next step is to test our fine tune model to make sure everything is working correctly.

In [25]:
translator = pipeline("translation", model="duongy18418/E-commerce_Translation_Model")

config.json:   0%|          | 0.00/896 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/39.9k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/4.23k [00:00<?, ?B/s]

In [26]:
ecommerce_dataset = pandas.read_csv("./Datasets/Test/Amazon_Ecommerce_Data_2020.csv", usecols=["Category"], nrows=100)
ecommerce_dataset = ecommerce_dataset.dropna()
en_list = ecommerce_dataset['Category'].tolist()
zh_list = []

In [27]:
for i in range(len(en_list)):
    zh_list.append(translator(en_list[i], src_lang='en', tgt_lang='zh'))

In [30]:
ecommerce_dataset = pandas.DataFrame(list(zip(en_list, zh_list)), columns=['Category-en', 'Category-zh'])
print(ecommerce_dataset.to_markdown())

|    | Category-en                                                                                                                     | Category-zh                                                                                                                     |
|---:|:--------------------------------------------------------------------------------------------------------------------------------|:--------------------------------------------------------------------------------------------------------------------------------|
|  0 | Sports Outdoors Outdoor Recreation Skates, Skateboards Scooters Skateboarding Standard Skateboards Longboards Longboards        | [{'translation_text': 'zheng/公仔 玩具 玩具车船飞机 玩具车船飞机'}]                                                             |
|  1 | Toys Games Learning Education Science Kits Toys                                                                                 | [{'translation_text': 'zheng/公仔 玩具 玩具车船飞机 玩具车船飞机'}]                           

In [32]:
currentDateTime = datetime.now().strftime("%m-%d-%Y %H-%M-%S %p")
ecommerce_dataset.to_csv(f'./Results/result {currentDateTime}.csv', index=False)