### Required Imports

In [None]:
import os
from azureml.core import Workspace

import pandas as pd
from tokenizers import decoders, models, pre_tokenizers, processors, trainers, Tokenizer
from transformers import PreTrainedTokenizerFast

## Get the workspace and datastore

In [None]:
# This can be run both locally using the explicit
# workspace = Workspace.from_config("../aml_workspace_config.json")
# or from an Azure Compute using only:
# workspace = Workspace.from_config()
# This below only works when from an Azure compute, if you're in another env you should name it explicitly.
workspace = Workspace.from_config()
default_datastore = workspace.get_default_datastore()
default_datastore

## Get the dataset from the HuggingFace datasets library.

In [None]:
# # This is commented out, because this is simply not realistic only for didatic purposes.
# # On your scenario this will very likely come from a Blob location and it was put there by Azure Data Factory or other copying service.
# # However for the sake of this workshop, if you are not on the cybersai-innovation workspace, then you can use this data set instead.

# from datasets import load_dataset

# dataset = load_dataset("imdb")

# dataset

## Realistically get the data from a Blob

In [None]:
# this resembles more the actual flow, something already put the data on Azure Blob you know of, usually the one associated with the Azure Machine Learning workspace
from azureml.core import Dataset
datastore_paths = [(default_datastore, 'imdb/data/imdb_unsupervised.csv')]
dataset = Dataset.Tabular.from_delimited_files(path=datastore_paths)
dataset

In [None]:
pd.set_option('display.max_colwidth', 200)
#convert dataset to a dataframe
df = dataset.to_pandas_dataframe()
df.head()

### Train the tokenizer

In [None]:
# you pick a model for your tokenizer
# https://huggingface.co/docs/tokenizers/python/latest/components.html#models
tokenizer = Tokenizer(models.BPE())
#you pick a pre-tokenizer for your tokenizer
#https://huggingface.co/docs/tokenizers/python/latest/api/reference.html#module-tokenizers.pre_tokenizers
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
#test the pre-tokenizer
tokenizer.pre_tokenizer.pre_tokenize_str("Let's test pre-tokenization!")

In [None]:
#special tokens for RoBERTa model
special_tokens = [
                "<s>",
                "<pad>",
                "</s>",
                "<mask>"
            ]

In [None]:
# 30522 size comes from original RoBERTa config
# https://huggingface.co/docs/tokenizers/python/latest/api/reference.html#module-tokenizers.trainers
trainer = trainers.BpeTrainer(vocab_size=30522, special_tokens=special_tokens)

# actually train the tokenizer
tokenizer.train_from_iterator(df['text'], trainer=trainer)

#did you use the HuggingFace dataset? use this below instead
#tokenizer.train_from_iterator(dataset["unsupervised"]["text"], trainer=trainer)

### Test the tokenizer

In [None]:

encoding = tokenizer.encode("Let's test this tokenizer, or tokenization")
print(encoding.ids)
print(encoding.tokens)

In [None]:
#apply post processor
tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)

# set the decoder
tokenizer.decoder = decoders.ByteLevel()

#this to see post processor working
sentence = df['text'][0]
#did you use the HuggingFace dataset? use this instead
# sentence = dataset["unsupervised"]["text"][0]

print("sentence: ", sentence)
encoding = tokenizer.encode(sentence)
print("encoding ids", encoding.ids)
print("encoding offsets", encoding.offsets)
start, end = encoding.offsets[4]
print("start: ", start)
print("end: ", end)
sentence[start:end]

In [None]:
#test it all works
print(encoding.ids)
tokenizer.decode(encoding.ids)

In [None]:
#wrap it in a fast tokenizer to export to use in transformers library
wrapped_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    bos_token="<s>",
    eos_token="</s>",
    pad_token="<pad>",
    mask_token="<mask>",
)

wrapped_tokenizer.save_pretrained("../src/tokenizers/imdb_tokenizer")

In [None]:
# I like looking at the vocabulary, you can find all kinds of interesting things in there
wrapped_tokenizer.vocab

### But what if you want to train a different type of Tokenizer? like say a Word Level Tokenizer

In [None]:
# you pick a model for your tokenizer
word_tokenizer = Tokenizer(models.WordLevel(unk_token="[UNK]"))
#you pick a pre-tokenizer for your tokenizer
word_tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer(add_prefix_space=False)
#test the pre-tokenizer
word_tokenizer.pre_tokenizer.pre_tokenize_str("Let's test pre-tokenization!")


word_special_tokens = [
    "[PAD]",
    "[UNK]",
    "[CLS]",
    "[SEP]",
    "[MASK]",
]

# 30522 size comes from original BERT config
word_trainer = trainers.WordLevelTrainer(vocab_size=30522, special_tokens=word_special_tokens)

# actually train the tokenizer
word_tokenizer.train_from_iterator(df['text'], trainer=word_trainer)

#did you use the HuggingFace dataset? use this below instead
#tokenizer.train_from_iterator(dataset["unsupervised"]["text"], trainer=trainer)

word_encoding = word_tokenizer.encode("Let's test this tokenizer, or tokenization")
print(word_encoding.ids)
print(word_encoding.tokens)