# Installing Dependencies

In order to get started, we will install the libraries in `requirements.txt` that we will use to load any pretrained huggingface model.

In [1]:
#!pip install -r requirements.txt

# Experiment Parameters

In [2]:
# Processing Parameters
preprocessing_num_workers = None # The number of processes to use for the preprocessing.
overwrite_cache = True # Overwrite the cached training and evaluation sets.

# Training Parameters
max_train_samples = None # For debugging purposes or quicker training, truncate the number of training examples to this value if set.
max_eval_samples = None # For debugging purposes or quicker training, truncate the number of evaluation examples to this value if set.
model_name = "gpt2"
output_dir = "output"

# Load dataset

We will use a small dataset for testing purposes. 

Dataset `banking77` composed of online banking queries annotated with their corresponding intents.

`banking77` dataset provides a very fine-grained set of intents in a banking domain. It comprises 13,083 customer service queries labeled with 77 intents. 

For our purpose, we will ignore the intent label and focus on generating texts from the banking domain.

In [3]:
from datasets import load_dataset

#raw_datasets = load_dataset("wikitext", "wikitext-103-raw-v1")
raw_datasets = load_dataset("banking77")

Using custom data configuration default
Reusing dataset banking77 (/home/azureuser/.cache/huggingface/datasets/banking77/default/1.1.0/aec0289529599d4572d76ab00c8944cb84f88410ad0c9e7da26189d31f62a55b)


In [4]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 10003
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 3080
    })
})

In [5]:
import random

index = random.sample(range(len(raw_datasets["train"])), 1)
print(raw_datasets["train"][index])

index = random.sample(range(len(raw_datasets["test"])), 1)
print(raw_datasets["test"][index])

OrderedDict([('text', ['For the disposable cards, what are the restrictions?']), ('label', [29])])
OrderedDict([('text', ['How can I transfer money to my account?']), ('label', [65])])


# Tokenize dataset using gpt2 tokenizer

In [6]:
from transformers import AutoConfig, AutoTokenizer

config = AutoConfig.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

text_column_name = "text"
column_names = raw_datasets["train"].column_names
            
def tokenize_function(examples):
    return tokenizer(examples[text_column_name])

tokenized_datasets = raw_datasets.map(
    tokenize_function,
    batched=True,
    num_proc=preprocessing_num_workers,
    remove_columns=column_names,
    load_from_cache_file=not overwrite_cache,
    desc="Running tokenizer on dataset",
)

Running tokenizer on dataset:   0%|          | 0/11 [00:00<?, ?ba/s]

Running tokenizer on dataset:   0%|          | 0/4 [00:00<?, ?ba/s]

In [7]:
index = random.sample(range(len(raw_datasets["train"])), 1)

print(raw_datasets["train"][index])
print(tokenized_datasets["train"][index])

OrderedDict([('text', ['how to get new card after atm eats it']), ('label', [18])])
OrderedDict([('attention_mask', [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), ('input_ids', [[4919, 284, 651, 649, 2657, 706, 379, 76, 25365, 340]])])


# Concatenate all texts from our dataset and generate chunks of block_size

In [8]:
block_size = tokenizer.model_max_length
if block_size > 1024:
    # The tokenizer picked seems to have a very large `model_max_length`
    block_size = 1024

# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=len(tokenized_datasets["train"]), # if training size is very small, like in our case.
    num_proc=preprocessing_num_workers,
    load_from_cache_file=not overwrite_cache,
    desc=f"Grouping texts in chunks of {block_size}",
)

Grouping texts in chunks of 1024:   0%|          | 0/1 [00:00<?, ?ba/s]

Grouping texts in chunks of 1024:   0%|          | 0/1 [00:00<?, ?ba/s]

In [9]:
print(raw_datasets["train"][0])
print(raw_datasets["train"][1])
print(raw_datasets["train"][2])
print(raw_datasets["train"][3])

{'text': 'I am still waiting on my card?', 'label': 11}
{'text': "What can I do if my card still hasn't arrived after 2 weeks?", 'label': 11}
{'text': 'I have been waiting over a week. Is the card still coming?', 'label': 11}
{'text': 'Can I track my card while it is in the process of delivery?', 'label': 11}


In [10]:
print(tokenized_datasets["train"][0])
print(tokenized_datasets["train"][1])
print(tokenized_datasets["train"][2])
print(tokenized_datasets["train"][3])

{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [40, 716, 991, 4953, 319, 616, 2657, 30]}
{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [2061, 460, 314, 466, 611, 616, 2657, 991, 5818, 470, 5284, 706, 362, 2745, 30]}
{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [40, 423, 587, 4953, 625, 257, 1285, 13, 1148, 262, 2657, 991, 2406, 30]}
{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [6090, 314, 2610, 616, 2657, 981, 340, 318, 287, 262, 1429, 286, 7585, 30]}


In [11]:
print(lm_datasets["train"][0]['input_ids'][:40])

[40, 716, 991, 4953, 319, 616, 2657, 30, 2061, 460, 314, 466, 611, 616, 2657, 991, 5818, 470, 5284, 706, 362, 2745, 30, 40, 423, 587, 4953, 625, 257, 1285, 13, 1148, 262, 2657, 991, 2406, 30, 6090, 314, 2610]


If we want each line to be treated seperately. So we instead pad each line or truncate each line to a maximum length.

In [12]:
"""
from transformers import AutoTokenizer

text_column_name = "text"
column_names = raw_datasets["train"].column_names

tokenizer = AutoTokenizer.from_pretrained(model_name)
max_seq_length = tokenizer.model_max_length
padding = "max_length"

tokenizer.add_special_tokens({'pad_token': '<pad>'})

def tokenize_function(examples):
    return tokenizer(examples[text_column_name],
                     padding=padding,
                     truncation=True,
                     max_length=max_seq_length)

tokenized_datasets = raw_datasets.map(
    tokenize_function,
    batched=True,
    remove_columns=column_names,
    num_proc=preprocessing_num_workers,
    load_from_cache_file=not overwrite_cache,
    desc="Running tokenizer on dataset",
)
"""

"""
index = random.sample(range(len(raw_datasets)), 1)

print(raw_datasets["train"][index])
print(tokenized_datasets["train"][index])
"""

'\nindex = random.sample(range(len(raw_datasets)), 1)\n\nprint(raw_datasets["train"][index])\nprint(tokenized_datasets["train"][index])\n'

In [13]:
"""
def add_label(examples):
    examples["labels"] = examples["input_ids"].copy()
    return examples

lm_datasets = tokenized_datasets.map(
    add_label,
    batched=True,
    num_proc=preprocessing_num_workers,
    load_from_cache_file=not overwrite_cache,
    desc=f"Adding label to each text",
)
"""

'\ndef add_label(examples):\n    examples["labels"] = examples["input_ids"].copy()\n    return examples\n\nlm_datasets = tokenized_datasets.map(\n    add_label,\n    batched=True,\n    num_proc=preprocessing_num_workers,\n    load_from_cache_file=not overwrite_cache,\n    desc=f"Adding label to each text",\n)\n'

# Prepare Training & Evaluation Datasets

<span style="color:red">Recheck script train/eval datasets! It seems training data is split even if test set is provided!</span>

In [14]:
train_dataset = lm_datasets["train"]
eval_dataset = lm_datasets["test"]

In [15]:
if max_train_samples is not None:
    train_dataset = train_dataset.select(range(max_train_samples))
if max_eval_samples is not None:
    eval_dataset = eval_dataset.select(range(max_eval_samples))

# Set Logging Level

In [16]:
import random
from importlib import reload  # Not needed in Python 2
import logging
reload(logging)
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.DEBUG, datefmt='%I:%M:%S')

logger = logging.getLogger()

# Log a few random samples from the training set:
#for index in random.sample(range(len(train_dataset)), 3):
    #logger.info(f"Sample {index} of the training set: {train_dataset[index]}. \n")
    #logger.info(f"Sample {index} of the training set shape: {len(train_dataset[index]['input_ids'])}. \n")    

# Check Trainig Parameters

We can customize the training arguments using training_args if we want, or hypertune some on a seperate validation set (might take a huge amount of time though).

For more arguments, check: https://huggingface.co/transformers/main_classes/trainer.html#transformers.TFTrainingArguments

In [17]:
from transformers import TFTrainingArguments

training_args = TFTrainingArguments(output_dir=output_dir)

num_replicas = training_args.strategy.num_replicas_in_sync
batches_per_epoch = len(train_dataset) // (num_replicas * training_args.per_device_train_batch_size)

{
    "init_lr": training_args.learning_rate,
    "num_replicas": num_replicas,
    "num_train_epochs": training_args.num_train_epochs,
    "per_device_train_batch_size": training_args.per_device_train_batch_size,
    "batches_per_epoch": len(train_dataset) // (num_replicas * training_args.per_device_train_batch_size),
    "num_train_steps": int(training_args.num_train_epochs * batches_per_epoch),
    "num_warmup_steps": training_args.warmup_steps,
    "adam_beta1": training_args.adam_beta1,
    "adam_beta2": training_args.adam_beta2,
    "adam_epsilon": training_args.adam_epsilon,
    "weight_decay_rate": training_args.weight_decay
}


05:42:18 DEBUG:Falling back to TensorFlow client; we recommended you install the Cloud TPU client directly with pip install cloud-tpu-client.
05:42:19 DEBUG:Creating converter from 7 to 5
05:42:19 DEBUG:Creating converter from 5 to 7
05:42:19 DEBUG:Creating converter from 7 to 5
05:42:19 DEBUG:Creating converter from 5 to 7
05:42:19 INFO:PyTorch: setting up devices
05:42:19 INFO:The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
05:42:19 INFO:Tensorflow: setting up strategy


{'init_lr': 5e-05,
 'num_replicas': 1,
 'num_train_epochs': 3.0,
 'per_device_train_batch_size': 8,
 'batches_per_epoch': 16,
 'num_train_steps': 48,
 'num_warmup_steps': 0,
 'adam_beta1': 0.9,
 'adam_beta2': 0.999,
 'adam_epsilon': 1e-08,
 'weight_decay_rate': 0.0}

# Model Training

Steps:

* Load Pretrained Model 
* Resize the number of token embeddings in the model to that of the tokenizer
    * Since our model and tokenizer belong to the same model, the number of token embeddings should be the same.
    
* Generate tf.data.Dataset (s) Sample Generator:
    * Reoreder batch randomly.
    * Convert each tokenized text to a tensor.
 
* Define a callback SavePretrainedCallback that will save the model checkpoint at the end of each epoch.

* Define the neural network optimizer from the arguments set in the training_args!

* Define the loss: We are using a dummy loss that will minimize the difference between predicted and real next token.
    * There should be a smarter loss.

* Fit the model over the training dataset & evaluate the model over the eval dataset.

* Log the loss & the perplexity metric of the model.

* Save the final model to the output directory.

In [18]:
import numpy as np
import tensorflow as tf
import math
from functools import partial
from transformers import AutoConfig, TFAutoModelForCausalLM
from transformers import create_optimizer

def sample_generator(dataset, tokenizer):
    # Trim off the last partial batch if present
    sample_ordering = np.random.permutation(len(dataset))
    for sample_idx in sample_ordering:
        example = dataset[int(sample_idx)]
        # Handle dicts with proper padding and conversion to tensor.
        example = {key: tf.convert_to_tensor(arr, dtype_hint=tf.int64) for key, arr in example.items()}
        yield example, example["labels"]  # TF needs some kind of labels, even if we don't use them
    return

# region Helper classes
class SavePretrainedCallback(tf.keras.callbacks.Callback):
    # Hugging Face models have a save_pretrained() method that saves both the weights and the necessary
    # metadata to allow them to be loaded as a pretrained model in future. This is a simple Keras callback
    # that saves the model with this method after each epoch.
    def __init__(self, output_dir, **kwargs):
        super().__init__()
        self.output_dir = output_dir

    def on_epoch_end(self, epoch, logs=None):
        self.model.save_pretrained(self.output_dir)

training_args = TFTrainingArguments(output_dir=output_dir)
#training_args.per_device_train_batch_size = 32

with training_args.strategy.scope():

    config = AutoConfig.from_pretrained(model_name)
    model = TFAutoModelForCausalLM.from_pretrained(model_name, config=config)

    model.resize_token_embeddings(len(tokenizer))

    num_replicas = training_args.strategy.num_replicas_in_sync

    # region TF Dataset preparation
    train_generator = partial(sample_generator, train_dataset, tokenizer)
    train_signature = {
        feature: tf.TensorSpec(shape=(None,), dtype=tf.int64)
        for feature in train_dataset.features
        if feature != "special_tokens_mask"
    }
    train_sig = (train_signature, train_signature["labels"])
    options = tf.data.Options()
    options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
    tf_train_dataset = (
        tf.data.Dataset.from_generator(train_generator, output_signature=train_sig)
        .with_options(options)
        .batch(batch_size=num_replicas * training_args.per_device_train_batch_size, drop_remainder=True)
        .repeat(int(training_args.num_train_epochs))
    )
    eval_generator = partial(sample_generator, eval_dataset, tokenizer)
    eval_signature = {
        feature: tf.TensorSpec(shape=(None,), dtype=tf.int64)
        for feature in eval_dataset.features
        if feature != "special_tokens_mask"
    }
    eval_sig = (eval_signature, eval_signature["labels"])
    tf_eval_dataset = (
        tf.data.Dataset.from_generator(eval_generator, output_signature=eval_sig)
        .with_options(options)
        .batch(batch_size=num_replicas * training_args.per_device_eval_batch_size, drop_remainder=True)
        .repeat(int(training_args.num_train_epochs))
    )
    # endregion
    # region Optimizer and loss
    
    batches_per_epoch = len(train_dataset) // (num_replicas * training_args.per_device_train_batch_size)
    # Bias and layernorm weights are automatically excluded from the decay
    optimizer, lr_schedule = create_optimizer(
        init_lr=training_args.learning_rate,
        num_train_steps=int(training_args.num_train_epochs * batches_per_epoch),
        num_warmup_steps=training_args.warmup_steps,
        adam_beta1=training_args.adam_beta1,
        adam_beta2=training_args.adam_beta2,
        adam_epsilon=training_args.adam_epsilon,
        weight_decay_rate=training_args.weight_decay,
    )

    def dummy_loss(y_true, y_pred):
        return tf.reduce_mean(y_pred)

    model.compile(optimizer=optimizer, loss={"loss": dummy_loss})
    # endregion

    # region Training and validation
    logger.info("***** Running training *****")
    logger.info(f"  Num examples = {len(train_dataset)}")
    logger.info(f"  Num Epochs = {training_args.num_train_epochs}")
    logger.info(f"  Instantaneous batch size per device = {training_args.per_device_train_batch_size}")
    logger.info(f"  Total train batch size = {training_args.per_device_train_batch_size * num_replicas}")

    history = model.fit(
        tf_train_dataset,
        validation_data=tf_eval_dataset,
        epochs=int(training_args.num_train_epochs),
        steps_per_epoch=len(train_dataset) // (training_args.per_device_train_batch_size * num_replicas),
        callbacks=[SavePretrainedCallback(output_dir=training_args.output_dir)],
    )
    try:
        train_perplexity = math.exp(history.history["loss"][-1])
    except OverflowError:
        train_perplexity = math.inf
    try:
        validation_perplexity = math.exp(history.history["val_loss"][-1])
    except OverflowError:
        validation_perplexity = math.inf
    logger.info(f"  Final train loss: {history.history['loss'][-1]:.3f}")
    logger.info(f"  Final train perplexity: {train_perplexity:.3f}")
    logger.info(f"  Final validation loss: {history.history['val_loss'][-1]:.3f}")
    logger.info(f"  Final validation perplexity: {validation_perplexity:.3f}")
    # endregion

    if training_args.output_dir is not None:
        model.save_pretrained(training_args.output_dir)

05:42:20 INFO:PyTorch: setting up devices
05:42:20 INFO:The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
05:42:20 INFO:Tensorflow: setting up strategy
05:42:21 INFO:loading weights file https://huggingface.co/gpt2/resolve/main/tf_model.h5 from cache at /home/azureuser/.cache/huggingface/transformers/4029f7287fbd5fa400024f6bbfcfeae9c5f7906ea97afcaaa6348ab7c6a9f351.723d8eaff3b27ece543e768287eefb59290362b8ca3b1c18a759ad391dca295a.h5

If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.
05:42:23 INFO:***** Running training *****
05:42:23 INFO:  Num examples = 134
05:42:23 INFO:  Num Epochs = 3.0
05:42:23 INFO:  Instantaneous batch size per device = 8
05:42:23 INFO:  Tot

Epoch 1/3


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: <cyfunction Socket.send at 0x7fd18d0f1d90> is not a module, class, method, function, traceback, frame, or code object


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: <cyfunction Socket.send at 0x7fd18d0f1d90> is not a module, class, method, function, traceback, frame, or code object



Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.








05:53:32 DEBUG:Creating converter from 5 to 3
05:53:36 INFO:Model weights saved in output/tf_model.h5


Epoch 2/3


06:04:56 INFO:Model weights saved in output/tf_model.h5


Epoch 3/3


06:16:11 INFO:Model weights saved in output/tf_model.h5
06:16:11 INFO:  Final train loss: 2.232
06:16:11 INFO:  Final train perplexity: 9.320
06:16:11 INFO:  Final validation loss: 2.255
06:16:11 INFO:  Final validation perplexity: 9.534
06:16:15 INFO:Model weights saved in output/tf_model.h5


# Use Fine-tuned Model

Now that we have trained our new language model on new data, lets give it a try! We will want to use the path to the directory that the script outputs the model file to, and load it up to see results.

In [19]:
# setup imports to use the model
from transformers import TFGPT2LMHeadModel
from transformers import GPT2Tokenizer

model = TFGPT2LMHeadModel.from_pretrained(output_dir)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

06:16:15 INFO:loading weights file output/tf_model.h5
06:16:17 DEBUG:Creating converter from 3 to 5

If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [27]:
input_ids = tokenizer.encode("Hello", return_tensors='tf')

generated_text_samples = model.generate(
    input_ids, 
    max_length=30,  
    num_return_sequences=5,
    #no_repeat_ngram_size=2,
    #repetition_penalty=1.5,
    #top_p=0.92,
    #temperature=.85,
    do_sample=True,
    #top_k=125,
    early_stopping=True
)

#Print output for each sequence generated above
for i, beam in enumerate(generated_text_samples):
  print("{}: {}".format(i + 1,tokenizer.decode(beam, skip_special_tokens=True)))
  print()

1: Hello by saying there is one missing link in your statement that I made so let me know. Please do not send my email to others with the same

2: Hello it's working fine. I got to check if everything is fine before I start streaming."If everything is fine, I know I have an error

3: Hello I'm a big fan of POC! It does a really good job of holding my card, and keeps my cash in-bank at the

4: Hello) I don't know why we don't know the number, so I think it's there. What does it mean to have that number?

5: Hello I want to add the option to enable the automatic reset of the lock-up history. How does that work?


I want to disable



In [28]:
input_ids = tokenizer.encode("Hello", return_tensors='tf')

generated_text_samples = model.generate(
    input_ids, 
    max_length=30,  
    num_return_sequences=5,
    #no_repeat_ngram_size=2,
    #repetition_penalty=1.5,
    #top_p=0.92,
    #temperature=.85,
    do_sample=True,
    #top_k=125,
    early_stopping=True
)

#Print output for each sequence generated above
for i, beam in enumerate(generated_text_samples):
  print("{}: {}".format(i + 1,tokenizer.decode(beam, skip_special_tokens=True)))
  print()



1: Hello? If so, why is it still listed? Where is this link? Is anything else wrong?

2: Hello. How do you plan to use this update if I lose my stuff?  Do you want me to continue doing this update?  This would

3: Hello for this, my husband and I've been having problems in the process of getting a new car. I can't get some stuff delivered to me

4: Hello. I have a card? Please open the account!

My card doesn't work? Please open it. I'm not happy with my

5: Hello from Austria I'm at the place and I want to see if there's anyone here. I just saw that people are asking about the hotel that

