# Train Transformer-XL from Scratch using 🤗 Transformers

In [1]:
!pip install transformers==4.19.2
!pip install rarfile==4.0
!pip install datasets==2.4.0
!pip install sacremoses==0.0.53

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import re
import random
from tqdm.notebook import tqdm
from collections import Counter
from datasets import load_dataset, Dataset
from transformers import TransfoXLTokenizer, TransfoXLLMHeadModel, TransfoXLConfig

# 1- Find a dataset

Create your own dataset from a text file or load it from Hugging Face Hub, then preprocess it in an appropriate way.

In [3]:
data = load_dataset('arabic_billion_words', 'SaudiYoum', split="train[0%:1%]")



In [4]:
def remove_non_arabic(text):
    return ' '.join(re.sub(u"[^\u0621-\u063A\u0641-\u064A ]", " ", text,  flags=re.UNICODE).split())

In [5]:
texts = []
for text in tqdm(data['text']):
    text = remove_non_arabic(text)
    if text != '':
        texts.append(text)

  0%|          | 0/8881 [00:00<?, ?it/s]

In [6]:
del data # delete the dataset object to save some memory

In [7]:
# function that splits a list of texts into train, validation, and test datasets
# where you specify the train data ratio and the rest will be splitted into
# validation and testing equally
def split_train_val_test(texts, train_ratio):
    random.shuffle(texts)
    train_end_idx = int(train_ratio * len(texts))
    valid_end_idx = train_end_idx + int(((1 - train_ratio) / 2) * len(texts))
    train_texts = texts[0:train_end_idx]
    valid_texts = texts[train_end_idx:valid_end_idx]
    test_texts = texts[valid_end_idx:]

    train_dataset = Dataset.from_dict({'train': train_texts})
    val_dataset = Dataset.from_dict({'validation': valid_texts})
    test_dataset = Dataset.from_dict({'test': test_texts})

    return train_dataset, val_dataset, test_dataset

In [8]:
train_dataset, val_dataset, test_dataset = split_train_val_test(texts[:100], train_ratio=0.98)

We need to create a list of vocab since the Transformer-XL tokenizer is a word-level tokenizer.

In [9]:
# from a given list of texts, create a vocab list by specifying the top N most 
# common words and the minimum frequency for each word
def get_vocab(texts, min_freq=2, topn=300000, add_special_symbols=[]):
    vocab = list(add_special_symbols)
    counter = Counter()
    for text in tqdm(texts):
        text = remove_non_arabic(text)
        counter.update(text.split())
    
    counter = Counter({word: count for word, count in counter.items() if count >= min_freq})
    vocab += [word for word, _ in counter.most_common(topn)]
    return vocab

In [10]:
vocab = get_vocab(texts, min_freq=2, topn=300000, add_special_symbols=['<unk>', '<sos>', '<eos>'])

  0%|          | 0/8879 [00:00<?, ?it/s]

In [11]:
# write the vocab list to a path that will be used in the tokenizer to load
# them back
vocab_file = './vocab.txt'
with open(vocab_file, 'w') as f1:
    for word in vocab:
        f1.write(f'{word}\n')

In [12]:
tokenizer = TransfoXLTokenizer(vocab_file=vocab_file)
tokenizer

PreTrainedTokenizer(name_or_path='', vocab_size=85574, model_max_len=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<eos>', 'unk_token': '<unk>', 'additional_special_tokens': ['<formula>']})

In [13]:
len(tokenizer)

85574

In [14]:
# NOTE: cutoffs depends on your vocab size, so make sure that all values in the
# cutoff list are less than the actual number of vocabs you have
config = TransfoXLConfig(vocab_size=len(tokenizer), cutoffs=[20000, 40000])
model = TransfoXLLMHeadModel(config)

In [15]:
# Test the tokenizer and the model

sample_text = 'بسم الله الرحمن الرحيم'
inputs = tokenizer(sample_text, return_tensors="pt")
outputs = model(**inputs, labels=inputs["input_ids"])
loss, logits = outputs[:2]
loss

tensor([[10.1311,  9.8044,  9.7278]], grad_fn=<ViewBackward0>)

In [16]:
outputs.keys()

odict_keys(['losses', 'prediction_scores', 'mems', 'loss'])

# 2- Tokenize texts

Most of this code was borrowed from [here](https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_clm.py
).

In [17]:
preprocessing_num_workers = 4

block_size = 256
block_size = min(block_size, tokenizer.model_max_length)

In [18]:
from itertools import chain

column_names = []

def tokenize_function(examples):
    output = tokenizer(examples[column_names[0]])
    return output

column_names = list(train_dataset.features.keys())
train_tokenized_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    num_proc=preprocessing_num_workers,
    remove_columns=column_names,
    #load_from_cache_file=not data_args.overwrite_cache,
    desc="Running tokenizer on dataset",
)
column_names = list(val_dataset.features.keys())
val_tokenized_dataset = val_dataset.map(
    tokenize_function,
    batched=True,
    num_proc=preprocessing_num_workers,
    remove_columns=column_names,
    #load_from_cache_file=not data_args.overwrite_cache,
    desc="Running tokenizer on dataset",
)


     

Running tokenizer on dataset #0:   0%|          | 0/1 [00:00<?, ?ba/s]

   

Running tokenizer on dataset #2:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on dataset #1:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on dataset #3:   0%|          | 0/1 [00:00<?, ?ba/s]



Running tokenizer on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

In [19]:
# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

# Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder
# for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower
# to preprocess.
#
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
train_lm_dataset = train_tokenized_dataset.map(
    group_texts,
    batched=True,
    num_proc=preprocessing_num_workers,
    #load_from_cache_file=not data_args.overwrite_cache,
    desc=f"Grouping texts in chunks of {block_size}",
)

val_lm_dataset = val_tokenized_dataset.map(
    group_texts,
    batched=True,
    num_proc=preprocessing_num_workers,
    #load_from_cache_file=not data_args.overwrite_cache,
    desc=f"Grouping texts in chunks of {block_size}",
)

      

Grouping texts in chunks of 256 #0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 256 #1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 256 #2:   0%|          | 0/1 [00:00<?, ?ba/s]

Grouping texts in chunks of 256 #3:   0%|          | 0/1 [00:00<?, ?ba/s]



Grouping texts in chunks of 256:   0%|          | 0/1 [00:00<?, ?ba/s]

# 4- Train the model

In [20]:
# https://colab.research.google.com/github/huggingface/blog/blob/main/notebooks/01_how_to_train.ipynb#scrollTo=YpvnFFmZJD-N

from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./transformer_xl",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    save_steps=1000,
    save_total_limit=2,
    prediction_loss_only=True,
    report_to="all"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_lm_dataset,
    eval_dataset=val_lm_dataset

)

In [21]:
trainer.train()

***** Running training *****
  Num examples = 96
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 24


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=24, training_loss=10.178698221842447, metrics={'train_runtime': 71.8144, 'train_samples_per_second': 1.337, 'train_steps_per_second': 0.334, 'total_flos': 36581178212352.0, 'train_loss': 10.178698221842447, 'epoch': 1.0})