<a href="https://colab.research.google.com/github/daspartho/prompt-extend/blob/main/tokenizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Installing required libraries

In [None]:
!pip install transformers sentencepiece datasets -q

### Downloading the corpus of prompts

In [2]:
from datasets import load_dataset

ds = load_dataset("poloclub/diffusiondb", "text_only")
ds

Downloading builder script:   0%|          | 0.00/9.60k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/19.0k [00:00<?, ?B/s]

Downloading and preparing dataset diffusiondb/text_only to /root/.cache/huggingface/datasets/poloclub___diffusiondb/text_only/0.9.0/c98e527fa9b265717e27788ced0971123572f432d66cbe5d80a3f1ce5111ac1c...


Downloading data:   0%|          | 0.00/190M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset diffusiondb downloaded and prepared to /root/.cache/huggingface/datasets/poloclub___diffusiondb/text_only/0.9.0/c98e527fa9b265717e27788ced0971123572f432d66cbe5d80a3f1ce5111ac1c. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['image_name', 'prompt', 'part_id', 'seed', 'step', 'cfg', 'sampler'],
        num_rows: 2000000
    })
})

In [3]:
columns_to_remove = ['cfg', 'image_name', 'part_id', 'sampler', 'seed', 'step']
ds = ds.remove_columns(columns_to_remove)
ds

DatasetDict({
    train: Dataset({
        features: ['prompt'],
        num_rows: 2000000
    })
})

In [4]:
example = ds['train'][0]['prompt']
example

'doom eternal, game concept art, veins and worms, muscular, crustacean exoskeleton, chiroptera head, chiroptera ears, mecha, ferocious, fierce, hyperrealism, fine details, artstation, cgsociety, zbrush, no background '

### Transform the dataset into an iterator of batches of prompts

In [5]:
def get_training_corpus():
    return (
        ds["train"][i : i + 1000]["prompt"]
        for i in range(0, len(ds["train"]), 1000)
        )

training_corpus = get_training_corpus()

### Load the tokenizer

In [6]:
from transformers import AutoTokenizer

old_tokenizer = AutoTokenizer.from_pretrained("gpt2")

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

### Let's how if performs before training

In [7]:
tokens = old_tokenizer.tokenize(example)
tokens, len(tokens)

(['d',
  'oom',
  'Ġeternal',
  ',',
  'Ġgame',
  'Ġconcept',
  'Ġart',
  ',',
  'Ġveins',
  'Ġand',
  'Ġworms',
  ',',
  'Ġmuscular',
  ',',
  'Ġcrust',
  'ace',
  'an',
  'Ġex',
  'os',
  'keleton',
  ',',
  'Ġchirop',
  'tera',
  'Ġhead',
  ',',
  'Ġchirop',
  'tera',
  'Ġears',
  ',',
  'Ġme',
  'cha',
  ',',
  'Ġferocious',
  ',',
  'Ġfierce',
  ',',
  'Ġhyper',
  'real',
  'ism',
  ',',
  'Ġfine',
  'Ġdetails',
  ',',
  'Ġart',
  'station',
  ',',
  'Ġc',
  'gs',
  'oc',
  'iety',
  ',',
  'Ġz',
  'brush',
  ',',
  'Ġno',
  'Ġbackground',
  'Ġ'],
 57)

### Training a new tokenizer

In [8]:
tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, 52000)

### Let's see how the trained tokenizer performs

In [9]:
tokens = tokenizer.tokenize(example)
tokens, len(tokens)

(['doom',
  'Ġeternal',
  ',',
  'Ġgame',
  'Ġconcept',
  'Ġart',
  ',',
  'Ġveins',
  'Ġand',
  'Ġworms',
  ',',
  'Ġmuscular',
  ',',
  'Ġcrustacean',
  'Ġexoskeleton',
  ',',
  'Ġchiroptera',
  'Ġhead',
  ',',
  'Ġchiroptera',
  'Ġears',
  ',',
  'Ġmecha',
  ',',
  'Ġferocious',
  ',',
  'Ġfierce',
  ',',
  'Ġhyperrealism',
  ',',
  'Ġfine',
  'Ġdetails',
  ',',
  'Ġartstation',
  ',',
  'Ġcgsociety',
  ',',
  'Ġzbrush',
  ',',
  'Ġno',
  'Ġbackground',
  'Ġ'],
 42)

### Saving the tokenizer

In [10]:
tokenizer.save_pretrained("prompt-tokenizer")

('prompt-tokenizer/tokenizer_config.json',
 'prompt-tokenizer/special_tokens_map.json',
 'prompt-tokenizer/vocab.json',
 'prompt-tokenizer/merges.txt',
 'prompt-tokenizer/added_tokens.json',
 'prompt-tokenizer/tokenizer.json')

### Uploading the tokenizer to HuggingFace Hub

Be sure to login with your auth token below to push the tokenizer to Hub

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
tokenizer.push_to_hub("prompt-tokenizer")