<a href="https://colab.research.google.com/github/daspartho/prompt-extend/blob/main/tokenizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Installing required libraries

In [3]:
!pip install transformers sentencepiece datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


### Downloading the corpus of prompts

In [4]:
from datasets import load_dataset

ds = load_dataset("Gustavosta/Stable-Diffusion-Prompts")
ds

Downloading readme:   0%|          | 0.00/777 [00:00<?, ?B/s]



Downloading and preparing dataset parquet/Gustavosta--Stable-Diffusion-Prompts to /root/.cache/huggingface/datasets/Gustavosta___parquet/Gustavosta--Stable-Diffusion-Prompts-f4211d2c5626deea/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/9.23M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.03M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/Gustavosta___parquet/Gustavosta--Stable-Diffusion-Prompts-f4211d2c5626deea/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['Prompt'],
        num_rows: 73718
    })
    test: Dataset({
        features: ['Prompt'],
        num_rows: 8192
    })
})

In [5]:
example = ds['train'][0]['Prompt']
example

'realistic car 3 d render sci - fi car and sci - fi robotic factory structure in the coronation of napoleon painting and digital billboard with point cloud in the middle, unreal engine 5, keyshot, octane, artstation trending, ultra high detail, ultra realistic, cinematic, 8 k, 1 6 k, in style of zaha hadid, in style of nanospace michael menzelincev, in style of lee souder, in plastic, dark atmosphere, tilt shift, depth of field,'

### Transform the dataset into an iterator of batches of prompts

In [6]:
def get_training_corpus():
    return (
        ds["train"][i : i + 1000]["Prompt"]
        for i in range(0, len(ds["train"]), 1000)
        )

training_corpus = get_training_corpus()

### Load the tokenizer

In [7]:
from transformers import AutoTokenizer

old_tokenizer = AutoTokenizer.from_pretrained("gpt2")

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

### Let's how if performs before training

In [8]:
tokens = old_tokenizer.tokenize(example)
tokens, len(tokens)

(['real',
  'istic',
  'Ġcar',
  'Ġ3',
  'Ġd',
  'Ġrender',
  'Ġsci',
  'Ġ-',
  'Ġfi',
  'Ġcar',
  'Ġand',
  'Ġsci',
  'Ġ-',
  'Ġfi',
  'Ġrobotic',
  'Ġfactory',
  'Ġstructure',
  'Ġin',
  'Ġthe',
  'Ġcoron',
  'ation',
  'Ġof',
  'Ġnap',
  'oleon',
  'Ġpainting',
  'Ġand',
  'Ġdigital',
  'Ġbillboard',
  'Ġwith',
  'Ġpoint',
  'Ġcloud',
  'Ġin',
  'Ġthe',
  'Ġmiddle',
  ',',
  'Ġunreal',
  'Ġengine',
  'Ġ5',
  ',',
  'Ġkeys',
  'hot',
  ',',
  'Ġoct',
  'ane',
  ',',
  'Ġart',
  'station',
  'Ġtrending',
  ',',
  'Ġultra',
  'Ġhigh',
  'Ġdetail',
  ',',
  'Ġultra',
  'Ġrealistic',
  ',',
  'Ġcinematic',
  ',',
  'Ġ8',
  'Ġk',
  ',',
  'Ġ1',
  'Ġ6',
  'Ġk',
  ',',
  'Ġin',
  'Ġstyle',
  'Ġof',
  'Ġz',
  'aha',
  'Ġhad',
  'id',
  ',',
  'Ġin',
  'Ġstyle',
  'Ġof',
  'Ġnan',
  'ospace',
  'Ġm',
  'ichael',
  'Ġmen',
  'zel',
  'ince',
  'v',
  ',',
  'Ġin',
  'Ġstyle',
  'Ġof',
  'Ġle',
  'e',
  'Ġsou',
  'der',
  ',',
  'Ġin',
  'Ġplastic',
  ',',
  'Ġdark',
  'Ġatmosphere',
  ',',
  '

### Training a new tokenizer

In [9]:
tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, 52000)

### Let's see how the trained tokenizer performs

In [10]:
tokens = tokenizer.tokenize(example)
tokens, len(tokens)

(['realistic',
  'Ġcar',
  'Ġ3',
  'Ġd',
  'Ġrender',
  'Ġsci',
  'Ġ-',
  'Ġfi',
  'Ġcar',
  'Ġand',
  'Ġsci',
  'Ġ-',
  'Ġfi',
  'Ġrobotic',
  'Ġfactory',
  'Ġstructure',
  'Ġin',
  'Ġthe',
  'Ġcoronation',
  'Ġof',
  'Ġnapoleon',
  'Ġpainting',
  'Ġand',
  'Ġdigital',
  'Ġbillboard',
  'Ġwith',
  'Ġpoint',
  'Ġcloud',
  'Ġin',
  'Ġthe',
  'Ġmiddle',
  ',',
  'Ġunreal',
  'Ġengine',
  'Ġ5',
  ',',
  'Ġkeyshot',
  ',',
  'Ġoctane',
  ',',
  'Ġartstation',
  'Ġtrending',
  ',',
  'Ġultra',
  'Ġhigh',
  'Ġdetail',
  ',',
  'Ġultra',
  'Ġrealistic',
  ',',
  'Ġcinematic',
  ',',
  'Ġ8',
  'Ġk',
  ',',
  'Ġ1',
  'Ġ6',
  'Ġk',
  ',',
  'Ġin',
  'Ġstyle',
  'Ġof',
  'Ġzaha',
  'Ġhadid',
  ',',
  'Ġin',
  'Ġstyle',
  'Ġof',
  'Ġnanospace',
  'Ġmichael',
  'Ġmenzelincev',
  ',',
  'Ġin',
  'Ġstyle',
  'Ġof',
  'Ġlee',
  'Ġsouder',
  ',',
  'Ġin',
  'Ġplastic',
  ',',
  'Ġdark',
  'Ġatmosphere',
  ',',
  'Ġtilt',
  'Ġshift',
  ',',
  'Ġdepth',
  'Ġof',
  'Ġfield',
  ','],
 91)

### Saving the tokenizer

In [11]:
tokenizer.save_pretrained("prompt-tokenizer")

('prompt-tokenizer/tokenizer_config.json',
 'prompt-tokenizer/special_tokens_map.json',
 'prompt-tokenizer/vocab.json',
 'prompt-tokenizer/merges.txt',
 'prompt-tokenizer/added_tokens.json',
 'prompt-tokenizer/tokenizer.json')

### Uploading the tokenizer to HuggingFace Hub

Be sure to login with your auth token below to push the tokenizer to Hub

In [12]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [13]:
tokenizer.push_to_hub("prompt-tokenizer")

CommitInfo(commit_url='https://huggingface.co/daspartho/prompt-tokenizer/commit/1d037a86a53821fbce5d344f1cdd62a88374eb6c', commit_message='Upload tokenizer', commit_description='', oid='1d037a86a53821fbce5d344f1cdd62a88374eb6c', pr_url=None, pr_revision=None, pr_num=None)