In [4]:
# depends on g2p-en
from datasets import load_dataset
from g2p_en import G2p

g2p = G2p()

generics_kb = load_dataset(
    'community-datasets/generics_kb', name='generics_kb_best', split='train')
ponyspeech_dataset = load_dataset(
    'synthbot/pony-speech', split='train')

Resolving data files:   0%|          | 0/31 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/31 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/30 [00:00<?, ?it/s]

In [153]:
horsewords = {}
with open('horsewords.clean') as f:
    while line := f.readline():
        spl = line.split('  ')
        horsewords[spl[0]] = spl[1]

def phonemize(text):
    spl = text.split()
    l = []
    for s in spl:
        arpabet_string = ''.join(g2p(s))
        l.append(arpabet_string)
    return ' '.join(l)
print(phonemize('But one on one, let\'s clean it.'))

BAH1T WAH1N AA1N WAH1N , LEH1TS KLIY1N IH1T .


In [107]:
from unidecode import unidecode
from tqdm import tqdm
n = 240000
corpus_output_txt = 'g2pen_corpus_nosep.txt'
with open(corpus_output_txt, 'w', encoding='utf-8') as f:
    for text in tqdm(generics_kb.shuffle().select(
        range(n)
    )['generic_sentence'], desc='generic_kb wiki'):
        f.write(phonemize(text)+'\n')
    for text in tqdm(ponyspeech_dataset['transcription'], desc='ponyspeech'):
        f.write(phonemize(text)+'\n')

generic_kb wiki: 100%|██████████| 240000/240000 [09:42<00:00, 412.31it/s]
ponyspeech: 100%|██████████| 64783/64783 [02:06<00:00, 511.74it/s]


In [144]:
from tokenizers import (decoders, models, normalizers,
    pre_tokenizers, processors, trainers, Tokenizer)

import re

special_tokens = ['[UNK]', '[PAD]', '[CLS]', '[SEP]', '[MASK]']
tokenizer = Tokenizer(models.BPE())
tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
    [
    pre_tokenizers.Whitespace(),
    #pre_tokenizers.Split(r'\s+', behavior='removed'), 
    pre_tokenizers.Punctuation(),
    ])
trainer = trainers.BpeTrainer(vocab_size=384,
    special_tokens=special_tokens, unk_token='[UNK]')
tokenizer.train(['g2pen_corpus_nosep.txt'], trainer=trainer)

In [102]:
from transformers import AutoTokenizer
parler_tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-mini-v1")

In [146]:
test_texts = [
  'I need some money, can I get some money? I need one million.',
    'Raise up, get yourself together, and drive that funky soul.',
    'Our results demonstrate high fidelity speech generation and a diverse range of accents',
    'Equestria the brave Equestrian.',
    'Dead nirik storage.',
    'Well, one on one, let\'s clean it!',
    'Amazon delivers packages quickly across the United States!',
    'Bitcoin and Ethereum are popular cryptocurrencies.',
    'Text-to-speech models 1 $200',
    'Queue queuing'
    ]
for test_text in test_texts:
    #print(tokenizer.pre_tokenizer.pre_tokenize_str(phonemize(test_text)))
    encoding = tokenizer.encode(phonemize(test_text))
    print(encoding.tokens)
    print(len(encoding.tokens))
    print(any([tok == '[UNK]' for tok in encoding.tokens]))

    print(parler_tokenizer.tokenize(test_text))
    print(len(parler_tokenizer(test_text).input_ids))

['AY1', 'N', 'IY1', 'D', 'SAH1M', 'M', 'AH1N', 'IY0', ',', 'KAE1N', 'AY1', 'G', 'EH1T', 'SAH1M', 'M', 'AH1N', 'IY0', '?', 'AY1', 'N', 'IY1', 'D', 'WAH1N', 'M', 'IH1L', 'Y', 'AH0N', '.']
28
False
['▁I', '▁need', '▁some', '▁money', ',', '▁can', '▁I', '▁get', '▁some', '▁money', '?', '▁I', '▁need', '▁one', '▁million', '.']
17
['R', 'EY1Z', 'AH1P', ',', 'G', 'EH1T', 'Y', 'ER0', 'SEH1L', 'F', 'T', 'AH0G', 'EH1', 'DHER0', ',', 'AH0ND', 'DR', 'AY1', 'V', 'DHAE1T', 'F', 'AH1NG', 'K', 'IY0', 'S', 'OW1L', '.']
27
False
['▁R', 'aise', '▁up', ',', '▁get', '▁yourself', '▁together', ',', '▁and', '▁drive', '▁that', '▁fun', 'ky', '▁soul', '.']
16
['AW1ER0', 'R', 'IH0Z', 'AH1L', 'TS', 'D', 'EH1', 'MAH0N', 'STR', 'EY2T', 'HHAY1', 'F', 'AH0D', 'EH1L', 'AH0TIY0', 'SP', 'IY1', 'CH', 'JH', 'EH2', 'N', 'ER0', 'EY1SHAH0N', 'AH0ND', 'AH0', 'D', 'AY0', 'V', 'ER1', 'S', 'R', 'EY1NJH', 'AH1V', 'AE1', 'KS', 'EH0', 'N', 'TS']
38
False
['▁Our', '▁results', '▁demonstrate', '▁high', '▁', 'fidelity', '▁speech', '▁genera

In [147]:
test_texts = [
    "The phthisic sphinx stealthily eschews the mnemonic pneumatic knickknacks.",
    "Szczecin's squirrel chirped rhythmically while munching on zwitterionic quetzal quills.",
    "Bjorn's fjord-dwelling lynx phthalate psyche was quite a conundrum.",
    "The zeitgeist of Györgyike's csárdás xylophone quintet was truly unique.",
    "Tsar Xylophone's czarina whispered žuželjka while eating sauerkraut in Århus.",
    "Queuing for Llanfairpwllgwyngyllgogerychwyrndrobwllllantysiliogogogoch's onomatopoeic eisteddfod.",
    "The kleptomaniac's schwannoma diagnosis left the psychiatrist nonplussed.",
    "Xerxes' xiphosura exhibited xenophobic behavior towards Zsa Zsa's Zhivago zhuzh.",
    "The phlegmatic pharmacist's emphysema was exacerbated by Phthonus' chthonic chrysanthemums.",
    "Kwyjibo's syzygy with Przybyszewski's psyche caused a hubbub in Ouagadougou.",
    "The bourgeois faux pas at the rendezvous caused quite a brouhaha amongst the hoi polloi.",
    "Nietzsche's Übermensch theory clashed with Schrödinger's quantum doppelgänger hypothesis.",
    "The pneumonoultramicroscopicsilicovolcanoconiosis diagnosis flummoxed the otorhinolaryngologist.",
    "Gnocchi-loving Giacomo's pince-nez glinted as he perused Goethe's oeuvre on the Champs-Élysées.",
    "The tsktsk of the ptarmigan's wings echoed through the cwtch as it flew over Loch Ness."
    ]
for test_text in test_texts:
    #print(tokenizer.pre_tokenizer.pre_tokenize_str(phonemize(test_text)))
    encoding = tokenizer.encode(phonemize(test_text))
    print(encoding.tokens)
    print(len(encoding.tokens))
    print(any([tok == '[UNK]' for tok in encoding.tokens]))

    print(parler_tokenizer.tokenize(test_text))
    print(len(parler_tokenizer(test_text).input_ids))

['DHAH0', 'F', 'AH0T', 'H', 'IH1S', 'IH0K', 'S', 'F', 'IH1NG', 'KS', 'ST', 'EH1L', 'TH', 'IY0', 'LIY0', 'EH0', 'S', 'CH', 'UW1', 'Z', 'DHAH0', 'N', 'IH0', 'M', 'AA1N', 'IH0K', 'N', 'UW0', 'M', 'AE1T', 'IH0K', 'N', 'IH1K', 'N', 'AE2', 'KS', '.']
37
False
['▁The', '▁', 'p', 'h', 'this', 'ic', '▁', 's', 'phin', 'x', '▁steal', 'thi', 'ly', '▁', 'e', 'sche', 'w', 's', '▁the', '▁', 'm', 'nem', 'onic', '▁pneu', 'matic', '▁', 'k', 'nick', 'k', 'n', 'ack', 's', '.']
34
['SH', 'IY1', 'S', 'IH0N', 'Z', 'SK', 'WER1', 'AH0L', 'CH', 'ER1', 'PT', 'RIH1', 'DH', 'M', 'IH0K', 'LIY0', 'WAY1', 'L', 'M', 'AH1N', 'CH', 'IH0NG', 'AA1N', 'Z', 'W', 'IH2', 'TER0', 'SH', 'IY1N', 'IH0NG', 'KW', 'EH1T', 'Z', 'AH0L', 'K', 'WIH1', 'LZ', '.']
38
False
['▁S', 'z', 'c', 'zeci', 'n', "'", 's', '▁squirrel', '▁', 'chir', 'ped', '▁rhythm', 'ically', '▁while', '▁mun', 'ching', '▁on', '▁', 'z', 'wit', 'terio', 'nic', '▁qu', 'etz', 'al', '▁qui', 'll', 's', '.']
30
['B', 'AO1R', 'NZ', 'FER0', 'G', 'EH1D', 'L', 'HH', 'IH0N', 'L

In [149]:
tokenizer.save('tokenizer_g2pen.json')

In [151]:
from transformers import PreTrainedTokenizerFast
wrapped_tokenizer = PreTrainedTokenizerFast(
    tokenizer_file="tokenizer_g2pen.json",
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]"
)
wrapped_tokenizer.save_pretrained('tokenizer_g2pen')

('tokenizer_g2pen\\tokenizer_config.json',
 'tokenizer_g2pen\\special_tokens_map.json',
 'tokenizer_g2pen\\tokenizer.json')

In [155]:
print(wrapped_tokenizer.vocab_size)

384
