In [1]:
# depends on g2p-en
from datasets import load_dataset
from g2p_en import G2p

g2p = G2p()

generics_kb = load_dataset(
    'community-datasets/generics_kb', name='generics_kb_best', split='train')
ponyspeech_dataset = load_dataset(
    'synthbot/pony-speech', split='train')

Resolving data files:   0%|          | 0/31 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/31 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/30 [00:00<?, ?it/s]

In [2]:
def phonemize(text):
    spl = text.split()
    l = ''
    for s in spl:
        p = [arp for arp in g2p(s) if arp != ' ']
        arpabet_string = ''.join(p)
        l += arpabet_string + ' '
    return l.strip()
print(phonemize('But one on one, let\'s clean it. So let\'s get down'))

BAH1T WAH1N AA1N WAH1N, LEH1TS KLIY1N IH1T. SOW1 LEH1TS GEH1T DAW1N


In [70]:
from unidecode import unidecode
from tqdm import tqdm
n = 240000
corpus_output_txt = 'g2pen_corpus.txt'
with open(corpus_output_txt, 'w', encoding='utf-8') as f:
    for text in tqdm(generics_kb.shuffle().select(
        range(n)
    )['generic_sentence'], desc='generic_kb wiki'):
        f.write(phonemize(text)+'\n')
    for text in tqdm(ponyspeech_dataset['transcription'], desc='ponyspeech'):
        f.write(phonemize(text)+'\n')

generic_kb wiki:  30%|███       | 73164/240000 [04:39<10:36, 262.09it/s]


KeyboardInterrupt: 

In [3]:
from tokenizers import (decoders, models, normalizers,
    pre_tokenizers, processors, trainers, Tokenizer,
    )

import re

special_tokens = ['[UNK]', '[PAD]', '[CLS]', '[SEP]', '[MASK]']
tokenizer = Tokenizer(models.Unigram())
tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
    [
    pre_tokenizers.ByteLevel(
        add_prefix_space=True,
        use_regex=False
    ),
    pre_tokenizers.Punctuation()
    ])
tokenizer.decoder = decoders.ByteLevel()
trainer = trainers.UnigramTrainer(vocab_size=1024,
    special_tokens=special_tokens, unk_token='[UNK]')
tokenizer.train(['g2pen_corpus_nosep.txt'], trainer=trainer)

In [4]:
text = "S OW0"
print(tokenizer.pre_tokenizer.pre_tokenize_str(text))
print(tokenizer.encode(text).tokens)
print(tokenizer.decode(tokenizer.encode(text).ids))

[('ĠSĠOW0', (0, 5))]
['Ġ', 'SĠ', 'OW0']
 S OW0


In [6]:
from tokenizers import Tokenizer
parler_tokenizer = Tokenizer.from_file('parler_tokenizer.json')
print(parler_tokenizer.pre_tokenizer.pre_tokenize_str(
    "Oh my god, he went to eat the local food and got really sick."
))
print(parler_tokenizer.encode(
    "Oh my god, he went to eat the local food and got really sick."
).tokens)

[('▁Oh', (0, 2)), ('▁my', (2, 5)), ('▁god,', (5, 10)), ('▁he', (10, 13)), ('▁went', (13, 18)), ('▁to', (18, 21)), ('▁eat', (21, 25)), ('▁the', (25, 29)), ('▁local', (29, 35)), ('▁food', (35, 40)), ('▁and', (40, 44)), ('▁got', (44, 48)), ('▁really', (48, 55)), ('▁sick.', (55, 61))]
['▁Oh', '▁my', '▁god', ',', '▁', 'he', '▁went', '▁to', '▁', 'eat', '▁the', '▁local', '▁food', '▁and', '▁got', '▁really', '▁sick', '.', '</s>']


In [7]:
from transformers import AutoTokenizer
parler_tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-mini-v1")

In [8]:
test_texts = [
  'I need some money, can I get some money? I need one million.',
    'Raise up, get yourself together, and drive that funky soul.',
    'Our results demonstrate high fidelity speech generation and a diverse range of accents',
    'Equestria the brave Equestrian.',
    'Dead nirik storage.',
    'Well, one on one, let\'s clean it!',
    'Amazon delivers packages quickly across the United States!',
    'Bitcoin and Ethereum are popular cryptocurrencies.',
    'Text-to-speech models 1 $200',
    'Queue queuing',
    'Twilight Sparkle Pinkie Pie Fluttershy Applejack Rarity Rainbow Dash'
    ]
for test_text in test_texts:
    #print(tokenizer.pre_tokenizer.pre_tokenize_str(phonemize(test_text)))
    encoding = tokenizer.encode(phonemize(test_text))
    print(test_text)
    print(encoding.tokens)
    #print(len(encoding.tokens))
    #print(any([tok == '[UNK]' for tok in encoding.tokens]))

    #print(parler_tokenizer.tokenize(test_text))
    #print(len(parler_tokenizer(test_text).input_ids))

I need some money, can I get some money? I need one million.
['ĠAY1Ġ', 'NIY1DĠ', 'SAH1MĠ', 'M', 'AH1N', 'IY0', ',', 'Ġ', 'KAE1NĠ', 'AY1Ġ', 'GEH1TĠ', 'SAH1MĠ', 'M', 'AH1N', 'IY0', '?', 'ĠAY1Ġ', 'NIY1DĠ', 'WAH1NĠ', 'M', 'IH1L', 'Y', 'AH0N', '.']
Raise up, get yourself together, and drive that funky soul.
['Ġ', 'REY1', 'ZĠ', 'AH1', 'P', ',', 'Ġ', 'GEH1TĠ', 'YER0', 'SEH1LF', 'Ġ', 'TAH0', 'G', 'EH1', 'DH', 'ER0', ',', 'ĠAH0NDĠ', 'DR', 'AY1VĠ', 'DHAE1TĠ', 'F', 'AH1NGK', 'IY0Ġ', 'S', 'OW1L', '.']
Our results demonstrate high fidelity speech generation and a diverse range of accents
['Ġ', 'AW1ER0Ġ', 'RIH0ZAH1LTSĠ', 'D', 'EH1M', 'AH0N', 'STR', 'EY2TĠ', 'HHAY1', 'Ġ', 'F', 'AH0D', 'EH1L', 'AH0TIY0Ġ', 'SP', 'IY1CHĠ', 'JH', 'EH2N', 'ER0', 'EY1SHAH0NĠ', 'AH0NDĠ', 'AH0Ġ', 'D', 'AY0', 'VER1', 'SĠ', 'REY1N', 'JH', 'Ġ', 'AH1VĠ', 'AE1K', 'S', 'EH0N', 'T', 'S']
Equestria the brave Equestrian.
['Ġ', 'IH', '0KWEH1STRIY0AH0Ġ', 'DHAH0Ġ', 'B', 'REY1', 'V', 'Ġ', 'IH0K', 'W', 'EH1ST', 'RIY0', 'AH0N', '.']
Dead n

In [9]:
test_texts = [
    "The phthisic sphinx stealthily eschews the mnemonic pneumatic knickknacks.",
    "Szczecin's squirrel chirped rhythmically while munching on zwitterionic quetzal quills.",
    "Bjorn's fjord-dwelling lynx phthalate psyche was quite a conundrum.",
    "The zeitgeist of Györgyike's csárdás xylophone quintet was truly unique.",
    "Tsar Xylophone's czarina whispered žuželjka while eating sauerkraut in Århus.",
    "Queuing for Llanfairpwllgwyngyllgogerychwyrndrobwllllantysiliogogogoch's onomatopoeic eisteddfod.",
    "The kleptomaniac's schwannoma diagnosis left the psychiatrist nonplussed.",
    "Xerxes' xiphosura exhibited xenophobic behavior towards Zsa Zsa's Zhivago zhuzh.",
    "The phlegmatic pharmacist's emphysema was exacerbated by Phthonus' chthonic chrysanthemums.",
    "Kwyjibo's syzygy with Przybyszewski's psyche caused a hubbub in Ouagadougou.",
    "The bourgeois faux pas at the rendezvous caused quite a brouhaha amongst the hoi polloi.",
    "Nietzsche's Übermensch theory clashed with Schrödinger's quantum doppelgänger hypothesis.",
    "The pneumonoultramicroscopicsilicovolcanoconiosis diagnosis flummoxed the otorhinolaryngologist.",
    "Gnocchi-loving Giacomo's pince-nez glinted as he perused Goethe's oeuvre on the Champs-Élysées.",
    "The tsktsk of the ptarmigan's wings echoed through the cwtch as it flew over Loch Ness."
    ]
for test_text in test_texts:
    #print(tokenizer.pre_tokenizer.pre_tokenize_str(phonemize(test_text)))
    encoding = tokenizer.encode(phonemize(test_text))
    print(encoding.tokens)
    print(len(encoding.tokens))
    print(any([tok == '[UNK]' for tok in encoding.tokens]))

    print(parler_tokenizer.tokenize(test_text))
    print(len(parler_tokenizer(test_text).input_ids))

['ĠDHAH0Ġ', 'F', 'AH0', 'TH', 'IH1S', 'IH0KĠ', 'S', 'F', 'IH1NGK', 'SĠ', 'ST', 'EH1L', 'TH', 'IY0', 'LIY0Ġ', 'EH0', 'S', 'CH', 'UW1', 'ZĠ', 'DHAH0Ġ', 'N', 'IH0M', 'AA1N', 'IH0KĠ', 'N', 'UW0', 'MAE1', 'TIH0KĠ', 'N', 'IH1K', 'N', 'AE2K', 'S', '.']
35
False
['▁The', '▁', 'p', 'h', 'this', 'ic', '▁', 's', 'phin', 'x', '▁steal', 'thi', 'ly', '▁', 'e', 'sche', 'w', 's', '▁the', '▁', 'm', 'nem', 'onic', '▁pneu', 'matic', '▁', 'k', 'nick', 'k', 'n', 'ack', 's', '.']
34
['Ġ', 'SHIY1', 'S', 'IH0N', 'ZĠ', 'SK', 'WER1', 'AH0LĠ', 'CH', 'E', 'R', '1', 'P', 'TĠ', 'R', 'IH1', 'DH', 'M', 'IH0K', 'LIY0Ġ', 'WAY1', 'LĠ', 'M', 'AH1N', 'CH', 'IH0NGĠ', 'AA1NĠ', 'Z', 'W', 'IH2', 'TER0', 'SHIY1', 'N', 'IH0NGĠ', 'KW', 'EH1T', 'Z', 'AH0LĠ', 'KW', 'IH1L', 'Z', '.']
42
False
['▁S', 'z', 'c', 'zeci', 'n', "'", 's', '▁squirrel', '▁', 'chir', 'ped', '▁rhythm', 'ically', '▁while', '▁mun', 'ching', '▁on', '▁', 'z', 'wit', 'terio', 'nic', '▁qu', 'etz', 'al', '▁qui', 'll', 's', '.']
30
['Ġ', 'B', 'AO1R', 'NZĠ', 'FER0', '

In [10]:
tokenizer.save('tokenizer_g2p_v3.json')

In [11]:
from transformers import PreTrainedTokenizerFast
wrapped_tokenizer = PreTrainedTokenizerFast(
    tokenizer_file="tokenizer_g2p_v3.json",
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]"
)
wrapped_tokenizer.save_pretrained('tokenizer_g2p_v3')

('tokenizer_g2p_v3\\tokenizer_config.json',
 'tokenizer_g2p_v3\\special_tokens_map.json',
 'tokenizer_g2p_v3\\tokenizer.json')

In [67]:
print(wrapped_tokenizer.vocab_size)

384
