In [44]:
from transformers import AutoTokenizer 
from itertools import groupby
import numpy as np
import string
import re

# Credit to Synthbot for pruned tokenizers
class HybridPhonemeTokenizer:
    def __init__(self,
        tokenizer_eng = 'synthbot/parlertts_tokenizer_clean',
        tokenizer_g2p = 'synthbot/vul_g2pen_tokenizer_clean',
        eng_special = {
            'pad_token': "<pad>",
            'sep_token': "</s>",
            'unk_token': "<unk>",
        },
        g2p_special = {
            'unk_token': "[UNK]",
            'pad_token': "[PAD]",
            'cls_token': "[CLS]",
            'sep_token': "[SEP]",
            'mask_token':"[MASk]",
        },
         **kwargs):
        self.name_or_path = 'hybrid_phoneme_tokenizer'
        self.tokenizer_eng = AutoTokenizer.from_pretrained(
            tokenizer_eng, **eng_special)
        self.tokenizer_g2p = AutoTokenizer.from_pretrained(
            tokenizer_g2p, **g2p_special)

        # Not sure if this is actually necessary - ByteLevel pretokenizer
        # removes possibility of <unk> tokens
        self.special_tokens = {
            self.tokenizer_g2p.pad_token_id: self.tokenizer_eng.pad_token_id,
            self.tokenizer_g2p.bos_token_id: self.tokenizer_eng.bos_token_id,
            self.tokenizer_g2p.cls_token_id: self.tokenizer_eng.cls_token_id,
            self.tokenizer_g2p.eos_token_id: self.tokenizer_eng.eos_token_id,
            self.tokenizer_g2p.unk_token_id: self.tokenizer_eng.unk_token_id,
            self.tokenizer_g2p.mask_token_id: self.tokenizer_eng.mask_token_id
        }

        g2p_offset = len(self.tokenizer_eng.get_vocab())
        self.g2p_offset = g2p_offset

    def preprocess(self, text):
        # Replace multiple spaces with one space
        # And replace ñ with n
        text = re.sub(r'\s+', ' ', text).replace('ñ', 'n')
        return text

    def __call__(self, text):
        text = self.preprocess(text)
        parts = re.split(r'({.*?})', text)
        result = []
        for i, part in enumerate(parts):
            if not len(part):
                continue
            part = part.strip()
            if not (part.startswith('{') and part.endswith('}')):
                ids = self.tokenizer_eng(part)['input_ids']
                result += [i for i in ids]
            else:
                ids = self.tokenizer_g2p(part[1:-1])['input_ids']
                for i,id in enumerate(ids):
                    if id in self.special_tokens:
                        ids[i] = self.special_tokens[id] - self.g2p_offset
                result += [i + self.g2p_offset for i in ids]
        return {'input_ids': result, 'attention_mask': list(np.ones_like(result))}

    # Returns string constructed from decoded tokens with space handling
    def _list_decode(self, input_ids, skip_special_tokens=False):
        decode_args = {
            'clean_up_tokenization_spaces': True,
            'skip_special_tokens': skip_special_tokens
        }
        output = ''
        for key, group in groupby(input_ids,
            key=lambda x: x >= self.g2p_offset):
            g = list(group)
            if key:
                if len(output) == 0 or output[-1] != ' ':
                    output += ' '
                output += '{'
                output += self.tokenizer_g2p.decode(
                    [i - self.g2p_offset for i in g],
                     **decode_args)
                output += '}'
            else:
                decoded = self.tokenizer_eng.decode(
                    g, **decode_args)
                if len(output) and output[-1] == '}':
                    if len(decoded) and not decoded[0] in string.punctuation:
                        output += ' '
                output += decoded
        return output.strip()

    # Returns list of string tokens with no space handling
    def _decode_tokens(self, input_ids, skip_special_tokens=False):
        toks = []
        for key, group in groupby(input_ids,
            key=lambda x: x >= self.g2p_offset):
            g = list(group)
            if key:
                toks.extend(
                    [self.tokenizer_g2p.decode(i - self.g2p_offset) for i in g])
            else:
                toks.extend([self.tokenizer_eng.decode(i) for i in g])
        return toks
    
    def batch_decode(self, input_ids, skip_special_tokens=False):
        if not isinstance(input_ids[0], list):
            return self._decode_tokens(input_ids)

        return [self._list_decode(l, skip_special_tokens) for l in input_ids]

prompt_tokenizer = HybridPhonemeTokenizer()

In [16]:
text = "And my heaven will be a mare heaven, and I will walk through the front door."
input_ids = prompt_tokenizer(text)['input_ids']
decoded = prompt_tokenizer.batch_decode(input_ids, skip_special_tokens=True)
print(input_ids)
print(decoded)

text = "It's {S OW0 M AH0 CH} larger than life."
input_ids = prompt_tokenizer(text)['input_ids']
decoded = prompt_tokenizer.batch_decode([input_ids], skip_special_tokens=True)
print(input_ids)
print(decoded)
print(prompt_tokenizer(decoded[0])['input_ids'])
print(prompt_tokenizer.batch_decode([prompt_tokenizer(decoded[0])['input_ids']], 
    skip_special_tokens=True))

[275, 82, 9922, 56, 36, 3, 9, 1555, 9922, 6, 11, 27, 56, 1482, 190, 8, 851, 1365, 5, 1]
['And', 'my', 'heaven', 'will', 'be', '', 'a', 'mare', 'heaven', ',', 'and', 'I', 'will', 'walk', 'through', 'the', 'front', 'door', '.', '</s>']
[5, 42, 87, 5, 14, 180, 43]
[94, 31, 7, 1, 32105, 32142, 32187, 32105, 32114, 32280, 32143, 2186, 145, 280, 5, 1]
["It's { S OW0 M AH0 CH} larger than life."]
[5, 42, 87, 5, 14, 180, 43]
[94, 31, 7, 1, 32105, 32142, 32187, 32105, 32114, 32280, 32143, 2186, 145, 280, 5, 1]
[5, 42, 87, 5, 14, 180, 43]
["It's { S OW0 M AH0 CH} larger than life."]


In [17]:
text = "And my heaven will be a mare heaven, and I will walk through the front door."
input_ids = prompt_tokenizer(text)['input_ids']
decoded = prompt_tokenizer.batch_decode(input_ids, skip_special_tokens=True)
print(input_ids)
print(decoded)

prompt_tokenizer2 = HybridPhonemeTokenizer(tokenizer_eng='parler-tts/parler-tts-mini-v1')
text = "And my heaven will be a mare heaven, and I will walk through the front door."
input_ids = prompt_tokenizer2(text)['input_ids']
decoded = prompt_tokenizer2.batch_decode(input_ids, skip_special_tokens=True)
print(input_ids)
print(decoded)

[275, 82, 9922, 56, 36, 3, 9, 1555, 9922, 6, 11, 27, 56, 1482, 190, 8, 851, 1365, 5, 1]
['And', 'my', 'heaven', 'will', 'be', '', 'a', 'mare', 'heaven', ',', 'and', 'I', 'will', 'walk', 'through', 'the', 'front', 'door', '.', '</s>']
[275, 82, 9922, 56, 36, 3, 9, 1555, 9922, 6, 11, 27, 56, 1482, 190, 8, 851, 1365, 5, 1]
['And', 'my', 'heaven', 'will', 'be', '', 'a', 'mare', 'heaven', ',', 'and', 'I', 'will', 'walk', 'through', 'the', 'front', 'door', '.', '</s>']


In [7]:
from transformers import AutoTokenizer
tokenizer_eng = AutoTokenizer.from_pretrained('parler-tts/parler-tts-mini-v1')
tokenizer_eng_new = AutoTokenizer.from_pretrained('synthbot/parlertts_tokenizer_clean')
tokenizer_g2p = AutoTokenizer.from_pretrained('tokenizer_g2p_v2')

In [18]:
tokenizers = [tokenizer_eng, tokenizer_eng_new, tokenizer_g2p, prompt_tokenizer]
compare_prompts = ["Hi there!", " Whoa there! ", "{S OW0 }", "It's {S OW0 M AH0 CH OW0}"]
for p in compare_prompts:
    for t in tokenizers:
        print(t.name_or_path)
        tokenized = t(p.strip())
        print(tokenized)
        print(t.batch_decode([tokenized['input_ids']], skip_special_tokens=True))

parler-tts/parler-tts-mini-v1
{'input_ids': [2018, 132, 55, 1], 'attention_mask': [1, 1, 1, 1]}
['Hi there!']
synthbot/parlertts_tokenizer_clean
{'input_ids': [2018, 132, 55], 'token_type_ids': [0, 0, 0], 'attention_mask': [1, 1, 1]}
['▁Hi ▁there!']
tokenizer_g2p_v2
{'input_ids': [40, 21, 40, 5], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}
[' H!']
hybrid_phoneme_tokenizer
{'input_ids': [2018, 132, 55, 1], 'attention_mask': [1, 1, 1, 1]}
['Hi there!']
parler-tts/parler-tts-mini-v1
{'input_ids': [2645, 9, 132, 55, 1], 'attention_mask': [1, 1, 1, 1, 1]}
['Whoa there!']
synthbot/parlertts_tokenizer_clean
{'input_ids': [2645, 9, 132, 55], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}
['▁Who a ▁there!']
tokenizer_g2p_v2
{'input_ids': [130, 40, 5], 'token_type_ids': [0, 0, 0], 'attention_mask': [1, 1, 1]}
[' W!']
hybrid_phoneme_tokenizer
{'input_ids': [2645, 9, 132, 55, 1], 'attention_mask': [1, 1, 1, 1, 1]}
['Whoa there!']
parler-tts/parler-tts-mini-v1
{

In [29]:
test_texts = [
  'I need some money, can I get some money? I need one million.',
    'Raise up, get yourself together, and drive that funky soul.',
    'Our results demonstrate high fidelity speech generation and a diverse range of accents',
    'Equestria the brave Equestrian.',
    'Dead nirik storage.',
    'Well, one on one, let\'s clean it!',
    'Amazon delivers packages quickly across the United States!',
    'Bitcoin and Ethereum are popular cryptocurrencies.',
    'Text-to-speech models 1 $200',
    'Queue queuing',
    'Twilight Sparkle Pinkie Pie Fluttershy Applejack Rarity Rainbow Dash'
    ]
for test_text in test_texts:
    tokenized = prompt_tokenizer(test_text.strip())
    #print(tokenized)
    print(prompt_tokenizer.batch_decode(tokenized['input_ids'], skip_special_tokens=True))

['I', 'need', 'some', 'money', ',', 'can', 'I', 'get', 'some', 'money', '?', 'I', 'need', 'one', 'million', '.', '</s>']
['Rai', 's', 'e', 'up', ',', 'get', 'yourself', 'together', ',', 'and', 'drive', 'that', 'fun', 'k', 'y', 'soul', '.', '</s>']
['Our', 'results', 'demonstrate', 'high', 'fi', 'de', 'l', 'ity', 'speech', 'generation', 'and', '', 'a', 'diverse', 'range', 'of', 'accent', 's', '</s>']
['Equ', 'e', 'stria', 'the', 'brave', 'Equ', 'e', 'stria', 'n', '.', '</s>']
['Dead', '', 'n', 'i', 'r', 'i', 'k', 'storage', '.', '</s>']
['Well', ',', 'one', 'on', 'one', ',', 'let', "'", 's', 'clean', 'it', '!', '</s>']
['Amazon', 'delivers', 'packages', 'quickly', 'across', 'the', 'United', 'States', '!', '</s>']
['Bitcoin', 'and', 'Ethereum', 'are', 'popular', 'crypto', 'cu', 'r', 'r', 'en', 'c', 'ies', '.', '</s>']
['Text', '-', 'to', '-', 's', 'pe', 'e', 'ch', 'models', '1', '$200', '</s>']
['Que', 'u', 'e', 'que', 'u', 'ing', '</s>']
['Twi', 'light', 'Spark', 'l', 'e', 'Pink', 'i', 

In [107]:
import random
from itertools import groupby
from g2p_en import G2p
import time
import re
import string

g2p = G2p()

def phonemize(text):
    """Uses g2p_en to convert a string into contiguous ARPAbet characters"""
    spl = text.split()
    l = ''
    for s in spl:
        p = [arp for arp in g2p(s) if arp != ' ']
        arpabet_string = ''.join(p)
        l += arpabet_string + ' '
    return l.strip()

def clean_spaces(text):
    """Remove spaces before punctuation."""
    return re.sub(r'\s+([.,!?;:])', r'\1', text)

def random_phonemize(text, prob=0.2, grow_prob=0.5, seed=0):
    """ Randomly phonemize spans of text.
    `prob` influences the base probability of an index being phonemized
    `grow_prob` adds a probability for the previous index being phonemized."""
    text = clean_spaces(text)
    # Split including words or isolated punctuation
    spl = re.findall(r'[\w\']+|[.,!?;:]', text)
    splbits = [0 for s in spl]
    idxs = list(t[0] for t in enumerate(spl))

    random.seed(seed)
    random.shuffle(idxs)

    for idx in idxs[:int(prob*len(spl))]:
        splbits[idx] = 1
        if random.random() < grow_prob:
            if idx > 0:
                splbits[idx-1] = 1

    ret = ''

    for key, group in groupby(enumerate(splbits),
        key = lambda t: t[1] == 1):
        g = list(group)
        g = [spl[t[0]] for t in g]
        str_to_process = clean_spaces(' '.join(g))
        if key == 0:
            ret += str_to_process+' '
        else:
            ret += '{'+phonemize(str_to_process)+'} '

    return clean_spaces(ret)

for test_text in test_texts:
    #print(random_phonemize(test_text))
    rp = random_phonemize(test_text, seed=int(time.time()))
    print(rp)
    tokenized = prompt_tokenizer(rp.strip())
    print(prompt_tokenizer.batch_decode(tokenized['input_ids'], skip_special_tokens=True))
    print(prompt_tokenizer.batch_decode([tokenized['input_ids']], skip_special_tokens=True))

I {NIY1D SAH1M} money, can I get some money? {AY1 NIY1D WAH1N MIH1LYAH0N}. 
['I', '</s>', '', '', '', '', 'money', ',', 'can', 'I', 'get', 'some', 'money', '?', '</s>', '', '', '', '', '', '', '', '', '.', '</s>']
['I { NIY1D SAH1M} money, can I get some money? { AY1 NIY1D WAH1N MIH1LYAH0N}.']
Raise up, get yourself {TAH0GEH1DHER0}, and drive that {FAH1NGKIY0 SOW1L}. 
['Rai', 's', 'e', 'up', ',', 'get', 'yourself', '</s>', '', '', '', '', '', '', '', ',', 'and', 'drive', 'that', '</s>', '', '', '', '', '', '', '', '.', '</s>']
['Raise up, get yourself { TAH0GEH1DHER0}, and drive that { FAH1NGKIY0 SOW1L}.']
Our results demonstrate high fidelity {SPIY1CH} generation and a diverse {REY1NJH AH1V} accents 
['Our', 'results', 'demonstrate', 'high', 'fi', 'de', 'l', 'ity', '</s>', '', '', '', 'generation', 'and', '', 'a', 'diverse', '</s>', '', '', '', '', '', '', 'accent', 's', '</s>']
['Our results demonstrate high fidelity { SPIY1CH} generation and a diverse { REY1NJH AH1V} accents']
Eques