In [29]:
from transformers import AutoTokenizer 
import random
from huggingface_hub import hf_hub_download
from itertools import groupby
from g2p_en import G2p
import numpy as np
import string
import re
import json

def clean_spaces(text):
    """Remove spaces before punctuation and on inside of opening brace."""
    text = re.sub(r'\s+([.,!?;:])', r'\1', text)
    text = re.sub(r'{\s+','{', text)
    return text

# Credit to Synthbot for pruned tokenizers
class HybridPhonemeTokenizer:
    def __init__(self,
        tokenizer_eng = 'therealvul/parlertts_tokenizer_clean',
        tokenizer_g2p = 'therealvul/g2pen_tokenizer_clean',
        eng_special = {
            'pad_token': "<pad>",
            'eos_token': "</s>",
            'unk_token': "<unk>",
        },
        g2p_special = {
            'unk_token': "[UNK]",
            'pad_token': "[PAD]",
            'cls_token': "[CLS]",
            'eos_token': "[SEP]",
            'mask_token':"[MASk]",
        },
         **kwargs):
        self.name_or_path = 'hybrid_phoneme_tokenizer'
        self.tokenizer_eng = AutoTokenizer.from_pretrained(
            tokenizer_eng, **eng_special)
        self.tokenizer_g2p = AutoTokenizer.from_pretrained(
            tokenizer_g2p, **g2p_special)
        tokenizer_eng_vocab_path = hf_hub_download(repo_id=
            tokenizer_eng, filename="tokenizer.json")
        with open(tokenizer_eng_vocab_path, encoding='utf-8') as f:
            tokenizer_eng_scores = json.load(f)["model"]["vocab"]

        # To avoid expanding vocab size and changing embedding length,
        # we re-map g2p IDs to disabled IDs in the english tokenizer

        # maps from g2p id to external id
        self.g2p_to_ext = list()
        # maps from external id to g2p id
        self.ext_to_g2p = dict()
        for i,t in enumerate(tokenizer_eng_scores):
            token, score = t
            if score == -99.0:
                self.g2p_to_ext.append(i)
                self.ext_to_g2p[i] = (len(
                    self.g2p_to_ext) - 1)

        # The vocab size of the g2p tokenizer must be smaller or equal to
        # the number of disabled tokens in the eng tokenizer
        assert len(self.tokenizer_g2p.get_vocab()) < len(self.g2p_to_ext)

        # Not sure if this is actually necessary - ByteLevel pretokenizer
        # removes possibility of <unk> tokens
        self.special_tokens = {
            self.tokenizer_g2p.pad_token_id: self.tokenizer_eng.pad_token_id,
            self.tokenizer_g2p.bos_token_id: self.tokenizer_eng.bos_token_id,
            self.tokenizer_g2p.cls_token_id: self.tokenizer_eng.cls_token_id,
            self.tokenizer_g2p.eos_token_id: self.tokenizer_eng.eos_token_id,
            self.tokenizer_g2p.unk_token_id: self.tokenizer_eng.unk_token_id,
            self.tokenizer_g2p.mask_token_id: self.tokenizer_eng.mask_token_id
        }
        self.pad_token_id = self.tokenizer_eng.pad_token_id

    def ext_is_g2p_id(self, id):
        return id in self.ext_to_g2p

    def ext_to_g2p_id(self, id):
        return self.ext_to_g2p[id]

    def g2p_to_ext_id(self, id):
        return self.g2p_to_ext[id]

    def preprocess(self, text):
        # Replace multiple spaces with one space
        # And replace ñ with n
        text = re.sub(r'\s+', ' ', text).replace('ñ', 'n')
        return text

    def max_vocab_length(self):
        return len(self.tokenizer_eng.get_vocab())

    def __call__(self, text):
        text = self.preprocess(text)
        parts = re.split(r'({.*?})', text)
        result = []
        for i, part in enumerate(parts):
            if not len(part):
                continue
            part = part.strip()
            if not (part.startswith('{') and part.endswith('}')):
                ids = self.tokenizer_eng(part, add_special_tokens=False)['input_ids']
                result += [i for i in ids]
            else:
                ids = self.tokenizer_g2p(part[1:-1])['input_ids']
                for i,id in enumerate(ids):
                    if id in self.special_tokens:
                        ids[i] = self.special_tokens[id]
                    else:
                        ids[i] = self.g2p_to_ext_id(id)
                result += [i for i in ids]
        return {'input_ids': result, 'attention_mask': list(np.ones_like(result))}

    # Returns string constructed from decoded tokens with space handling
    def _list_decode(self, input_ids, skip_special_tokens=False):
        decode_args = {
            'clean_up_tokenization_spaces': True,
            'skip_special_tokens': skip_special_tokens
        }
        output = ''
        for isg2p, group in groupby(input_ids,
            key=lambda x: self.ext_is_g2p_id(x)):
            g = list(group)
            if isg2p:
                if len(output) == 0 or output[-1] != ' ':
                    output += ' '
                output += '{'
                output += self.tokenizer_g2p.decode(
                    [self.ext_to_g2p_id(i) for i in g],
                     **decode_args)
                output += '}'
            else:
                decoded = self.tokenizer_eng.decode(
                    g, **decode_args)
                if len(output) and output[-1] == '}':
                    if len(decoded) and not decoded[0] in string.punctuation:
                        output += ' '
                output += decoded
        return clean_spaces(output.strip())

    # Returns list of string tokens with no space handling
    def _decode_tokens(self, input_ids, skip_special_tokens=False):
        toks = []
        for isg2p, group in groupby(input_ids,
            key=lambda x: self.ext_is_g2p_id(x)):
            g = list(group)
            if isg2p:
                toks.extend(
                    [self.tokenizer_g2p.decode(
                        self.ext_to_g2p_id(i)) for i in g])
            else:
                toks.extend([self.tokenizer_eng.decode(i) for i in g])
        return toks
    
    def batch_decode(self, input_ids, skip_special_tokens=False):
        if not isinstance(input_ids[0], list):
            return self._decode_tokens(input_ids)

        return [self._list_decode(l, skip_special_tokens) for l in input_ids]

    

g2p = G2p()

class HorsePhonemizer:
    def __init__(self, horsewords_dictionary = 'new_horsewords.clean'):
        self.horsedict = {}
        with open(horsewords_dictionary, 'r') as f:
            while line := f.readline():
                baseword, transcription = line.split('  ')
                self.horsedict[baseword] = transcription

    def phonemize(self, text):
        """Uses g2p_en + a dictionary to convert a string into contiguous ARPAbet characters"""
        spl = text.split()
        l = ''
        for s in spl:
            s_up = s.strip().upper()
            if s_up in self.horsedict:
                arpabet = ''.join(self.horsedict[s_up].split())
                l += arpabet + ' '
            else:
                p = [arp for arp in g2p(s) if arp != ' ']
                arpabet_string = ''.join(p)
                l += arpabet_string + ' '
        return l.strip()

    def random_phonemize(self, text, prob=0.2, grow_prob=0.2, seed=None):
        """ Randomly phonemize spans of text.
        `prob` influences the base probability of an index being phonemized
        `grow_prob` adds a probability for the previous index being phonemized."""
        text = clean_spaces(text)
        # Split including words or isolated punctuation
        spl = re.findall(r'[\w\']+|[.,!?;:]', text)
        splbits = [0 for s in spl]
        idxs = list(t[0] for t in enumerate(spl))

        if seed is not None:
            random.seed(seed)
        random.shuffle(idxs)

        for idx in idxs[:int(prob*len(spl))]:
            splbits[idx] = 1
            if random.random() < grow_prob:
                if idx > 0:
                    splbits[idx-1] = 1

        ret = ''

        for key, group in groupby(enumerate(splbits),
            key = lambda t: t[1] == 1):
            g = list(group)
            g = [spl[t[0]] for t in g]
            str_to_process = clean_spaces(' '.join(g))
            if key == 0:
                ret += str_to_process+' '
            else:
                ret += '{'+self.phonemize(str_to_process)+'} '

        return clean_spaces(ret)

prompt_tokenizer = HybridPhonemeTokenizer()
print(prompt_tokenizer.max_vocab_length())

class HorsePhonemizer:
    def __init__(self, horsewords_dictionary = 'new_horsewords.clean'):
        self.horsedict = {}
        with open(horsewords_dictionary, 'r') as f:
            while line := f.readline():
                baseword, transcription = line.split('  ')
                self.horsedict[baseword] = transcription

    def phonemize(self, text):
        """Uses g2p_en + a dictionary to convert a string into contiguous ARPAbet characters"""
        spl = text.split()
        l = ''
        for s in spl:
            s_up = s.strip().upper()
            if s_up in self.horsedict:
                arpabet = ''.join(self.horsedict[s_up].split())
                l += arpabet + ' '
            else:
                p = [arp for arp in g2p(s) if arp != ' ']
                arpabet_string = ''.join(p)
                l += arpabet_string + ' '
        return l.strip()

    def random_phonemize(self, text, prob=0.2, grow_prob=0.2, seed=None):
        """ Randomly phonemize spans of text.
        `prob` influences the base probability of an index being phonemized
        `grow_prob` adds a probability for the previous index being phonemized."""
        text = clean_spaces(text)
        # Split including words or isolated punctuation
        spl = re.findall(r'[\w\']+|[.,!?;:]', text)
        splbits = [0 for s in spl]
        idxs = list(t[0] for t in enumerate(spl))

        if seed is not None:
            random.seed(seed)
        random.shuffle(idxs)

        for idx in idxs[:int(prob*len(spl))]:
            splbits[idx] = 1
            if random.random() < grow_prob:
                if idx > 0:
                    splbits[idx-1] = 1

        ret = ''

        for key, group in groupby(enumerate(splbits),
            key = lambda t: t[1] == 1):
            g = list(group)
            g = [spl[t[0]] for t in g]
            str_to_process = clean_spaces(' '.join(g))
            if key == 0:
                ret += str_to_process+' '
            else:
                ret += '{'+self.phonemize(str_to_process)+'} '

        return clean_spaces(ret)

horse_phonemizer = HorsePhonemizer()

32100


In [70]:
test_text = "Come on Rainbow Dash, you can do this. Just remember the routine!"
ids = prompt_tokenizer(test_text)['input_ids']
text = prompt_tokenizer.batch_decode([ids])[0]
text = horse_phonemizer.random_phonemize(text)
ids = prompt_tokenizer(text)['input_ids']
text = prompt_tokenizer.batch_decode([ids])[0]
print(text)

Come on Rainbow Dash {, YUW1 KAE1N} do {DHIH1S}. Just remember the routine!


In [16]:
text = "And my heaven will be a mare heaven, and I will walk through the front door."
input_ids = prompt_tokenizer(text)['input_ids']
decoded = prompt_tokenizer.batch_decode(input_ids, skip_special_tokens=True)
print(input_ids)
print(decoded)

text = "It's {S OW0 M AH0 CH} larger than life."
input_ids = prompt_tokenizer(text)['input_ids']
decoded = prompt_tokenizer.batch_decode([input_ids], skip_special_tokens=True)
print(input_ids)
print(decoded)
print(prompt_tokenizer(decoded[0])['input_ids'])
print(prompt_tokenizer.batch_decode([prompt_tokenizer(decoded[0])['input_ids']], 
    skip_special_tokens=True))

print([prompt_tokenizer.ext_is_g2p_id(i) for i in input_ids])

[275, 82, 3, 88, 9, 162, 29, 56, 36, 3, 9, 1555, 3, 88, 9, 162, 29, 6, 11, 27, 56, 1482, 190, 8, 851, 1365, 5]
['And', 'my', '', 'he', 'a', 've', 'n', 'will', 'be', '', 'a', 'mare', '', 'he', 'a', 've', 'n', ',', 'and', 'I', 'will', 'walk', 'through', 'the', 'front', 'door', '.']
[94, 31, 7, 93, 252, 374, 93, 171, 583, 256, 50, 52, 122, 49, 145, 280, 5]
["It's {S OW0 M AH0 CH} larger than life."]
[94, 31, 7, 93, 252, 374, 93, 171, 583, 256, 50, 52, 122, 49, 145, 280, 5]
["It's {S OW0 M AH0 CH} larger than life."]
[False, False, False, True, True, True, True, True, True, True, False, False, False, False, False, False, False]


In [17]:
text = "And my heaven will be a mare heaven, and I will walk through the front door."
input_ids = prompt_tokenizer(text)['input_ids']
decoded = prompt_tokenizer.batch_decode(input_ids, skip_special_tokens=True)
print(input_ids)
print(decoded)

#prompt_tokenizer2 = HybridPhonemeTokenizer(tokenizer_eng='parler-tts/parler-tts-mini-v1')
#text = "And my heaven will be a mare heaven, and I will walk through the front door."
#input_ids = prompt_tokenizer2(text)['input_ids']
#decoded = prompt_tokenizer2.batch_decode(input_ids, skip_special_tokens=True)
#print(input_ids)
#print(decoded)

[275, 82, 3, 88, 9, 162, 29, 56, 36, 3, 9, 1555, 3, 88, 9, 162, 29, 6, 11, 27, 56, 1482, 190, 8, 851, 1365, 5]
['And', 'my', '', 'he', 'a', 've', 'n', 'will', 'be', '', 'a', 'mare', '', 'he', 'a', 've', 'n', ',', 'and', 'I', 'will', 'walk', 'through', 'the', 'front', 'door', '.']


In [11]:
from transformers import AutoTokenizer
tokenizer_eng = AutoTokenizer.from_pretrained('parler-tts/parler-tts-mini-v1')
tokenizer_eng_new = AutoTokenizer.from_pretrained('synthbot/parlertts_tokenizer_clean')

In [18]:
tokenizers = [#tokenizer_eng, tokenizer_eng_new, tokenizer_g2p,
tokenizer_eng,
 tokenizer_eng_new,
 prompt_tokenizer,
 ]
compare_prompts = ["It's {S OW0 M AH0 CH OW0}"]
for p in compare_prompts:
    for t in tokenizers:
        print(t.name_or_path)
        tokenized = t(p.strip())
        print(tokenized)
        print(t.batch_decode(tokenized['input_ids'], skip_special_tokens=True))

parler-tts/parler-tts-mini-v1
{'input_ids': [94, 31, 7, 3, 2, 134, 3, 15251, 632, 283, 3, 14084, 632, 9302, 3, 15251, 632, 2, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
['It', "'", 's', '', '', 'S', '', 'OW', '0', 'M', '', 'AH', '0', 'CH', '', 'OW', '0', '', '']
synthbot/parlertts_tokenizer_clean
{'input_ids': [94, 31, 7, 3, 2, 134, 411, 518, 632, 283, 71, 566, 632, 9302, 411, 518, 632, 2, 1], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
['It', "'", 's', '', '', 'S', 'O', 'W', '0', 'M', 'A', 'H', '0', 'CH', 'O', 'W', '0', '', '']
hybrid_phoneme_tokenizer
{'input_ids': [94, 31, 7, 93, 252, 374, 93, 171, 583, 256, 93, 374], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
['It', "'", 's', ' ', 'S ', 'OW0', ' ', 'M', ' AH0 ', 'CH', ' ', 'OW0']


In [19]:
test_texts = [
  'I need some money, can I get some money? I need one million.',
    'Raise up, get yourself together, and drive that funky soul.',
    'Our results demonstrate high fidelity speech generation and a diverse range of accents',
    'Equestria the brave Equestrian.',
    'Dead nirik storage.',
    'Well, one on one, let\'s clean it!',
    'Amazon delivers packages quickly across the United States!',
    'Bitcoin and Ethereum are popular cryptocurrencies.',
    'Text-to-speech models 1 $200',
    'Queue queuing',
    'Twilight Sparkle Pinkie Pie Fluttershy Applejack Rarity Rainbow Dash Ahuizotl Tenochtitlan Xilati Ziegfilly',
    'Reparatur',
    'Servicii',
    ]
for test_text in test_texts:
    tokenized = prompt_tokenizer(test_text.strip())
    #print(tokenized)
    print(prompt_tokenizer.batch_decode(tokenized['input_ids'], skip_special_tokens=True))

['I', 'need', 'some', '', 'mon', 'e', 'y', ',', 'can', 'I', 'get', 'some', '', 'mon', 'e', 'y', '?', 'I', 'need', 'one', '', 'm', 'i', 'll', 'i', 'on', '.']
['R', 'a', 'is', 'e', 'up', ',', 'get', 'yourself', 'together', ',', 'and', '', 'd', 'r', 'ive', 'that', 'fun', 'k', 'y', 'so', 'ul', '.']
['Our', '', 're', 's', 'ul', 't', 's', 'de', 'mon', 's', 't', 'r', 'at', 'e', 'high', '', 'f', 'i', 'de', 'l', 'ity', '', 's', 'pe', 'e', 'ch', '', 'g', 'en', 'er', 'ation', 'and', '', 'a', '', 'd', 'ive', 'r', 's', 'e', '', 'r', 'an', 'g', 'e', 'of', '', 'a', 'c', 'c', 'en', 't', 's']
['E', 'que', 'stria', 'the', '', 'b', 'r', 'a', 've', 'E', 'que', 'stria', 'n', '.']
['', 'D', 'e', 'a', 'd', '', 'n', 'i', 'r', 'i', 'k', '', 's', 't', 'or', 'a', 'g', 'e', '.']
['Well', ',', 'one', 'on', 'one', ',', 'let', "'", 's', 'clean', 'it', '!']
['A', 'm', 'a', 'z', 'on', 'de', 'l', 'ive', 'r', 's', '', 'p', 'a', 'c', 'k', 'a', 'g', 'e', 's', 'quick', 'ly', '', 'a', 'c', 'r', 'o', 's', 's', 'the', 'U', 'n

In [20]:
import random
from itertools import groupby
from g2p_en import G2p
import time
import re
import string

g2p = G2p()

class HorsePhonemizer:
    def __init__(self, horsewords_dictionary = 'new_horsewords.clean'):
        self.horsedict = {}
        with open(horsewords_dictionary, 'r') as f:
            while line := f.readline():
                baseword, transcription = line.split('  ')
                self.horsedict[baseword] = transcription

    def phonemize(self, text):
        """Uses g2p_en + a dictionary to convert a string into contiguous ARPAbet characters"""
        spl = text.split()
        l = ''
        for s in spl:
            s_up = s.strip().upper()
            if s_up in self.horsedict:
                arpabet = ''.join(self.horsedict[s_up].split())
                l += arpabet + ' '
            else:
                p = [arp for arp in g2p(s) if arp != ' ']
                arpabet_string = ''.join(p)
                l += arpabet_string + ' '
        return l.strip()

    def random_phonemize(self, text, prob=0.2, grow_prob=0.2, seed=0):
        """ Randomly phonemize spans of text.
        `prob` influences the base probability of an index being phonemized
        `grow_prob` adds a probability for the previous index being phonemized."""
        text = clean_spaces(text)
        # Split including words or isolated punctuation
        spl = re.findall(r'[\w\']+|[.,!?;:]', text)
        splbits = [0 for s in spl]
        idxs = list(t[0] for t in enumerate(spl))

        random.seed(seed)
        random.shuffle(idxs)

        for idx in idxs[:int(prob*len(spl))]:
            splbits[idx] = 1
            if random.random() < grow_prob:
                if idx > 0:
                    splbits[idx-1] = 1

        ret = ''

        for key, group in groupby(enumerate(splbits),
            key = lambda t: t[1] == 1):
            g = list(group)
            g = [spl[t[0]] for t in g]
            str_to_process = clean_spaces(' '.join(g))
            if key == 0:
                ret += str_to_process+' '
            else:
                ret += '{'+self.phonemize(str_to_process)+'} '

        return clean_spaces(ret)

hphzr = HorsePhonemizer()
for test_text in test_texts:
    #print(random_phonemize(test_text))
    rp = hphzr.random_phonemize(test_text, seed=int(time.time()))
    tokenized = prompt_tokenizer(rp.strip())
    print(rp)
    print(prompt_tokenizer.batch_decode(tokenized['input_ids'], skip_special_tokens=True))
    print(prompt_tokenizer.batch_decode([tokenized['input_ids']], skip_special_tokens=True))

I need some money {, KAE1N} I get some money? I need one million {.} 
['I', 'need', 'some', '', 'mon', 'e', 'y', ' ', ',', ' ', 'K', 'AE1N', 'I', 'get', 'some', '', 'mon', 'e', 'y', '?', 'I', 'need', 'one', '', 'm', 'i', 'll', 'i', 'on', ' ', '.']
['I need some money {, KAE1N} I get some money? I need one million {.}']
Raise {AH1P}, get yourself together, and {DRAY1V} that funky soul. 
['R', 'a', 'is', 'e', ' ', 'AH1', 'P', '', ',', 'get', 'yourself', 'together', ',', 'and', ' ', 'DR', 'AY1', 'V', 'that', 'fun', 'k', 'y', 'so', 'ul', '.']
['Raise {AH1P}, get yourself together, and {DRAY1V} that funky soul.']
Our {RIH0ZAH1LTS} demonstrate high fidelity speech generation and {AH0} diverse range of accents 
['Our', ' ', 'RIH0', 'Z', 'AH1L', 'T', 'S', 'de', 'mon', 's', 't', 'r', 'at', 'e', 'high', '', 'f', 'i', 'de', 'l', 'ity', '', 's', 'pe', 'e', 'ch', '', 'g', 'en', 'er', 'ation', 'and', ' ', 'AH0', '', 'd', 'ive', 'r', 's', 'e', '', 'r', 'an', 'g', 'e', 'of', '', 'a', 'c', 'c', 'en', '