In [8]:
#Llama Tokenizer - Step 1
from transformers import MimiModel, AutoFeatureExtractor, AutoTokenizer
import torch
import numpy as np

class TextTokenizer:
    def __init__(self, name='Llama_tokenizer'):
        self.tokenizer = AutoTokenizer.from_pretrained(name, legacy=False)
        print("text vocab size", self.tokenizer.vocab_size)

    def encode(self, text: str):
        tokens = self.tokenizer.encode(text)
        return tokens

    def decode(self, tokens):
        return self.tokenizer.decode(tokens)
    
class MimiTokenizer:
    def __init__(self, device):    
        self.device = device
        self.model = MimiModel.from_pretrained("kyutai/mimi")
        self.model.to(device)
        self.model.eval()
        self.feature_extractor = AutoFeatureExtractor.from_pretrained("kyutai/mimi", device=device)
        self.sampling_rate = self.feature_extractor.sampling_rate
        self.n_codebooks = 8
        self.vocab_size = 2048

    @torch.inference_mode()
    def encode(self, waveform):
        inputs = self.feature_extractor(raw_audio=waveform, 
                                        sampling_rate=self.sampling_rate, 
                                        return_tensors="pt").to(self.device)
            
        output = self.model.encode(inputs["input_values"], inputs["padding_mask"], num_quantizers=self.n_codebooks)
        tokens = output.audio_codes[0].cpu().numpy()
        return tokens

    def decode(self, tokens):
        assert len(tokens.shape) == 2
        tokens = torch.tensor(np.expand_dims(tokens, axis=0)).to(self.device)
        output = self.model.decode(tokens)
        waveform = output.audio_values.cpu()
        return waveform

In [2]:
import json
tokenizer = TextTokenizer()
with open('/home/subhash/.cache/indri/lj_speech/annotation/metadata.jsonl') as f:
    for line in f:
        data = json.loads(line)
        text = data['raw_text']
        tokens = tokenizer.encode(text)
        print(text)
        print(tokens)
        print(tokenizer.decode(tokens))
        break

text vocab size 128000
Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition
[128000, 39628, 11, 304, 279, 1193, 5647, 449, 902, 584, 527, 520, 3118, 11920, 11, 44642, 505, 1455, 422, 539, 505, 682, 279, 19071, 323, 44948, 15609, 304, 279, 68033]
<|begin_of_text|>Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition


In [3]:
# Mimi Tokeniser - Step 2
import numpy as np
mimi_tokens = np.load("/home/subhash/.cache/indri/lj_speech/tokens/mimi/LJ040-0046.npy")
print(mimi_tokens.shape)
print(mimi_tokens)

(8, 100)
[[ 995  572 1117 1069 1620   80 1134  715 1650 1532  937 1077 1352  749
   577  670 1701 1762  570 1468  157  839 1457  181 1022 1037 1067 1132
  1320 1607 1607  663 1779 1301 1618 1782  304 1219 1640 1196  505 1146
   784  917 1708  922 1137 1358  126  832 1743  663 1544  835  337 1377
  2024 1252 1260  304 1039 1640  797  262  663  384 2025 1774  178  129
  1669  421 1519 1022 1562 1252 1782  304 1882 1721 1356   96  777  505
   526 1146 1235  981 1708 1473 1358  502 1268  885 1558 1252 1260  304
  1631 1565]
 [ 277 1224 1549 1138 1795  954 1155  172 1889 1591 1776  275  338  907
   243 1111  440 1155 1349  696 1996  993 1046 1293 1624  389 1843  339
  1056  243 1211  243  243  243 1717  632  641 1342  534  739  735   15
  1623 1706  731  338  773  820  490 1211  243  243 1056  893 1139 1769
   693  192  823  369 1777  648 1211 1211  243 1349  789  931  632 1203
   285  457 1910  903  328 1579  420 1518   31  177 1940 1715 2028 1348
  1576 1478  794 1866  254  823  369 1056 

In [4]:
# Weave Tokens, codebook offset - Step 3
def weave_tokens(tokens):
    result = []
    max_length = max(len(codebook) for codebook in tokens)
    
    for i in range(max_length):
        for codebook_index, codebook in enumerate(tokens):
            if i < len(codebook):
                offset = 2048 * codebook_index + 128000
                result.append(codebook[i] + offset)                 
    return np.array(result)

weave_audio = weave_tokens(mimi_tokens.tolist())
weave_audio

array([128995, 130325, 132834, 135350, 138081, 138819, 140988, 143449,
       128572, 131272, 133038, 135276, 136552, 139416, 141865, 142522,
       129117, 131597, 133406, 134146, 136322, 139289, 141923, 143985,
       129069, 131186, 133662, 135220, 136512, 139853, 141760, 142410,
       129620, 131843, 133121, 134814, 137436, 138387, 141084, 142444,
       128080, 131002, 133038, 135147, 137077, 139665, 141897, 142752,
       129134, 131203, 132924, 134668, 136548, 140002, 140409, 143451,
       128715, 130220, 133849, 135225, 136357, 139775, 140460, 143596,
       129650, 131937, 132126, 134545, 136933, 139077, 141937, 143864,
       129532, 131639, 132479, 134835, 137883, 138246, 141974, 143201,
       128937, 131824, 133411, 134384, 136248, 138246, 141565, 143949,
       129077, 130323, 132375, 134147, 138069, 139010, 141512, 142954,
       129352, 130386, 133860, 134608, 138148, 140177, 141688, 143239,
       128749, 130955, 133781, 135938, 137985, 140133, 141270, 143942,
      

In [9]:
import os
import torch
import json
import numpy as np
from transformers import AutoTokenizer, AutoModel

llama_tokenizer = TextTokenizer()
llama_model = AutoModel.from_pretrained('Llama_tokenizer/')

mimi_tokenizer = MimiTokenizer(device=torch.device('cuda'))
metadata_file = '/home/subhash/.cache/indri/lj_speech/annotation/metadata.jsonl'
audio_token_dir = '/home/subhash/.cache/indri/lj_speech/tokens/mimi/'
num_text_tokens = llama_tokenizer.tokenizer.vocab_size
num_audio_tokens = mimi_tokenizer.vocab_size
print(f"Text vocab size: {num_text_tokens}, Audio vocab size: {num_audio_tokens}")
embedding_table = np.zeros((num_text_tokens + num_audio_tokens, llama_model.config.hidden_size), dtype=np.float32)

for token_id in range(num_text_tokens):
    embedding_table[token_id] = llama_model.get_input_embeddings()(torch.tensor([token_id])).squeeze().detach().numpy()
for audio_file in os.listdir(audio_token_dir):
    audio_tokens = np.load(os.path.join(audio_token_dir, audio_file))
    for token in audio_tokens.astype(np.int32):
        embedding_table[num_text_tokens + token] = np.random.normal(size=llama_model.config.hidden_size)

text_data = []
mimi_tokens = []
with open(metadata_file, 'r') as f:
    for line in f:
        data = json.loads(line)
        text_data.append(data['raw_text'])
       
        audio_token_file = os.path.join(audio_token_dir, f"{data['id']}.npy")
        if os.path.exists(audio_token_file):
            audio_tokens = np.load(audio_token_file)
            mimi_tokens.append(audio_tokens)
        else:
            mimi_tokens.append(np.array([100] * 8))  

text_tokens = [llama_tokenizer.encode(text) for text in text_data]
print(f"Embedding table shape: {embedding_table.shape}")

text vocab size 128000


  self.register_buffer("padding_total", torch.tensor(kernel_size - stride, dtype=torch.int64), persistent=False)


Text vocab size: 128000, Audio vocab size: 2048
Embedding table shape: (130048, 2048)


In [14]:
from transformers import PreTrainedTokenizerFast

llama_vocab = llama_tokenizer.tokenizer.get_vocab()
mimi_vocab = mimi_tokenizer()

offset = len(llama_vocab)
mimi_vocab_offset = {k: v + offset for k, v in mimi_vocab.items()}

merged_vocab = {**llama_vocab, **mimi_vocab_offset}

tts_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=llama_tokenizer.tokenizer, 
    vocab=merged_vocab, 
)

tts_tokenizer.save_pretrained("TTS_tokenizer")

print("TTS_tokenizer has been created and saved.")


AttributeError: 'EncodecFeatureExtractor' object has no attribute 'get_vocab'

In [20]:
# Combine the tokens - Step 5
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('tokenizer')
task_token = tokenizer.encode('[spkr_hifi_tts_9017]')
print(task_token)

[66692]


NotImplementedError: 