In [10]:
#Llama Tokenizer - Step 1
from transformers import MimiModel, AutoFeatureExtractor, AutoTokenizer
import torch
import numpy as np

class TextTokenizer:
    def __init__(self, name='Llama_tokenizer'):
        self.tokenizer = AutoTokenizer.from_pretrained(name, legacy=False)
        print("text vocab size", self.tokenizer.vocab_size)

    def encode(self, text: str):
        tokens = self.tokenizer.encode(text)
        return tokens

    def decode(self, tokens):
        return self.tokenizer.decode(tokens)
    
class MimiTokenizer:
    def __init__(self, device):    
        self.device = device
        self.model = MimiModel.from_pretrained("kyutai/mimi")
        self.model.to(device)
        self.model.eval()
        self.feature_extractor = AutoFeatureExtractor.from_pretrained("kyutai/mimi", device=device)
        self.sampling_rate = self.feature_extractor.sampling_rate
        self.n_codebooks = 8
        self.vocab_size = 2048

    @torch.inference_mode()
    def encode(self, waveform):
        inputs = self.feature_extractor(raw_audio=waveform, 
                                        sampling_rate=self.sampling_rate, 
                                        return_tensors="pt").to(self.device)
            
        output = self.model.encode(inputs["input_values"], inputs["padding_mask"], num_quantizers=self.n_codebooks)
        tokens = output.audio_codes[0].cpu().numpy()
        return tokens

    def decode(self, tokens):
        assert len(tokens.shape) == 2
        tokens = torch.tensor(np.expand_dims(tokens, axis=0)).to(self.device)
        output = self.model.decode(tokens)
        waveform = output.audio_values.cpu()
        return waveform

2024-12-09 13:56:21.421986: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1733732781.442067  115641 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1733732781.447540  115641 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-09 13:56:21.468376: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [11]:
import json
tokenizer = TextTokenizer()
with open('/home/subhash/.cache/indri/lj_speech/annotation/metadata.jsonl') as f:
    for line in f:
        data = json.loads(line)
        text = data['raw_text']
        tokens = tokenizer.encode(text)
        print(text)
        print(tokens)
        print(tokenizer.decode(tokens))
        break

text vocab size 128000
Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition
[128000, 39628, 11, 304, 279, 1193, 5647, 449, 902, 584, 527, 520, 3118, 11920, 11, 44642, 505, 1455, 422, 539, 505, 682, 279, 19071, 323, 44948, 15609, 304, 279, 68033]
<|begin_of_text|>Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition


In [28]:
# Mimi Tokeniser - Step 2
import numpy as np
mimi_tokens = np.load("/home/subhash/.cache/indri/lj_speech/tokens/mimi/LJ040-0046.npy")
print(mimi_tokens.shape)
#print(mimi_tokens)
type(mimi_tokens)

(8, 100)


numpy.ndarray

In [26]:
# Weave Tokens, codebook offset - Step 3
def weave_tokens(tokens):
    result = []
    max_length = max(len(codebook) for codebook in tokens)
    
    for i in range(max_length):
        for codebook_index, codebook in enumerate(tokens):
            if i < len(codebook):
                offset = 2048 * codebook_index + 128000
                result.append(codebook[i] + offset)                 
    return np.array(result)

weave_audio = weave_tokens(mimi_tokens.tolist())
weave_audio

array([128995, 130325, 132834, 135350, 138081, 138819, 140988, 143449,
       128572, 131272, 133038, 135276, 136552, 139416, 141865, 142522,
       129117, 131597, 133406, 134146, 136322, 139289, 141923, 143985,
       129069, 131186, 133662, 135220, 136512, 139853, 141760, 142410,
       129620, 131843, 133121, 134814, 137436, 138387, 141084, 142444,
       128080, 131002, 133038, 135147, 137077, 139665, 141897, 142752,
       129134, 131203, 132924, 134668, 136548, 140002, 140409, 143451,
       128715, 130220, 133849, 135225, 136357, 139775, 140460, 143596,
       129650, 131937, 132126, 134545, 136933, 139077, 141937, 143864,
       129532, 131639, 132479, 134835, 137883, 138246, 141974, 143201,
       128937, 131824, 133411, 134384, 136248, 138246, 141565, 143949,
       129077, 130323, 132375, 134147, 138069, 139010, 141512, 142954,
       129352, 130386, 133860, 134608, 138148, 140177, 141688, 143239,
       128749, 130955, 133781, 135938, 137985, 140133, 141270, 143942,
      

In [17]:
import torch
import numpy as np
from transformers import AutoTokenizer

class TTSTokenizer:
    def __init__(self, text_tokenizer_name='tts_tokenizer', audio_tokenizer_name='tts_tokenizer'):
        self.text_tokenizer = AutoTokenizer.from_pretrained(text_tokenizer_name, legacy=False)
        self.audio_tokenizer = AutoTokenizer.from_pretrained(audio_tokenizer_name, legacy=False)
        print("text vocab size", self.audio_tokenizer.vocab_size)

    def encode(self, input_data, add_special_tokens=True):
        if isinstance(input_data, str):
            encoded_tokens = self.text_tokenizer.encode(
                input_data, 
                return_tensors='pt', 
                add_special_tokens=add_special_tokens
            )
            return encoded_tokens
        elif isinstance(input_data, list) and all(isinstance(item, str) for item in input_data):
            encoded_tokens = self.audio_tokenizer.encode(
                input_data, 
                return_tensors='pt', 
                add_special_tokens=add_special_tokens
            )
            return encoded_tokens
        else:
            raise TypeError("Input must be a string or a list of strings")

    def decode(self, tokens):
        if not isinstance(tokens, torch.Tensor):
            raise TypeError("Input must be a torch tensor of tokens")
        
        try:
            decoded_text = self.text_tokenizer.decode(tokens)
            return decoded_text
        except:
            try:
                decoded_tokens = self.audio_tokenizer.decode(tokens)
                return torch.tensor(decoded_tokens)
            except:
                raise ValueError("Unable to decode the provided tokens")

In [18]:
tokenizer = TTSTokenizer(text_tokenizer_name='tts_tokenizer', audio_tokenizer_name='tts_tokenizer')
audio_decoding = tokenizer.decode(tokens=torch.tensor(weave_audio))
text_decoding = tokenizer.decode(tokens=torch.tensor([128000, 39628, 11, 304, 279, 1193, 5647, 449, 902, 584, 527, 520, 3118, 11920, 11, 44642, 505, 1455, 422, 539, 505, 682, 279, 19071, 323, 44948, 15609, 304, 279, 68033]))
print(text_decoding)
print(audio_decoding)

text vocab size 128000


NameError: name 'weave_audio' is not defined

In [37]:
#Appending the tokens to a single sequence
#text_tokens, task_tokens, speaker_tokens, audio_start_tokens, audio_tokens, common_stop_token.
tokenizer = TTSTokenizer(text_tokenizer_name='tts_tokenizer', audio_tokenizer_name='tts_tokenizer')

MIMI = '[mimi]'
CONVERT = '[convert]'
CONTINUE = '[continue]'
DEFAULT_SPEAKER = '[spkr_unk]'
COMMON_STOP = '[stop]'

import torch

@torch.no_grad()
def append_tokens(text, audio_tokens, speaker=DEFAULT_SPEAKER):
    audio_tokens = torch.tensor(audio_tokens, dtype=torch.int32).clone().detach()
    text_tokens = torch.tensor(tokenizer.encode(text), dtype=torch.int32).view(-1).clone().detach()
    convert_tokens = torch.tensor(tokenizer.encode(CONVERT, add_special_tokens=False), dtype=torch.int32).view(-1).clone().detach()
    continue_tokens = torch.tensor(tokenizer.encode(CONTINUE, add_special_tokens=False), dtype=torch.int32).view(-1).clone().detach()
    speaker_tokens = torch.tensor(tokenizer.encode(speaker, add_special_tokens=False), dtype=torch.int32).view(-1).clone().detach()
    mimi_tokens = torch.tensor(tokenizer.encode(MIMI, add_special_tokens=False), dtype=torch.int32).view(-1).clone().detach()
    stop_tokens = torch.tensor(tokenizer.encode(COMMON_STOP, add_special_tokens=False), dtype=torch.int32).view(-1).clone().detach()
    
    result = torch.cat([
        text_tokens,
        convert_tokens,
        #continue_tokens,
        speaker_tokens,
        mimi_tokens,
        audio_tokens,
        stop_tokens
    ])
    
    return result

In [38]:
result = append_tokens("Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition", weave_audio)

  text_tokens = torch.tensor(tokenizer.encode(text), dtype=torch.int32).view(-1).clone().detach()
  convert_tokens = torch.tensor(tokenizer.encode(CONVERT, add_special_tokens=False), dtype=torch.int32).view(-1).clone().detach()
  continue_tokens = torch.tensor(tokenizer.encode(CONTINUE, add_special_tokens=False), dtype=torch.int32).view(-1).clone().detach()
  speaker_tokens = torch.tensor(tokenizer.encode(speaker, add_special_tokens=False), dtype=torch.int32).view(-1).clone().detach()
  mimi_tokens = torch.tensor(tokenizer.encode(MIMI, add_special_tokens=False), dtype=torch.int32).view(-1).clone().detach()
  stop_tokens = torch.tensor(tokenizer.encode(COMMON_STOP, add_special_tokens=False), dtype=torch.int32).view(-1).clone().detach()


In [39]:
result

tensor([128000,  39628,     11,    304,    279,   1193,   5647,    449,    902,
           584,    527,    520,   3118,  11920,     11,  44642,    505,   1455,
           422,    539,    505,    682,    279,  19071,    323,  44948,  15609,
           304,    279,  68033, 144642, 144645, 144641, 128995, 130325, 132834,
        135350, 138081, 138819, 140988, 143449, 128572, 131272, 133038, 135276,
        136552, 139416, 141865, 142522, 129117, 131597, 133406, 134146, 136322,
        139289, 141923, 143985, 129069, 131186, 133662, 135220, 136512, 139853,
        141760, 142410, 129620, 131843, 133121, 134814, 137436, 138387, 141084,
        142444, 128080, 131002, 133038, 135147, 137077, 139665, 141897, 142752,
        129134, 131203, 132924, 134668, 136548, 140002, 140409, 143451, 128715,
        130220, 133849, 135225, 136357, 139775, 140460, 143596, 129650, 131937,
        132126, 134545, 136933, 139077, 141937, 143864, 129532, 131639, 132479,
        134835, 137883, 138246, 141974, 

In [63]:
import os
import torch
import json
import numpy as np
CACHE_DIR = '/home/subhash/.cache/indri'
def load_tokens(dataset_dir):
    metadata_path = f"{CACHE_DIR}/{dataset_dir}/annotation/metadata.jsonl"
    tokens_dir = os.path.join(CACHE_DIR, dataset_dir, 'tokens', 'mimi')
    with open(metadata_path, 'r', encoding='utf-8') as file:
        for line_number, line in enumerate(file, 1):
            data = json.loads(line.strip())            
            file_path = os.path.join(tokens_dir, data['id'] + '.npy')
            
            audio_tokens = np.load(file_path)
            weave_audio = weave_tokens(audio_tokens.tolist())
            yield data['raw_text'], weave_audio, data['speaker_id']

In [67]:
dataset = 'lj_speech'
for raw_text, audio_tokens, speaker in load_tokens(dataset_dir=dataset):
    with open('allowed_speakers.jsonl', 'r', encoding='utf-8') as file:
        allowed_speakers = [json.loads(line.strip()) for line in file]
    entry = next((item for item in allowed_speakers if item['dataset'] == dataset and item['speaker'] == speaker), None)
    if entry:
        combined = entry['combined']
    else:
        combined = DEFAULT_SPEAKER
    result = append_tokens(raw_text, audio_tokens, speaker=combined)
    print(result)
    print(tokenizer.decode(result))
    break

tensor([128000,  39628,     11,  ..., 140557, 142493, 144644],
       dtype=torch.int32)
<|begin_of_text|>Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition[convert][spkr_unk][mimi][aco_1442][aco_3215][aco_4335][aco_6316][aco_8867][aco_11749][aco_14000][aco_15936][aco_1463][aco_3102][aco_4741][aco_6853][aco_9438][aco_11023][aco_12637][aco_15755][aco_12][aco_3327][aco_5098][aco_7331][aco_8508][aco_11952][aco_12220][aco_14360][aco_1133][aco_3558][aco_5358][aco_6739][aco_7940][aco_11184][aco_12999][aco_15792][aco_595][aco_2796][aco_5106][aco_7093][aco_8529][aco_11455][aco_13701][aco_15488][aco_1516][aco_2061][aco_5212][aco_6954][aco_8614][aco_10897][aco_13674][aco_14271]<|reserved_special_token_178|>[aco_2240][aco_4828][aco_6706][aco_8849][aco_10478][aco_12378][aco_15799][aco_1051][aco_3045][aco_4302][aco_7390][aco_8951][aco_11910][aco_12257][aco_15170][aco_1639][aco_2699][aco_4302][aco_60

  text_tokens = torch.tensor(tokenizer.encode(text), dtype=torch.int32).view(-1).clone().detach()
  convert_tokens = torch.tensor(tokenizer.encode(CONVERT, add_special_tokens=False), dtype=torch.int32).view(-1).clone().detach()
  continue_tokens = torch.tensor(tokenizer.encode(CONTINUE, add_special_tokens=False), dtype=torch.int32).view(-1).clone().detach()
  speaker_tokens = torch.tensor(tokenizer.encode(speaker, add_special_tokens=False), dtype=torch.int32).view(-1).clone().detach()
  mimi_tokens = torch.tensor(tokenizer.encode(MIMI, add_special_tokens=False), dtype=torch.int32).view(-1).clone().detach()
  stop_tokens = torch.tensor(tokenizer.encode(COMMON_STOP, add_special_tokens=False), dtype=torch.int32).view(-1).clone().detach()


In [None]:
# Sample data (replace this with your actual data)


# Function to get combined ID based on dataset and speaker
def get_combined_id(data, dataset, speaker):
    entry = next((item for item in data if item['dataset'] == dataset and item['speaker'] == speaker), None)
    return entry['combined'] if entry else None

# Example usage
dataset_input = "mls_eng_10k"
speaker_input = "2156"
combined_id = get_combined_id(data, dataset_input, speaker_input)

print(combined_id)  # Output: [spkr_mls_eng_10k_2156]

In [72]:
import pickle

def load_pickle_file(file_path):
    with open(file_path, 'rb') as file:
        data = pickle.load(file)
    return data

pickle_file_path = 'tokens/lj_speech_tokens.pkl'  
data = load_pickle_file(pickle_file_path)
data


[tensor([128000,  39628,     11,  ..., 140557, 142493, 144644],
        dtype=torch.int32),
 tensor([128000,    258,   1694,  71561,   6617,     13, 144642, 144645, 144641,
         128678, 130835, 132339, 134203, 137663, 139106, 142070, 142354, 129536,
         130238, 132125, 134707, 136590, 138658, 141753, 142495, 128362, 131923,
         133219, 135748, 137040, 138904, 140309, 142608, 129057, 131923, 132339,
         134833, 136254, 138522, 141680, 143399, 128389, 131442, 133814, 135748,
         136937, 139761, 142242, 143471, 129558, 130834, 133087, 135506, 137091,
         139920, 142253, 143824, 129054, 130238, 133462, 135701, 136499, 138697,
         141120, 144007, 129343, 130297, 132950, 135831, 137259, 139195, 141398,
         143899, 128321, 131400, 133395, 134382, 136325, 139199, 140550, 143850,
         129839, 131895, 132960, 136035, 137376, 139298, 142203, 144094, 128902,
         130505, 132643, 135349, 136999, 139242, 140509, 142800, 129389, 131868,
         134110, 

In [2]:
from llama_model import Llama, LlamaConfig
def calculate_parameters(config: LlamaConfig) -> int:
    total_params = 0

    total_params += config.vocab_size * config.dim 

    for _ in range(config.n_layers):
        total_params += (config.dim * config.n_heads * (config.dim // config.n_heads)) * 3  # wq, wk, wv
        total_params += (config.n_heads * (config.dim // config.n_heads) * config.dim)  # wo

        hidden_dim = int(4 * config.dim)  # Assuming hidden_dim is 4 * dim
        total_params += (config.dim * hidden_dim) + (hidden_dim * config.dim)  # w1 and w2
        total_params += (config.dim * hidden_dim)  # w3

    total_params += config.dim * 2 * config.n_layers

    return total_params

In [15]:
config = LlamaConfig(
    dim=1024,
    n_layers=12,
    n_heads=16,
    vocab_size=144448,
    max_seq_len=2048
)
total_params = calculate_parameters(config)
print(f"Total number of parameters: {total_params}")

Total number of parameters: 349265920
