# Testing Tokenizer

In [None]:
from espnet2.text.build_tokenizer import build_tokenizer
from espnet2.text.token_id_converter import TokenIDConverter
import torchaudio
import yaml
import os

In [None]:
model_root_dir = "/home/pb_deployment/espnet/asr_inference/model_files/e_branchformer_librispeech/"
asr_config_path = os.path.join(model_root_dir, "exp/asr_train_asr_e_branchformer_raw_en_bpe5000_sp/config.yaml")
bpe_model_path = os.path.join(model_root_dir, 'data/en_token_list/bpe_unigram5000/bpe.model')

In [None]:
with open(asr_config_path, 'r') as file:
    config_data = yaml.safe_load(file)
tokens_list = config_data.get('token_list', [])
#print(f"TOKEN LIST\n{tokens_list}")

In [None]:
test_dataset = torchaudio.datasets.LIBRISPEECH("/home/pb_deployment/Downloads", url="test-clean", download=True)

In [None]:
tokenizer = build_tokenizer(
    token_type='bpe',
    bpemodel=bpe_model_path  
)
tokCon = TokenIDConverter(token_list = tokens_list)

In [None]:
text = test_dataset[0][2]
print(f"Original Text : {text}")

In [None]:
tokens = tokenizer.text2tokens(text)
print(f"Tokenized text : {tokens}")

In [None]:
tokenized_text_ids = tokCon.tokens2ids(tokens)
print(f"Tokenized IDs : {tokenized_text_ids}")

In [None]:
reconstructed_tokens = tokCon.ids2tokens(tokenized_text_ids)
print(f"Reconstructed tokens : {reconstructed_tokens}")

In [None]:
reconstructed_text = tokenizer.tokens2text(reconstructed_tokens)
print(f"Reconstructed text : {reconstructed_text}")

In [None]:
class TruCLeS_Tokenizer:
    def __init__(self, asr_config_path, bpe_model_path):
        self.asr_config_path = asr_config_path
        self.bpe_model_path = bpe_model_path
        self.tokenizer = build_tokenizer(
            token_type='bpe',
            bpemodel=bpe_model_path  
        )
        with open(self.asr_config_path, 'r') as file:
            config_data = yaml.safe_load(file)
        self.tokens_list = config_data.get('token_list', [])
        self.tokenIDConvertor = TokenIDConverter(token_list = self.tokens_list)

    def text2ids(self, text):
        tokenized = self.tokenizer.text2tokens(text)
        ids = self.tokenIDConvertor.tokens2ids(tokenized)
        return ids
    
    def ids2text(self, ids):
        tokenized = self.tokenIDConvertor.ids2tokens(ids)
        text = self.tokenizer.tokens2text(tokenized)
        return text

In [None]:
tokenizer = TruCLeS_Tokenizer(asr_config_path, bpe_model_path)

In [None]:
ids = tokenizer.text2ids("HELLO WORLD")
print(ids)

In [None]:
text_new = tokenizer.ids2text(ids)
print(text_new)