In [2]:
import json
import logging
import os
import re
import sys
from typing import List, Optional, Tuple, Union

import numpy as np
import soundfile as sf
import torch
import torchaudio

from espnet2.bin.mt_inference import Text2Text
from espnet2.asr.frontend.s3prl import S3prlFrontend

In [None]:
class BaseFeatureReader(object):
    def __init__(self):
        raise NotImplementedError

    def load_audio(self, path: str, ref_len: Optional[int] = None):
        wav, sr = sf.read(path)
        assert sr == self.sample_rate, sr
        if wav.ndim == 2:
            wav = wav.mean(-1)
        if ref_len is not None and abs(ref_len - len(wav)) > 160:
            logging.warning(f"ref {ref_len} != read {len(wav)} ({path})")
        return wav

    def preprocess_data(
        self,
        data: Union[str, np.ndarray, list, torch.Tensor],
        data_lens: Union[int, List[int], torch.Tensor],
        ref_len: Optional[int] = None,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        if isinstance(data, torch.Tensor):
            return data, data_lens
        elif isinstance(data, str):
            batch_size = 1
            x = self.load_audio(data, ref_len=ref_len)
        elif isinstance(data, np.ndarray):
            batch_size = 1
            x = data
        else:
            raise TypeError(f"Unexpected data type of argument 1: {type(data)}.")
        x = torch.from_numpy(x).view(batch_size, -1).float()
        x_lens = torch.tensor([data_lens]).long()
        return x, x_lens

    def get_feats(
        self, data: torch.Tensor, data_lens: torch.Tensor, ref_len: Optional[int] = None
    ):
        raise NotImplementedError

class S3PRLFeatureReader(BaseFeatureReader):
    def __init__(
        self,
        fs: Union[int, str] = 16000,
        s3prl_conf: Optional[dict] = None,
        download_dir: str = None,
        multilayer_feature: bool = False,
        layer: int = -1,
        use_gpu: bool = True,
    ):
        self.model = S3prlFrontend(
            fs=fs,
            frontend_conf=s3prl_conf,
            download_dir=download_dir,
            multilayer_feature=multilayer_feature,
            layer=layer,
        )
        self.device = "cuda" if use_gpu and torch.cuda.is_available() else "cpu"
        self.model = self.model.to(self.device)

    def get_feats(
        self,
        data: torch.Tensor,
        data_lens: torch.Tensor,
        ref_len: Optional[int] = None,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        with torch.no_grad():
            x, x_lens = self.preprocess_data(data, data_lens)
            x = x.to(self.device)

            feats, feats_lens = self.model(x, x_lens)
        feats = feats.cpu()
        feats_lens = feats_lens.cpu()
        return feats, feats_lens

In [None]:
reader = S3PRLFeatureReader(
            fs = 16000,
            s3prl_conf = 
        )

In [None]:
chapter one missus rachel lynde is surprised missus rachel lynde lived just where the avonlea main road dipped down into a little hollow fringed with alders and ladies eardrops and traversed by a brook

In [18]:
text2text = Text2Text("/home/rathna/espnet/egs2/001_librispeech/my_asr2/exp/asr_train_discrete_asr_e_branchformer1_raw_wavlm_large_21_km2000_bpe_rm6000_bpe_ts5000_sp/config.yaml", "/home/rathna/espnet/egs2/001_librispeech/my_asr2/exp/asr_train_discrete_asr_e_branchformer1_raw_wavlm_large_21_km2000_bpe_rm6000_bpe_ts5000_sp/valid.acc.ave_10best.pth",nbest=1, device='cuda',beam_size=10)

import librosa
audio, rate = librosa.load("/hdd_storage/data/ASR_datasets/LJSpeech-1.1/LJSpeech-1.1/wavs/LJ001-0007.wav", sr=16000)

In [22]:
feats = "77 1373 1373 147 151 151 151 147 151 80 151 80 151 228 15 15 15 1933 15 15 147 1626 337 795 673 1443 443 1607 1387 440 440 1309 681 195 242 68 1361 1361 1361 1361 1361 698 401 889 762 907 641 1171 1171 1479 703 387 458 458 458 1994 1264 431 178 1431 644 644 644 721 246 1017 1314 230 1354 1354 576 1957 1413 963 33 159 706 173 481 788 843 316 1280 316 316 316 1545 1545 1895 1895 380 642 642 1575 1243 1893 1893 544 1994 1914 766 1254 617 617 617 617 143 1578 267 50 378 378 291 550 550 70 115 115 1587 1587 1587 302 302 542 542 302 302 377 238 147 806 1658 865 1370 338 338 338 338 1846 143 1578 1526 891 891 970 401 889 385 482 1791 1791 1008 95 1052 1052 1052 175 949 1678 394 394 5 1081 585 786 368 1154 904 904 904 904 72 579 579 371 970 970 970 1712 121 1441 788 207 207 962 336 1265 95 1270 1052 1052 756 175 1444 171 1755 1171 1171 909 909 198 938 538 438 366 1967 1967 1967 1967 1545 380 1895 376 376 6 172 1102 1434 1171 152 147 1897 875 593 593 780 219 1658 865 1370 958 1540 958 740 21 1905 267 1094 1094 1094 1094 1901 1901 1895 1895 403 403 314 154 6 39 95 1270 87 87 409 328 389 1157 571 881 613 613 613 1177 1177 163 1569 1154 1744 1744 1744 88 656 1676 1620 469 1171 437 437 271 271 271 271 271 271 145 1241 1963 1627 1471 330 121 121 901 1061 1061 1061 1061 1678 1910 5 5 152 1443 702 1903 83 83 83 83 94 280 1515 1008 406 406 406 406 807 154 646 1768 109 76 697 1443 281 124 1229 1229 1375 1314 230 1354 1354 576 576 791 374 113 1565 1565 1024 288 1648 1648 958 958 958 740 21 1267 267 216 216 216 1407 1819 1819 1802 1711 754 1362 649 649 649 649 649 649 524 602 1120 724 516 1441 1633 469 1270 22 22 1810 328 389 598 1593 1593 1240 1240 1240 1240 1240 1034 696 1926 1926 1926 1926 1926 1605 1302 1023 385 430 505 1104 1182 1104 1104 152 697 238 7 144 1903 1684 1684 1684 282 889 385 298 1167 1167 1167 1167 565 164 261 1137 204 1104 38 407 613 613 613 147 613 1177 699 699 699 498 1511 1835 1835 1835 1835 1256 1256 50 1165 1165 1307 642 881 613 613 613 1177 163 1154 1744 1744 1744 88 1433 121 1620 1061 1061 1061 1061 215 1678 598 810 1340 1340 1787 1787 1787 51 864 64 262 659 1107 1609 1107 1168 329 329 329 656 439 1804 85 547 962 962 790 103 255 533 425 90 453 453 1310 1310 127 1016 127 1933 1099 1099 228 1099 54 54"

In [23]:
feats = list(map(int, feats.split()))
print(feats)

[77, 1373, 1373, 147, 151, 151, 151, 147, 151, 80, 151, 80, 151, 228, 15, 15, 15, 1933, 15, 15, 147, 1626, 337, 795, 673, 1443, 443, 1607, 1387, 440, 440, 1309, 681, 195, 242, 68, 1361, 1361, 1361, 1361, 1361, 698, 401, 889, 762, 907, 641, 1171, 1171, 1479, 703, 387, 458, 458, 458, 1994, 1264, 431, 178, 1431, 644, 644, 644, 721, 246, 1017, 1314, 230, 1354, 1354, 576, 1957, 1413, 963, 33, 159, 706, 173, 481, 788, 843, 316, 1280, 316, 316, 316, 1545, 1545, 1895, 1895, 380, 642, 642, 1575, 1243, 1893, 1893, 544, 1994, 1914, 766, 1254, 617, 617, 617, 617, 143, 1578, 267, 50, 378, 378, 291, 550, 550, 70, 115, 115, 1587, 1587, 1587, 302, 302, 542, 542, 302, 302, 377, 238, 147, 806, 1658, 865, 1370, 338, 338, 338, 338, 1846, 143, 1578, 1526, 891, 891, 970, 401, 889, 385, 482, 1791, 1791, 1008, 95, 1052, 1052, 1052, 175, 949, 1678, 394, 394, 5, 1081, 585, 786, 368, 1154, 904, 904, 904, 904, 72, 579, 579, 371, 970, 970, 970, 1712, 121, 1441, 788, 207, 207, 962, 336, 1265, 95, 1270, 1052, 1052, 

In [24]:
print(text2text(torch.Tensor(feats)))

[('of tish with amoured mantelowem staant madsed a book allinged his but ofner has chisys vast haidelled the mans term monk of the new frivillar on the and mudmen eing with could was shoten with ants saphor wasss of masonity knowed five phians could et cetera raises on the and you him minc tenboat like a inhorary', ['▁of', '▁ti', 's', 'h', '▁with', '▁am', 'our', 'ed', '▁man', 't', 'el', 'ow', 'em', '▁sta', 'ant', '▁mad', 's', 'ed', '▁a', '▁book', '▁all', 'ing', 'ed', '▁his', '▁but', '▁of', 'ner', '▁has', '▁chi', 's', 'y', 's', '▁vast', '▁ha', 'id', 'el', 'led', '▁the', '▁mans', '▁term', '▁monk', '▁of', '▁the', '▁new', '▁fri', 'v', 'ill', 'ar', '▁on', '▁the', '▁and', '▁mud', 'men', '▁e', 'ing', '▁with', '▁could', '▁was', '▁shot', 'en', '▁with', '▁a', 'nt', 's', '▁sa', 'ph', 'or', '▁was', 's', 's', '▁of', '▁ma', 's', 'on', 'ity', '▁know', 'ed', '▁five', '▁p', 'h', 'ian', 's', '▁could', '▁et', '▁cetera', '▁raise', 's', '▁on', '▁the', '▁and', '▁you', '▁him', '▁min', 'c', '▁ten', 'bo', 'at'

In [None]:
print(dir(text2text))


print(text2text.mt_model)

In [None]:
text2text("/hdd_storage/data/ASR_datasets/LJSpeech-1.1/LJSpeech-1.1/wavs/LJ001-0007.wav")

In [None]:
def convert2hex(x):
    result = str(x.decode('utf-8'))
    result = result[1:]
    result = result.replace('\\','').replace("'","").split('x')
    result = result[1:-1]
    return ''.join(result)

cjk_value = feats[0]
hex_value = convert2hex(cjk_value)

print(cjk_value)
print(hex_value)
print(chr(int(hex_value,16)))

In [None]:
import torch
feats = "乍卝亓亗亓亗乐亗乐亗令丏喍丏亓呚佑儛傡厣侻呇卫侸匝傩仃仲乄卑傺侑兹僺冋傁劓叇傿侃俊嗊勰侯亲厗傄僑件凹匢仦半偀喥厅凃両亟僂亭信儔克似匀似吉啧佼傂吧勛啥倠嗊啺僾勦偩亏吪伋串佺伣倦乆乳吳伮倞伮佹仮亓儦呺兡博佒唶亏吪叶养凊侑兹侁俢哿凰也刜亯况咎侊丅刹偉儒佰劂冈么偃佳凊咰乹厡儔仏凂佐勱也勶刜僴亯厤享哛劓再仆冪倚侶佮喯吉佼啧佸丆京剎厚劓亘亓啩八偑儌仛呺兡博冾各冾僤丕啱伋剆啭啧侓伺亚丆丧也勶乗侙佈侅劅倻共健劙亣吡劂哐乘傐和呔俕劓侵伏云務喫呛县佊乹内別咎啶丅亘厣傾啯乓乞优叫凰侖儧亚傆哨乭乌傹厣伙乼勍卟匢仦半偀儗佶乱吝刀传呰冾僤丕勳伋付卿唛唊咯僲卒傉倌做剠僔倄厡呡俕勶世唒佈侅偖吹勘刊傸喆呅化凿侁侮俹剐办剐亘傹仮万亐啯咔会兹侁伪劏倵交伅剱仌剐並侗健亓健劙傻俲叧唫勨串劍匛傂共健劙亣劂哐乘厙乹呔別仗咎偖優匼哻丳兠乀伆傓剓呉剓劐佉傐侷唌乕倣凂儖乧仿倕侩乚俅匞乿凸乿喍剋令剋丶"
feat_list = [int(str(x.encode('utf-8')),16) for x in feats] 
audio = torch.Tensor(feat_list)

In [None]:
text2text(audio)