# Export Wav2vec2 Huggingface model to ONNX

In [1]:
from transformers import Wav2Vec2ForCTC
from torchaudio.models.wav2vec2.utils import import_huggingface_model
import torch.onnx
from pathlib import Path


AUDIO_MAXLEN = 160000
MODEL_OUTDIR = Path("/models/wav2vec2/1/")
MODEL_OUTDIR.mkdir(parents=True, exist_ok=True)
HF_REPO_NAME = "kresnik/wav2vec2-large-xlsr-korean"
ROOT_DIR = '/opt'

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

original = Wav2Vec2ForCTC.from_pretrained(HF_REPO_NAME)
imported = import_huggingface_model(original) 
imported.eval()


Some weights of the model checkpoint at kresnik/wav2vec2-large-xlsr-korean were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at kresnik/wav2vec2-large-xlsr-korean and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRA

Wav2Vec2Model(
  (feature_extractor): FeatureExtractor(
    (conv_layers): ModuleList(
      (0): ConvLayerBlock(
        (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
      )
      (1-4): 4 x ConvLayerBlock(
        (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
      )
      (5-6): 2 x ConvLayerBlock(
        (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,))
      )
    )
  )
  (encoder): Encoder(
    (feature_projection): FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projection): Linear(in_features=512, out_features=1024, bias=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (transformer): Transformer(
      (pos_conv_embed): ConvolutionalPositionalEmbedding(
        (

In [3]:
dummy_input = torch.randn(1, AUDIO_MAXLEN, requires_grad=True)

torch.onnx.export(
        imported,         # model being run
         dummy_input,       # model input (or a tuple for multiple inputs)
         f"{MODEL_OUTDIR}/model.onnx",       # where to save the model
         export_params=True,  # store the trained parameter weights inside the model file
         opset_version=14,    # the ONNX version to export the model to
         do_constant_folding=True,  # whether to execute constant folding for optimization
         input_names = ['input'],   # the model's input names
         output_names = ['output'], # the model's output names
         dynamic_axes={
            'input' : {
                0: 'batch_size',
                1: 'input_sequence'
                },    
            'output' : {
                0: 'batch_size',
                1: 'output_sequence'
                }
            
            })

## Inference test

In [8]:
!wget https://www.dropbox.com/s/9kpeh8eodshcqhj/common_voice_th_23646850.wav?dl=1
!mv common_voice_th_23646850.wav?dl=1 ${ROOT}/sound.wav
!wget https://huggingface.co/airesearch/wav2vec2-large-xlsr-53-th/raw/main/vocab.json 
!mv vocab.json ${ROOT}/vocab.json


--2024-01-23 15:44:41--  https://www.dropbox.com/s/9kpeh8eodshcqhj/common_voice_th_23646850.wav?dl=1
Resolving www.dropbox.com (www.dropbox.com)... 162.125.84.18, 2620:100:6034:18::a27d:5412
Connecting to www.dropbox.com (www.dropbox.com)|162.125.84.18|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: /s/dl/9kpeh8eodshcqhj/common_voice_th_23646850.wav [following]
--2024-01-23 15:44:45--  https://www.dropbox.com/s/dl/9kpeh8eodshcqhj/common_voice_th_23646850.wav
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uc90027e405434cb4b55460bc627.dl.dropboxusercontent.com/cd/0/get/CL5n94jmUsw5bzMB2rmocpC-yk5jqRElBegT7gg_BLU_wQbVYwaeQ0TBXlfCHbV3SyQ9x7ZFOtBD165dDUNo165BrtJx3N19pJi9a2aeuT_TCt1LpBwyrnidXjpiFGbyMQwoGNgW6PuA8v8cAKv0m5O2/file?dl=1# [following]
--2024-01-23 15:44:46--  https://uc90027e405434cb4b55460bc627.dl.dropboxusercontent.com/cd/0/get/CL5n94jmUsw5bzMB2rmocpC-yk5jqRElBegT7gg_BLU_wQbVY

In [6]:
with open(f"/workspace/config/vocab.json","r",encoding="utf-8-sig") as f:
  d = eval(f.read())

In [7]:
print(d)

{'볍': 0, '칠': 1, '깊': 2, '뭔': 3, '러': 4, '르': 5, '튀': 6, '쳇': 7, '땀': 8, '픔': 9, '밌': 10, '좁': 11, '찧': 12, '뮬': 13, '했': 14, '연': 15, '핵': 16, '붐': 17, '봇': 18, '궁': 19, '뜸': 20, '넌': 21, '젖': 22, '맏': 23, '벚': 24, '락': 25, '가': 26, '롭': 27, '달': 28, '슐': 29, '컸': 30, '읍': 31, '색': 32, '맛': 33, '닦': 34, '기': 35, '악': 36, '뻗': 37, '팅': 38, '끗': 39, '깝': 40, '후': 41, '소': 42, '끽': 43, '조': 44, '겪': 45, '코': 46, '광': 47, '컨': 48, '올': 49, '큰': 50, '델': 51, '굉': 52, '교': 53, '발': 54, '랍': 55, '난': 56, '맹': 57, '킨': 58, '옵': 59, '삭': 60, '녹': 61, '엄': 62, '마': 63, '쫄': 64, '콜': 65, '넥': 66, '웹': 67, '뻔': 68, '뿐': 69, '엠': 70, '태': 71, '헝': 72, '멕': 73, '괭': 74, '멧': 75, '핀': 76, '냉': 77, '덟': 78, '덕': 79, '딪': 80, '넷': 81, '향': 82, '왜': 83, '모': 84, '협': 85, '요': 86, '궈': 87, '밑': 88, '결': 89, '덤': 90, '팩': 91, '뻑': 92, '씩': 93, '텔': 94, '및': 95, '펀': 96, '벅': 97, '케': 98, '잠': 99, '낼': 100, '팡': 101, '립': 102, '답': 103, '것': 104, '굿': 105, '허': 106, '샌': 107, '응': 108, '깃': 109, '느': 110,

## Inference

In [11]:
import onnx
import onnxruntime

In [13]:
import numpy as np
import soundfile as sf
from scipy.io import wavfile
import scipy.signal as sps
import os
from pythainlp.util import normalize

In [15]:
input_size = 100000
new_rate = 16000
AUDIO_MAXLEN = input_size
ort_session = onnxruntime.InferenceSession(f'{MODEL_OUTDIR}/model.onnx') # load onnx model


In [16]:
res = dict((v,k) for k,v in d.items())
res[69]="[PAD]"
res[68]="[UNK]"

In [17]:
def _normalize(x): #
  """You must call this before padding.
  Code from https://github.com/vasudevgupta7/gsoc-wav2vec2/blob/main/src/wav2vec2/processor.py#L101
  Fork TF to numpy
  """
  # -> (1, seqlen)
  mean = np.mean(x, axis=-1, keepdims=True)
  var = np.var(x, axis=-1, keepdims=True)
  return np.squeeze((x - mean) / np.sqrt(var + 1e-5))

In [18]:
def remove_adjacent(item): # code from https://stackoverflow.com/a/3460423
  nums = list(item)
  a = nums[:1]
  for item in nums[1:]:
    if item != a[-1]:
      a.append(item)
  return ''.join(a)

In [19]:
def asr(path):
    """
    Code from https://github.com/vasudevgupta7/gsoc-wav2vec2/blob/main/notebooks/wav2vec2_onnx.ipynb
    Fork TF to numpy
    """
    sampling_rate, data = wavfile.read(path)
    samples = round(len(data) * float(new_rate) / sampling_rate)
    new_data = sps.resample(data, samples)
    speech = np.array(new_data, dtype=np.float32)
    speech = _normalize(speech)[None]
    padding = np.zeros((speech.shape[0], AUDIO_MAXLEN - speech.shape[1]))
    speech = np.concatenate([speech, padding], axis=-1).astype(np.float32)
    ort_inputs = {"input": speech}
    ort_outs = ort_session.run(None, ort_inputs)
    prediction = np.argmax(ort_outs, axis=-1)
    # Text post processing
    _t1 = ''.join([res[i] for i in list(prediction[0][0])])
    return normalize(''.join([remove_adjacent(j) for j in _t1.split("[PAD]")]))

In [20]:
FILENAME = "/workspace/sound.wav"



In [21]:
asr(FILENAME)

  sampling_rate, data = wavfile.read(path)


'บริษัท|เรา|จะ|ต้อง|ปรับตัว|เพื่อ|ใช้งาน|เทคโนโลยี|เหล่านี้'

In [1]:
str1 = "บริษัท|เรา|จะ|ต้อง|ปรับตัว|เพื่อ|ใช้งาน|เทคโนโลยี|เหล่านี้"
str2 = "บริษัท|เรา|จะ|ต้อง|ปรับตัว|เพื่อ|ใช้งาน|เทคโนโลยี|เหล่านี้"


In [2]:
str1==str2

True