# Export Wav2vec2 Huggingface model to ONNX

In [5]:
from transformers import Wav2Vec2ForCTC
from torchaudio.models.wav2vec2.utils import import_huggingface_model
import torch.onnx
from pathlib import Path

AUDIO_MAXLEN = 160000
MODEL_OUTDIR = Path("/models/wav2vec2/1/")
MODEL_OUTDIR.mkdir(parents=True, exist_ok=True)
HF_REPO_NAME = "kresnik/wav2vec2-large-xlsr-korean"
ROOT_DIR = '/opt'

In [2]:

original = Wav2Vec2ForCTC.from_pretrained(HF_REPO_NAME)
imported = import_huggingface_model(original) 
imported.eval()


config.json: 100%|██████████| 2.31k/2.31k [00:00<00:00, 10.5MB/s]
model.safetensors: 100%|██████████| 1.27G/1.27G [01:50<00:00, 11.5MB/s]
Some weights of the model checkpoint at kresnik/wav2vec2-large-xlsr-korean were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at kresnik/wav2vec2-large-xlsr-korean and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.

Wav2Vec2Model(
  (feature_extractor): FeatureExtractor(
    (conv_layers): ModuleList(
      (0): ConvLayerBlock(
        (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
      )
      (1-4): 4 x ConvLayerBlock(
        (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
      )
      (5-6): 2 x ConvLayerBlock(
        (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,))
      )
    )
  )
  (encoder): Encoder(
    (feature_projection): FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projection): Linear(in_features=512, out_features=1024, bias=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (transformer): Transformer(
      (pos_conv_embed): ConvolutionalPositionalEmbedding(
        (

## Conver to FP32 model

In [None]:
dummy_input = torch.randn(1, AUDIO_MAXLEN, requires_grad=True)

torch.onnx.export(
        imported,         # model being run
         dummy_input,       # model input (or a tuple for multiple inputs)
         f"{MODEL_OUTDIR}/model.onnx",       # where to save the model
         export_params=True,  # store the trained parameter weights inside the model file
         opset_version=14,    # the ONNX version to export the model to
         do_constant_folding=True,  # whether to execute constant folding for optimization
         input_names = ['input'],   # the model's input names
         output_names = ['output'], # the model's output names
         dynamic_axes={
            'input' : {
                0: 'batch_size',
                1: 'input_sequence'
                },    
            'output' : {
                0: 'batch_size',
                1: 'output_sequence'
                }
            
            })

## Inference test

In [None]:
with open(f"/workspace/vocab.json","r",encoding="utf-8") as f:
  d = eval(f.read())

res = dict((v, k) for k, v in d.items())


In [None]:
import onnx
import onnxruntime
import numpy as np
from scipy.io import wavfile
import scipy.signal as sps

In [None]:
input_size = 100000
new_rate = 16000
AUDIO_MAXLEN = input_size
ort_session = onnxruntime.InferenceSession(f'{MODEL_OUTDIR}/model.onnx') # load onnx model


In [None]:
res = dict((v,k) for k,v in d.items())
res[69]="[PAD]"
res[68]="[UNK]"

In [None]:
def _normalize(x): #
  """You must call this before padding.
  Code from https://github.com/vasudevgupta7/gsoc-wav2vec2/blob/main/src/wav2vec2/processor.py#L101
  Fork TF to numpy
  """
  # -> (1, seqlen)
  mean = np.mean(x, axis=-1, keepdims=True)
  var = np.var(x, axis=-1, keepdims=True)
  return np.squeeze((x - mean) / np.sqrt(var + 1e-5))

In [None]:
def remove_adjacent(item): # code from https://stackoverflow.com/a/3460423
  nums = list(item)
  a = nums[:1]
  for item in nums[1:]:
    if item != a[-1]:
      a.append(item)
  return ''.join(a)

In [None]:
def asr(path):
    """
    Code from https://github.com/vasudevgupta7/gsoc-wav2vec2/blob/main/notebooks/wav2vec2_onnx.ipynb
    Fork TF to numpy
    """
    sampling_rate, data = wavfile.read(path)
    samples = round(len(data) * float(new_rate) / sampling_rate)
    new_data = sps.resample(data, samples)
    speech = np.array(new_data, dtype=np.float32)
    speech = _normalize(speech)[None]
    padding = np.zeros((speech.shape[0], AUDIO_MAXLEN - speech.shape[1]))
    speech = np.concatenate([speech, padding], axis=-1).astype(np.float32)
    ort_inputs = {"input": speech}
    ort_outs = ort_session.run(None, ort_inputs)
    prediction = np.argmax(ort_outs, axis=-1)
    # Text post processing
    _t1 = ''.join([res[i] for i in list(prediction[0][0])])
    return normalize(''.join([remove_adjacent(j) for j in _t1.split("[PAD]")]))

In [None]:
FILENAME = "/workspace/sound.wav"



In [None]:
asr(FILENAME)

## Conver to FP16 model

In [2]:
import onnx
from onnxconverter_common import float16
from pathlib import Path

model_path = "/models/wav2vec2_fp16/1/model.onnx"
MODEL_FP16_OUTDIR = Path("/models/wav2vec2_fp16/1/")
MODEL_FP16_OUTDIR.mkdir(parents=True, exist_ok=True)

model_fp32 = onnx.load_model(model_path)
model_fp16 = float16.convert_float_to_float16(model_fp32)
onnx.save(model_fp16, f"{MODEL_FP16_OUTDIR}/model.onnx")

NameError: name 'MODEL_OUTDIR' is not defined