# Test of wav2vec for norwegian

**Author:** [Computas AS](https://github.com/computas) ([kontakt@computas.com](mailto:kontakt@computas.com))

**Achievement:** *[Short, preferably single-line, statement of what has been accomplished. For example, "Assuming ... and using ... we show that ...".]*

## Introduction

This is a simple quality test of Facebook's wav2vec ASR system.

Based on the code from: 
- https://github.com/pytorch/fairseq/tree/master/examples/wav2vec

# Reproducibility and code formatting

In [1]:
# To watermark the environment
%load_ext watermark

# For automatic code formatting in jupyter lab.
%load_ext lab_black

# For automatic code formatting in jupyter notebook
%load_ext nb_black

# ASR

In [2]:
# Download wav2vec large
!wget https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_large.pt

--2020-07-09 12:43:29--  https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_large.pt
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)...104.22.75.142, 172.67.9.4, 104.22.74.142, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.75.142|:443... connected.
HTTP request sent, awaiting response...200 OK
Length: 325396342 (310M) [application/octet-stream]
Saving to: ‘wav2vec_large.pt.1’

wav2vec_large.pt.1   17%[==>                 ]  54,39M  8,77MB/s    eta 36s^C


In [3]:
import torch
from fairseq.models.wav2vec import Wav2VecModel

In [4]:
cp = torch.load('wav2vec_large.pt',map_location=torch.device('cpu'))
model = Wav2VecModel.build_model(cp['args'], task=None)
model.load_state_dict(cp['model'])
model.eval()

Wav2VecModel(
  (feature_extractor): ConvFeatureExtractionModel(
    (conv_layers): ModuleList(
      (0): Sequential(
        (0): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
        (1): Dropout(p=0.0, inplace=False)
        (2): Fp32GroupNorm(1, 512, eps=1e-05, affine=True)
        (3): ReLU()
      )
      (1): Sequential(
        (0): Conv1d(512, 512, kernel_size=(8,), stride=(4,), bias=False)
        (1): Dropout(p=0.0, inplace=False)
        (2): Fp32GroupNorm(1, 512, eps=1e-05, affine=True)
        (3): ReLU()
      )
      (2): Sequential(
        (0): Conv1d(512, 512, kernel_size=(4,), stride=(2,), bias=False)
        (1): Dropout(p=0.0, inplace=False)
        (2): Fp32GroupNorm(1, 512, eps=1e-05, affine=True)
        (3): ReLU()
      )
      (3): Sequential(
        (0): Conv1d(512, 512, kernel_size=(4,), stride=(2,), bias=False)
        (1): Dropout(p=0.0, inplace=False)
        (2): Fp32GroupNorm(1, 512, eps=1e-05, affine=True)
        (3): ReLU()
      )
 

Wav2VecModel(
  (feature_extractor): ConvFeatureExtractionModel(
    (conv_layers): ModuleList(
      (0): Sequential(
        (0): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
        (1): Dropout(p=0.0, inplace=False)
        (2): Fp32GroupNorm(1, 512, eps=1e-05, affine=True)
        (3): ReLU()
      )
      (1): Sequential(
        (0): Conv1d(512, 512, kernel_size=(8,), stride=(4,), bias=False)
        (1): Dropout(p=0.0, inplace=False)
        (2): Fp32GroupNorm(1, 512, eps=1e-05, affine=True)
        (3): ReLU()
      )
      (2): Sequential(
        (0): Conv1d(512, 512, kernel_size=(4,), stride=(2,), bias=False)
        (1): Dropout(p=0.0, inplace=False)
        (2): Fp32GroupNorm(1, 512, eps=1e-05, affine=True)
        (3): ReLU()
      )
      (3): Sequential(
        (0): Conv1d(512, 512, kernel_size=(4,), stride=(2,), bias=False)
        (1): Dropout(p=0.0, inplace=False)
        (2): Fp32GroupNorm(1, 512, eps=1e-05, affine=True)
        (3): ReLU()
      )
 

In [8]:
# Test that it works
wav_input_16khz = torch.randn(1,10000)
print(wav_input_16khz)
z = model.feature_extractor(wav_input_16khz)
c = model.feature_aggregator(z)
c

tensor([[-0.4459, -0.9507, -0.4878,  ...,  1.3605, -0.5447,  0.9433]])


tensor([[[2.5149e-02, 1.8407e-02, 2.1503e-02,  ..., 1.5585e-02,
          1.8384e-02, 1.5623e-03],
         [2.5810e-04, 5.9348e-04, 1.6409e-02,  ..., 5.8323e-02,
          8.6536e-04, 6.5153e-03],
         [2.1172e-02, 0.0000e+00, 2.9426e-03,  ..., 5.8322e-03,
          4.6896e-03, 0.0000e+00],
         ...,
         [2.3478e-05, 1.1042e-03, 0.0000e+00,  ..., 2.0213e-03,
          0.0000e+00, 0.0000e+00],
         [9.0235e-02, 6.8277e-02, 1.0346e-01,  ..., 3.2361e-01,
          3.6492e-01, 2.7135e-01],
         [2.9137e-01, 3.1641e-01, 3.2684e-01,  ..., 2.4398e-01,
          1.8389e-01, 1.8778e-01]]], grad_fn=<MulBackward0>)

In [16]:
#dir(wav_input_16khz)
wav_input_16khz.size()

torch.Size([1, 10000])

We test with a custom file

In [6]:
import librosa

In [23]:
wav_input = librosa.load('data/solberg.wav')
print(wav_input)
tensors = torch.from_numpy(wav_input[0]).reshape(1,wav_input[0].size)
z = model.feature_extractor(tensors)
c = model.feature_aggregator(z)
c

(array([ 0.        ,  0.        ,  0.        , ..., -0.00349916,
       -0.00802596,  0.        ], dtype=float32), 22050)


tensor([[[0.0635, 0.0635, 0.0635,  ..., 0.0279, 0.0398, 0.0354],
         [0.0221, 0.0221, 0.0221,  ..., 0.1054, 0.0568, 0.1092],
         [0.0005, 0.0005, 0.0005,  ..., 0.0977, 0.3109, 0.3539],
         ...,
         [0.0000, 0.0000, 0.0000,  ..., 0.0037, 0.0084, 0.0127],
         [0.4041, 0.4041, 0.4041,  ..., 0.1282, 0.0406, 0.1174],
         [0.0088, 0.0088, 0.0088,  ..., 0.0772, 0.1449, 0.2367]]],
       grad_fn=<MulBackward0>)