In [8]:
import time
import os
import soundfile as sf
import sounddevice as sd
from scipy.io import wavfile
import pandas as pd

### Record/play functions

In [9]:
# record audio
FS = 44100  # default sample rate
REC_S = 3  # defaut duration of recording

def record(fn='output.wav', duration=REC_S, fs=FS):
    recording = sd.rec(int(duration * fs), samplerate=fs, channels=1)
    print('Recording', end='')
    for i in range(10):
        time.sleep(duration/8.0)
        print('.', end='')
    
    sd.wait()  # Wait until recording is finished
    print(' Done')
    wavfile.write(fn, fs, recording)  # Save as WAV file 
    
def play(fn):
    # Extract data and sampling rate from file
    data, fs = sf.read(fn, dtype='float32')  
    sd.play(data, fs)
    status = sd.wait()  # Wait until file is done playing

### wav2vec

Tested on:
- **transformers==4.4.0.dev** (installed via `pip install -e '.[dev]'` on latest version of [repo](https://github.com/huggingface/transformers))
- **torch==1.7.1**

In [10]:
import transformers
import torch

print(torch.__version__)
print(transformers.__version__)

1.7.1
4.4.0.dev0


In [11]:
import torch
from transformers import Wav2Vec2ForMaskedLM, Wav2Vec2Tokenizer

# load pretrained model
tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForMaskedLM.from_pretrained("facebook/wav2vec2-base-960h")

We'll test inference on short audio (1-10s) using LJ Speech Dataset (download [here](https://keithito.com/LJ-Speech-Dataset/), read also: [TensorFlow LJSpeech](https://www.tensorflow.org/datasets/catalog/ljspeech)).

We may also choose to test via pre-recorded audio or own recording using `record()`. Note that `wav2vec2` model expects **16 kHz, single-channel** audio.

#### Get sample from LJ Speech

In [12]:
LJ_DIR = 'LJSpeech-1.1'
if not os.path.exists(LJ_DIR):
    raise Exception('Download LJ Speech first.')
    
ds = pd.read_csv(os.path.join(LJ_DIR, 'metadata.csv'), sep='|', names=['id', 'text', 'text_normalized'])
#print('Text range: {:.0f}-{:.0f}'.format(ds.text_normalized.str.len().min(),ds.text_normalized.str.len().max()))

MAX_LEN = 75
ds_short = ds[ds.text_normalized.str.len() < MAX_LEN]

samp = ds_short.sample().iloc[0]
print(samp.id, samp.text_normalized)

LJ038-0284 Although the Commission recognizes that neither expert was able to state


In [13]:
# LJ Speech is not 16kHz, so we re-sample
from scipy.signal import resample

target_fs = 16000
test_file = 'output.wav'

raw_file = os.path.join(LJ_DIR, 'wavs', samp.id+'.wav')
x, orig_fs = sf.read(raw_file)
x = resample(x, num=int(len(x)*target_fs/orig_fs))
wavfile.write(test_file, target_fs, x)  # Save as WAV file 

play(test_file)

#### Record own audio / Pre-recorded

In [20]:
# set name appropriately and comment-out record() if gonna use pre-recorded
test_file = 'output.wav'
record(fn=test_file, fs=16000, duration=3)

play(test_file)

Recording.......... Done


#### Inference

In [21]:
# load audio
audio_input, _ = sf.read(test_file) # expected 16kHz, single-channel

def transcribe(audio):
    # transcribe
    input_values = tokenizer(audio, return_tensors="pt").input_values
    logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = tokenizer.batch_decode(predicted_ids)[0]
    
    return transcription

transcribe(audio_input)

'ALTHOUGH THE COMMISSION RECOGNIZES THAT NEITHER EXPERT WAS ABLE TO STATE'

In [29]:
audio

NameError: name 'audio_arr' is not defined

#### Streaming

In [3]:
import time
import pyaudio as pa
from scipy.io import wavfile
import numpy as np

p = pa.PyAudio()
print('Available audio input devices:')
for i in range(p.get_device_count()):
    dev = p.get_device_info_by_index(i)
    if dev.get('maxInputChannels'):
        print(i, dev.get('name'))
print('Please type input device ID:')
dev_idx = int(input())

Available audio input devices:
12 pulse
14 default
Please type input device ID:
14


In [4]:
CHANNELS = 1
RATE = 16000
DURATION = 2.0
CHUNK_SIZE = int(DURATION*RATE)

In [5]:
def callback(in_data, frame_count, time_info, status):
    # in_data: bytes
    audio_arr = np.frombuffer(in_data, dtype=float)
    
    #print(transcribe(audio_arr))
    wavfile.write('test.wav', RATE, audio_arr)
    return (in_data, pa.paContinue)

In [6]:
stream = p.open(format=pa.paInt16,
                channels=CHANNELS,
                rate=RATE,
                input=True,
                input_device_index=dev_idx,
                stream_callback=callback,
                frames_per_buffer=CHUNK_SIZE)

stream.start_stream()

# while stream.is_active():
#     time.sleep(0.1)

for i in range(10):
    time.sleep(DURATION/8.0)
    #print('.', end='')


stream.stop_stream()
stream.close()

p.terminate()