# Imports

In [59]:
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from jiwer import wer

import torch
import glob
import os
import librosa
import torchaudio

# Load data

In [64]:
def load_transcripts(config="clean", split="test", base_path=""):
    transcripts={}
    
    for filepath in glob.glob(f"{base_path}/*/*/*.txt", recursive=True):
        with open(filepath, "r") as f:
            for line in f.readlines():
                tokens = line.split("\n")[0].split(" ")
                transcripts[tokens[0]] = " ".join(tokens[1:])

    return transcripts


def load_audio(config="clean", split="test", path=""):
    speech, sr = torchaudio.load(path)
    speech = speech.squeeze()
    resampler = torchaudio.transforms.Resample(sr, 16000)
    speech = resampler(speech)
    
    return speech
    
    
def load_dataset(config="clean", split="test"):
    BASE_PATH = f"../data/en/LibriSpeech/{split.lower()}-{config.lower()}"
    
    transcripts = load_transcripts(config, split, BASE_PATH)
    audio = {}
    
    for key, value in transcripts.items():
        audio_path = f"{BASE_PATH}/{'/'.join(key.split('-')[:2])}/{key}.flac"
        audio[key] = load_audio(config, split, audio_path)
        
    return transcripts, audio

transcripts, audio = load_dataset()

# Load model

In [65]:
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")

Downloading: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 1.60k/1.60k [00:00<00:00, 641kB/s]
Downloading: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 378M/378M [00:15<00:00, 24.9MB/s]
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Downloading: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 159/159 [00:00<00:00, 65.5kB/s]
Downloading: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 163/163 [00:00<00:00, 67.7kB/s]
Downloading: 100%|████████████████████████████████████████████████████████████████████████████

In [67]:
list(transcripts.keys())[:10]

['61-70970-0000',
 '61-70970-0001',
 '61-70970-0002',
 '61-70970-0003',
 '61-70970-0004',
 '61-70970-0005',
 '61-70970-0006',
 '61-70970-0007',
 '61-70970-0008',
 '61-70970-0009']

In [70]:
input_values = processor(audio["61-70970-0000"], return_tensors="pt", sampling_rate=16000)["input_values"]

In [71]:
logits = model(input_values)["logits"]


In [72]:
predicted_ids = torch.argmax(logits, dim=-1)

In [73]:
transcription = processor.decode(predicted_ids[0])

In [74]:
transcription

"YOUNG FITZOOTH HAD BEEN COMMANDED TO HIS MOTHER'S CHAMBER SO SOON AS HE HAD COME OUT FROM HIS CONVERSE WITH THE SQUIRE"

In [75]:
transcripts["61-70970-0000"]

"YOUNG FITZOOTH HAD BEEN COMMANDED TO HIS MOTHER'S CHAMBER SO SOON AS HE HAD COME OUT FROM HIS CONVERSE WITH THE SQUIRE"

In [77]:
wer([transcription], [transcripts["61-70970-0000"]])

0.0