In [4]:
pip install vosk

Collecting vosk
  Downloading vosk-0.3.45-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (1.8 kB)
Collecting websockets (from vosk)
  Downloading websockets-12.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading vosk-0.3.45-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (7.2 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m44.4 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:01[0m:01[0m
[?25hDownloading websockets-12.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (130 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m130.5/130.5 kB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: websockets, vosk
Successfully installed vosk-0.3.45 websockets-12.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip

In [12]:
import os
from vosk import Model, KaldiRecognizer, SetLogLevel
from pydub import AudioSegment
import wave
import json

In [6]:
def get_vosk_model(model_path):
    # Verify the model and audio path
    if not os.path.exists(model_path):
        print("Model path does not exist")
        return None

    # Read Vosk Model
    print(f"Reading your vosk model '{model_path}'...")
    try:
        model = Model(model_path)
    except:
        print("Failed to create a model")
        return None

    return model

In [20]:
def mono_wav(audio_folder_path, input_file_name, skip=0):
    # Create tmp path
    if not os.path.isdir(os.path.join(audio_folder_path, "tmp")):
        os.mkdir(os.path.join(audio_folder_path, "tmp"))

    # Set input and output paths
    source = os.path.join(audio_folder_path, input_file_name)
    output_path = os.path.join(
        audio_folder_path,
        "tmp",
        os.path.splitext(input_file_name)[0]
        + datetime.now().strftime("_%Y%m%d_%H%M%S")
        + ".wav",
    )

    # Perform conversion
    filename, file_extension = os.path.splitext(source)
    sound = AudioSegment.from_file(source, format=file_extension[1:])  # load source
    sound = sound.set_channels(1)  # mono
    sound = sound.set_frame_rate(16000)  # 16000Hz

    audio = sound[skip * 1000 :]
    outputfile = audio.export(output_path, format="wav", codec="pcm_s16le")

    outputfile.close()

    return output_path


def generate_vosk_transcription(filepath, filename, model):
    # open audio file
    wf = wave.open(filepath, "rb")

    rec = KaldiRecognizer(model, wf.getframerate())

    # To store our results
    transcription = []

    rec.SetWords(True)
    segment_id = 0

    while True:
        data = wf.readframes(5000)
        if len(data) == 0:
            break
        if rec.AcceptWaveform(data):
            # Convert json output to dict
            result_dict = json.loads(rec.Result())
            # Extract text values and append them to transcription list
            if result_dict.get("text", "") != "" and len(result_dict.get("result", [])) > 0:
                for result_word in result_dict.get("result"):
                    transcription.append(
                        [
                            filename,
                            segment_id,
                            result_dict.get("text", ""),
                            result_dict.get("result", "")[0].get("start", ""),
                            result_dict.get("result", "")[-1].get("end", ""),
                            result_word["word"],
                            result_word["start"],
                            result_word["end"]
                        ]
                    )
        segment_id+=1

    # Get final bits of audio and flush the pipeline
    final_result = json.loads(rec.FinalResult())
    if final_result.get("text", "") != "" and len(final_result.get("result", [])) > 0:
        for result_word in final_result.get("result"):
            transcription.append(
                [
                    filename,
                    segment_id,
                    final_result.get("text", ""),
                    final_result.get("result", "")[0].get("start", ""),
                    final_result.get("result", "")[-1].get("end", ""),
                    result_word["word"],
                    result_word["start"],
                    result_word["end"]
                ]
            )

    return transcription

In [16]:
model = get_vosk_model("../../who_said_that/utils/vosk-model-en-us-0.42-gigaspeech/")

Reading your vosk model '../../who_said_that/utils/vosk-model-en-us-0.42-gigaspeech/'...


LOG (VoskAPI:ReadDataFiles():model.cc:213) Decoding params beam=13 max-active=7000 lattice-beam=8
LOG (VoskAPI:ReadDataFiles():model.cc:216) Silence phones 1:2:3:4:5:6:7:8:9:10
LOG (VoskAPI:RemoveOrphanNodes():nnet-nnet.cc:948) Removed 0 orphan nodes.
LOG (VoskAPI:RemoveOrphanComponents():nnet-nnet.cc:847) Removing 0 orphan components.
LOG (VoskAPI:ReadDataFiles():model.cc:248) Loading i-vector extractor from ../../who_said_that/utils/vosk-model-en-us-0.42-gigaspeech//ivector/final.ie
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:183) Computing derived variables for iVector extractor
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:204) Done.
LOG (VoskAPI:ReadDataFiles():model.cc:279) Loading HCLG from ../../who_said_that/utils/vosk-model-en-us-0.42-gigaspeech//graph/HCLG.fst
LOG (VoskAPI:ReadDataFiles():model.cc:297) Loading words from ../../who_said_that/utils/vosk-model-en-us-0.42-gigaspeech//graph/words.txt
LOG (VoskAPI:ReadDataFiles():model.cc:308) Loading winfo ../

In [19]:
transcriptions = generate_vosk_transcription(
    "../../output/video_temp/MagnusCarlson_542_599/pyavi/audio.wav",
    "audio.wav"
    ,model)

{'result': [{'conf': 1.0, 'end': 0.18, 'start': 0.0, 'word': "i'm"}, {'conf': 0.847572, 'end': 0.33, 'start': 0.18, 'word': 'going'}, {'conf': 0.847572, 'end': 0.388721, 'start': 0.33, 'word': 'to'}, {'conf': 1.0, 'end': 0.54, 'start': 0.388721, 'word': 'name'}, {'conf': 1.0, 'end': 0.6, 'start': 0.54, 'word': 'a'}, {'conf': 1.0, 'end': 1.14, 'start': 0.6, 'word': 'sport'}, {'conf': 1.0, 'end': 1.68, 'start': 1.53, 'word': 'you'}, {'conf': 1.0, 'end': 1.77, 'start': 1.68, 'word': 'have'}, {'conf': 1.0, 'end': 1.86, 'start': 1.77, 'word': 'to'}, {'conf': 1.0, 'end': 1.98, 'start': 1.86, 'word': 'tell'}, {'conf': 1.0, 'end': 2.07, 'start': 1.98, 'word': 'me'}, {'conf': 1.0, 'end': 2.16, 'start': 2.07, 'word': 'the'}, {'conf': 1.0, 'end': 2.46, 'start': 2.16, 'word': 'greatest'}, {'conf': 1.0, 'end': 2.52, 'start': 2.46, 'word': 'of'}, {'conf': 1.0, 'end': 2.61, 'start': 2.52, 'word': 'all'}, {'conf': 1.0, 'end': 2.88, 'start': 2.61, 'word': 'time'}, {'conf': 1.0, 'end': 3.63, 'start': 3.

In [21]:
transcriptions

[['audio.wav',
  34,
  "i'm going to name a sport you have to tell me the greatest of all time basketball lebron james soccer messi i agree with both baseball barry bonds",
  0.0,
  9.21,
  "i'm",
  0.0,
  0.18],
 ['audio.wav',
  34,
  "i'm going to name a sport you have to tell me the greatest of all time basketball lebron james soccer messi i agree with both baseball barry bonds",
  0.0,
  9.21,
  'going',
  0.18,
  0.33],
 ['audio.wav',
  34,
  "i'm going to name a sport you have to tell me the greatest of all time basketball lebron james soccer messi i agree with both baseball barry bonds",
  0.0,
  9.21,
  'to',
  0.33,
  0.388721],
 ['audio.wav',
  34,
  "i'm going to name a sport you have to tell me the greatest of all time basketball lebron james soccer messi i agree with both baseball barry bonds",
  0.0,
  9.21,
  'name',
  0.388721,
  0.54],
 ['audio.wav',
  34,
  "i'm going to name a sport you have to tell me the greatest of all time basketball lebron james soccer messi i a

In [22]:
import pandas as pd

In [23]:
df = pd.DataFrame(transcriptions, columns=["file_name", "segment_id", "segment_text", "segment_start", "segment_end", "word", "word_start", "word_end"])

In [25]:
df.to_excel("magnus.xlsx", index=False)