Modified to handle youtube videos. 

Prerequisites:
torch,
cuda,
ffmpeg, and 
distil-whisper at https://github.com/huggingface/distil-whisper.

CAUTION: MAKE SURE TO BACKUP AUDIO FILES IF THEY HAVE SPACES IN THEIR NAMES AS THEY WILL BE RENAMED (AND THE METADATA ALTERED). For file names without spaces, the original file is not renamed and a copy is made in a compatible audio format.

In [2]:
import torch
torch.__version__

'2.2.1'

In [3]:
# need cuda for vastly faster transcription
# 47 seconds for a full 30 min podcast! 1.9s for the same file that took whisper 4m 44 seconds!
torch.cuda.is_available()

True

In [4]:
# conda activate py310
import torch
import subprocess
import os
import glob
import textwrap
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import ipywidgets as widgets
from pytube import YouTube
from utils import get_audio
from utils import prepare_srt
from pathlib import Path
from pytube import Playlist
import datetime

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
print(device)

model_id = "distil-whisper/distil-large-v2"
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, 
    torch_dtype=torch_dtype, 
    low_cpu_mem_usage=True, 
    use_safetensors=True,
    attn_implementation="flash_attention_2")
model.to(device)
processor = AutoProcessor.from_pretrained(model_id)

2024-03-22 06:14:30.433537: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-22 06:14:30.471227: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-03-22 06:14:31.602838: I itex/core/wrapper/itex_cpu_wrapper.cc:52] Intel Extension for Tensorflow* AVX2 CPU backend is loaded.
2024-03-22 06:14:31.605178: W itex/core/wrapper/itex_gpu_wrapper.cc:32] Could not load dynamic library: libimf.so: cannot open shared object file: No such file or directory
2024-03-22 06:14:31.636722: W itex/core/ops/op_init.cc:58] Op: _QuantizedMaxPool

cuda:0


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
# Specify the audio file directory
home_directory = os.path.expanduser("~")
directory = home_directory + '/machine_learning/whisper.cpp/samples/'

# copy and youtube share link below:
VIDEO_LINK = "https://youtu.be/TcoolNrruwE?feature=shared"
name = VIDEO_LINK[17:-15]
print(name)

link = widgets.Text(
    value=VIDEO_LINK,
    placeholder="Type link for video",
    description="Video:",
    disabled=False
)
link

print(f"Downloading video {link.value} started")
output_file = Path(directory + name + ".mp4")
yt = YouTube(link.value)
yt.streams.get_highest_resolution().download(filename=output_file)
print(f"Video saved to {output_file}")

import subprocess

extracted_audio_file = name + '.wav'

def extract_audio(video_path, audio_path):
    yes_command = f'echo "y" | '
    command = yes_command + "ffmpeg -i {} -vn -acodec pcm_s16le -ar 16000 -ac 1 {}".format(video_path, audio_path)
    subprocess.call(command, shell=True)

# Usage
try:
    extract_audio(output_file, directory + extracted_audio_file)
    print("Audio coverted successfully.")
except subprocess.CalledProcessError as e:
    print(f"Audio convertion failed with error {e.returncode}.")

TcoolNrruwE
Downloading video https://youtu.be/TcoolNrruwE?feature=shared started
Video saved to /var/home/fraser/machine_learning/whisper.cpp/samples/TcoolNrruwE.mp4


ffmpeg version 4.4 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 9.3.0 (crosstool-NG 1.24.0.133_b0863d8_dirty)
  configuration: --prefix=/root/miniconda3/envs/conda_bld/conda-bld/ffmpeg_1635335682798/_h_env_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_place --cc=/root/miniconda3/envs/conda_bld/conda-bld/ffmpeg_1635335682798/_build_env/bin/x86_64-conda-linux-gnu-cc --disable-doc --disable-openssl --enable-avresample --enable-hardcoded-tables --enable-libfreetype --enable-libopenh264 --enable-pic --enable-pthreads --enable-shared --disable-static --enable-version3 --enable-zlib --enable-libmp3lame
  libavutil      56. 70.100 / 56. 70.100
  libavcodec     58.134.100 / 58.134.100
  libavformat    58. 76.100 / 58. 76.100
  libavdevice    58. 13.100 / 58. 13.100
  libavfilter     7.110.100 /  7.110.100
  libavresample   4.  0.  0 /  4.  0.  0

Audio coverted successfully.


size=   16253kB time=00:08:40.10 bitrate= 256.0kbits/s speed=1.28e+03x    
video:0kB audio:16253kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.000469%


In [6]:
'''
# chunk_lengthS=15 and batch_size=16 is ideal
pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=15,
    batch_size=16,
    torch_dtype=torch_dtype,
    device=device
)
print(directory + extracted_audio_file)

result_local = pipe(directory + extracted_audio_file)

# output transcription
wrapper = textwrap.TextWrapper(width=80,
    initial_indent=" ",
    subsequent_indent="",
    break_long_words=False,
    break_on_hyphens=False)
print(wrapper.fill(result_local["text"]))

# save transcript to the samples folder as a .md file
saved_txt=result_local["text"]
f = open(directory + extracted_audio_file + ".md", "a")
f.write(saved_txt)
f.close()
'''

'\n# chunk_lengthS=15 and batch_size=16 is ideal\npipe = pipeline(\n    "automatic-speech-recognition",\n    model=model,\n    tokenizer=processor.tokenizer,\n    feature_extractor=processor.feature_extractor,\n    max_new_tokens=128,\n    chunk_length_s=15,\n    batch_size=16,\n    torch_dtype=torch_dtype,\n    device=device\n)\nprint(directory + extracted_audio_file)\n\nresult_local = pipe(directory + extracted_audio_file)\n\n# output transcription\nwrapper = textwrap.TextWrapper(width=80,\n    initial_indent=" ",\n    subsequent_indent="",\n    break_long_words=False,\n    break_on_hyphens=False)\nprint(wrapper.fill(result_local["text"]))\n\n# save transcript to the samples folder as a .md file\nsaved_txt=result_local["text"]\nf = open(directory + extracted_audio_file + ".md", "a")\nf.write(saved_txt)\nf.close()\n'

In [7]:
# transcribe using the base model (great with CUDA enabled whisper.cpp)
# produces subtitles too
try:
    subprocess.run(['transcribe -t 12 -m ' + home_directory + '/machine_learning/whisper.cpp/models/ggml-base.en.bin -f ' 
                + directory + extracted_audio_file + ' -otxt -ovtt -osrt -olrc'], shell=True, check=True)
    print("Transcription executed successfully and saved in " + directory)
except subprocess.CalledProcessError as e:
    print(f"Transcription failed with error {e.returncode}.")

Starting container...                   	[32m [ OK ]
[0mInstalling basic packages...            	[32m [ OK ]
[0mSetting up devpts mounts...             	[32m [ OK ]
[0mSetting up read-only mounts...          	[32m [ OK ]
[0mSetting up read-write mounts...         	[32m [ OK ]
[0mSetting up host's sockets integration...	[32m [ OK ]
[0mIntegrating host's themes, icons, fonts...	[32m [ OK ]
[0mSetting up package manager exceptions...	[32m [ OK ]
[0mSetting up rpm exceptions...            	[32m [ OK ]
[0mSetting up distrobox profile...         	[32m [ OK ]
[0mSetting up sudo...                      	[32m [ OK ]
[0mSetting up user groups...               	[32m [ OK ]
[0mSetting up kerberos integration...      	[32m [ OK ]
[0mSetting up user's group list...         	[32m [ OK ]
[0mSetting up user home...                 	[32m [ OK ]
[0mEnsuring user's access...               	[32m [ OK ]
[0m
Container Setup Complete!
whisper_init_from_file_with_params_no_state


[00:00:00.000 --> 00:00:02.520]   "It wasn't a roast in any traditional sense,
[00:00:02.520 --> 00:00:04.400]   but that did not stop President Joe Biden
[00:00:04.400 --> 00:00:06.080]   from getting some laughs at a fundraiser
[00:00:06.080 --> 00:00:07.200]   in Dallas last night.
[00:00:07.200 --> 00:00:09.880]   And this was the joke that killed.
[00:00:09.880 --> 00:00:12.760]   Quote, "Just the other day a guy came to me and said,
[00:00:12.760 --> 00:00:14.440]   "Mr. President, I need your help.
[00:00:14.440 --> 00:00:15.600]   "I'm being crushed with debt.
[00:00:15.600 --> 00:00:17.280]   "I'm completely wiped out.
[00:00:17.280 --> 00:00:20.680]   "I had to say, Donald, I can't help you."
[00:00:20.680 --> 00:00:23.720]   As the saying goes, it's funny 'cause it's true.
[00:00:23.720 --> 00:00:26.200]   Donald Trump's financial situation is not pretty.
[00:00:26.200 --> 00:00:28.400]   There's that looming half a billion dollar bond
[00:00:28.400 --> 00:00:31.400]   due 

output_txt: saving output to '/var/home/fraser/machine_learning/whisper.cpp/samples/TcoolNrruwE.wav.txt'
output_vtt: saving output to '/var/home/fraser/machine_learning/whisper.cpp/samples/TcoolNrruwE.wav.vtt'
output_srt: saving output to '/var/home/fraser/machine_learning/whisper.cpp/samples/TcoolNrruwE.wav.srt'
output_lrc: saving output to '/var/home/fraser/machine_learning/whisper.cpp/samples/TcoolNrruwE.wav.lrc'

whisper_print_timings:     load time =   235.10 ms
whisper_print_timings:     fallbacks =   0 p /   0 h
whisper_print_timings:      mel time =   311.25 ms
whisper_print_timings:   sample time =  4641.82 ms / 12235 runs (    0.38 ms per run)
whisper_print_timings:   encode time =   100.51 ms /    19 runs (    5.29 ms per run)
whisper_print_timings:   decode time =    63.72 ms /    12 runs (    5.31 ms per run)
whisper_print_timings:   batchd time =  6343.85 ms / 12129 runs (    0.52 ms per run)
whisper_print_timings:   prompt time =   917.61 ms /  3996 runs (    0.23 ms per