In [None]:
#@title **Required settings:**
!nvidia-smi

# @markdown **【IMPORTANT】:**<font size="2">Select uploaded file type.

# encoding:utf-8
# file_type = "audio"  # @param ["audio","video"]

# @markdown #### **Youtube video**
yt_url = "https://www.youtube.com/playlist?list=PL7lx05VRzoFHJK1A7KF8mrqrNo5NSOBxr"  # @param {type:"string"}

# @markdown #### **Initial prompt**
# @markdown Prompts can be very helpful for correcting specific words or acronyms that the model often misrecognizes in the audio.
prompt = "Game development talk"  # @param {type:"string"}

# @markdown #### Model
model_size = "large-v2"  # @param ["base", "base.en", "small", "small.en","medium", "medium.en", "large-v1","large-v2","large-v3"]

# @markdown #### Language
language = "en" # @param ["auto", "en", "zh", "de", "es", "ru", "ko", "fr", "ja", "pt", "tr", "pl", "ca", "nl", "ar", "sv", "it", "id", "hi", "fi", "vi", "he", "uk", "el", "ms", "cs", "ro", "da", "hu", "ta", "no", "th", "ur", "hr", "bg", "lt", "la", "mi", "ml", "cy", "sk", "te", "fa", "lv", "bn", "sr", "az", "sl", "kn", "et", "mk", "br", "eu", "is", "hy", "ne", "mn", "bs", "kk", "sq", "sw", "gl", "mr", "pa", "si", "km", "sn", "yo", "so", "af", "oc", "ka", "be", "tg", "sd", "gu", "am", "yi", "lo", "uz", "fo", "ht", "ps", "tk", "nn", "mt", "sa", "lb", "my", "bo", "tl", "mg", "as", "tt", "haw", "ln", "ha", "ba", "jw", "su"]

# @markdown #### Filename Type
# @markdown Use YouTube title as file name by default
filename_type = "id"  # @param ["title", "id"]

# @markdown #### Assign speaker labels
# @markdown Recognize speakers
assign_speaker_lable = False # @param {type:"boolean"}

# @markdown #### Align whisper output
align_whisper_output = True # @param {type:"boolean"}

In [None]:
#@title **Run Whisper**
#@markdown srt files will be auto downloaded after finish.
! pip install pytube
! pip install yt_dlp

print('Start downloading videos.')

from IPython.display import clear_output
clear_output()
import os
import subprocess
from yt_dlp import YoutubeDL
import pytube
import torch
from google.colab import files
from pathlib import Path
from tqdm import tqdm
import time
import requests
import sys
import gc
import re
from IPython.display import display, Markdown, YouTubeVideo


# assert file_name != ""
# assert language != ""
tic = time.time()

video_path_local_list = []

try:
    list_video_yt = [pytube.YouTube(yt_url)]
except Exception:
    try:
        list_video_yt = list(pytube.Playlist(yt_url).videos)
    except Exception:
        raise(RuntimeError(f"{yt_url} isn't recognized."))

for video_yt in list_video_yt:
    try:
        video_yt.check_availability()
        display(
            YouTubeVideo(video_yt.video_id)
        )
    except:
        display(
            Markdown(f"**{yt_url} isn't available.**"),
        )
    try:
        video_path_local = Path(".").resolve() / (video_yt.video_id+".mp4")
        video_yt.streams.filter(
            type="audio",
            mime_type="audio/mp4",
            abr="48kbps"
        ).first().download(
            output_path = video_path_local.parent,
            filename = video_path_local.name
        )
        # if store_audio:
        #     shutil.copy(video_path_local, drive_whisper_path / video_path_local.name)
        video_path_local_list.append(video_path_local)
    except:
        try:
            video_path_local = Path(".").resolve() / (video_yt.video_id+".mp3")
            video_url = f'http://youtube.com/watch?v={video_yt.video_id}'
            URLS = [video_url]
            ydl_opts = {
                "outtmpl": f'./{video_yt.video_id}.%(ext)s',
                'format': 'm4a/bestaudio/best',
                # ℹ️ See help(yt_dlp.postprocessor) for a list of available Postprocessors and their arguments
                'postprocessors': [{  # Extract audio using ffmpeg
                    'key': 'FFmpegExtractAudio',
                    'preferredcodec': 'mp3',
                }]
            }

            with YoutubeDL(ydl_opts) as ydl:
                error_code = ydl.download(URLS)
            # if store_audio:
            #     shutil.copy(video_path_local, drive_whisper_path / video_path_local.name)
            video_path_local_list.append(video_path_local)
        except Exception as e:
            display(
                Markdown(f"**{yt_url} isn't available on yt-dlp.**"),
            )
            raise(RuntimeError(f"{yt_url} isn't available."))

for index, video_path_local in enumerate(video_path_local_list):
  if video_path_local.suffix == ".mp4" or video_path_local.suffix == ".mp3":
      original_suffix = video_path_local.suffix
      video_path_local = video_path_local.with_suffix(".wav")
      video_path_local_list[index] = video_path_local # to make the element of array actually change.
      result  = subprocess.run(["ffmpeg", "-i", str(video_path_local.with_suffix(original_suffix)), "-vn", "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1", str(video_path_local)])


In [None]:
! pip install git+https://github.com/m-bain/whisperx.git


In [None]:
import os


language_param = ""
if language != "auto":
    language_param = f"--language {language}"

diarize_param = ""
if assign_speaker_lable:
    diarize_param = "--diarize --hf_token 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"

align_whisper_param = ""
if align_whisper_output:
    align_whisper_param = "--align_model WAV2VEC2_ASR_LARGE_LV60K_960H"

prompt_param = ""
if prompt != "":
    prompt_param = f'--initial_prompt "{prompt}"'

for video_path_local in video_path_local_list:
    file_name = os.path.splitext(video_path_local)[0]
    input = file_name.replace(":", "\:").replace("'", "'").replace("content/","")
    run = f'whisperx \'.{input}.wav\' --model {model_size} {language_param} --output_dir . {prompt_param} {align_whisper_param} {diarize_param}'

    print(run)

    !{run}



In [None]:
from google.colab import files
for video_path_local in video_path_local_list:
    base_filename = os.path.splitext(video_path_local)[0]
    base_filename = base_filename.replace(":", "\:")
    srt_filename =f"{base_filename}.srt"
    # json_filename = f"{base_filename}.json"
    print(srt_filename)
    # print(json_filename)
    files.download(srt_filename)
    # files.download(json_filename)