# Before starting:
Make sure to enable the GPU:
- Runtime
- Change runtime type
- T4 GPU

After GPU activation:
- Runtime
- Run all
- Scroll to the end of the page and wait until a visual app appears under the last cell.
- You can now use the app


# Install libraries

In [1]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [5]:
!pip install --upgrade insanely-fast-whisper yt_dlp optimum accelerate transformers gradio

Collecting transformers
  Downloading transformers-4.36.2-py3-none-any.whl (8.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.35.2
    Uninstalling transformers-4.35.2:
      Successfully uninstalled transformers-4.35.2
Successfully installed transformers-4.36.2


# Load libraries, models

In [1]:
import tempfile
import os
import time
import gradio as gr
import spacy
import torch
import yt_dlp as youtube_dl
from transformers import pipeline
from transformers.pipelines.audio_utils import ffmpeg_read
from math import floor
from pprint import pprint
from transformers.utils import is_flash_attn_2_available
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq



nlp = spacy.load('en_core_web_sm')

MODEL_NAME = "openai/whisper-large-v3"
BATCH_SIZE = 16
FILE_LIMIT_MB = 1000
YT_LENGTH_LIMIT_S = 3600  # limit to 1 hour YouTube files

device = 0 if torch.cuda.is_available() else "cpu"



pipe = pipeline(
    "automatic-speech-recognition",
    model=MODEL_NAME,
    torch_dtype=torch.float16,
    device=device, # or mps for Mac devices
    #model_kwargs={"use_flash_attention_2": is_flash_attn_2_available()},# set to False for old GPUs
)







Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Timestamp transformation

In [2]:

def format_time(time_:float) -> str:
  """Transform time in seconds to minutes:seconds,centiseconds"""
  decimals = floor((time_ % 1)*1000)
  # get number of minutes, we do not care about hours as length is shorter than 1 hour
  min, sec = divmod(floor(time_), 60)
  formatted_time = f'00:{format(min, "02")}:{format(sec, "02")},{format(decimals, "03")}'
  return formatted_time

def timestamp_srt(timestamp:tuple):
  """Convert timestamp start, end to the correct srt format"""
  start, end = timestamp # unpack tuple
  formatted_start = format_time(start)
  formatted_end = format_time(end)
  srt = f'{formatted_start} --> {formatted_end}\n'
  return srt

def get_srt(chunks):
  """Generate the srt file"""
  srt = ''
  for i,chunk in enumerate(chunks):
    srt += f'{i+1}\n' #chunk number
    srt += timestamp_srt(chunk['timestamp'])
    srt += f"{chunk['text'].strip()}\n\n"
  return srt


# Download video data

In [3]:
def _return_yt_html_embed(yt_url):
    video_id = yt_url.split("?v=")[-1]
    HTML_str = (
        f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
        " </center>"
    )
    return HTML_str

def download_yt_audio(yt_url, filename):
    info_loader = youtube_dl.YoutubeDL()

    try:
        info = info_loader.extract_info(yt_url, download=False)
    except youtube_dl.utils.DownloadError as err:
        raise gr.Error(str(err))

    file_length = info["duration_string"]
    file_h_m_s = file_length.split(":")
    file_h_m_s = [int(sub_length) for sub_length in file_h_m_s]

    if len(file_h_m_s) == 1:
        file_h_m_s.insert(0, 0)
    if len(file_h_m_s) == 2:
        file_h_m_s.insert(0, 0)
    file_length_s = file_h_m_s[0] * 3600 + file_h_m_s[1] * 60 + file_h_m_s[2]

    if file_length_s > YT_LENGTH_LIMIT_S:
        yt_length_limit_hms = time.strftime("%HH:%MM:%SS", time.gmtime(YT_LENGTH_LIMIT_S))
        file_length_hms = time.strftime("%HH:%MM:%SS", time.gmtime(file_length_s))
        raise gr.Error(f"Maximum YouTube length is {yt_length_limit_hms}, got {file_length_hms} YouTube video.")

    ydl_opts = {"outtmpl": filename, "format": "worstvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best"}

    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
        try:
            ydl.download([yt_url])
        except youtube_dl.utils.ExtractorError as err:
            raise gr.Error(str(err))


def yt_transcribe(yt_url, task= 'transcribe', language='french',  max_filesize=75.0):
    assert task in ['transcribe', 'translate'], "task value should be in ['transcribe', 'translate']"
    html_embed_str = _return_yt_html_embed(yt_url)

    with tempfile.TemporaryDirectory() as tmpdirname:
        filepath = os.path.join(tmpdirname, "video.mp4")
        download_yt_audio(yt_url, filepath)
        with open(filepath, "rb") as f:
            inputs = f.read()

    inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
    inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}

    # text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
    text = pipe(inputs,
               chunk_length_s=3,
               batch_size=BATCH_SIZE,
               return_timestamps=True,
               generate_kwargs={
            "task": task,
            "language": language,
        },)
    # transcript without timestamps
    transcript = text['text']
    sentences = '\n\n'.join([sent.text for sent in nlp(transcript).sents])
    text_path = './need_review.txt'
    with open(text_path, 'w', encoding="utf-8") as f1:
      f1.write(sentences)
    # transcript without timestamps
    chunks = get_srt(text['chunks'])
    srt_path = './need_review.srt'
    with open(srt_path, 'w', encoding="utf-8") as f2:
      f2.write(chunks)
    return [sentences, text_path, chunks, srt_path]

# Local file

In [4]:
def transcribe(filepath, task = 'transcribe', language = 'french'):
    with open(filepath, "rb") as f:
      inputs = f.read()

    inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
    inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
    if inputs is None:
      raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")

    text = pipe(inputs,
               chunk_length_s=3,
               batch_size=BATCH_SIZE,
               return_timestamps=True,
               generate_kwargs={
            "task": task,
            "language": language,
        },)
    # transcript without timestamps
    transcript = text['text']
    sentences = '\n\n'.join([sent.text for sent in nlp(transcript).sents])
    text_path = './need_review.txt'
    with open(text_path, 'w', encoding="utf-8") as f1:
      f1.write(sentences)
    # transcript without timestamps
    chunks = get_srt(text['chunks'])
    srt_path = './need_review.srt'
    with open(srt_path, 'w', encoding="utf-8") as f2:
      f2.write(chunks)
    return [sentences, text_path, chunks, srt_path]

# After review
Import your reviewed file to Colab

In [5]:
def re_order_chunks(file):
  # Restore the order of chunks
  final_file_name = f'ordered_{file.name.split("/")[-1]}'
  with open(file.name, 'r', encoding="utf-8") as checked_file:
    corrected_chunk_number = 0
    corrected_content = ''
    for line in checked_file.readlines():
      if line.strip().isdecimal():
        corrected_chunk_number += 1
        corrected_content += f'{corrected_chunk_number}\n'
      else:
        corrected_content += line.strip() + '\n'
  # Create the final file
  with open(final_file_name, 'w', encoding="utf-8") as f:
    f.write(corrected_content)
  return [corrected_content, final_file_name]


# Visual demo


In [6]:

gr.close_all()
demo = gr.Blocks()
yt = gr.Interface(
    fn=yt_transcribe,
    inputs=[gr.Textbox(label='Youtube Url',
                         value='https://www.youtube.com/watch?v=_KFZgN7MXdw&pp=ygUXY2FzdGVsbG8gbG9wZXMgZW1tZW50YWw%3D'),
              gr.Radio(["transcribe", "translate"], label="task", value ="transcribe"),
              gr.Textbox(value = 'french', label='language',
                         info='the target language to transcribe or translate the video')],
    outputs=[gr.Textbox(label='transcript without timestamps'),
             gr.File(label="The txt file containing the transcript without timestamps"),
             gr.Textbox(label='transcript with timestamps'),
             gr.File(label="The srt file containing the transcript with timestamps"), ],
    title="Whisper Large V3: Transcribe YouTube",
    description=(
        "Transcribe long-form YouTube videos with the click of a button! Demo uses the checkpoint"
        f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe video files of"
        " arbitrary length.\n"
        "The model may have a hard time with background sounds/voices and proper nouns, so check the results!"
    ),
)
file_transcribe = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.File(label="Audio/Video file"),
        gr.Radio(["transcribe", "translate"], label="task", value ="transcribe"),
        gr.Textbox(value = 'french', label='language',
                         info='the target language to transcribe or translate the video')],
    outputs=[gr.Textbox(label='transcript without timestamps'),
             gr.File(label="The txt file containing the transcript without timestamps"),
             gr.Textbox(label='transcript with timestamps'),
             gr.File(label="The srt file containing the transcript with timestamps"), ],
    title="Whisper Large V3: Transcribe Audio",
    description=(
        "Transcribe long-form audio/video inputs with the click of a button! Demo uses the"
        f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
        " of arbitrary length."
    ),
)
chunk_order = gr.Interface(
    fn=re_order_chunks,
    inputs = gr.File(label='The reviewed srt file'),
    outputs = [gr.Textbox(label='ordered chunks'),
              gr.File(label="The srt file containing the above ordered chunks")],
    title="Srt chunk orderer",
    description=(
        "After review of a srt file, if you deleted or added chunks, the order of chunks may be messed up."
        "This tool will assign an unique number to each chunk, according to the order in which they appear in your file."
    ),
)
with demo:
  gr.TabbedInterface([yt, file_transcribe, chunk_order],
   ['Youtube transcripter', 'Local file transcripter','chunk orderer'])

demo.launch()

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://a8dd03b3d8ec7b6001.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


