<a href="https://colab.research.google.com/github/curtec/julie/blob/main/Trelis_Whisper_Transcription.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<font size=64px>Whisper Transcription</font>

Notebook by Trelis Research. Built upon an [original notebook](https://colab.research.google.com/github/deepgram-devs/try-whisper-in-google-collab/blob/main/try_whisper_in_three_easy_steps.ipynb) by Ross O'Connell.

Find:
- [Trelis on YouTube](https://youtube.com/@trelisresearch)
- [The Trelis Newsletter here](https://blog.trelis.com).
- [Fine-tuning video and scripts - upcoming]()


-

Key updates since original Whisper Fine-tuning video:
- Allow transcription from uploaded mp3, mp4 or wav files.
- Make whisper-turbo the default for lower word error rate and 2x speed up versus whisper small.

## Mount Google Drive (optional)
You later need to change the paths below for pulling and transcribing audio.

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# import os
# cache_dir = "/content/drive/My Drive/video_transcripts"
# os.makedirs(cache_dir, exist_ok=True) # Ensure the directory exists

## Install Whisper

In the first line we install Whisper!

In [None]:
!pip install git+https://github.com/openai/whisper.git -q

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.5/209.5 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m45.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for openai-whisper (pyproject.toml) ... [?25l[?25hdone


## Transcribe uploaded mp3 or wav or m4a files
Upload your files to on the left hand pane (or, adjust the path below if you have mounted Drive).

In [None]:
import os

# Get the first mp3, wav, or m4a file in the /content/ directory
audio_files = [file for file in os.listdir('/content') if file.endswith(('.mp3', '.wav', '.m4a'))]

if audio_files:
    audio_file = f"/content/{audio_files[0]}" # convert the first audio file
    # Call whisper with the dynamically found audio file (MP3, WAV, or M4A)
    os.system(f'whisper "{audio_file}" --model small --language English')
else:
    print("No MP3, WAV, or M4A file found in the /content/ directory.")

In [None]:
## FOR A CUSTOM MODEL

import os
from huggingface_hub import hf_hub_download
import subprocess

# Download the model file from the Hugging Face repo
repo_id = "Trelis/whisper-turbo-llm-lingo"
model_file = "whisper-turbo-llm-lingo-openai.bin"

# Download the model file from Hugging Face
model_path = hf_hub_download(repo_id=repo_id, filename=model_file)

# Get the first mp3, wav, or m4a file in the /content/ directory
audio_files = [file for file in os.listdir('/content') if file.endswith(('.mp3', '.wav', '.m4a'))]

if audio_files:
    audio_file = f"/content/{audio_files[0]}" # convert the first audio file
    # Call whisper with the dynamically found audio file (MP3, WAV, or M4A)
    # Pass the local model path instead of a predefined model name
    command = f'whisper "{audio_file}" --model "{model_path}" --language English'
    result = subprocess.run(command, shell=True, capture_output=True, text=True)
else:
    print("No MP3, WAV, or M4A file found in the /content/ directory.")

In [None]:
print(model_path)

/root/.cache/huggingface/hub/models--Trelis--whisper-turbo-llm-lingo/snapshots/db89ed6efa18c99b36bf5bdb41a1fc9519ae68dc/whisper-turbo-llm-lingo-openai.bin


### Grab and transcribe audio from YouTube
Note that downloading from YouTube can be difficult outside of Colab and may require you to authenticate with YouTube

In [None]:
!pip install yt-dlp -q -U

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.3/171.3 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m57.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m164.1/164.1 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m90.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.4/194.4 kB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m75.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# !yt-dlp https://youtube.com/video/VUKZP0ShxEM --format m4a -o "/content/%(id)s.%(ext)s"
# !whisper "/content/VUKZP0ShxEM.m4a" --model small --language English

In [None]:
import os

# User hardcodes the YouTube link
youtube_link = "https://youtube.com/watch?v=VUKZP0ShxEM"

# Extract video ID from the YouTube link
video_id = youtube_link.split("v=")[-1]

# Use yt-dlp to download the audio as m4a format
os.system(f'yt-dlp {youtube_link} --format m4a -o "/content/{video_id}.%(ext)s"')

# The file path for the downloaded m4a file
audio_file = f"/content/{video_id}.m4a"

# Run Whisper for transcription on the downloaded audio file
os.system(f'whisper "{audio_file}" --model turbo --language English')

0

In [None]:
## For a custom model

import os, subprocess

# User hardcodes the YouTube link
youtube_link = "https://youtube.com/watch?v=VUKZP0ShxEM"

# Extract video ID from the YouTube link
video_id = youtube_link.split("v=")[-1]

# Use yt-dlp to download the audio as m4a format
os.system(f'yt-dlp {youtube_link} --format m4a -o "/content/{video_id}.%(ext)s"')

# The file path for the downloaded m4a file
audio_file = f"/content/{video_id}.m4a"

# Download the model file from the Hugging Face repo
repo_id = "Trelis/whisper-turbo-llm-lingo"
model_file = "whisper-turbo-llm-lingo-openai.bin"

# Download the model file from Hugging Face
model_path = hf_hub_download(repo_id=repo_id, filename=model_file)

# Run Whisper for transcription on the downloaded audio file
command = f'whisper "{audio_file}" --model "{model_path}" --language English'
errors = subprocess.run(command, shell=True, capture_output=True, text=True)

In [None]:
# # To see any errors, if any
# print(errors)

