<a href="https://colab.research.google.com/github/cxctis/Google-Colab-Experiment/blob/main/Whisper_ASR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
import os

# 1. Upload
print("Please upload your file:")
uploaded = files.upload()
# FIX: It should be .keys() with an 's'
file_name = list(uploaded.keys())[0]

# 2. Convert to MP3
# FIX: We use !ffmpeg (command line) because the 'ffmpeg' python library
# often requires complex setup in Colab. This is much more reliable:
audio_output = "audio_converted.mp3"
!ffmpeg -i "{file_name}" -acodec libmp3lame -ar 44100 -ac 2 -ab 192k "{audio_output}" -y

# 3. Load Whisper
import whisper
# Note: Ensure you ran !pip install openai-whisper in a previous cell
model = whisper.load_model("medium")
print(f"Transcribing {file_name}... this may take a few minutes.")

# 4. Transcribe
# Added fp16=False to prevent warnings if you aren't using a GPU
result = model.transcribe(audio_output, language="en", fp16=False)

print("\n---DONE!---\n")
print(result["text"])

# 5. Save
with open("transcription.txt", "w") as f:
    f.write(result["text"])

print("\nYour transcription has been saved as 'transcription.txt' in the files folder on the left.")

In [None]:
import datetime

import whisper
model = whisper.load_model("medium")
print(f"Transcribing with timestamp...this might take a moment.")

result = model.transcribe(audio_output, language="en", fp16=False)

def format_timestamp(seconds: float):
  td = datetime.timedelta(seconds=seconds)
  total_seconds = int(td.total_seconds())
  hours, reminder = divmod(total_seconds, 3600)
  minutes, seconds_int = divmod(reminder, 60)
  milliseconds = int((seconds -int(seconds)) * 1000)
  return f"{hours:02}:{minutes:02}:{seconds_int:02},{milliseconds:03}"

caption_text = ""
for i, segment in enumerate(result['segments'], start=1):
  start = format_timestamp(segment['start'])
  end = format_timestamp(segment['end'])
  text = segment['text'].strip()

  caption_line = f"{i}\n{start} --> {end}\n{text}\n\n"
  caption_text += caption_line

print("\n--- CAPTION PREVIEW ---\n")
print(caption_text[:500] + "...") # Preview the first 500 chars

with open("captions.srt", "w", encoding="utf-8") as f:
    f.write(caption_text)

print("\nSuccess! Your 'captions.srt' file is ready in the left folder.")

from google.colab import files
files.download('captions.srt')

In [11]:
!pip install git+https://github.com/openai/whisper.git yt-dlp

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-_vpl8uhz
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-_vpl8uhz
  Resolved https://github.com/openai/whisper.git to commit c0d2f624c09dc18e709e37c2ad90c039a4eb72a2
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting yt-dlp
  Downloading yt_dlp-2026.2.21-py3-none-any.whl.metadata (182 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.1/182.1 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
Downloading yt_dlp-2026.2.21-py3-none-any.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: yt-dlp
Successfully installed yt-dlp-2026.2.21


In [None]:
# 1. INSTALL NECESSARY TOOLS
!pip install git+https://github.com/openai/whisper.git yt-dlp
!sudo apt update && sudo apt install ffmpeg

import yt_dlp
import whisper
import datetime
from google.colab import files

# 2. INPUT THE URL
video_url = input("Enter the YouTube Video URL: ")

# 3. DOWNLOAD AUDIO ONLY
print("Downloading audio from YouTube...")
ydl_opts = {
    'format': 'm4a/bestaudio/best',
    'outtmpl': 'youtube_audio.%(ext)s',
    'postprocessors': [{
        'key': 'FFmpegExtractAudio',
        'preferredcodec': 'mp3',
        'preferredquality': '192',
    }],
}

with yt_dlp.YoutubeDL(ydl_opts) as ydl:
    ydl.download([video_url])

audio_file = "youtube_audio.mp3"

# 4. TRANSCRIBE
print("Loading model and transcribing...")
model = whisper.load_model("medium")
result = model.transcribe(audio_file, fp16=True) # Set to True for GPU speed

# 5. FORMAT AS CAPTIONS (SRT)
def format_timestamp(seconds: float):
    td = datetime.timedelta(seconds=seconds)
    total_seconds = int(td.total_seconds())
    hours, remainder = divmod(total_seconds, 3600)
    minutes, seconds_int = divmod(remainder, 60)
    milliseconds = int((seconds - int(seconds)) * 1000)
    return f"{hours:02}:{minutes:02}:{seconds_int:02},{milliseconds:03}"

caption_text = ""
for i, segment in enumerate(result['segments'], start=1):
    start = format_timestamp(segment['start'])
    end = format_timestamp(segment['end'])
    text = segment['text'].strip()
    caption_text += f"{i}\n{start} --> {end}\n{text}\n\n"

# 6. SAVE AND DOWNLOAD
file_output = "youtube_transcript.srt"
with open(file_output, "w", encoding="utf-8") as f:
    f.write(caption_text)

print(f"\n--- DONE! ---\nFile saved as {file_output}")
files.download(file_output)