In [1]:
# Install packages
!pip install -q yt-dlp ffmpeg-python transformers librosa noisereduce accelerate
!pip install -q --upgrade torch torchvision torchaudio
# Imports
import os
import librosa
import torch
import noisereduce as nr
import ffmpeg
# from transformers import WhisperProcessor, WhisperForConditionalGeneration

# Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# 1. Download YouTube Audio using yt_dlp
video_url = input("Enter YouTube Video URL: ")

# Download audio in best quality
!yt-dlp -f bestaudio --extract-audio --audio-format wav --audio-quality 0 -o "downloaded_audio.wav" "{video_url}"
downloaded_audio_path = "downloaded_audio.wav"
print(f"Downloaded and converted to WAV: {downloaded_audio_path}")

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gradio 3.41.2 requires aiofiles<24.0,>=22.0, but you have aiofiles 24.1.0 which is incompatible.
gradio 3.41.2 requires markupsafe~=2.0, but you have markupsafe 3.0.2 which is incompatible.
gradio 3.41.2 requires websockets<12.0,>=10.0, but you have websockets 15.0.1 which is incompatible.
gradio-client 0.5.0 requires websockets<12.0,>=10.0, but you have websockets 15.0.1 which is incompatible.
langflow 1.1.4 requires certifi<2025.0.0,>=2023.11.17, but you have certifi 2025.1.31 which is incompatible.
langflow 1.1.4 requires certifi==2024.8.30, but you have certifi 2025.1.31 which is incompatible.
langflow 1.1.4 requires langchain==0.3.10, but you have langchain 0.3.17 which is incompatible.
langflow 1.1.4 requires langchain-openai==0.2.12, but you have langchain-openai 0.3.3 which is incompatible.
realtime 1

In [2]:
# 2. Denoising
import soundfile as sf
def denoise_audio(audio_path):
    y, sr = librosa.load(audio_path, sr=None)
    reduced_noise = nr.reduce_noise(y=y, sr=sr)
    denoised_path = 'denoised_audio.wav'
    # Use soundfile.write to save the audio instead of librosa.output.write_wav
    sf.write(denoised_path, reduced_noise, sr)  
    return denoised_path, sr
  
denoised_audio_path, sample_rate = denoise_audio(downloaded_audio_path)
print(f"Denoised audio saved at {denoised_audio_path}")

Denoised audio saved at denoised_audio.wav


In [3]:
# 3. Chunking
import soundfile as sf # import soundfile
def chunk_audio(audio_path, chunk_length_sec=30):
    y, sr = librosa.load(audio_path, sr=None)
    total_duration = librosa.get_duration(y=y, sr=sr)
    chunks = []

    for start in range(0, int(total_duration), chunk_length_sec):
        end = min(start + chunk_length_sec, total_duration)
        chunk = y[int(start * sr):int(end * sr)]
        chunk_path = f'chunk_{start}_{end}.wav'
        # Use soundfile to write the audio instead
        sf.write(chunk_path, chunk, sr)  
        chunks.append(chunk_path)

    return chunks

chunks = chunk_audio(denoised_audio_path)
print(f"Created {len(chunks)} chunks for transcription.")

Created 16 chunks for transcription.


In [4]:
!pip install --upgrade --no-cache-dir openai-whisper



In [None]:
import whisper
import librosa
import soundfile as sf

model_size = 'medium'  # Choose from ['tiny', 'base', 'small', 'medium', 'large']
whisper_model = whisper.load_model(model_size)
TARGET_LANGUAGE = "en"

# 4. Transcribe chunks function
def transcribe_chunks(chunks):
    transcripts = []
    for chunk_path in chunks:
        try:
            # Transcribe with Whisper
            result = whisper_model.transcribe(
                chunk_path,
                language=TARGET_LANGUAGE,
                temperature=0.15,
                best_of=5,
                beam_size=8,
                fp16=False,  # Set to True if using GPU
                verbose=True
            )
            transcripts.append(result["text"])
            print(f"✅ Successfully transcribed {chunk_path}")
        except Exception as e:
            print(f"❗ Error transcribing {chunk_path}: {e}")
            transcripts.append("")
    return transcripts

# Transcribe all chunks
whisper_transcripts = transcribe_chunks(chunks)

# 5. Combine and save results
final_whisper_text = ' '.join(whisper_transcripts)
with open('whisper_youtube_transcript.txt', 'w') as f:
    f.write(final_whisper_text)

print("✅ Full transcription complete! Saved as 'whisper_youtube_transcript.txt'.")

[00:00.000 --> 00:06.880]  Coming to India to do business is a big learning curve.
[00:06.880 --> 00:12.160]  First India is a learning curve, then business in India is a further learning curve.
[00:12.160 --> 00:17.120]  For me, it was all about letting India be India.
[00:17.120 --> 00:24.960]  If you find monotony dull, if you find comfort boring, then India is a perfect place to be.
[00:24.960 --> 00:27.640]  Nothing is predictable, every day is different.
[00:27.640 --> 00:29.880]  I took it as part of the course that...
✅ Successfully transcribed chunk_0_30.wav
[00:00.000 --> 00:03.440]  I would just have to adjust to this in order to be successful here.
[00:11.680 --> 00:14.320]  My name is Bert Mueller. I'm 35 years old.
[00:14.320 --> 00:18.720]  I live in Bengaluru, India and I moved here in December 2011.
[00:20.720 --> 00:26.240]  So when I looked at starting a Mexican-inspired restaurant in India, there was just Taco Bell.
[00:26.240 --> 00:30.000]  Even now, 13 years in t