In [1]:
# Step 1: Install Dependencies (this may take about 2-3 min)

# !pip install faster-whisper
# !pip install yt-dlp

import os, re
import torch
from pathlib import Path
from faster_whisper import WhisperModel
from whisper.utils import get_writer
from yt_dlp import YoutubeDL


## Faster-Whisper STT

In [2]:
from pathlib import Path
import torch
from faster_whisper import WhisperModel
from yt_dlp import YoutubeDL

def transcribe_audio(audio_dir="./contents", output_dir="./outputs", output_formats=["txt", "srt"], 
                     mode="youtube", whisper_model="base", if_Colab=False):
    
    device = "cuda" if torch.cuda.is_available() else "cpu"
    compute_type = "float16" if device == "cuda" else "int8"
    model = WhisperModel(whisper_model, device=device, compute_type=compute_type)

    audio_dir = Path(audio_dir)
    audio_dir.mkdir(parents=True, exist_ok=True)
    
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    if mode == "youtube":
        YouTube_URL = input("Enter YouTube_URL")  # @param {type:"string"}

        def download_audio_from_youtube(url, file_name=None, out_dir=audio_dir):
            print(f"\n==> Downloading audio with yt-dlp...")
            ydl_opts = {
                'format': 'bestaudio/best',
                'outtmpl': f"{out_dir}/{file_name if file_name else '%(title)s.%(ext)s'}",
                'postprocessors': [{
                    'key': 'FFmpegExtractAudio',
                    'preferredcodec': 'mp3',
                    'preferredquality': '192',
                }],
            }
            with YoutubeDL(ydl_opts) as ydl:
                ydl.download([url])
                info_dict = ydl.extract_info(url, download=False)
                file_path = ydl.prepare_filename(info_dict)
                mp3_file_path = Path(file_path).with_suffix('.mp3')
            
            print(f"File downloaded to {mp3_file_path}!")
            return str(mp3_file_path)

        audio = download_audio_from_youtube(YouTube_URL)
        print("\n=======================")
        print(f"\n🔗 YouTube URL: {YouTube_URL}")
        print(f"\n🤖 Whisper Model: {whisper_model}")
        print("\n=======================")
        
    elif mode == "local":
        mp3_files = list(audio_dir.glob("*.mp3"))
        
        if len(mp3_files) == 0:
            raise FileNotFoundError(f"No .mp3 files found in the directory: {output_dir}")
        elif len(mp3_files) > 1:
            raise FileExistsError(f"Multiple .mp3 files found in the directory: {output_dir}. Please specify the file to use.")
        
        audio = mp3_files[0]
        print("\n=======================")
        print(f"\n🔗 Audio Path: {audio}")
        print(f"\n🤖 Whisper Model: {whisper_model}")
        print("\n=======================")
    else:
        raise ValueError("Invalid mode. Please select either 'youtube' or 'local'.")
    
    file_path = Path(audio)

    print(f"\n==> Transcribing audio")
    segments, info = model.transcribe(str(file_path), beam_size=5)

    # Save the transcription in the requested formats
    for format in output_formats:
        print(f"\n==> Creating .{format} file")
        output_file_path = output_dir / f"{file_path.stem}.{format}"
        
        with open(output_file_path, 'w', encoding='utf-8') as f:
            if format == "txt":
                for segment in segments:
                    f.write(f"{segment.text}\n")
            elif format == "srt":
                for i, segment in enumerate(segments, start=1):
                    f.write(f"{i}\n")
                    f.write(f"{self.format_timestamp(segment.start)} --> {self.format_timestamp(segment.end)}\n")
                    f.write(f"{segment.text}\n\n")
        
        if if_Colab:
            from google.colab import files
            files.download(str(output_file_path))
        else:
            print(f"Transcription saved as .{format} file at: {output_file_path}")

    print("\n✨ All Done!")
    print("=======================")
    return segments

def format_timestamp(seconds):
    hours = int(seconds / 3600)
    minutes = int((seconds % 3600) / 60)
    seconds = seconds % 60
    milliseconds = int((seconds - int(seconds)) * 1000)
    return f"{hours:02d}:{minutes:02d}:{int(seconds):02d},{milliseconds:03d}"

### 로컬 예제

In [3]:
result = transcribe_audio(mode="local", output_formats=["txt"])

model.bin:   0%|          | 0.00/145M [00:00<?, ?B/s]

vocabulary.txt:   0%|          | 0.00/460k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.31k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


tokenizer.json:   0%|          | 0.00/2.20M [00:00<?, ?B/s]



🔗 Audio Path: contents\sample_sound.mp3

🤖 Whisper Model: base


==> Transcribing audio

==> Creating .txt file
Transcription saved as .txt file at: outputs\sample_sound.txt

✨ All Done!


## Youtube 예제

In [10]:
result = transcribe_audio(mode="youtube", output_formats=["txt"])


==> Downloading audio with yt-dlp...
[youtube] Extracting URL: https://youtu.be/GEYxeMYMtE4
[youtube] GEYxeMYMtE4: Downloading webpage
[youtube] GEYxeMYMtE4: Downloading ios player API JSON
[youtube] GEYxeMYMtE4: Downloading web creator player API JSON
[youtube] GEYxeMYMtE4: Downloading player 57c75fa4
[youtube] GEYxeMYMtE4: Downloading m3u8 information
[info] GEYxeMYMtE4: Downloading 1 format(s): 251
[download] Destination: outputs\[시계열] Ch1. 데이터와 인공지능, 그리고 데이터 분석이란？.webm
[download] 100% of   16.03MiB in 00:00:06 at 2.43MiB/s   
[ExtractAudio] Destination: outputs\[시계열] Ch1. 데이터와 인공지능, 그리고 데이터 분석이란？.mp3
Deleting original file outputs\[시계열] Ch1. 데이터와 인공지능, 그리고 데이터 분석이란？.webm (pass -k to keep)
[youtube] Extracting URL: https://youtu.be/GEYxeMYMtE4
[youtube] GEYxeMYMtE4: Downloading webpage
[youtube] GEYxeMYMtE4: Downloading ios player API JSON
[youtube] GEYxeMYMtE4: Downloading web creator player API JSON
[youtube] GEYxeMYMtE4: Downloading m3u8 information
File downloaded to outputs\[시

100%|██████████| 125648/125648 [01:19<00:00, 1570.74frames/s]


==> Creating .txt file
Transcription saved as .txt file at: outputs\[시계열] Ch1. 데이터와 인공지능, 그리고 데이터 분석이란？.txt

✨ All Done!



