## Faster-Whisper STT

```python
# !pip install faster-whisper
# !pip install yt-dlp
# !pip install validators
```

In [4]:
import torch
print(torch.__version__)

# CUDA가 사용 가능한지 확인
print("CUDA available:", torch.cuda.is_available())

# cuDNN이 활성화되었는지 확인
print("cuDNN enabled:", torch.backends.cudnn.enabled)

2.4.1+cpu
CUDA available: False
cuDNN enabled: True


In [1]:
def clean_filename(filename):
    return re.sub(r'[^\w\-_\. ]', '_', filename)

In [5]:
import os
import re
import json
import torch
from faster_whisper import WhisperModel
import validators
from pathlib import Path
from yt_dlp import YoutubeDL
from tqdm.notebook import tqdm

def clean_filename(filename):
    return re.sub(r'[^\w\-_\. ]', '_', filename)

def clean_filename(filename):
    # 괄호와 그 내용을 제거
    filename = re.sub(r'\([^)]*\)', '', filename)
    # 한글, 영어, 숫자, 공백, 마침표, 하이픈, 언더스코어를 제외한 문자를 언더스코어로 대체
    filename = re.sub(r'[^\w\s\-_가-힣]', '_', filename)
    # 연속된 언더스코어를 하나로 줄임
    filename = re.sub(r'_{2,}', '_', filename)
    # 앞뒤 공백과 언더스코어 제거
    return filename.strip().strip('_')

def format_timestamp(seconds):
    hours = int(seconds / 3600)
    minutes = int((seconds % 3600) / 60)
    seconds = seconds % 60
    milliseconds = int((seconds - int(seconds)) * 1000)
    return f"{hours:02d}:{minutes:02d},{milliseconds:03d}"

def structure_sentences(text, lang='en'):
    if lang == 'ko':
        sentences = normalize_korean_text(text)
    else:
        sentences = normalize_english_text(text)
    return sentences  # Return the text directly without joining

def remove_extra_spaces(text):
    # 숫자와 문자 사이의 공백을 유지하면서 불필요한 띄어쓰기를 제거
    text = re.sub(r'\s+', ' ', text)  # 여러 공백을 하나의 공백으로
    text = re.sub(r'(?<=\S)\s+(?=\S)', ' ', text).strip()  # 문자 사이의 불필요한 공백 제거
    return text.strip()  # 앞뒤 공백 제거

def normalize_korean_text(text):
    # 연속된 구두점 정규화 및 문장 사이에 줄 바꿈 추가
    text = re.sub(r'([.!?])\1+', r'\1', text)
    text = re.sub(r'\s+', ' ', text).strip()  # 불필요한 공백 제거
    ending_patterns = r'(습니다|니다|세요|네요|어요|아요|죠|지요|에요|예요|입니까|합니까|할까요|말입니다|바랍니다|싶습니다)'
    text = re.sub(f'{ending_patterns}(?![\s.!?])', r'\1. ', text)
    text = re.sub(r'([.!?])(?=\S)', r'\1\n', text)  # 문장 끝에 한 줄 띄움
    return text.strip()

def normalize_english_text(text):
    text = re.sub(r'([.!?])\1+', r'\1', text)
    text = re.sub(r'\s+', ' ', text).strip()  # 여러 공백을 하나로 줄임
    text = '. '.join(sentence.capitalize().strip() for sentence in text.split('. '))  # 문장마다 공백 정리
    return text.strip()

def post_process_text(text, lang='en'):
    text = remove_extra_spaces(text)  # 불필요한 공백 제거
    text = structure_sentences(text, lang)
    return text.strip()  # 마지막에 한번 더 공백 제거

def remove_existing_files(directory):
    dir_path = Path(directory)
    dir_path.mkdir(parents=True, exist_ok=True)  # 디렉토리가 없으면 생성
    for file in dir_path.glob('*'):
        if file.is_file():
            os.remove(file)
            print(f'Removed existing file: {file}')

def transcribe_audio(mode="youtube", whisper_model="medium", 
                    audio_dir="./contents", output_dir="./outputs", 
                    output_formats=["txt", "srt"], 
                    beam_size=5, temperature=0):
    
    # 최적화: beam_size와 temperature 조정, 성능을 위해 값 조정
    device = "cuda" if torch.cuda.is_available() else "cpu"
    compute_type = "float16" if device == "cuda" else "int8"
    model = WhisperModel(whisper_model, device=device, compute_type=compute_type)

    audio_dir = Path(audio_dir)
    audio_dir.mkdir(parents=True, exist_ok=True)

    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    if mode == "youtube":
        YouTube_URL = input("Enter YouTube_URL: ")

        if not validators.url(YouTube_URL):
            raise ValueError("Invalid URL. Please enter a valid YouTube URL.")

        def download_audio_from_youtube(url, out_dir=audio_dir):
            print(f"\n==> Removing existing files...")
            remove_existing_files(out_dir)  # 기존 파일 제거
            
            print(f"\n==> Downloading audio with yt-dlp...")
            ydl_opts = {
                'format': 'bestaudio/best',
                'outtmpl': f"{out_dir}/%(title)s.%(ext)s",
                'postprocessors': [{
                    'key': 'FFmpegExtractAudio',
                    'preferredcodec': 'mp3',
                    'preferredquality': '192',
                }],
                'progress_hooks': [lambda d: tqdm.write(f"{d['status']} - {d.get('filename', 'Unknown file')}")]
            }
            with YoutubeDL(ydl_opts) as ydl:
                info_dict = ydl.extract_info(url, download=True)
                file_path = ydl.prepare_filename(info_dict)
                mp3_file_path = Path(file_path).with_suffix('.mp3')
                clean_file_path = mp3_file_path.parent / f"{clean_filename(mp3_file_path.stem)}.mp3"
                mp3_file_path.rename(clean_file_path)

            print(f"File downloaded to {clean_file_path}!")
            return str(clean_file_path)

        audio = download_audio_from_youtube(YouTube_URL)

    elif mode == "local":
        mp3_files = list(audio_dir.glob("*.mp3"))

        if len(mp3_files) == 0:
            raise FileNotFoundError(f"No .mp3 files found in the directory: {audio_dir}")
        elif len(mp3_files) > 1:
            raise FileExistsError(f"Multiple .mp3 files found in the directory: {audio_dir}. Please specify the file to use.")

        audio = mp3_files[0]
    else:
        raise ValueError("Invalid mode. Please select either 'youtube' or 'local'.")

    file_path = Path(audio)

    print(f"# ------------------------------------\n# TRANSCRIBING AUDIO {file_path.stem}\n# ------------------------------------")
    segments, info = model.transcribe(str(file_path), beam_size=beam_size, temperature=temperature)
    detected_language = info.language
    language_probability = info.language_probability
    print(f"Detected language: '{detected_language}' with probability {language_probability:.2f}")

    lang = 'ko' if detected_language == 'ko' else 'en'
    print("Transcription completed!\n")
    
    full_text = " ".join([segment.text for segment in segments])

    print("# ------------------------------------\n# POSTPROCESSING AUDIO\n# ------------------------------------")
    processed_text = post_process_text(full_text, lang)
    print("Postprocessing completed!\n")

    for format in output_formats:
        output_file_path = output_dir / f"{file_path.stem}.{format}"
        with open(output_file_path, 'w', encoding='utf-8') as f:
            if format == "txt":
                f.write(processed_text)
            elif format == "srt":
                for i, segment in enumerate(tqdm(segments, desc="Creating SRT"), start=1):
                    f.write(f"{i}\n")
                    f.write(f"{format_timestamp(segment.start)} --> {format_timestamp(segment.end)}\n")
                    f.write(f"{post_process_text(segment.text, lang)}\n\n")
            elif format == "json":
                json_output = {
                    "transcription": processed_text,
                    "segments": [
                        {
                            "id": i,
                            "start": format_timestamp(segment.start),
                            "end": format_timestamp(segment.end),
                            "text": post_process_text(segment.text, lang)
                        }
                        for i, segment in enumerate(segments, start=1)
                    ]
                }
                json.dump(json_output, f, ensure_ascii=False, indent=4)

        print(f"Saved {format.upper()} file at: {output_file_path}")

    print("\n✨ All Done!")
    return segments


### 로컬 예제

In [2]:
# result = transcribe_audio(mode="local", output_formats=["txt"])
# result = transcribe_audio(mode="local", 
#                         whisper_model="distil-large-v3",
#                         audio_dir="./meeting_rec", 
#                         output_dir="./meeting_text", 
#                         output_formats=["txt"])

## Youtube 예제

In [3]:
result = transcribe_audio(mode="youtube", 
                        audio_dir="./youtube_contents",
                        output_dir="./youtube_text", 
                        output_formats=["txt"])


==> Removing existing files...
Removed existing file: youtube_contents\[오늘 이 뉴스]  추석 코앞 '13호 태풍' 발동..＂어디로 가나＂ 한중일 '촉각'  (2024.09.10⧸MBC뉴스).mp3
Removed existing file: youtube_contents\_오늘 이 뉴스_  추석 코앞 _13호 태풍_ 발동.._어디로 가나_ 한중일 _촉각_  _2024.09.10_MBC뉴스_.mp3

==> Downloading audio with yt-dlp...
[youtube] Extracting URL: https://youtu.be/YvFYlDANNJ0
[youtube] YvFYlDANNJ0: Downloading webpage
[youtube] YvFYlDANNJ0: Downloading ios player API JSON
[youtube] YvFYlDANNJ0: Downloading web creator player API JSON
[youtube] YvFYlDANNJ0: Downloading m3u8 information
[info] YvFYlDANNJ0: Downloading 1 format(s): 251
[download] Destination: youtube_contents\[오늘 이 뉴스]  추석 코앞 '13호 태풍' 발동..＂어디로 가나＂ 한중일 '촉각'  (2024.09.10⧸MBC뉴스).webm
[download]   0.1% of    1.24MiB at  111.01KiB/s ETA 00:11downloading - youtube_contents\[오늘 이 뉴스]  추석 코앞 '13호 태풍' 발동..＂어디로 가나＂ 한중일 '촉각'  (2024.09.10⧸MBC뉴스).webm
[download]   0.2% of    1.24MiB at  299.71KiB/s ETA 00:04downloading - youtube_contents\[오늘 이 뉴스]  추석 코앞 '13호 태풍' 