In [None]:
# https://chat.openai.com/share/b551701e-8224-459d-af6d-7b50712e7013

In [2]:
import json
import subprocess
import os

def get_playlist_title(playlist_url: str) -> str:
    # Shell command to download video info and write to JSON
    command = [
        'yt-dlp',
        '-j',  # Output JSON
        '--flat-playlist',  # Don't download the videos
        '--playlist-items',  '1',  # Only get the first video's info
        playlist_url
    ]
    result = subprocess.run(command, stdout=subprocess.PIPE, text=True)
    data = json.loads(result.stdout)
    print(data)
    # return data;
    return data['playlist_title']


def download_subtitles(playlist_url: str):
    # Get the playlist title
    playlist_title = get_playlist_title(playlist_url)

    if playlist_title is None:
        print(f"Cannot get the title of playlist: {playlist_url}")
        return

    # Define the path for saving subtitles based on the playlist title
    save_path = os.path.join('../youtube', playlist_title, 'subtitle')

    # Shell command to download subtitles
    command = [
        'yt-dlp',
        '--write-sub',
        '--skip-download',
        '--sub-lang',
        'zh-Hans,zh',
        '--sub-format',
        'vtt',
        '-P',
        save_path,
        playlist_url
    ]

    # Execute the command
    subprocess.run(command, check=True)



In [3]:
import webvtt
import edge_tts
import os
import asyncio
import nest_asyncio

nest_asyncio.apply()


def read_subtitles_from_vtt_file(file_path: str) -> str:
    """从.vtt文件读取字幕内容"""
    if not os.path.exists(file_path):
        print(f"The file {file_path} does not exist.")
        return ""

    # 从.vtt文件中读取字幕
    captions = webvtt.read(file_path)

    # 将所有字幕组合成一个字符串，每个字幕之间用空格分隔
    return '\n'.join(caption.text for caption in captions)


async def text_to_speech(text_file: str, audio_file: str, voice: str = "zh-CN-XiaoxiaoNeural"):
    """Main function"""
    text = read_subtitles_from_vtt_file(text_file)
    if text:
        communicate = edge_tts.Communicate(text, voice, rate='+75%')
        await communicate.save(audio_file)


def process_all_files(folder_name: str):
    # Create the directories for the subtitles and the output audio files
    subtitle_dir = os.path.join('../youtube', folder_name, 'subtitle')
    output_dir = os.path.join('../youtube', folder_name, 'mp3')

    # Check if the directories exist, if not, create them
    os.makedirs(subtitle_dir, exist_ok=True)
    os.makedirs(output_dir, exist_ok=True)

    # Iterate over all .vtt files in the subtitle directory
    for file_name in os.listdir(subtitle_dir):
        if file_name.endswith('.vtt'):
            # Remove the .vtt extension
            base_name = os.path.splitext(file_name)[0]
            text_file = os.path.join(subtitle_dir, file_name)
            audio_file = os.path.join(output_dir, base_name + '.mp3')
            asyncio.run(text_to_speech(text_file, audio_file))



ModuleNotFoundError: No module named 'webvtt'

In [None]:
# 执行

# Specify the URL of the YouTube playlist
PLAYLIST_URL = 'https://www.youtube.com/watch?v=1SZOGp1D17E&list=PLiuLMb-dLdWKjX8ib9PhlCIx1jKMNxMpy'
download_subtitles(PLAYLIST_URL)

In [53]:

# Specify the folder under 'YouTube' for the subtitles and the output audio files
FOLDER_NAME = '基于LangChain的大语言模型应用开发'  # replace this with your folder name

# Process all files
process_all_files(FOLDER_NAME)