In [4]:
!pip install youtube-transcript-api

Collecting youtube-transcript-api
  Downloading youtube_transcript_api-0.6.3-py3-none-any.whl.metadata (17 kB)
Downloading youtube_transcript_api-0.6.3-py3-none-any.whl (622 kB)
   ---------------------------------------- 0.0/622.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/622.3 kB ? eta -:--:--
   ---------------- ----------------------- 262.1/622.3 kB ? eta -:--:--
   ---------------------------------------- 622.3/622.3 kB 1.4 MB/s eta 0:00:00
Installing collected packages: youtube-transcript-api
Successfully installed youtube-transcript-api-0.6.3


In [2]:
import re
from urllib.parse import urlparse, parse_qs
from youtube_transcript_api import YouTubeTranscriptApi


def extract_video_id(youtube_url: str) -> str:
    """
    Extract the video ID from a YouTube URL.
    """
    short_url_match = re.match(r'(https?://)?(www\.)?youtu\.be/(?P<id>[^?&]+)', youtube_url)
    if short_url_match:
        return short_url_match.group('id')

    shorts_match = re.match(r'(https?://)?(www\.)?youtube\.com/shorts/(?P<id>[^?&]+)', youtube_url)
    if shorts_match:
        return shorts_match.group('id')

    parsed_url = urlparse(youtube_url)
    query_params = parse_qs(parsed_url.query)
    if 'v' in query_params:
        return query_params['v'][0]

    raise ValueError(f"Could not parse YouTube video ID from URL: {youtube_url}")


def get_transcript_with_timestamps(video_id: str, languages=('en', 'en-US')) -> str:
    """
    Retrieve the transcript text with timestamps for a given YouTube video ID.
    """
    try:
        transcript_list = YouTubeTranscriptApi.get_transcript(video_id, languages=list(languages))
        
        # Combine text and timestamps into a formatted string
        transcript_with_timestamps = "\n".join(
            [f"[{entry['start']:.2f}s] {entry['text']}" for entry in transcript_list]
        )
        return transcript_with_timestamps
    except Exception as e:
        print(f"Error fetching transcript for video {video_id}: {e}")
        return ""


def download_transcripts_from_list(url_list, output_dir="transcripts"):
    """
    Downloads transcripts with timestamps from a list of YouTube URLs and saves them as text files.
    """
    import os
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for url in url_list:
        try:
            video_id = extract_video_id(url)
            print(f"Fetching transcript for video ID: {video_id}")

            # Get the transcript with timestamps
            transcript_text = get_transcript_with_timestamps(video_id)

            if transcript_text:
                out_path = os.path.join(output_dir, f"{video_id}.txt")
                with open(out_path, "w", encoding="utf-8") as f:
                    f.write(transcript_text)
                print(f"Transcript saved to: {out_path}")
            else:
                print(f"No transcript found or error for video ID: {video_id}")
        except ValueError as e:
            print(e)


if __name__ == "__main__":
    youtube_urls = [
        "https://www.youtube.com/watch?v=0JPQrRdu4Ok",
        "https://www.youtube.com/watch?v=cyrrfl0eNYc",
        # Add more URLs as needed
    ]

    download_transcripts_from_list(youtube_urls)


Fetching transcript for video ID: 0JPQrRdu4Ok
Transcript saved to: transcripts\0JPQrRdu4Ok.txt
Fetching transcript for video ID: cyrrfl0eNYc
Transcript saved to: transcripts\cyrrfl0eNYc.txt
