In [1]:
!pip install scrapetube aiohttp beautifulsoup4 tqdm pandas requests 



In [None]:
import asyncio
import aiohttp
import json
from tqdm.asyncio import tqdm_asyncio
import traceback
import pandas as pd
import os
import scrapetube
import requests
from bs4 import BeautifulSoup
import re
from typing import Literal

# ==== Configuration ====
CHANNEL_NAME = "RickAstleyYT"  # example channel name
LOCALE = "zh_TW"                                    # desired locale for subtitles
OUTPUT_PATH = f"{CHANNEL_NAME}.jsonl"
CONCURRENCY = 10
LIMIT = 1000
# ========================



def get_channel_videos(
    channel_name: str, 
    limit: int = None, 
    sort_by: Literal['newest', 'oldest', 'popular'] = "newest",
    content_type: Literal['videos', 'shorts', 'streams'] = "videos"
) -> list[str]:
    videos = scrapetube.get_channel(channel_username=channel_name, limit=limit, sort_by=sort_by, content_type=content_type)
    video_ids = [video['videoId'] for video in videos]
    return video_ids

def get_youtube_title(video_id: str) -> str:
    try:
        return scrapetube.get_video(video_id)['title']['runs'][0]['text']
    except Exception:
        
        url = f'https://www.youtube.com/watch?v={video_id}'
        soup = BeautifulSoup(requests.get(url).content, 'html.parser')
        html_text = str(soup)
        title_pattern = re.compile(r'(?<=\"title\":\").*?(?=\",\")')
        title = title_pattern.findall(html_text)[0].replace('\\n','\n')
        return title


def get_upload_date(video_id):
    try:
        # print(scrapetube.get_video(video_id))
        return scrapetube.get_video(video_id)['dateText']['simpleText']
    except Exception:
        try:
            url = f'https://www.youtube.com/watch?v={video_id}'
            soup = BeautifulSoup(requests.get(url).content, 'html.parser')
            html_text = str(soup)
            date_pattern = re.compile(r'(?<=\"dateText\":\").*?(?=\",\")')
            upload_date = date_pattern.findall(html_text)[0].replace('\\n','\n')
            return upload_date
        except Exception:
            return "unknown"


def get_subtitles(video_id: str) -> str:
    """Main function to get subtitle text for one video."""
    #TODO: implement subtitle fetching
    return ""


async def process_video(session: aiohttp.ClientSession, video_id: str, max_retries=5, retry_delay=2):
    """Fetch title + subtitles for one video with retries on failure."""
    for attempt in range(1, max_retries + 1):
        try:
            title = get_youtube_title(video_id)
            upload_date = get_upload_date(video_id)
            subtitles = get_subtitles(video_id)
            print(f"‚úÖ {video_id}: got transcript ({len(subtitles)} chars)")
            return {"video_id": video_id, "title": title, "upload_date": upload_date, "transcript": subtitles}

        except Exception as e:
            print(f"‚ùå {video_id} failed ({e}), attempt {attempt}/{max_retries}")
            if "No available subtitles for this video" in str(e):
                # Do not retry if there are no subtitles
                break
            traceback.print_exc()

        if attempt < max_retries:
            await asyncio.sleep(retry_delay)  # wait before retrying

    print(f"‚ö†Ô∏è {video_id} failed after {max_retries} attempts")
    return None


async def main():
    videos = get_channel_videos(CHANNEL_NAME, limit=LIMIT)
    semaphore = asyncio.Semaphore(CONCURRENCY)
    if os.path.exists(OUTPUT_PATH) and os.path.getsize(OUTPUT_PATH) > 0:
        try:
            df = pd.read_json(OUTPUT_PATH, lines=True)
            existing_ids = set(df["video_id"].values)
            print(f"üìÑ Loaded {len(existing_ids)} existing entries.")
        except Exception as e:
            print(f"‚ö†Ô∏è Could not read existing JSONL ({e}), starting fresh.")
            existing_ids = set()
    else:
        existing_ids = set()

    async with aiohttp.ClientSession() as session:
        # Open file synchronously (cannot use async with)
        with open(OUTPUT_PATH, "a", encoding="utf-8") as f:
            async def sem_task(video_id):
                async with semaphore:
                    result = await process_video(session, video_id)
                    if result:
                        json.dump(result, f, ensure_ascii=False)
                        f.write("\n")

            await tqdm_asyncio.gather(*(sem_task(v) for v in videos if v not in existing_ids), desc="Processing videos")

    print(f"‚úÖ All data saved to {OUTPUT_PATH}")


if __name__ == "__main__":
    await main()

Processing videos:   0%|          | 0/886 [00:00<?, ?it/s]

‚úÖ w8bplcgbgBQ: got transcript (0 chars)
‚úÖ 3eqnd1jdGiM: got transcript (0 chars)
‚úÖ awbMggTc9_8: got transcript (0 chars)
‚úÖ htcj7Ywjgv4: got transcript (0 chars)
‚úÖ -DWyLAp9g74: got transcript (0 chars)
‚úÖ d4dAM5mLy_k: got transcript (0 chars)
‚úÖ mBfCd4pKVC4: got transcript (0 chars)
‚úÖ Zum0C3p4-xg: got transcript (0 chars)
‚úÖ bb8BsHDGlSw: got transcript (0 chars)
‚úÖ vG8RAqY_qrs: got transcript (0 chars)
‚úÖ YqJbt4wQvfY: got transcript (0 chars)
‚úÖ KGiQezajMjI: got transcript (0 chars)
‚úÖ 07foUoE9LBE: got transcript (0 chars)
‚úÖ RdAlGFg_qw0: got transcript (0 chars)
‚úÖ 7LLyxEOKccs: got transcript (0 chars)
‚úÖ Zfz1WmA-tzQ: got transcript (0 chars)
‚úÖ sMbkc-8vMjo: got transcript (0 chars)
‚úÖ AGFecR6IV_Y: got transcript (0 chars)
‚úÖ 3l-B01irMgA: got transcript (0 chars)
‚úÖ NHgEarptI2o: got transcript (0 chars)
‚úÖ SfiQhRakfBE: got transcript (0 chars)
‚úÖ ArfN8U1ugng: got transcript (0 chars)
‚úÖ XzWBG-5tod0: got transcript (0 chars)
‚úÖ c1ZxyWr9GBc: got transcript (0