yt api docs:
- https://developers.google.com/youtube/v3/quickstart/python
- https://developers.google.com/resources/api-libraries/documentation/youtube/v3/python/latest/youtube_v3.search.html
- https://developers.google.com/youtube/v3/getting-started#quota 

In [2]:
import os
import sys
import json
import logging
import yaml
from typing import Dict, List, Set
from googleapiclient.discovery import build

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Constants
BATCH_SIZE = 5

# Handle path resolution for notebook environment
current_dir = os.getcwd()
ROOT_DIR = os.path.abspath(os.path.join(current_dir, "../.."))
ARTICLES_PATH = os.path.join(ROOT_DIR, "src/content/articles")
OUTPUT_PATH = os.path.join(ROOT_DIR, "src/data/youtube-video-list.json")
CONFIG_PATH = os.path.join(ROOT_DIR, "src/scripts/components-generator/config.py")

# Print paths for debugging
logger.info(f"Current directory: {current_dir}")
logger.info(f"Root directory: {ROOT_DIR}")
logger.info(f"Articles path: {ARTICLES_PATH}")
logger.info(f"Output path: {OUTPUT_PATH}")
logger.info(f"Config path: {CONFIG_PATH}")

def setup_youtube_client(api_key: str):
    """Initialize YouTube API client."""
    try:
        return build('youtube', 'v3', developerKey=api_key)
    except Exception as e:
        logger.error(f"Failed to initialize YouTube client: {e}")
        raise

def search_videos(youtube_client, title: str) -> List[Dict]:
    """Search for YouTube videos related to a concept."""
    try:
        # Build search query
        search_query = f"'{title}' in AI or machine learning -tutorial -howto"
        
        request = youtube_client.search().list(
            q=search_query,
            part='snippet',
            maxResults=5,
            type='video',
            videoCaption='closedCaption', # With captions for accessibility
            order='relevance',           # Most relevant first
            safeSearch='strict'          # Educational content
        )
        response = request.execute()
        
        videos = [
            {
                'title': item['snippet']['title'],
                'link': f"https://www.youtube.com/watch?v={item['id']['videoId']}"
            }
            for item in response.get('items', [])
        ]
        
        return videos
    except Exception as e:
        logger.error(f"Failed to search videos for {title}: {e}")
        return []

def load_existing_data() -> tuple[List[Dict], Set[str]]:
    """Load existing JSON data and processed slugs."""
    try:
        if os.path.exists(OUTPUT_PATH):
            with open(OUTPUT_PATH, 'r', encoding='utf-8') as f:
                existing_data = json.load(f)
                processed_slugs = {item['slug'] for item in existing_data}
                return existing_data, processed_slugs
        return [], set()
    except Exception as e:
        logger.error(f"Error loading existing data: {e}")
        return [], set()

def parse_frontmatter(file_path: str) -> Dict:
    """Parse frontmatter from markdown file."""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read().strip()
            if content.startswith('---'):
                # Find the second '---' that closes the frontmatter
                _, fm, _ = content.split('---', 2)
                return yaml.safe_load(fm)
            return {}
    except Exception as e:
        logger.error(f"Error parsing frontmatter from {file_path}: {e}")
        return {}

def get_unprocessed_files(processed_slugs: Set[str]) -> List[str]:
    """Get list of unprocessed markdown files."""
    try:
        if not os.path.exists(ARTICLES_PATH):
            logger.error(f"Articles directory not found: {ARTICLES_PATH}")
            return []

        all_files = [f for f in os.listdir(ARTICLES_PATH) if f.endswith('.md')]
        all_files.sort()
        
        unprocessed_files = []
        for file in all_files:
            file_path = os.path.join(ARTICLES_PATH, file)
            try:
                frontmatter = parse_frontmatter(file_path)
                if frontmatter.get('slug') not in processed_slugs:
                    unprocessed_files.append(file_path)
                    if len(unprocessed_files) >= BATCH_SIZE:
                        break
            except Exception as e:
                logger.warning(f"Error reading file {file}: {e}")
                continue
                
        return unprocessed_files
    except Exception as e:
        logger.error(f"Error scanning directory: {e}")
        return []

def process_files(youtube_client, files: List[str]) -> List[Dict]:
    """Process markdown files and get video recommendations."""
    results = []
    had_errors = False
    
    for file_path in files:
        try:
            frontmatter = parse_frontmatter(file_path)
            title = frontmatter.get('title', '')
            slug = frontmatter.get('slug', '')
            
            if not title or not slug:
                continue
                
            videos = search_videos(youtube_client, title)
            
            # Check if video retrieval failed
            if not videos:
                had_errors = True
                logger.error(f"No videos retrieved for {slug}, skipping save")
                continue
                
            # Only include slug and recommendations in the result
            result = {
                'slug': slug,
                'recommendations': videos
            }
            
            results.append(result)
            logger.info(f"Processed {slug}")
            
        except Exception as e:
            had_errors = True
            logger.error(f"Error processing file {file_path}: {e}")
            continue
            
    # Return both results and error status
    return results, had_errors

def save_results(all_results: List[Dict]) -> None:
    """Save results to JSON file."""
    try:
        os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)
        with open(OUTPUT_PATH, 'w', encoding='utf-8') as f:
            json.dump(all_results, f, indent=2, ensure_ascii=False)
        logger.info(f"Results saved to {OUTPUT_PATH}")
    except Exception as e:
        logger.error(f"Error saving results: {e}")

def main():
    """Main execution function."""
    try:
        # Load API key
        try:
            config_dir = os.path.dirname(CONFIG_PATH)
            if config_dir not in sys.path:
                sys.path.append(config_dir)
            from config import GOOGLECLOUD_API_KEY
        except ImportError:
            logger.error("Could not import GOOGLECLOUD_API_KEY from config.py")
            logger.error(f"Make sure config.py exists at: {CONFIG_PATH}")
            return
        
        youtube_client = setup_youtube_client(GOOGLECLOUD_API_KEY)
        
        # Load existing data
        existing_data, processed_slugs = load_existing_data()
        
        # Get unprocessed files
        files_to_process = get_unprocessed_files(processed_slugs)
        
        if not files_to_process:
            logger.info("No new files to process")
            return
            
        # Process files
        new_results, had_errors = process_files(youtube_client, files_to_process)
        
        # Only save if there were no errors
        if not had_errors and new_results:
            # Combine results
            all_results = existing_data + new_results
            
            # Save results
            save_results(all_results)
        else:
            logger.warning("Not saving results due to errors in video retrieval")
        
    except Exception as e:
        logger.error(f"An error occurred in main execution: {e}")
        raise

# Run main function when the notebook cell is executed
main()

INFO:__main__:Current directory: /Users/kemi/Documents/GitHub/vocab/src/scripts
INFO:__main__:Root directory: /Users/kemi/Documents/GitHub/vocab
INFO:__main__:Articles path: /Users/kemi/Documents/GitHub/vocab/src/content/articles
INFO:__main__:Output path: /Users/kemi/Documents/GitHub/vocab/src/data/youtube-video-list.json
INFO:__main__:Config path: /Users/kemi/Documents/GitHub/vocab/src/scripts/components-generator/config.py
INFO:googleapiclient.discovery_cache:file_cache is only supported with oauth2client<4.0.0
INFO:__main__:Processed autoformalization
INFO:__main__:Processed autograd
INFO:__main__:Processed automaton
INFO:__main__:Processed automl-automated-machine-learning
INFO:__main__:Processed autonomous-agents
INFO:__main__:Results saved to /Users/kemi/Documents/GitHub/vocab/src/data/youtube-video-list.json
