# YouTube Audio Crawler - Google Colab Version

This notebook downloads audio from YouTube channels with specific requirements:
- Format: M4A (AAC codec)
- Sample Rate: 44000 Hz (exact)
- Channels: Mono (1 channel)
- Bitrate: 192kbps

## Instructions:
1. Run cells in order (Cell ‚Üí Run All)
2. Add your channel URLs in the configuration cell
3. Start the download process
4. Download results from Colab files panel

## Step 1: Install Dependencies

In [None]:
# Install required packages
!pip install -q yt-dlp ffmpeg-python

# FFmpeg is pre-installed in Colab
!ffmpeg -version | head -n 1

print("\n‚úÖ Dependencies installed successfully")

## Step 2: Import Libraries and Setup

In [None]:
import yt_dlp
import json
import time
import random
from pathlib import Path
from datetime import datetime
from urllib.parse import urlparse, parse_qs

# Create downloads directory
DOWNLOADS_DIR = Path('./downloads')
DOWNLOADS_DIR.mkdir(exist_ok=True)

# Rate limiting configuration (conservative settings)
SLEEP_BETWEEN_VIDEOS = 15  # seconds between each video
SLEEP_MIN = 10  # minimum random sleep
SLEEP_MAX = 20  # maximum random sleep
RATE_LIMIT = '200K'  # 200KB/s download speed

print("‚úÖ Setup complete")
print(f"üìÅ Downloads directory: {DOWNLOADS_DIR.absolute()}")

## Step 3: Core Download Functions

In [None]:
def extract_channel_id(channel_url):
    """Extract channel ID from various YouTube URL formats"""
    parsed = urlparse(channel_url)
    
    if 'youtube.com' in parsed.netloc:
        if '/channel/' in parsed.path:
            return parsed.path.split('/channel/')[-1].split('/')[0]
        elif '/c/' in parsed.path or '/@' in parsed.path:
            return parsed.path.strip('/')
    
    return channel_url

def get_channel_videos(channel_url):
    """Get all video URLs from a channel"""
    print(f"\nüîç Fetching videos from: {channel_url}")
    
    ydl_opts = {
        'quiet': True,
        'extract_flat': True,
        'force_generic_extractor': False,
    }
    
    try:
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            result = ydl.extract_info(channel_url, download=False)
            
            if 'entries' in result:
                videos = []
                for entry in result['entries']:
                    if entry:
                        video_id = entry.get('id')
                        video_url = f"https://www.youtube.com/watch?v={video_id}"
                        videos.append({
                            'video_id': video_id,
                            'url': video_url,
                            'title': entry.get('title', 'Unknown')
                        })
                
                print(f"‚úÖ Found {len(videos)} videos")
                return videos
            
    except Exception as e:
        print(f"‚ùå Error fetching channel videos: {str(e)}")
        return []
    
    return []

def download_video_audio(video_info):
    """Download audio from a single video"""
    video_id = video_info['video_id']
    video_url = video_info['url']
    title = video_info['title']
    
    print(f"\n‚¨áÔ∏è  Downloading: {title[:50]}...")
    print(f"   Video ID: {video_id}")
    
    # Create video directory
    video_dir = DOWNLOADS_DIR / video_id
    video_dir.mkdir(exist_ok=True)
    
    output_path = video_dir / f"{video_id}.%(ext)s"
    
    ydl_opts = {
        'format': 'bestaudio/best',
        'outtmpl': str(output_path),
        'writeinfojson': False,
        'no_warnings': True,
        'ignoreerrors': False,
        'ratelimit': RATE_LIMIT,
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'm4a',
            'preferredquality': '192',
        }],
        'postprocessor_args': [
            '-ar', '44000',  # 44kHz sample rate (exact)
            '-ac', '1',      # Mono channel
        ],
    }
    
    try:
        # Random sleep before download
        sleep_time = random.uniform(SLEEP_MIN, SLEEP_MAX)
        print(f"   ‚è≥ Sleeping {sleep_time:.1f}s before download...")
        time.sleep(sleep_time)
        
        # Download
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            info = ydl.extract_info(video_url, download=True)
            
            # Save metadata
            metadata = {
                'video_id': video_id,
                'title': info.get('title'),
                'description': info.get('description'),
                'uploader': info.get('uploader'),
                'upload_date': info.get('upload_date'),
                'duration': info.get('duration'),
                'view_count': info.get('view_count'),
                'like_count': info.get('like_count'),
                'channel_id': info.get('channel_id'),
                'channel_url': info.get('channel_url'),
                'url': video_url,
                'audio_format': 'm4a',
                'sample_rate': 44000,
                'channels': 1,
                'bitrate': 192,
                'downloaded_at': datetime.now().isoformat()
            }
            
            metadata_path = video_dir / f"{video_id}.json"
            with open(metadata_path, 'w', encoding='utf-8') as f:
                json.dump(metadata, f, indent=2, ensure_ascii=False)
            
            print(f"   ‚úÖ Downloaded successfully")
            
            # Sleep between videos
            print(f"   ‚è≥ Sleeping {SLEEP_BETWEEN_VIDEOS}s before next video...")
            time.sleep(SLEEP_BETWEEN_VIDEOS)
            
            return True, None
            
    except Exception as e:
        error_msg = str(e)
        print(f"   ‚ùå Error: {error_msg}")
        
        # Check if 403
        if '403' in error_msg or 'Forbidden' in error_msg:
            print(f"   ‚ö†Ô∏è  403 Forbidden - YouTube blocked this request")
        
        return False, error_msg

print("‚úÖ Functions loaded")

## Step 4: Configuration - Add Your Channel URLs Here

In [None]:
# Configure your channels here
CHANNELS = [
    "https://www.youtube.com/channel/UCLFgJS-f6UKOJ3Xz0K8Kosg",  # leon
    "https://www.youtube.com/channel/UC9nijyKbu2cQ0lrK6RyGLsw",  # Ë©©Ë©©fly
    # "https://www.youtube.com/channel/UC74T0OeGBT2bOcidVMwqqoQ",  # Ross_Liu
]

# Max videos per channel (for testing, set to None for all videos)
MAX_VIDEOS_PER_CHANNEL = 5  # Download only first 5 videos per channel for testing

print(f"üìã Configured {len(CHANNELS)} channels")
print(f"üìä Max videos per channel: {MAX_VIDEOS_PER_CHANNEL if MAX_VIDEOS_PER_CHANNEL else 'ALL'}")

## Step 5: Start Download Process

In [None]:
# Statistics
stats = {
    'total_videos': 0,
    'successful': 0,
    'failed': 0,
    'forbidden_403': 0,
    'start_time': datetime.now()
}

print("üöÄ Starting download process...")
print(f"‚è∞ Start time: {stats['start_time'].strftime('%Y-%m-%d %H:%M:%S')}")
print("=" * 60)

for channel_url in CHANNELS:
    print(f"\n{'='*60}")
    print(f"üì∫ Processing channel: {channel_url}")
    print(f"{'='*60}")
    
    # Get videos from channel
    videos = get_channel_videos(channel_url)
    
    if not videos:
        print("‚ö†Ô∏è  No videos found or error fetching channel")
        continue
    
    # Limit videos if configured
    if MAX_VIDEOS_PER_CHANNEL:
        videos = videos[:MAX_VIDEOS_PER_CHANNEL]
        print(f"üìä Limited to first {len(videos)} videos")
    
    # Download each video
    for i, video_info in enumerate(videos, 1):
        print(f"\n--- Video {i}/{len(videos)} ---")
        stats['total_videos'] += 1
        
        success, error = download_video_audio(video_info)
        
        if success:
            stats['successful'] += 1
        else:
            stats['failed'] += 1
            if error and ('403' in error or 'Forbidden' in error):
                stats['forbidden_403'] += 1

# Final statistics
stats['end_time'] = datetime.now()
stats['duration'] = (stats['end_time'] - stats['start_time']).total_seconds()

print("\n" + "="*60)
print("üìä FINAL STATISTICS")
print("="*60)
print(f"‚è∞ Start time: {stats['start_time'].strftime('%Y-%m-%d %H:%M:%S')}")
print(f"‚è∞ End time: {stats['end_time'].strftime('%Y-%m-%d %H:%M:%S')}")
print(f"‚è±Ô∏è  Duration: {stats['duration']:.0f} seconds ({stats['duration']/60:.1f} minutes)")
print(f"\nüìπ Total videos attempted: {stats['total_videos']}")
print(f"‚úÖ Successful downloads: {stats['successful']}")
print(f"‚ùå Failed downloads: {stats['failed']}")
print(f"üö´ 403 Forbidden errors: {stats['forbidden_403']}")

if stats['total_videos'] > 0:
    success_rate = (stats['successful'] / stats['total_videos']) * 100
    print(f"\nüìà Success rate: {success_rate:.1f}%")

print("\n‚úÖ Process complete!")

## Step 6: View Results

In [None]:
# List all downloaded files
print("üìÇ Downloaded files:\n")

successful_downloads = 0
failed_downloads = 0

for video_dir in sorted(DOWNLOADS_DIR.iterdir()):
    if not video_dir.is_dir():
        continue
    
    m4a_files = list(video_dir.glob('*.m4a'))
    json_files = list(video_dir.glob('*.json'))
    
    if m4a_files:
        successful_downloads += 1
        m4a_file = m4a_files[0]
        size_mb = m4a_file.stat().st_size / (1024 * 1024)
        print(f"‚úÖ {video_dir.name}/")
        print(f"   ‚îî‚îÄ {m4a_file.name} ({size_mb:.2f} MB)")
        if json_files:
            print(f"   ‚îî‚îÄ {json_files[0].name}")
    else:
        failed_downloads += 1
        print(f"‚ùå {video_dir.name}/ (incomplete - no .m4a file)")

print(f"\nüìä Summary:")
print(f"   ‚úÖ Successful: {successful_downloads}")
print(f"   ‚ùå Failed: {failed_downloads}")

## Step 7: Download Files (Optional)

To download files from Colab:
1. Click on the folder icon üìÅ in the left sidebar
2. Navigate to `downloads/` folder
3. Right-click on any file ‚Üí Download

Or use the code below to create a ZIP file:

In [None]:
# Create ZIP file of all downloads
import shutil
from datetime import datetime

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
zip_filename = f'youtube_audio_downloads_{timestamp}'

print(f"üì¶ Creating ZIP file: {zip_filename}.zip")
shutil.make_archive(zip_filename, 'zip', DOWNLOADS_DIR)
print(f"‚úÖ ZIP created successfully!")
print(f"\nüì• Download the file from the files panel on the left")
print(f"   File: {zip_filename}.zip")