In [10]:
import os
import googleapiclient.discovery
import csv
import json
from datetime import datetime

# ===== Configuration Variables =====
API_KEY = 'GOOGLE-API-KEY'  # Enter your actual API key here

# Batch name (to distinguish multiple crawling jobs)
BATCH_NAME = 'default'  # e.g., 'january_2024', 'test_batch', 'main_collection', etc.

# List of video IDs to crawl
VIDEO_IDS = [
    '_1gluMtaUmg',
    'o69BiOqY1Ec',
    'ZnmsMg6joGo',
]

# Output directory (separated by batch)
OUTPUT_DIR = f'comments/{BATCH_NAME}/'

# Progress tracking file (separated by batch)
PROGRESS_FILE = f'crawling_progress_{BATCH_NAME}.csv'


In [3]:
def save_progress(video_id, next_page_token=None, completed=False):
    """Save crawling progress to CSV file"""
    try:
        # Load existing data
        progress = load_progress()
        
        # Update current video info
        progress[video_id] = {
            'video_id': video_id,
            'next_page_token': next_page_token or '',
            'completed': completed,
            'last_updated': datetime.now().isoformat()
        }
        
        # Save to CSV file
        with open(PROGRESS_FILE, 'w', encoding='utf-8', newline='') as f:
            writer = csv.writer(f)
            writer.writerow(['video_id', 'next_page_token', 'completed', 'last_updated'])
            
            for data in progress.values():
                writer.writerow([
                    data['video_id'],
                    data['next_page_token'],
                    data['completed'],
                    data['last_updated']
                ])
        
        print(f"Progress saved: {video_id} - {'Completed' if completed else 'In Progress'}")
        
    except Exception as e:
        print(f"Failed to save progress: {e}")

def load_progress():
    """Load saved crawling progress from CSV"""
    try:
        if os.path.exists(PROGRESS_FILE):
            progress = {}
            with open(PROGRESS_FILE, 'r', encoding='utf-8') as f:
                reader = csv.DictReader(f)
                for row in reader:
                    video_id = row['video_id']
                    progress[video_id] = {
                        'video_id': video_id,
                        'next_page_token': row['next_page_token'] if row['next_page_token'] else None,
                        'completed': row['completed'].lower() == 'true',
                        'last_updated': row['last_updated']
                    }
            return progress
        return {}
    except Exception as e:
        print(f"Failed to load progress: {e}")
        return {}



def print_progress_status():
    """Print current progress status"""
    progress = load_progress()
    print(f"\n=== Batch '{BATCH_NAME}' Crawling Progress ===")
    
    for video_id in VIDEO_IDS:
        if video_id in progress:
            status = progress[video_id]
            if status.get('completed', False):
                print(f"{video_id}: Completed")
            else:
                token = status.get('next_page_token')
                print(f"{video_id}: In Progress (Token: {token[:20] + '...' if token else 'None'})")
        else:
            print(f"{video_id}: Not Started")
    print("=" * 50)


In [4]:
def get_replies(parent_id, writer, api_key, depth):
    # Build the YouTube API client
    youtube = googleapiclient.discovery.build(
        "youtube", "v3", developerKey=api_key
    )

    request = youtube.comments().list(
        part="snippet",
        parentId=parent_id,
        maxResults=100
    )

    while request:
        response = request.execute()

        for item in response.get("items", []):
            reply = item["snippet"]
            reply_id = item["id"]
            writer.writerow([
                reply_id,
                reply["authorDisplayName"],
                reply.get("authorChannelId", {}).get("value"),
                reply["textDisplay"],
                reply["likeCount"],
                reply["publishedAt"],
                depth,
                parent_id
            ])

            # Fetch deeper replies if available
            get_replies(reply_id, writer, api_key, depth + 1)

        request = youtube.comments().list_next(request, response)


In [5]:
def get_youtube_comments(video_id, writer, api_key, next_page_token=None):
    """Crawl YouTube comments and save progress"""
    # Build the YouTube API client
    youtube = googleapiclient.discovery.build(
        "youtube", "v3", developerKey=api_key
    )
    
    print(f"Crawling started - Video ID: {video_id}")
    if next_page_token:
        print(f"Resuming from previous stop point - Token: {next_page_token[:20]}...")
    
    params = {
        "part": "snippet",
        "videoId": video_id,
        "maxResults": 100,
    }
    
    if next_page_token:
        params["pageToken"] = next_page_token

    request = youtube.commentThreads().list(**params)
    page_count = 0

    try:
        while request:
            response = request.execute()
            page_count += 1
            
            print(f"Processing page {page_count}... ({len(response.get('items', []))} comments)")

            for item in response.get("items", []):
                comment = item["snippet"]["topLevelComment"]["snippet"]
                comment_id = item["snippet"]["topLevelComment"]["id"]
                writer.writerow([
                    comment_id,
                    comment["authorDisplayName"],
                    comment.get("authorChannelId", {}).get("value"),
                    comment["textDisplay"],
                    comment["likeCount"],
                    comment["publishedAt"],
                    0,
                    None
                ])

                # Fetch replies for the comment
                get_replies(comment_id, writer, api_key, depth=1)
            
            # Check for next page token
            next_token = response.get("nextPageToken")
            if next_token:
                print(f"Next page token: {next_token[:20]}...")
                # Save progress (intermediate save)
                save_progress(video_id, next_token, completed=False)
            else:
                print("All pages completed!")
                # Save as completed
                save_progress(video_id, None, completed=True)
                break

            request = youtube.commentThreads().list_next(request, response)
            
    except Exception as e:
        print(f"Error occurred during crawling: {e}")
        # Save current token on error
        if 'next_token' in locals() and next_token:
            save_progress(video_id, next_token, completed=False)
        raise e


In [6]:
def get_video_title(video_id, api_key):
    # Build the YouTube API client
    youtube = googleapiclient.discovery.build(
        "youtube", "v3", developerKey=api_key
    )

    request = youtube.videos().list(
        part="snippet",
        id=video_id
    )
    response = request.execute()

    items = response.get("items", [])
    if items:
        return items[0]["snippet"]["title"].replace(" ", "_").replace("/", "-")
    else:
        return "UnknownTitle"


In [8]:
def crawl_videos():
    """Crawl comments for all videos (always resumes from where it stopped)"""
    print("YouTube Comment Crawling Started!")
    print(f"Batch name: {BATCH_NAME}")
    print(f"Output directory: {OUTPUT_DIR}")
    print(f"Progress file: {PROGRESS_FILE}")
    print(f"Number of videos to process: {len(VIDEO_IDS)}")
    
    print_progress_status()
    
    # Create output directory
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    
    success_count = 0
    for i, video_id in enumerate(VIDEO_IDS, 1):
        print(f"\n[{i}/{len(VIDEO_IDS)}] Processing...")
        
        try:
            print(f"Starting video processing: {video_id}")
            
            # Get video title
            video_title = get_video_title(video_id, API_KEY)
            filename = f"{OUTPUT_DIR}{video_title}_{video_id}.csv"
            
            # Check if resuming
            progress = load_progress()
            next_page_token = None
            if video_id in progress and not progress[video_id].get('completed', False):
                next_page_token = progress[video_id].get('next_page_token')
                if next_page_token:
                    print(f"Resuming from previous stop point")
                else:
                    print(f"Starting new crawling")
            else:
                print(f"Starting new crawling")
            
            # Determine file mode (append for resume, write for new)
            file_mode = "a" if next_page_token else "w"
            
            with open(filename, file_mode, encoding="utf-8", newline="") as f:
                writer = csv.writer(f)
                
                # Write header for new crawling
                if not next_page_token:
                    writer.writerow(["ID", "Author", "AuthorID", "Comment", "LikeCount", "PublishedAt", "Depth", "ParentID"])
                
                # Start comment crawling
                get_youtube_comments(video_id, writer, API_KEY, next_page_token)
            
            print(f"Completed: {filename}")
            success_count += 1
            
        except Exception as e:
            print(f"Error occurred ({video_id}): {e}")
    
    print(f"\nCrawling completed!")
    print_progress_status()


In [None]:
# ===== Usage =====

# Check current batch info
print(f"Current batch: {BATCH_NAME}")
print_progress_status()

# To start crawling, uncomment the following:
crawl_videos()  # Always resumes from where it stopped


In [None]:
print_progress_status()

