In [1]:
import requests
import csv
import time
from datetime import datetime
import isodate

API_KEY = "use your own youtube api key"
CHANNEL_IDS = [
    "UCJ5v_MCY6GNUBTO8-D3XoAg",   # WWE
    "UCFN4JkGP_bVhAdBsoV9xftA",   # AEW
    "UCOp8wkVqdrWbFYHjDv946QQ"    # TNA
]
OUTPUT_CSV = "raw_youtube.csv" 
START_DATE = datetime(2024, 1, 1)  # Filter to videos from this date onward


def get_channel_title_and_uploads_playlist_id(channel_id):
    """Get channel name, subscriber count, and uploads playlist ID."""
    url = "https://www.googleapis.com/youtube/v3/channels"
    params = {
        "part": "snippet,contentDetails,statistics",
        "id": channel_id,
        "key": API_KEY
    }
    response = requests.get(url, params=params).json()
    try:
        item = response["items"][0]
        uploads_id = item["contentDetails"]["relatedPlaylists"]["uploads"]
        channel_title = item["snippet"]["title"]
        subscriber_count = item["statistics"].get("subscriberCount", "")
        return channel_title, uploads_id, subscriber_count
    except (IndexError, KeyError):
        raise Exception("❌ Error retrieving uploads playlist or channel data.")


def get_all_video_ids(playlist_id):
    """Retrieve all video IDs from uploads playlist."""
    video_ids = []
    url = "https://www.googleapis.com/youtube/v3/playlistItems"
    params = {
        "part": "contentDetails",
        "playlistId": playlist_id,
        "maxResults": 50,
        "key": API_KEY
    }

    while True:
        response = requests.get(url, params=params).json()
        for item in response.get("items", []):
            video_ids.append(item["contentDetails"]["videoId"])
        if 'nextPageToken' in response:
            params['pageToken'] = response['nextPageToken']
            time.sleep(0.1)
        else:
            break
    return video_ids

def iso_duration_to_seconds(iso_str):
    try:
        duration = isodate.parse_duration(iso_str)
        return int(duration.total_seconds())
    except:
        return ""

def get_video_metadata(video_ids, channel_title, channel_id, subscriber_count):
    """Fetch metadata for a list of video IDs, plus channel info."""
    video_data = []
    url = "https://www.googleapis.com/youtube/v3/videos"

    for i in range(0, len(video_ids), 50):
        batch = video_ids[i:i+50]
        params = {
            "key": API_KEY,
            "id": ",".join(batch),
            "part": "snippet,statistics,contentDetails,topicDetails"
        }
        response = requests.get(url, params=params).json()

        for item in response.get("items", []):
            snip = item.get("snippet", {})
            stats = item.get("statistics", {})
            contdet = item.get("contentDetails", {})
            topic = item.get("topicDetails", {})

            pub_time_str = snip.get("publishedAt")
            if pub_time_str:
                pub_date = datetime.strptime(pub_time_str, '%Y-%m-%dT%H:%M:%SZ')
                if pub_date < START_DATE:
                    continue
                upload_date = pub_date.date()
            else:
                upload_date = ""

            duration_seconds = iso_duration_to_seconds(contdet.get("duration", ""))
            topic_cats = ', '.join(topic.get("topicCategories", [])) if topic.get("topicCategories") else ""

            thumbnail_url = snip.get("thumbnails", {}).get("high", {}).get("url", "")
            video_id = item.get("id", "")
            video_url = f"https://www.youtube.com/watch?v={video_id}"

            video_data.append({
                "channel_title": channel_title,
                "channel_id": channel_id,
                "subscriber_count": subscriber_count,
                "video_id": video_id,
                "video_url": video_url,
                "title": snip.get("title", ""),
                "description": snip.get("description", "").replace('\n', ' ').replace('\r', ' '),
                "upload_date": upload_date,
                "published_time": pub_time_str,
                "view_count": stats.get("viewCount", ""),
                "like_count": stats.get("likeCount", ""),
                "comment_count": stats.get("commentCount", ""),
                "duration_seconds": duration_seconds,
                "definition": contdet.get("definition", ""),
                "caption": contdet.get("caption", ""),
                "topic_categories": topic_cats,
                "tags": ", ".join(snip.get("tags", [])) if "tags" in snip else "",
                "thumbnail_url": thumbnail_url,
                "licensed_content": contdet.get("licensedContent", "")
            })

        time.sleep(0.1)
    return video_data

def save_to_csv(records, filename):
    fieldnames = [
        "channel_title", "channel_id", "subscriber_count", "video_id", "video_url", "title",
        "description", "upload_date", "published_time", "view_count", "like_count",
        "comment_count", "duration_seconds", "definition", "caption",
        "topic_categories", "tags", "thumbnail_url", "licensed_content"
    ]
    with open(filename, "w", encoding="utf-8", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(records)
    print(f"✅ Saved {len(records)} videos to {filename}")

def main():
    all_metadata = []
    for channel_id in CHANNEL_IDS:
        print(f"\n🔎 Processing Channel ID: {channel_id}")
        channel_title, playlist_id, subscriber_count = get_channel_title_and_uploads_playlist_id(channel_id)
        print(f"📺 Found channel: {channel_title} | Subscribers: {subscriber_count}")
        print("🎯 Retrieving video IDs...")
        video_ids = get_all_video_ids(playlist_id)
        print(f"🎬 Found {len(video_ids)} total videos. Filtering from {START_DATE.date()}...")
        metadata = get_video_metadata(video_ids, channel_title, channel_id, subscriber_count)
        print(f"✅ {len(metadata)} videos after date filter.")
        all_metadata.extend(metadata)
    print("\n💾 Saving combined data to CSV...")
    save_to_csv(all_metadata, OUTPUT_CSV)

if __name__ == "__main__":
    main()



🔎 Processing Channel ID: UCJ5v_MCY6GNUBTO8-D3XoAg
📺 Found channel: WWE | Subscribers: 110000000
🎯 Retrieving video IDs...
🎬 Found 20000 total videos. Filtering from 2024-01-01...
✅ 13671 videos after date filter.

🔎 Processing Channel ID: UCFN4JkGP_bVhAdBsoV9xftA
📺 Found channel: All Elite Wrestling | Subscribers: 4560000
🎯 Retrieving video IDs...
🎬 Found 9508 total videos. Filtering from 2024-01-01...
✅ 3690 videos after date filter.

🔎 Processing Channel ID: UCOp8wkVqdrWbFYHjDv946QQ
📺 Found channel: TNA Wrestling | Subscribers: 5180000
🎯 Retrieving video IDs...
🎬 Found 4956 total videos. Filtering from 2024-01-01...
✅ 1351 videos after date filter.

💾 Saving combined data to CSV...
✅ Saved 18712 videos to raw_youtube.csv


In [None]:
calendar_date
calendar_day_name
calendar_month_name
calendar_quarter
channel_id
channel_name
video_id
video_name
video_description
video_url
video_thumbnail_url
video_published_timestamp_utc
video_published_hour_utc
video_duration_seconds
video_views
video_likes
video_comments
video_engagement_rate
is_short
is_full_match
