In [3]:
import requests
import random
from collections import defaultdict
import pandas as pd


API_KEY = "AIzaSyC34PDdHRGHIYIT_iqEquq9na1JHgOhMSY"
SEARCH_QUERY = "gq"


# Helper Functions for API Calls

def search_youtube(query, max_results=50, page_token=None):
    """
    YT search with keyword
    """
    url = "https://www.googleapis.com/youtube/v3/search"
    params = {
        "part": "snippet",
        "q": query,
        "type": "video",
        "maxResults": max_results,
        "key": API_KEY,
    }
    if page_token:
        params["pageToken"] = page_token
    response = requests.get(url, params=params)
    response.raise_for_status()
    return response.json()

def get_video_stats(video_ids_batch):
    """
    Video statistics & snippet
    """
    url = "https://www.googleapis.com/youtube/v3/videos"
    params = {
        "part": "statistics,snippet",
        "id": ",".join(video_ids_batch),
        "key": API_KEY,
    }
    response = requests.get(url, params=params)
    response.raise_for_status()
    return response.json().get("items", [])

def get_channel_stats_batch(channel_ids_batch):
    """
    Channel statistics
    """
    url = "https://www.googleapis.com/youtube/v3/channels"
    params = {
        "part": "statistics,contentDetails",
        "id": ",".join(channel_ids_batch),
        "key": API_KEY,
    }
    response = requests.get(url, params=params)
    response.raise_for_status()
    return response.json().get("items", [])




# video search
videos = []
result = search_youtube(SEARCH_QUERY, max_results=50)
videos.extend(result.get("items", []))


# channel mappings
video_ids = [item["id"]["videoId"] for item in videos]
video_to_channel = {item["id"]["videoId"]: item["snippet"]["channelId"] for item in videos}


# video stats
video_stats = []
for i in range(0, len(video_ids), 50):
    batch = video_ids[i:i+50]
    video_stats.extend(get_video_stats(batch))

video_details = {}
for item in video_stats:
    vid = item["id"]
    stats = item.get("statistics", {})
    views = int(stats.get("viewCount", 0))
    likes = int(stats.get("likeCount", 0)) if "likeCount" in stats else None
    video_details[vid] = {
        "title": item["snippet"].get("title"),
        "views": views,
        "likes": likes,
        "channel_id": video_to_channel.get(vid),
    }


# channel stats
unique_channel_ids = list({details["channel_id"] for details in video_details.values()})

channel_stats = []
for i in range(0, len(unique_channel_ids), 50):
    batch = unique_channel_ids[i:i+50]
    channel_stats.extend(get_channel_stats_batch(batch))

channel_info = {}
for ch in channel_stats:
    cid = ch["id"]
    stats = ch.get("statistics", {})
    video_count = int(stats.get("videoCount", 0))
    channel_info[cid] = {
        "viewCount": int(stats.get("viewCount", 0)),
        "videoCount": video_count,

        "uploads_playlist_id": ch.get("contentDetails", {}).get("relatedPlaylists", {}).get("uploads")
    }

# filter only channels with > 5 vids
valid_channels = {cid for cid, info in channel_info.items() if info["videoCount"] >= 5}
filtered_video_details = {
    vid: details
    for vid, details in video_details.items()
    if details["channel_id"] in valid_channels
}

# random vid selection from eligible channels
if len(filtered_video_details) >= 10:
    selected_video_ids = random.sample(list(filtered_video_details.keys()), 10)
else:
    selected_video_ids = list(filtered_video_details.keys())

selected_video_details = {vid: filtered_video_details[vid] for vid in selected_video_ids}


# channel avg views
channel_all_stats = {}
for cid in {details["channel_id"] for details in selected_video_details.values()}:
    info = channel_info.get(cid)
    if not info or info["videoCount"] == 0:
        continue
    avg_views_all = info["viewCount"] / info["videoCount"]

    # Note: There is no direct aggregated like metric at the channel level.
    channel_all_stats[cid] = {
        "avg_views_all": avg_views_all,
        "channel_total_videos": info["videoCount"],
    }


# combine statistics we need
df_data = []
base_url = "https://www.youtube.com/watch?v="
for vid, details in selected_video_details.items():
    cid = details["channel_id"]
    stats = channel_all_stats.get(cid, {})
    df_data.append({
        "video_url": f"{base_url}{vid}",
        "title": details["title"],
        "views": details["views"],
        "likes": details["likes"],
        "channel_id": cid,
        "channel_total_videos": stats.get("channel_total_videos"),
        "channel_avg_views_all": stats.get("avg_views_all"),
    })

df = pd.DataFrame(df_data)

print(df)


                                     video_url  \
0  https://www.youtube.com/watch?v=nV7ekK30-e8   
1  https://www.youtube.com/watch?v=YcmuTkRpbG4   
2  https://www.youtube.com/watch?v=GiWRiLkrl6E   
3  https://www.youtube.com/watch?v=T4Hz34G381M   
4  https://www.youtube.com/watch?v=wPITlS9DEqU   
5  https://www.youtube.com/watch?v=T1ec0LKl7CY   
6  https://www.youtube.com/watch?v=3D1hn3jLO6Q   
7  https://www.youtube.com/watch?v=WaoL6cGOOMQ   
8  https://www.youtube.com/watch?v=ZKUcXCy9dZE   
9  https://www.youtube.com/watch?v=1fGo0zkl56g   

                                               title    views   likes  \
0       10 Things Kevin Hart Can't Live Without | GQ  5749310  154616   
1  🔴HOJE É O DIA DO LARANJINHA E A G.Q VAI TER UM...      202      25   
2           10 Things Druski Can't Live Without | GQ  1129028   26380   
3  BigXthaPlug Shows Off His $1,200,000 Jewelry C...    61853    1906   
4         GQ Disco Nights Rock Freak 1979 ((Stereo))  1643703   23338   
5  10 Thing

In [4]:
from google.colab import files
df.to_csv('yt_10gq.csv')
files.download('yt_10gq.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>