In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import requests
import random
from collections import defaultdict
import pandas as pd

API_KEY = "AIzaSyC34PDdHRGHIYIT_iqEquq9na1JHgOhMSY"
SEARCH_QUERY = "interview"

In [None]:
def search_youtube(query, max_results=50, page_token=None):
    url = "https://www.googleapis.com/youtube/v3/search"
    params = {
        "part": "snippet",
        "q": query,
        "type": "video",
        "maxResults": max_results,
        "key": API_KEY,
    }
    if page_token:
        params["pageToken"] = page_token
    response = requests.get(url, params=params)
    response.raise_for_status()
    return response.json()


videos = []
next_page_token = None
result = search_youtube(SEARCH_QUERY, max_results=50, page_token=next_page_token)
videos.extend(result.get("items", []))


# get vid and channel ids
video_ids = [item["id"]["videoId"] for item in videos]
video_to_channel = {item["id"]["videoId"]: item["snippet"]["channelId"] for item in videos}

# fetch vid stats
def get_video_stats(video_ids_batch):
    url = "https://www.googleapis.com/youtube/v3/videos"
    params = {
        "part": "statistics,snippet",
        "id": ",".join(video_ids_batch),
        "key": API_KEY,
    }
    response = requests.get(url, params=params)
    response.raise_for_status()
    return response.json().get("items", [])

video_stats = []
for i in range(0, len(video_ids), 50):
    batch = video_ids[i:i+50]
    video_stats.extend(get_video_stats(batch))

video_details = {}
for item in video_stats:
    vid = item["id"]
    stats = item.get("statistics", {})
    views = int(stats.get("viewCount", 0))
    likes = int(stats.get("likeCount", 0)) if "likeCount" in stats else None
    video_details[vid] = {
        "title": item["snippet"].get("title"),
        "views": views,
        "likes": likes,
        "channel_id": video_to_channel.get(vid),
    }

In [3]:
# fetch channel stats
unique_channel_ids = list({details["channel_id"] for details in video_details.values()})

def get_channel_stats_batch(channel_ids_batch):
    url = "https://www.googleapis.com/youtube/v3/channels"
    params = {
        "part": "statistics,snippet",
        "id": ",".join(channel_ids_batch),
        "key": API_KEY,
    }
    response = requests.get(url, params=params)
    response.raise_for_status()
    return response.json().get("items", [])

channel_stats = []
for i in range(0, len(unique_channel_ids), 50):
    batch = unique_channel_ids[i:i+50]
    channel_stats.extend(get_channel_stats_batch(batch))

channel_total_videos = {}
for ch in channel_stats:
    cid = ch["id"]
    stat = ch.get("statistics", {})
    total_videos = int(stat.get("videoCount", 0))
    channel_total_videos[cid] = total_videos

# filter for vids of channel with at least 5 vids uploaded
valid_channels = {cid for cid, count in channel_total_videos.items() if count >= 5}
filtered_video_details = {vid: details
                          for vid, details in video_details.items()
                          if details["channel_id"] in valid_channels}

# select 10 vids for list
if len(filtered_video_details) >= 10:
    selected_video_ids = random.sample(list(filtered_video_details.keys()), 10)
else:
    selected_video_ids = list(filtered_video_details.keys())

selected_video_details = {vid: filtered_video_details[vid] for vid in selected_video_ids}

# get avg channel stats
channel_video_data = defaultdict(list)
for vid, details in selected_video_details.items():
    channel_video_data[details["channel_id"]].append(details)

channel_averages = {}
for channel_id, vids in channel_video_data.items():
    total_views = sum(video["views"] for video in vids)
    total_likes = sum(video["likes"] for video in vids if video["likes"] is not None)
    count = len(vids)
    avg_views = total_views / count if count else 0
    avg_likes = (total_likes / count) if count and total_likes else None
    channel_averages[channel_id] = {
        "avg_views": avg_views,
        "avg_likes": avg_likes,
        "num_videos_sample": count
    }

                                     video_url  \
0  https://www.youtube.com/watch?v=dxOyNqKKwr0   
1  https://www.youtube.com/watch?v=b92PfcokiYI   
2  https://www.youtube.com/watch?v=2CM24AT68Bc   
3  https://www.youtube.com/watch?v=-OtTOYp7oLg   
4  https://www.youtube.com/watch?v=vMwz5doRUQI   
5  https://www.youtube.com/watch?v=zLnX1SQfgJI   
6  https://www.youtube.com/watch?v=OfLc1PJIKIk   
7  https://www.youtube.com/watch?v=WFUiewIkjvg   
8  https://www.youtube.com/watch?v=8smGAJJl79k   
9  https://www.youtube.com/watch?v=wG1xwUlVTyM   

                                               title     views    likes  \
0  Job interviews of Kalyug 🤦🏻😭 | Kuldeep Singhan...  12438706   684130   
1  LIL Durk Realest Interview Ever. Last Intervie...   1112508    13810   
2  Playboi Carti's DJ Swamp Izzo Tells His Life S...      9800      303   
3  DJU Baby Splat Interview:  What he saw in Big ...    217009     3912   
4  EBK Jaaybo on His Brother’s Influence 🕊️ via: ...    396504    20045   

In [4]:
# final data joining
df_data = []
base_url = "https://www.youtube.com/watch?v="
for vid, details in selected_video_details.items():
    cid = details["channel_id"]
    df_data.append({
        "video_url": f"{base_url}{vid}",
        "title": details["title"],
        "views": details["views"],
        "likes": details["likes"],
        "channel_id": cid,
        "channel_total_videos": channel_total_videos.get(cid),
        "channel_avg_views_sample": channel_averages.get(cid, {}).get("avg_views"),
        "channel_avg_likes_sample": channel_averages.get(cid, {}).get("avg_likes"),
        "channel_num_videos_sample": channel_averages.get(cid, {}).get("num_videos_sample"),
    })

df = pd.DataFrame(df_data)

#df.to_csv('/content/drive/MyDrive/MLiAS/Data/interview_10yt.csv', index=False)