# Data Collection

## Channel Metadata

In [1]:
from googleapiclient.discovery import build
import pandas as pd
from datetime import datetime, timedelta

In [2]:
api_key = 'AIzaSyBajnJT4G686q0v92lmNpTHVHEb41Ouvcc'

In [3]:
channel_id = 'UC7cs8q-gJRlGwj4A8OmCmXg'

In [4]:
youtube = build("youtube", "v3", developerKey=api_key)

In [5]:
def get_channel_metadata(youtube, channel_id):
    request = youtube.channels().list(
        part="snippet,contentDetails,statistics",
        id=channel_id
    )
    response = request.execute()
    item = response['items'][0]

    metadata = {
        "Channel_name": item["snippet"]["title"],
        "Subscribers": item["statistics"]["subscriberCount"],
        "Views": item["statistics"]["viewCount"],
        "PublishedAt": item["snippet"]["publishedAt"],
        "Country": item.get("snippet", {}).get("country", "N/A"),
        "Total_videos": item["statistics"]["videoCount"],
        "playlist_id": item["contentDetails"]["relatedPlaylists"]["uploads"]
    }
    return metadata

In [6]:
channel_metadata = get_channel_metadata(youtube, channel_id)
channel_df = pd.DataFrame([channel_metadata])

In [7]:
channel_df

Unnamed: 0,Channel_name,Subscribers,Views,PublishedAt,Country,Total_videos,playlist_id
0,Alex The Analyst,1130000,53403017,2020-01-08T05:04:24.970712Z,US,386,UU7cs8q-gJRlGwj4A8OmCmXg


In [8]:
channel_df.to_csv("metadata.csv", index=False)

# Videos Data

In [9]:
pip install isodate

Collecting isodate
  Downloading isodate-0.7.2-py3-none-any.whl.metadata (11 kB)
Downloading isodate-0.7.2-py3-none-any.whl (22 kB)
Installing collected packages: isodate
Successfully installed isodate-0.7.2


In [10]:
from datetime import datetime, timedelta
import isodate

In [11]:
def get_all_videos_with_stats(youtube, playlist_id):
    videos = []
    next_page_token = None

    while True:
        request = youtube.playlistItems().list(
            part="snippet",
            playlistId=playlist_id,
            maxResults=50,
            pageToken=next_page_token
        )
        response = request.execute()

        for item in response['items']:
            published_at = item["snippet"]["publishedAt"]
            video_id = item["snippet"]["resourceId"]["videoId"]
            video_title = item["snippet"]["title"]

            stats_response = youtube.videos().list(
                part="statistics,contentDetails",
                id=video_id
            ).execute()

            video_data = stats_response['items'][0]
            stats = video_data['statistics']
            duration_iso = video_data['contentDetails']['duration']
            duration_seconds = isodate.parse_duration(duration_iso).total_seconds()

            videos.append({
                "video_id": video_id,
                "video_title": video_title,
                "published_at": published_at,
                "view_count": int(stats.get("viewCount", 0)),
                "like_count": int(stats.get("likeCount", 0)),
                "comment_count": int(stats.get("commentCount", 0)),
                "duration_seconds": duration_seconds
            })

        next_page_token = response.get("nextPageToken")
        if not next_page_token:
            break

    return pd.DataFrame(videos)

In [12]:
playlist_id = channel_df["playlist_id"].iloc[0]
df_videos = get_all_videos_with_stats(youtube, playlist_id)

In [13]:
df_videos.head(5)

Unnamed: 0,video_id,video_title,published_at,view_count,like_count,comment_count,duration_seconds
0,kk5zEOQzTmQ,Data Visualization and Presentation in R | R f...,2025-08-19T12:01:22Z,1860,61,5,1264.0
1,yhlqKsYpzgE,Alex The Analyst Q/A Livestream | Come Ask Me ...,2025-08-18T01:01:35Z,0,8,0,0.0
2,TP2OJuZhbIQ,Things I Learned as a Data Analyst p1,2025-08-15T11:46:04Z,5823,203,12,38.0
3,Mi8st3hyMH8,Alex The Analyst Q/A Livestream | Come Ask Me ...,2025-08-14T14:18:24Z,2777,109,8,3967.0
4,vAKs1-EEJ38,How to Remove Duplicates in an R Dataframe | R...,2025-08-12T12:01:49Z,1752,59,2,488.0


In [14]:
df_videos.to_csv("videos.csv", index=False)

In [15]:
from googleapiclient.errors import HttpError
import pandas as pd
import time

def get_video_comments(youtube, video_id, max_comments=100):
    """
    Fetch up to `max_comments` for a given video_id.
    Returns a list of dictionaries with video_id and comment text.
    Skips videos with comments disabled.
    """
    comments = []
    next_page_token = None
    count = 0

    while True:
        try:
            request = youtube.commentThreads().list(
                part="snippet",
                videoId=video_id,
                maxResults=100,  # API max per page is 100
                pageToken=next_page_token,
                textFormat="plainText"
            )
            response = request.execute()

            for item in response['items']:
                comment_text = item['snippet']['topLevelComment']['snippet']['textDisplay']
                comments.append({
                    "video_id": video_id,
                    "comment": comment_text
                })
                count += 1
                if count >= max_comments:
                    break

            next_page_token = response.get("nextPageToken")
            if not next_page_token or count >= max_comments:
                break

        except HttpError as e:
            if e.resp.status in [403, 404]:
                print(f"Skipping video {video_id} (comments disabled or unavailable)")
                break
            else:
                print(f"Error for video {video_id}: {e}")
                time.sleep(5)  # wait a bit before retrying
                continue

    return comments

all_comments = []

for idx, row in df_videos.iterrows():
    vid = row['video_id']
    print(f"Fetching comments for video {vid} ({idx+1}/{len(df_videos)})")
    video_comments = get_video_comments(youtube, vid, max_comments=100)
    all_comments.extend(video_comments)

# Convert to DataFrame
df_comments = pd.DataFrame(all_comments)

# Save to CSV
df_comments.to_csv("video_comments.csv", index=False)
print("Comments extraction completed!")



Fetching comments for video kk5zEOQzTmQ (1/387)
Fetching comments for video yhlqKsYpzgE (2/387)
Skipping video yhlqKsYpzgE (comments disabled or unavailable)
Fetching comments for video TP2OJuZhbIQ (3/387)
Fetching comments for video Mi8st3hyMH8 (4/387)
Fetching comments for video vAKs1-EEJ38 (5/387)
Fetching comments for video -Z2KjOUcvTA (6/387)
Fetching comments for video v5jasYkmwZI (7/387)
Fetching comments for video kqhJff8ruKw (8/387)
Fetching comments for video QR8kxf925IQ (9/387)
Fetching comments for video klKWFqsqFeI (10/387)
Fetching comments for video XIWe4vCtHcY (11/387)
Fetching comments for video PqCp1f0RIaU (12/387)
Fetching comments for video 0B-lz9VQkYs (13/387)
Fetching comments for video _YG8b1g-QJA (14/387)
Fetching comments for video UT_hTGge_wc (15/387)
Fetching comments for video A42T_b0gwF4 (16/387)
Fetching comments for video 8NfozskpBtY (17/387)
Fetching comments for video lpK71OwmG0s (18/387)
Fetching comments for video QC1Jased-PA (19/387)
Fetching comment