In [1]:
import os
from typing import Optional, List, Dict
import pandas as pd
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
import html

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from collections import Counter
import nltk
from nltk.corpus import stopwords

def warn(*args, **kwargs):
    pass

import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')

# Fungsi Scrapping

In [None]:
def video_comments(
    video_id: str,
    api_key: Optional[str] = None,
    max_pages: Optional[int] = None,
    max_comments: Optional[int] = None,
    verbose: bool = True
) -> pd.DataFrame:
    api_key = api_key or os.getenv("YOUTUBE_API_KEY")
    if not api_key:
        raise ValueError("❌ API key tidak ditemukan. Set env var YOUTUBE_API_KEY atau kirim via parameter.")

    youtube = build("youtube", "v3", developerKey=api_key)
    comments: List[Dict] = []
    page_count = 0

    request = youtube.commentThreads().list(
        part="snippet,replies",
        videoId=video_id,
        maxResults=100,
        textFormat="plainText" )

    try:
        while request:
            response = request.execute()
            page_count += 1
            if verbose:
                print(f"[INFO] Halaman ke-{page_count} berhasil diambil.")

            for item in response.get("items", []):
                top = item["snippet"]["topLevelComment"]["snippet"]
                top_comment_id = item["snippet"]["topLevelComment"]["id"]
                top_text = html.unescape(top.get("textDisplay", "")).strip()

                if top_text:
                    comments.append({
                        "video_id": video_id,
                        "comment_id": top_comment_id,
                        "parent_id": None,
                        "publishedAt": top.get("publishedAt"),
                        "author": top.get("authorDisplayName"),
                        "text": top_text,
                        "likeCount": top.get("likeCount", 0),
                        "type": "main"
                    })

                for reply in item.get("replies", {}).get("comments", []):
                    r = reply["snippet"]
                    r_text = html.unescape(r.get("textDisplay", "")).strip()

                    if r_text:
                        comments.append({
                            "video_id": video_id,
                            "comment_id": reply["id"],
                            "parent_id": top_comment_id,
                            "publishedAt": r.get("publishedAt"),
                            "author": r.get("authorDisplayName"),
                            "text": r_text,
                            "likeCount": r.get("likeCount", 0),
                            "type": "reply"
                        })

                if max_comments and len(comments) >= max_comments:
                    if verbose:
                        print(f"[INFO] Batas {max_comments} komentar tercapai.")
                    break

            if max_pages and page_count >= max_pages:
                if verbose:
                    print(f"[INFO] Batas {max_pages} halaman tercapai.")
                break

            if max_comments and len(comments) >= max_comments:
                break

            request = youtube.commentThreads().list_next(request, response)

    except HttpError as e:
        print(f"[ERROR] Gagal mengambil komentar: {e}")

    df = pd.DataFrame(comments)
    return df


In [3]:
if __name__ == "__main__":
    video_id = "SzXMacu80o8"
    api_key = "AIzaSyArKeRFfv7MTwmV40H4iLzR4_LjMPkfxLs" 
    df = video_comments(video_id, api_key=api_key, max_comments=57000)

[INFO] Halaman ke-1 berhasil diambil.
[INFO] Halaman ke-2 berhasil diambil.
[INFO] Halaman ke-3 berhasil diambil.
[INFO] Halaman ke-4 berhasil diambil.
[INFO] Halaman ke-5 berhasil diambil.
[INFO] Halaman ke-6 berhasil diambil.
[INFO] Halaman ke-7 berhasil diambil.
[INFO] Halaman ke-8 berhasil diambil.
[INFO] Halaman ke-9 berhasil diambil.
[INFO] Halaman ke-10 berhasil diambil.
[INFO] Halaman ke-11 berhasil diambil.
[INFO] Halaman ke-12 berhasil diambil.
[INFO] Halaman ke-13 berhasil diambil.
[INFO] Halaman ke-14 berhasil diambil.
[INFO] Halaman ke-15 berhasil diambil.
[INFO] Halaman ke-16 berhasil diambil.
[INFO] Halaman ke-17 berhasil diambil.
[INFO] Halaman ke-18 berhasil diambil.
[INFO] Halaman ke-19 berhasil diambil.
[INFO] Halaman ke-20 berhasil diambil.
[INFO] Halaman ke-21 berhasil diambil.
[INFO] Halaman ke-22 berhasil diambil.
[INFO] Halaman ke-23 berhasil diambil.
[INFO] Halaman ke-24 berhasil diambil.
[INFO] Halaman ke-25 berhasil diambil.
[INFO] Halaman ke-26 berhasil diam

In [4]:
df

Unnamed: 0,video_id,comment_id,parent_id,publishedAt,author,text,likeCount,type
0,SzXMacu80o8,UgwtjWGbp-W5rRC7XFh4AaABAg,,2025-07-23T14:04:38Z,@gilangramadhan2568,halo mas fufufafa😅,2,main
1,SzXMacu80o8,Ugx1aL4EL74ON5JtMvZ4AaABAg,,2025-07-23T08:05:32Z,@zzkyy,Itu yang like pasti pendukung nya 😂😂,0,main
2,SzXMacu80o8,UgzMTHngVJKpQDXJTpx4AaABAg,,2025-07-22T21:35:25Z,@pecintarasulullahhrsdanfpi7857,jangan lupakan kasus fufufafa,1,main
3,SzXMacu80o8,UgyGDOBG4vn-Rpa_HTJ4AaABAg,,2025-07-22T14:10:45Z,@WahyuNyonyo,Jelasinnya yang panjang dong,0,main
4,SzXMacu80o8,Ugz8eHWr87npc9BsBgF4AaABAg,,2025-07-22T12:04:40Z,@VernoeAldee,Apsih,0,main
...,...,...,...,...,...,...,...,...
48791,SzXMacu80o8,UgzZVZOY_NxJHC6LRgp4AaABAg,,2025-04-19T01:18:00Z,@praburevolusiofc,🇮🇩🇮🇩🇮🇩,1,main
48792,SzXMacu80o8,UgzFvvO6MlPnemTUFbZ4AaABAg,,2025-04-19T01:13:42Z,@evarianas9689,Sehat dan sukses selalu Pak Wapres Gibran.. Su...,3,main
48793,SzXMacu80o8,UgxGusRnAqasGvPMFXx4AaABAg,,2025-04-19T01:12:38Z,@ivana7,Mantap 👏 siap Pak Wapres 🫡,1,main
48794,SzXMacu80o8,Ugwr2XyJFJVp6KurgPF4AaABAg,,2025-04-19T01:05:46Z,@MasAgus_007,Mantap 👍👍,1,main


In [5]:
df_asli = pd.DataFrame(df, columns=['publishedAt', 'author', 'text', 'likeCount'])
df_asli.to_csv('gibran_yt_bonusdemografi.csv', index=False)

# Pembersihan Data

In [2]:
df_scrap = pd.read_csv('gibran_yt_bonusdemografi.csv')
df_scrap

Unnamed: 0,publishedAt,author,text,likeCount
0,2025-07-23T14:04:38Z,@gilangramadhan2568,halo mas fufufafa😅,2
1,2025-07-23T08:05:32Z,@zzkyy,Itu yang like pasti pendukung nya 😂😂,0
2,2025-07-22T21:35:25Z,@pecintarasulullahhrsdanfpi7857,jangan lupakan kasus fufufafa,1
3,2025-07-22T14:10:45Z,@WahyuNyonyo,Jelasinnya yang panjang dong,0
4,2025-07-22T12:04:40Z,@VernoeAldee,Apsih,0
...,...,...,...,...
48791,2025-04-19T01:18:00Z,@praburevolusiofc,🇮🇩🇮🇩🇮🇩,1
48792,2025-04-19T01:13:42Z,@evarianas9689,Sehat dan sukses selalu Pak Wapres Gibran.. Su...,3
48793,2025-04-19T01:12:38Z,@ivana7,Mantap 👏 siap Pak Wapres 🫡,1
48794,2025-04-19T01:05:46Z,@MasAgus_007,Mantap 👍👍,1


In [3]:
df_model = (df_scrap.copy()
            .drop_duplicates()
            .dropna()
            .drop(columns=['publishedAt', 'author', 'likeCount'])
            .rename(columns={'text': 'komentar'}))
df_model

Unnamed: 0,komentar
0,halo mas fufufafa😅
1,Itu yang like pasti pendukung nya 😂😂
2,jangan lupakan kasus fufufafa
3,Jelasinnya yang panjang dong
4,Apsih
...,...
48791,🇮🇩🇮🇩🇮🇩
48792,Sehat dan sukses selalu Pak Wapres Gibran.. Su...
48793,Mantap 👏 siap Pak Wapres 🫡
48794,Mantap 👍👍


In [None]:
import re
import string
import emoji
import unicodedata

def clean_text_id(text):
    text = text.lower()
    text = emoji.replace_emoji(text, replace='')
    text = re.sub(r'http\S+|www.\S+|@\w+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    text = re.sub(r'(wk)+', 'wk', text)
    text = re.sub(r'(ha)+', 'ha', text)
    text = re.sub(r'(he)+', 'he', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [5]:
df_model['komentar_clean'] = df_model['komentar'].apply(clean_text_id)
df_model[['komentar', 'komentar_clean']].head()

Unnamed: 0,komentar,komentar_clean
0,halo mas fufufafa😅,halo mas fufufafa
1,Itu yang like pasti pendukung nya 😂😂,itu yang like pasti pendukung nya
2,jangan lupakan kasus fufufafa,jangan lupakan kasus fufufafa
3,Jelasinnya yang panjang dong,jelasinnya yang panjang dong
4,Apsih,apsih


In [6]:
df_model = df_model.drop(columns=['komentar'])
df_model

Unnamed: 0,komentar_clean
0,halo mas fufufafa
1,itu yang like pasti pendukung nya
2,jangan lupakan kasus fufufafa
3,jelasinnya yang panjang dong
4,apsih
...,...
48791,
48792,sehat dan sukses selalu pak wapres gibran suks...
48793,mantap siap pak wapres
48794,mantap
