In [None]:
# End-to-end pipeline for YouTube comment crawling, preprocessing, topic modeling (BERTopic), and sentiment/emotion analysis.
!pip install youtube-comment-downloader bertopic transformers nltk
import nltk
nltk.download("stopwords")

In [1]:
# --- Crawl YouTube Comments ---
from itertools import islice
from youtube_comment_downloader import YoutubeCommentDownloader, SORT_BY_POPULAR
import json

def crawl_comments(url: str, output_file: str = "ENDEVR.json", limit: int = None):
    """Download YouTube comments and save them in JSON Lines format."""
    downloader = YoutubeCommentDownloader()
    comments = downloader.get_comments_from_url(url, sort_by=SORT_BY_POPULAR)
    
    if limit:
        comments = islice(comments, limit)
    
    comments_list = list(comments)

    with open(output_file, "w", encoding="utf-8") as f:
        for comment in comments_list:
            json_line = json.dumps(comment, ensure_ascii=False)
            f.write(json_line + "\n")

    print(f"âœ… Saved {len(comments_list)} comments to '{output_file}' (JSON Lines format)")

# URL = "https://www.youtube.com/watch?v=LTduwK0-sGI"
URL = "https://www.youtube.com/watch?v=5udOx8-QxtE"
crawl_comments(URL, output_file="GUARDIAN.json")

âœ… Saved 128 comments to 'guardian.json' (JSON Lines format)


In [5]:
# --- Clean and Preprocess Comments ---
import pandas as pd
import re, string

def clean_text(txt: str) -> str:
    """Lowercase, remove links/punctuation, normalize whitespace."""
    txt = txt.lower()
    txt = re.sub(r"http\S+|www\S+", "", txt)
    txt = txt.translate(str.maketrans('', '', string.punctuation))
    txt = re.sub(r"\s+", " ", txt)
    return txt.strip()

def preprocess(input_file: str = "GUARDIAN.json", output_file: str = "GUARDIAN_clean.csv"):
    """Load, clean, and save preprocessed comments."""
    data = []
    with open(input_file, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line:
                try:
                    data.append(json.loads(line))
                except:
                    pass

    df = pd.DataFrame(data)
    df = df.dropna(subset=["text"])
    df["text"] = df["text"].astype(str)
    df["clean"] = df["text"].apply(clean_text)

    df.to_csv(output_file, index=False)
    print(f"âœ… Preprocessed {len(df)} comments and saved to '{output_file}'")
    return df

df = preprocess("GUARDIAN.json")
df.head()

âœ… Preprocessed 128 comments and saved to 'GUARDIAN_clean.csv'


Unnamed: 0,cid,text,time,author,channel,votes,replies,photo,heart,reply,time_parsed,clean
0,Ugw2APWQ4_TqgbM4Xlh4AaABAg,this is guaranteed to keep people stuck in the...,vor 1 Jahr,@ArjunaJackson,UCKQs8X3QxiRWlS8XbXTYTFA,258,6.0,https://yt3.ggpht.com/usI7nJIo6lj0prOOD036PVnz...,False,False,1731407000.0,this is guaranteed to keep people stuck in the...
1,UgzlVq6zVImVxnyQ5ZN4AaABAg,"No, it could make the grieving process worse a...",vor 1 Jahr,@skylark1237,UCANRu8T_HKO96XcA4glmqZA,188,2.0,https://yt3.ggpht.com/ytc/AIdro_n8LDP5zMqUT9S1...,False,False,1731407000.0,no it could make the grieving process worse an...
2,UgyinvHQnO3yMyw8bdB4AaABAg,We are living in a black mirror episode.,vor 9 Monaten,@58rocKsErt1,UCHp8UCCFeHCG5t_mOWRStGQ,32,,https://yt3.ggpht.com/ytc/AIdro_nWb1yghKJ8-Sce...,False,False,1739356000.0,we are living in a black mirror episode
3,UgwoImAwQT4ljgcNgl94AaABAg,This may not be healthy,vor 1 Jahr,@averageamericangirl6819,UC2aTKfLcNgjmQXKkDws10Uw,90,1.0,https://yt3.ggpht.com/ytc/AIdro_law9lQDSwoio5B...,False,False,1731407000.0,this may not be healthy
4,UgwnN6IZI0DsWrR9zs54AaABAg,"If AI ended grief, that wouldn't speak well of...",vor 1 Jahr,@jps0117,UC4xTJlctOrJqqce3Cfthl_g,71,2.0,https://yt3.ggpht.com/ytc/AIdro_n-095R21h9D80v...,False,False,1731407000.0,if ai ended grief that wouldnt speak well of w...


In [10]:
# --- Topic Modeling with BERTopic ---
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

def run_bertopic(df, output_file: str = "GUARDIAN_btopic.csv"):
    """Run BERTopic and save topic assignments."""
    stop_words = stopwords.words("english")
    vectorizer = CountVectorizer(stop_words=stop_words)

    topic_model = BERTopic(
        vectorizer_model=vectorizer,
        language="english",
        verbose=True,
        calculate_probabilities=False
    )

    topics, _ = topic_model.fit_transform(df["clean"])
    df["topic"] = topics
    df.to_csv(output_file, index=False)

    topic_info = topic_model.get_topic_info()
    print("\nTop 10 Topics:\n")
    print(topic_info.head(10))

    print(f"âœ… BERTopic results saved to '{output_file}'")
    return topic_model, topic_info, df

topic_model, topic_info, df = run_bertopic(df)

for i in topic_info.head(11)["Topic"]:
    if i == -1:  # -1 means outliers
        continue
    print(f"\n--- Topic {i}: {topic_model.get_topic(i)} ---")
    sample_comments = df[df["topic"] == i]["text"].head(6).tolist()
    for c in sample_comments:
        print("â€¢", c[:500], "...\n")

2025-11-12 11:39:06,443 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 4/4 [00:05<00:00,  1.44s/it]
2025-11-12 11:39:14,474 - BERTopic - Embedding - Completed âœ“
2025-11-12 11:39:14,476 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-11-12 11:39:18,847 - BERTopic - Dimensionality - Completed âœ“
2025-11-12 11:39:18,849 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-11-12 11:39:18,864 - BERTopic - Cluster - Completed âœ“
2025-11-12 11:39:18,871 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-12 11:39:18,897 - BERTopic - Representation - Completed âœ“



Top 10 Topics:

   Topic  Count                        Name  \
0     -1      8    -1_loads_934_crew_actual   
1      0     76     0_grief_would_ai_people   
2      1     30  1_back_exactly_yeah_thanks   
3      2     14  2_black_mirror_episode_one   

                                      Representation  \
0  [loads, 934, crew, actual, recreation, red, dw...   
1  [grief, would, ai, people, think, like, let, l...   
2  [back, exactly, yeah, thanks, right, well, awf...   
3  [black, mirror, episode, one, ended, current, ...   

                                 Representative_Docs  
0  [charlie brooker is an actual visionary, if yo...  
1  [spot on finally someone gets that its humans ...  
2  [be right back, richardduplessis1090 for the b...  
3  [black mirror, its black mirror, this is a bla...  
âœ… BERTopic results saved to 'GUARDIAN_btopic.csv'

--- Topic 0: [('grief', np.float64(0.05572500279519491)), ('would', np.float64(0.041150210854549264)), ('ai', np.float64(0.041150210854549

In [7]:
# --- Sentiment and Emotion Analysis ---
from transformers import pipeline

# Models: you can change any of these with models from huggingface
MODEL_SENT = "cardiffnlp/twitter-roberta-base-sentiment-latest"
MODEL_EMO = "cardiffnlp/twitter-roberta-large-emotion-latest"

# Label maps
id2label_sent = { "0": "Negative", "1": "Neutral", "2": "Positive" }
id2label_emo = {
    "0": "anger", "1": "anticipation", "2": "disgust", "3": "fear",
    "4": "joy", "5": "love", "6": "optimism", "7": "pessimism",
    "8": "sadness", "9": "surprise", "10": "trust"
}

def get_prediction(text, pipe, label_map):
    """Run model prediction and map label IDs to names."""
    try:
        result = pipe(text[:256])[0]
        label_id = result["label"].replace("LABEL_", "")
        label_name = label_map.get(label_id, result["label"])
        return label_name, float(result["score"])
    except Exception:
        return None, None

def analyze_sentiment(df, output_sent="GUARDIAN_sent.csv", output_emo="GUARDIAN_emo.csv"):
    """Perform sentiment and emotion classification."""
    # Sentiment
    sent_pipe = pipeline("sentiment-analysis", model=MODEL_SENT)
    df[["sent_label", "sent_score"]] = df["clean"].apply(
        lambda x: pd.Series(get_prediction(x, sent_pipe, id2label_sent))
    )
    df.to_csv(output_sent, index=False)
    print(f"âœ… Sentiment predictions saved to '{output_sent}'")

    # Emotion
    emo_pipe = pipeline("sentiment-analysis", model=MODEL_EMO)
    df[["emo_label", "emo_score"]] = df["clean"].apply(
        lambda x: pd.Series(get_prediction(x, emo_pipe, id2label_emo))
    )
    df.to_csv(output_emo, index=False)
    print(f"âœ… Emotion predictions saved to '{output_emo}'")

    return df

df = analyze_sentiment(df)


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


âœ… Sentiment predictions saved to 'GUARDIAN_sent.csv'


Device set to use cpu


âœ… Emotion predictions saved to 'GUARDIAN_emo.csv'
