# 01 — EDA: YouTube Comments

Explore the raw comments collected by `src/data/ingest.py`.

**Run this after:** `python -m src.data.ingest` (or `--max 20` for a quick test)

**Goal:** Understand the data before training — distributions, text quality, channel breakdown, engagement patterns.

In [31]:
import json
import glob
from pathlib import Path

import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

ROOT = Path("..")
RAW_DIR = ROOT / "data" / "raw"

# Load all JSONL files
records = []
for path in sorted(RAW_DIR.glob("comments_*.jsonl")):
    with open(path, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line:
                records.append(json.loads(line))

df = pd.DataFrame(records)

# Parse timestamps
df["published_at"] = pd.to_datetime(df["published_at"], utc=True)
df["collected_at"] = pd.to_datetime(df["collected_at"], utc=True)

print(f"Total comments loaded: {len(df):,}")
print(f"Columns: {list(df.columns)}")

Total comments loaded: 143,782
Columns: ['content_id', 'platform', 'comment_id', 'video_id', 'parent_id', 'text_raw', 'like_count', 'reply_count', 'author_hash', 'published_at', 'collected_at', 'label_toxicity', 'label_hate_racism', 'label_harassment', 'model_version', 'video_title', 'channel_id', 'channel_name', 'channel_category']


## 1. Basic Stats

In [32]:
print("=== Shape ===")
print(f"  Rows: {df.shape[0]:,}  |  Columns: {df.shape[1]}")

print("\n=== Null counts ===")
nulls = df.isnull().sum()
print(nulls[nulls > 0].to_string() if nulls.any() else "  No nulls in key fields")

print("\n=== Top-level vs replies ===")
top_level = df["parent_id"].isna().sum()
replies   = df["parent_id"].notna().sum()
print(f"  Top-level: {top_level:,} ({top_level/len(df)*100:.1f}%)")
print(f"  Replies:   {replies:,} ({replies/len(df)*100:.1f}%)")

print("\n=== Date range ===")
print(f"  Earliest published: {df['published_at'].min()}")
print(f"  Latest published:   {df['published_at'].max()}")
print(f"  Collected:          {df['collected_at'].min().date()} – {df['collected_at'].max().date()}")

=== Shape ===
  Rows: 143,782  |  Columns: 19

=== Null counts ===
parent_id            119574
label_toxicity       143782
label_hate_racism    143782
label_harassment     143782
model_version        143782

=== Top-level vs replies ===
  Top-level: 119,574 (83.2%)
  Replies:   24,208 (16.8%)

=== Date range ===
  Earliest published: 2024-05-03 19:35:51+00:00
  Latest published:   2026-02-19 18:38:41+00:00
  Collected:          2026-02-19 – 2026-02-19


## 2. Comments per Channel

In [33]:
channel_counts = df.groupby(["channel_name", "channel_category"]).size().reset_index(name="count")
channel_counts = channel_counts.sort_values("count", ascending=False)

fig = px.bar(
    channel_counts,
    x="channel_name",
    y="count",
    color="channel_category",
    title="Comments Collected per Channel",
    labels={"channel_name": "Channel", "count": "# Comments", "channel_category": "Category"},
    text_auto=True,
)
fig.update_layout(xaxis_tickangle=-30)
fig.show()

print(channel_counts.to_string(index=False))

           channel_name channel_category  count
                MrBeast    entertainment  28531
               Fox News             news  21272
               The Hill             news  19662
The Late Show (Colbert)    entertainment  19326
      Jimmy Kimmel Live    entertainment  15414
              PewDiePie           gaming  10706
                    CNN             news   8867
               BBC News             news   8655
               ABC News             news   6017
               NBC News             news   3045
                  MSNBC             news   1261
         MrBeast Gaming           gaming   1012
                   ESPN           sports     14


## 3. Text Length Distribution

In [34]:
df["text_len"] = df["text_raw"].str.len()
df["word_count"] = df["text_raw"].str.split().str.len()

fig = make_subplots(rows=1, cols=2, subplot_titles=("Character Length", "Word Count"))

fig.add_trace(
    go.Histogram(x=df["text_len"], nbinsx=50, name="Chars", marker_color="steelblue"),
    row=1, col=1
)
fig.add_trace(
    go.Histogram(x=df["word_count"], nbinsx=50, name="Words", marker_color="darkorange"),
    row=1, col=2
)
fig.update_layout(title="Text Length Distribution", showlegend=False)
fig.show()

print(df[["text_len", "word_count"]].describe().round(1).to_string())

       text_len  word_count
count  143782.0    143782.0
mean      100.4        18.0
std       209.0        35.6
min         0.0         0.0
25%        24.0         4.0
50%        53.0        10.0
75%       110.0        20.0
max      9941.0      1871.0


## 4. Engagement: Likes & Replies

In [35]:
fig = make_subplots(rows=1, cols=2, subplot_titles=("Like Count Distribution", "Reply Count Distribution"))

fig.add_trace(
    go.Histogram(x=df["like_count"], nbinsx=40, name="Likes", marker_color="mediumseagreen"),
    row=1, col=1
)
fig.add_trace(
    go.Histogram(x=df["reply_count"], nbinsx=20, name="Replies", marker_color="mediumpurple"),
    row=1, col=2
)
fig.update_layout(title="Engagement Distributions", showlegend=False)
fig.show()

print("--- Likes ---")
print(df["like_count"].describe().round(1).to_string())
print("\n--- Replies ---")
print(df["reply_count"].describe().round(1).to_string())

--- Likes ---
count    143782.0
mean         23.9
std        1845.2
min           0.0
25%           0.0
50%           0.0
75%           1.0
max      586349.0

--- Replies ---
count    143782.0
mean          0.6
std          17.0
min           0.0
25%           0.0
50%           0.0
75%           0.0
max        1006.0


## 5. Top Words (Quick Frequency Check)

In [36]:
import re
from collections import Counter

STOPWORDS = {
    "the", "a", "an", "and", "or", "but", "is", "in", "on", "at", "to",
    "for", "of", "with", "this", "that", "it", "he", "she", "they",
    "we", "you", "i", "be", "are", "was", "were", "have", "has", "had",
    "do", "does", "did", "not", "no", "so", "if", "as", "from", "by",
    "about", "will", "would", "could", "should", "just", "like", "get",
    "what", "who", "how", "when", "there", "their", "them", "his", "her",
    "my", "me", "more", "all", "up", "out", "its", "than", "can"
}

def tokenize(text):
    return re.findall(r"\b[a-z]{3,}\b", str(text).lower())

all_words = [w for text in df["text_raw"] for w in tokenize(text) if w not in STOPWORDS]
top_words = Counter(all_words).most_common(30)

words_df = pd.DataFrame(top_words, columns=["word", "count"])
fig = px.bar(
    words_df,
    x="count", y="word",
    orientation="h",
    title="Top 30 Words (stopwords removed)",
    labels={"count": "Frequency", "word": ""},
    color="count",
    color_continuous_scale="Blues",
)
fig.update_layout(yaxis={"categoryorder": "total ascending"}, showlegend=False)
fig.show()

## 6. Comments Over Time

In [37]:
df["published_date"] = df["published_at"].dt.date
daily = df.groupby(["published_date", "channel_name"]).size().reset_index(name="count")

fig = px.line(
    daily,
    x="published_date",
    y="count",
    color="channel_name",
    title="Comment Volume Over Time by Channel",
    labels={"published_date": "Date", "count": "# Comments", "channel_name": "Channel"},
    markers=True,
)
fig.show()

## 7. Sample Comments — Manual Review

Read a random sample to eyeball data quality before training.

In [38]:
sample = df[["channel_name", "text_raw", "like_count", "reply_count"]].sample(
    min(10, len(df)), random_state=42
)
pd.set_option("display.max_colwidth", 120)
sample

Unnamed: 0,channel_name,text_raw,like_count,reply_count
30228,Fox News,Every village has one,2,0
51511,The Hill,what would you do if air is not free anymore?,0,0
128120,MrBeast,From your experience 🎉,0,0
65321,NBC News,There is a thinking called parent control for the kids that don’t go where they’re not supposed to,1,0
91750,The Late Show (Colbert),Love. Art. Beauty. Friends. Nature. \nin no particular order 😎,1,0
123771,MrBeast,Del Palmar al mundo Carlitos!,192,3
77887,The Late Show (Colbert),Robots? We're fucked.,0,0
72600,ABC News,​@captwillard3346 thanks capt. I should have watched it all the way through.,0,0
28771,Fox News,Sad and crazy,2,0
10576,The Hill,The solution that is privacy focused that you alluded to aka TPM is a terrible solution. Firmware on it isn't open s...,0,0


## 8. High-Engagement Comments (potential signal for toxicity)

High-reply comments often contain controversial content — useful to prioritize for labeling.

In [39]:
high_engagement = df.nlargest(10, "reply_count")[["channel_name", "text_raw", "like_count", "reply_count"]]
high_engagement

Unnamed: 0,channel_name,text_raw,like_count,reply_count
1945,MrBeast Gaming,"Go get Feastables Caramel hearts for someone you love this Valentine's Day! Available at Walmart, Kroger, and Albert...",13732,1006
115520,MrBeast,Subscribe so Justin will be in our next video,25176,1002
109503,MrBeast,What should we have ages 1-100 do next?,32487,1001
121041,MrBeast,"Go try my meat at Walmart, Target, Kroger, or 7Eleven!",14197,1001
2154,MrBeast Gaming,25 had too much fun with this,36876,1000
110009,MrBeast,bad day to skip class,80804,1000
110509,MrBeast,Oops I just found out that this guy’s famous,103024,1000
111011,MrBeast,"If you could have one piece of tech from this video, which one would you pick?",44189,1000
111512,MrBeast,How did I not see him do that?,20401,1000
112012,MrBeast,Subscribe to have your name added\r\n\r\n\nNO PURCHASE NECESSARY TO ENTER OR WIN. Takes place from 12:00 p.m. ET on ...,49251,1000


## 9. Data Quality Summary

Key things to verify before moving to preprocessing.

In [40]:
print("=" * 50)
print("DATA QUALITY SUMMARY")
print("=" * 50)

empty_text = (df["text_raw"].str.strip() == "").sum()
very_short  = (df["word_count"] < 3).sum()
very_long   = (df["word_count"] > 200).sum()
duplicates  = df.duplicated(subset=["content_id"]).sum()

print(f"  Total comments:       {len(df):,}")
print(f"  Unique comments:      {df['content_id'].nunique():,}")
print(f"  Duplicate IDs:        {duplicates}")
print(f"  Empty text:           {empty_text}")
print(f"  Very short (<3 words):{very_short} ({very_short/len(df)*100:.1f}%)")
print(f"  Very long (>200 words):{very_long} ({very_long/len(df)*100:.1f}%)")
print(f"  Channels covered:     {df['channel_name'].nunique()}")
print(f"  Unique videos:        {df['video_id'].nunique()}")
print(f"  Has replies:          {(df['reply_count'] > 0).sum():,} comments")
print()
print("Labels (expect all None until model runs):")
print(f"  label_toxicity:       {df['label_toxicity'].notna().sum()} labeled")
print(f"  label_hate_racism:    {df['label_hate_racism'].notna().sum()} labeled")
print(f"  label_harassment:     {df['label_harassment'].notna().sum()} labeled")
print()
if very_short > 0 or empty_text > 0:
    print("⚠️  Filter short/empty comments in preprocessing step.")
else:
    print("✅ Data looks clean. Ready for preprocessing.")

DATA QUALITY SUMMARY
  Total comments:       143,782
  Unique comments:      143,782
  Duplicate IDs:        0
  Empty text:           34
  Very short (<3 words):20753 (14.4%)
  Very long (>200 words):626 (0.4%)
  Channels covered:     13
  Unique videos:        507
  Has replies:          11,683 comments

Labels (expect all None until model runs):
  label_toxicity:       0 labeled
  label_hate_racism:    0 labeled
  label_harassment:     0 labeled

⚠️  Filter short/empty comments in preprocessing step.


## Next Steps

1. **Run full ingest:** `python -m src.data.ingest` → collect 10,000 comments (M1 complete)
2. **Notebook 02:** Preprocessing — clean text, detect language, filter short comments, save to `data/processed/`
3. **Notebook 03:** Train baseline model on public Kaggle datasets
4. **Notebook 04:** Run scoring on collected comments + evaluate uncertainty