# Youtube comment fetcher

In [None]:
!pip install azure-eventhub google-api-python-client


Collecting azure-eventhub
  Downloading azure_eventhub-5.15.0-py3-none-any.whl.metadata (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.1/73.1 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collecting azure-core>=1.27.0 (from azure-eventhub)
  Downloading azure_core-1.34.0-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.9/42.9 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
Downloading azure_eventhub-5.15.0-py3-none-any.whl (327 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m327.8/327.8 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading azure_core-1.34.0-py3-none-any.whl (207 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.4/207.4 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: azure-core, azure-eventhub
Succe

In [None]:
from googleapiclient.discovery import build
from transformers import pipeline
from azure.eventhub import EventHubProducerClient, EventData
import json
import time

In [None]:
YOUTUBE_API_KEY = '**********'
VIDEO_ID = 'fsQgc9pCyDU'
youtube = build('youtube', 'v3', developerKey=YOUTUBE_API_KEY)

In [None]:
EVENT_HUB_CONNECTION_STR = '*******'
EVENT_HUB_NAME = 'comments-hf'

In [None]:
producer = EventHubProducerClient.from_connection_string(
    conn_str=EVENT_HUB_CONNECTION_STR,
    eventhub_name=EVENT_HUB_NAME
)

In [None]:
sentiment_model = pipeline("sentiment-analysis")


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cpu


In [None]:
def get_all_youtube_comments(video_id, max_total=1000):
    """
    Fetch up to `max_total` comments with pagination,
    returns list of dicts: [{"text": comment_text}, ...]
    """
    comments = []
    next_page_token = None

    while len(comments) < max_total:
        response = youtube.commentThreads().list(
            part='snippet',
            videoId=video_id,
            maxResults=100,
            pageToken=next_page_token,
            textFormat='plainText'
        ).execute()

        for item in response['items']:
            comment_text = item['snippet']['topLevelComment']['snippet']['textDisplay']
            comments.append({"text": comment_text})
            if len(comments) >= max_total:
                break

        next_page_token = response.get('nextPageToken')
        if not next_page_token:
            break

    print(f"✅ Collected {len(comments)} comments")
    return comments


In [None]:
def huggingface_sentiment(text):
    result = sentiment_model(text[:512])[0]  # HuggingFace model max token length safety
    return result['label']

In [None]:
def rule_based_sentiment(text):
    text = text.lower()
    positive_words = ["good", "love", "great", "amazing", "awesome"]
    negative_words = ["bad", "hate", "terrible", "worst", "awful"]

    if any(word in text for word in positive_words):
        return "Positive"
    elif any(word in text for word in negative_words):
        return "Negative"
    else:
        return "Neutral"

In [None]:
def send_to_event_hub(comments, batch_size=50):
    from azure.eventhub import EventHubProducerClient, EventData
    import json
    import time

    producer = EventHubProducerClient.from_connection_string(
        conn_str=EVENT_HUB_CONNECTION_STR,
        eventhub_name=EVENT_HUB_NAME
    )

    with producer:
        for i in range(0, len(comments), batch_size):
            batch_comments = comments[i:i + batch_size]
            batch = producer.create_batch()

            for comment in batch_comments:
                text = comment["text"]
                comment["hf_sentiment"] = huggingface_sentiment(text)
                comment["sql_sentiment"] = rule_based_sentiment(text)

                event_data = EventData(json.dumps(comment))

                try:
                    batch.add(event_data)
                except ValueError:
                    # If a single event is too large, skip or log
                    print(f"⚠️ Skipping large comment at index {i}")
                    continue

            try:
                producer.send_batch(batch)
                print(f"✅ Sent batch {i // batch_size + 1} with {len(batch_comments)} comments")
                time.sleep(0.2)  # Optional: slight delay to avoid timeouts
            except Exception as e:
                print(f"❌ Error sending batch {i // batch_size + 1}: {e}")

    print(f"✅ Sent total {len(comments)} comments to Event Hub")



In [None]:
if __name__ == "__main__":
    VIDEO_ID = "fsQgc9pCyDU"
    MAX_COMMENTS = 13000

    comments = get_all_youtube_comments(VIDEO_ID, max_total=MAX_COMMENTS)
    send_to_event_hub(comments)

✅ Collected 5355 comments
✅ Sent batch 1 with 50 comments
✅ Sent batch 2 with 50 comments
✅ Sent batch 3 with 50 comments
✅ Sent batch 4 with 50 comments
✅ Sent batch 5 with 50 comments
✅ Sent batch 6 with 50 comments
✅ Sent batch 7 with 50 comments
✅ Sent batch 8 with 50 comments
✅ Sent batch 9 with 50 comments
✅ Sent batch 10 with 50 comments
✅ Sent batch 11 with 50 comments
✅ Sent batch 12 with 50 comments
✅ Sent batch 13 with 50 comments
✅ Sent batch 14 with 50 comments
✅ Sent batch 15 with 50 comments
✅ Sent batch 16 with 50 comments
✅ Sent batch 17 with 50 comments
✅ Sent batch 18 with 50 comments
✅ Sent batch 19 with 50 comments
✅ Sent batch 20 with 50 comments
✅ Sent batch 21 with 50 comments
✅ Sent batch 22 with 50 comments
✅ Sent batch 23 with 50 comments
✅ Sent batch 24 with 50 comments
✅ Sent batch 25 with 50 comments
✅ Sent batch 26 with 50 comments
✅ Sent batch 27 with 50 comments
✅ Sent batch 28 with 50 comments
✅ Sent batch 29 with 50 comments
✅ Sent batch 30 with 50 co

In [None]:
!pip install azure-storage-blob


Collecting azure-storage-blob
  Downloading azure_storage_blob-12.25.1-py3-none-any.whl.metadata (26 kB)
Collecting isodate>=0.6.1 (from azure-storage-blob)
  Downloading isodate-0.7.2-py3-none-any.whl.metadata (11 kB)
Downloading azure_storage_blob-12.25.1-py3-none-any.whl (406 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m407.0/407.0 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading isodate-0.7.2-py3-none-any.whl (22 kB)
Installing collected packages: isodate, azure-storage-blob
Successfully installed azure-storage-blob-12.25.1 isodate-0.7.2


# Retrieving CSV from Blob Storage


In [None]:
from azure.storage.blob import BlobServiceClient
import json
import pandas as pd
import os

# Your storage account connection string
conn_str = "************"

# The container name where Stream Analytics outputs blobs
container_name = "sentimentoutput"

blob_service_client = BlobServiceClient.from_connection_string(conn_str)
container_client = blob_service_client.get_container_client(container_name)

# List all blobs
blob_list = list(container_client.list_blobs())
print(f"✅ Found {len(blob_list)} blobs")

all_data = []

for blob in blob_list:
    blob_client = container_client.get_blob_client(blob)
    blob_data = blob_client.download_blob().readall()
    blob_text = blob_data.decode('utf-8')

    # Parse each line (JSONL)
    for line in blob_text.strip().split('\n'):
        try:
            all_data.append(json.loads(line))
        except json.JSONDecodeError:
            print(f"⚠️ Could not parse a line in {blob.name}")

# Convert to DataFrame
df = pd.DataFrame(all_data)
print(f"✅ Parsed {len(df)} records")

# Save to CSV
output_csv = "youtube_sentiment_data.csv"
df.to_csv(output_csv, index=False)
print(f"📁 Saved CSV to {os.path.abspath(output_csv)}")


✅ Found 1 blobs
✅ Parsed 5355 records
📁 Saved CSV to /content/youtube_sentiment_data.csv
