In [1]:
pip install google-api-python-client vaderSentiment

Note: you may need to restart the kernel to use updated packages.


In [2]:
from googleapiclient.discovery import build
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from googleapiclient.errors import HttpError
import re

api_key = 'AIzaSyD8Yt4bQT44sI1Q-_VaBHXZ_hp6mLQZ44A' 
youtube = build('youtube', 'v3', developerKey=api_key)

def get_comments(video_id):
    comments = []
    next_page_token = None
    
    while True:
        request = youtube.commentThreads().list(
            part="snippet,replies",
            videoId=video_id,
            pageToken=next_page_token,
            maxResults=100
        )
        try:
            response = request.execute()
        except HttpError as e:
            print(f"An HTTP error {e.resp.status} occurred: {e.content}")
            return []
        
        for item in response['items']:
            comment = item['snippet']['topLevelComment']['snippet']['textOriginal']
            comments.append(comment)
            
            if 'replies' in item:
                for reply in item['replies']['comments']:
                    reply_text = reply['snippet']['textOriginal']
                    comments.append(reply_text)
        
        next_page_token = response.get('nextPageToken')
        if not next_page_token:
            break
    
    return comments

def analyze_sentiment(comments):
    analyzer = SentimentIntensityAnalyzer()
    sentiment_scores = {'positive': [], 'neutral': [], 'negative': []}
    
    for comment in comments:
        sentiment = analyzer.polarity_scores(comment)
        if sentiment['compound'] >= 0.05:
            sentiment_scores['positive'].append(comment)
        elif sentiment['compound'] <= -0.05:
            sentiment_scores['negative'].append(comment)
        else:
            sentiment_scores['neutral'].append(comment)
    
    return sentiment_scores

def extract_video_id(url):
    regex = r"(?:https?:\/\/)?(?:www\.)?(?:youtube\.com\/(?:[^\/\n\s]+\/\S+\/|(?:v|e(?:mbed)?)\/|live\/|shorts\/|\S*?[?&]v=)|youtu\.be\/)([a-zA-Z0-9_-]{11})"
    match = re.search(regex, url)
    if match:
        return match.group(1)
    else:
        raise ValueError("Invalid YouTube URL")

def display_comments(sentiment_scores):
    while True:
        print("\nChoose an option:")
        print("a) Read positive comments")
        print("b) Read neutral comments")
        print("c) Read negative comments")
        print("d) Analyze another video")
        print("e) Exit")
        choice = input("Enter your choice: ").strip().lower()

        if choice == 'a':
            print("\nPositive Comments:")
            for comment in sentiment_scores['positive']:
                print(comment)
        elif choice == 'b':
            print("\nNeutral Comments:")
            for comment in sentiment_scores['neutral']:
                print(comment)
        elif choice == 'c':
            print("\nNegative Comments:")
            for comment in sentiment_scores['negative']:
                print(comment)
        elif choice == 'd':
            main(input("Enter a YouTube URL: "))
        elif choice == 'e':
            break
        else:
            print("Invalid choice. Please try again.")

def main(video_url):
    video_id = extract_video_id(video_url)
    print(f"Extracted video ID: {video_id}")
    comments = get_comments(video_id)
    if comments:
        sentiment_scores = analyze_sentiment(comments)
        total_comments = sum(len(v) for v in sentiment_scores.values())
        print(f"Total comments analyzed: {total_comments}")
        print(f"Positive comments: {len(sentiment_scores['positive'])} ({(len(sentiment_scores['positive'])/total_comments)*100:.2f}%)")
        print(f"Neutral comments: {len(sentiment_scores['neutral'])} ({(len(sentiment_scores['neutral'])/total_comments)*100:.2f}%)")
        print(f"Negative comments: {len(sentiment_scores['negative'])} ({(len(sentiment_scores['negative'])/total_comments)*100:.2f}%)")
        
        display_comments(sentiment_scores)
    else:
        print("No comments found for the video.")

# Run the sentiment analysis
video_url = input('Enter a YouTube URL: ')
main(video_url)


Enter a YouTube URL:  https://youtu.be/x3M4dyq6c2s?feature=shared


Extracted video ID: x3M4dyq6c2s
Total comments analyzed: 1079
Positive comments: 753 (69.79%)
Neutral comments: 271 (25.12%)
Negative comments: 55 (5.10%)

Choose an option:
a) Read positive comments
b) Read neutral comments
c) Read negative comments
d) Analyze another video
e) Exit


Enter your choice:  e


In [7]:
import re
import emoji
import time
import pandas as pd
import numpy as np
from collections import Counter
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim import corpora, models
from tqdm import tqdm

# Ensure necessary NLTK data is downloaded
nltk.download('punkt')
nltk.download('stopwords')

# YouTube API key
api_key = 'AIzaSyD8Yt4bQT44sI1Q-_VaBHXZ_hp6mLQZ44A'
youtube = build('youtube', 'v3', developerKey=api_key)

# Hardcoded YouTube video URL
video_url = "https://youtu.be/x3M4dyq6c2s?feature=shared"

# Function to extract video ID from URL
def extract_video_id(url):
    regex = r"(?:https?:\/\/)?(?:www\.)?(?:youtube\.com\/(?:[^\/\n\s]+\/\S+\/|(?:v|e(?:mbed)?)\/|live\/|shorts\/|\S*?[?&]v=)|youtu\.be\/)([a-zA-Z0-9_-]{11})"
    match = re.search(regex, url)
    if match:
        return match.group(1)
    else:
        raise ValueError("Invalid YouTube URL")

# Function to fetch comments and replies from a YouTube video
def get_comments(video_id):
    comments = []
    next_page_token = None

    print("Fetching comments and replies...")
    while True:
        request = youtube.commentThreads().list(
            part="snippet,replies",
            videoId=video_id,
            pageToken=next_page_token,
            maxResults=100
        )
        try:
            response = request.execute()
        except HttpError as e:
            print(f"An HTTP error {e.resp.status} occurred: {e.content}")
            return pd.DataFrame()
        
        for item in response['items']:
            # Top-level comment
            comment = item['snippet']['topLevelComment']['snippet']
            comment_id = item['snippet']['topLevelComment']['id']
            comments.append({
                'comment_id': comment_id,
                'text': comment['textOriginal'],
                'author': comment.get('authorDisplayName', ''),
                'published_at': comment.get('publishedAt', ''),
                'like_count': comment.get('likeCount', 0),
                'reply_to': None  # Top-level comment
            })
            
            # Replies to the top-level comment
            if 'replies' in item:
                for reply in item['replies']['comments']:
                    reply_snippet = reply['snippet']
                    reply_id = reply['id']
                    comments.append({
                        'comment_id': reply_id,
                        'text': reply_snippet['textOriginal'],
                        'author': reply_snippet.get('authorDisplayName', ''),
                        'published_at': reply_snippet.get('publishedAt', ''),
                        'like_count': reply_snippet.get('likeCount', 0),
                        'reply_to': comment_id  # Reply to top-level comment
                    })
        
        next_page_token = response.get('nextPageToken')
        if not next_page_token:
            break
        
    print(f"Total comments and replies fetched: {len(comments)}")
    return pd.DataFrame(comments)

# Function to perform sentiment analysis
def analyze_sentiment(df):
    analyzer = SentimentIntensityAnalyzer()
    sentiments = []
    print("Analyzing sentiments...")
    for text in tqdm(df['text']):
        # VADER Sentiment
        vs = analyzer.polarity_scores(text)
        # TextBlob Sentiment
        tb = TextBlob(text).sentiment
        sentiments.append({
            'vader_neg': vs['neg'],
            'vader_neu': vs['neu'],
            'vader_pos': vs['pos'],
            'vader_compound': vs['compound'],
            'textblob_polarity': tb.polarity,
            'textblob_subjectivity': tb.subjectivity
        })
    sentiment_df = pd.DataFrame(sentiments)
    return pd.concat([df.reset_index(drop=True), sentiment_df], axis=1)

# Function to classify sentiment labels
def classify_sentiment(row):
    if row['vader_compound'] >= 0.05:
        return 'Positive'
    elif row['vader_compound'] <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

# Function to extract emojis
def extract_emojis(text):
    return [char for char in text if char in emoji.EMOJI_DATA]

# Function to perform topic modeling
def topic_modeling(texts, num_topics=5):
    print("Performing topic modeling...")
    stop_words = set(stopwords.words('english'))
    processed_texts = []
    for text in texts:
        tokens = word_tokenize(text.lower())
        tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
        processed_texts.append(tokens)
    dictionary = corpora.Dictionary(processed_texts)
    corpus = [dictionary.doc2bow(text) for text in processed_texts]
    lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)
    topics = lda_model.print_topics(num_words=5)
    return topics

# Function to analyze emotions using NRC Emotion Lexicon
def emotion_analysis(texts):
    print("Analyzing emotions...")
    nltk.download('averaged_perceptron_tagger')
    from nltk.corpus import wordnet as wn
    from nltk.stem import WordNetLemmatizer
    from nltk.corpus import wordnet
    nltk.download('wordnet')
    lemmatizer = WordNetLemmatizer()
    emotions_list = []
    # Load NRC Emotion Lexicon
    emotion_df = pd.read_csv('NRC-Emotion-Lexicon-Wordlevel-v0.92.txt', sep='\t', header=None, names=['word', 'emotion', 'association'])
    emotion_df = emotion_df[emotion_df['association'] == 1]
    emotion_dict = emotion_df.groupby('word')['emotion'].apply(list).to_dict()
    for text in tqdm(texts):
        tokens = word_tokenize(text.lower())
        tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalpha()]
        emotion_counter = Counter()
        for token in tokens:
            if token in emotion_dict:
                emotion_counter.update(emotion_dict[token])
        emotions_list.append(emotion_counter)
    return emotions_list

# Visualization functions
def plot_sentiment_distribution(df):
    sentiment_counts = df['sentiment_label'].value_counts()
    plt.figure(figsize=(8, 6))
    sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values, palette=['green', 'blue', 'red'])
    plt.title('Sentiment Distribution')
    plt.xlabel('Sentiment')
    plt.ylabel('Number of Comments')
    plt.show()

def plot_word_cloud(texts, title):
    text = ' '.join(texts)
    wordcloud = WordCloud(width=1200, height=800, background_color='white').generate(text)
    plt.figure(figsize=(15, 10))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(title)
    plt.show()

def plot_emotion_heatmap(emotion_data):
    emotion_df = pd.DataFrame(emotion_data).fillna(0)
    emotion_sums = emotion_df.sum().sort_values(ascending=False)
    plt.figure(figsize=(10, 6))
    sns.barplot(x=emotion_sums.index, y=emotion_sums.values)
    plt.title('Overall Emotion Distribution')
    plt.xlabel('Emotion')
    plt.ylabel('Frequency')
    plt.show()

def plot_sentiment_over_time(df):
    df['published_at'] = pd.to_datetime(df['published_at'])
    df = df.sort_values('published_at')
    df['vader_compound_smooth'] = df['vader_compound'].rolling(window=10, min_periods=1).mean()
    plt.figure(figsize=(15, 6))
    plt.plot(df['published_at'], df['vader_compound_smooth'], color='orange')
    plt.title('Sentiment Over Time')
    plt.xlabel('Time')
    plt.ylabel('Smoothed Compound Sentiment Score')
    plt.show()

def plot_emoji_distribution(emojis):
    if not emojis:
        print("No emojis found.")
        return
    emoji_counts = Counter(emojis)
    most_common_emojis = emoji_counts.most_common(10)
    emojis, counts = zip(*most_common_emojis)
    plt.figure(figsize=(10, 6))
    sns.barplot(x=list(emojis), y=list(counts))
    plt.title('Top 10 Emojis Used')
    plt.xlabel('Emoji')
    plt.ylabel('Count')
    plt.show()

# Main function
def main():
    try:
        video_id = extract_video_id(video_url)
        print(f"Extracted video ID: {video_id}")
        comments_df = get_comments(video_id)
        
        if not comments_df.empty:
            # Sentiment Analysis
            comments_df = analyze_sentiment(comments_df)
            comments_df['sentiment_label'] = comments_df.apply(classify_sentiment, axis=1)
            
            # Emotion Analysis
            emotion_data = emotion_analysis(comments_df['text'])
            
            # Topic Modeling
            topics = topic_modeling(comments_df['text'])
            print("Identified Topics:")
            for idx, topic in enumerate(topics):
                print(f"Topic {idx+1}: {topic}")
            
            # Emoji Analysis
            comments_df['emojis'] = comments_df['text'].apply(extract_emojis)
            all_emojis = sum(comments_df['emojis'], [])
            
            # Word Frequency Analysis
            stop_words = set(stopwords.words('english'))
            all_words = []
            for text in comments_df['text']:
                tokens = word_tokenize(text.lower())
                tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
                all_words.extend(tokens)
            word_freq = Counter(all_words)
            most_common_words = word_freq.most_common(20)
            print("Most Common Words:")
            for word, freq in most_common_words:
                print(f"{word}: {freq}")
            
            # Display Results
            total_comments = len(comments_df)
            sentiment_counts = comments_df['sentiment_label'].value_counts()
            print(f"Total comments analyzed: {total_comments}")
            for sentiment in ['Positive', 'Neutral', 'Negative']:
                count = sentiment_counts.get(sentiment, 0)
                percentage = (count / total_comments) * 100
                print(f"{sentiment} comments: {count} ({percentage:.2f}%)")
            print(f"Total emojis found: {len(all_emojis)}")
            
            # Visualizations
            plot_sentiment_distribution(comments_df)
            plot_word_cloud(comments_df['text'], 'Word Cloud of All Comments')
            plot_emotion_heatmap(emotion_data)
            plot_sentiment_over_time(comments_df)
            plot_emoji_distribution(all_emojis)
            
            # Additional Analysis
            # You can add more visualizations and analyses as needed
            
        else:
            print("No comments found for the video.")
    except ValueError as e:
        print(e)

# Run the analysis
main()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


Extracted video ID: x3M4dyq6c2s
Fetching comments and replies...
Total comments and replies fetched: 1079
Analyzing sentiments...


100%|████████████████████████████████████████████████████████████████████████████| 1079/1079 [00:00<00:00, 1471.14it/s]
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...


Analyzing emotions...


[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...


FileNotFoundError: [Errno 2] No such file or directory: 'NRC-Emotion-Lexicon-Wordlevel-v0.92.txt'

In [6]:
pip install google-api-python-client vaderSentiment textblob wordcloud matplotlib seaborn emoji pandas numpy nltk gensim tqdm




  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
moviepy 2.1.1 requires numpy>=1.25.0, but you have numpy 1.24.4 which is incompatible.



Collecting textblob
  Downloading textblob-0.18.0.post0-py3-none-any.whl.metadata (4.5 kB)
Collecting wordcloud
  Downloading wordcloud-1.9.4-cp311-cp311-win_amd64.whl.metadata (3.5 kB)
Collecting emoji
  Downloading emoji-2.14.0-py3-none-any.whl.metadata (5.7 kB)
Collecting FuzzyTM>=0.4.0 (from gensim)
  Downloading FuzzyTM-2.0.9-py3-none-any.whl.metadata (7.9 kB)
Collecting pyfume (from FuzzyTM>=0.4.0->gensim)
  Downloading pyFUME-0.3.4-py3-none-any.whl.metadata (9.7 kB)
Collecting scipy>=1.7.0 (from gensim)
  Downloading scipy-1.10.1-cp311-cp311-win_amd64.whl.metadata (58 kB)
     ---------------------------------------- 0.0/59.0 kB ? eta -:--:--
     ---------------------------------------- 0.0/59.0 kB ? eta -:--:--
     ------------- -------------------------- 20.5/59.0 kB ? eta -:--:--
     -------------------------- ----------- 41.0/59.0 kB 495.5 kB/s eta 0:00:01
     -------------------------- ----------- 41.0/59.0 kB 495.5 kB/s eta 0:00:01
     -------------------------- ----