In [20]:
import os
import re
import googleapiclient.discovery
from googleapiclient.errors import HttpError
import pandas as pd

# Step 1: Scrape YouTube Comments
def get_youtube_comments(video_id, api_key, max_comments=500):
    # Disable OAuthlib's HTTPS verification when running locally.
    os.environ["OAUTHLIB_INSECURE_TRANSPORT"] = "1"

    # Create a YouTube API client
    youtube = googleapiclient.discovery.build("youtube", "v3", developerKey=api_key)

    comments = []
    next_page_token = None

    while len(comments) < max_comments:
        try:
            request = youtube.commentThreads().list(
                part="snippet",
                videoId=video_id,
                maxResults=100,
                pageToken=next_page_token
            )
            response = request.execute()

            for item in response['items']:
                comment = item['snippet']['topLevelComment']['snippet']
                comment_id = item['id']
                comment_text = comment['textDisplay']
                like_count = comment['likeCount']
                comments.append({
                    "CommentId": comment_id,
                    "VideoId": video_id,
                    "Text": comment_text,
                    "Likes": like_count
                })
            
            next_page_token = response.get('nextPageToken')
            if not next_page_token:
                break
        except HttpError as e:
            print(f"An error occurred: {e}")
            break
    
    return comments[:max_comments]

# Step 2: Preprocess the Comments
def preprocess_comments(comments):
    for comment in comments:
        # Remove URLs, special characters, etc.
        text = re.sub(r"http\S+", "", comment["Text"])
        text = re.sub(r"[^A-Za-z0-9\s]+", "", text)
        text = text.lower()
        comment["Text"] = text
    return comments

# Step 3: Save comments to CSV
def save_comments_to_csv(comments, filename):
    df = pd.DataFrame(comments)
    
    df = df.sort_values(by='Likes', ascending=False)
    
    df.to_csv(filename, index=False)
    print(f"Comments saved to {filename}")

if __name__ == "__main__":
    # API key and video ID
    API_KEY = "YOur api key here."  
    VIDEO_ID = "dQw4w9WgXcQ"          
    FILENAME = "C:\\Users\\Home\\Desktop\\comments\\sentiment_dataset.csv"  

    # Get comments from YouTube
    comments = get_youtube_comments(VIDEO_ID, API_KEY, max_comments=500)
    print(f"Fetched {len(comments)} comments")

    # Preprocess the comments
    preprocessed_comments = preprocess_comments(comments)

    save_comments_to_csv(preprocessed_comments, FILENAME)


Fetched 500 comments
Comments saved to youtube_comments_sorted.csv
