# Individual Project
Youtube comment sentiment analysis

Questions:
1. In general, do YouTube comments lean towards a more positive or negative sentiment?
2. Which genre of YouTube videos tends to attract the most positive feedback in the comments section?
3. What is the sentiment towards controversial topics on YouTube?

Approach:
1. Gather data using Youtube api and make out into csv or other dataset file(columns are going to be g index, title, genre, controvertial(flag), the comment its self)
2. Create ai sentiment alaysis model
3. Train model
4. Use model to answer questions

Other:
1. Youtube comments will only be the main comments no replies to comments
2. Comments will be limited to less than 50,000 per video

In [2]:
from googleapiclient.discovery import build
import csv
import pandas as pd
import langid

In [2]:
API_KEY = ''

In [3]:
#gathering comments
def get_youtube_comments(video_id, max_results=100, max_comments=50000):
    video_comments = []
    comment_count = 0
    youtube = build("youtube", "v3", developerKey=API_KEY)

    # try a request
    try:
        video_response = youtube.commentThreads().list(part="snippet", videoId=video_id, maxResults=max_results).execute()
    except Exception as e:
        return video_comments

    # get comments
    while video_response:
        for item in video_response['items']:
            comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
            video_comments.append(comment)
            comment_count += 1

            # check if the max comments gotten
            if comment_count >= max_comments:
                return video_comments

        # check for more pages
        nextPageToken = video_response.get('nextPageToken')
        if nextPageToken:
            try:
                video_response = youtube.commentThreads().list(part="snippet", videoId=video_id, maxResults=max_results, pageToken=nextPageToken).execute()
            except Exception as e:
                break
        else:
            break
    return video_comments

In [5]:
def to_CSV(name, comments):
    try:
        #open file or create new on 
        with open(name, mode='a', newline='') as file:
            writer = csv.writer(file)

            #if no header, make header
            if file.tell() == 0:
                writer.writerow(['Index', 'Comments'])

            #write comments to csv
            for index, value in enumerate(comments):
                writer.writerow([index, value])

        print(f"Data successfully written to {name}")

    except Exception as e:
        print(f"Error writing to CSV file: {e}")

In [3]:
def clean_csv(input_file, output_file):
    df = pd.read_csv(input_file)

    #check if comment english
    def is_english(comment):
        try:
            lang, confidence = langid.classify(comment)
            return lang == 'en' and confidence > 0.5
        except Exception:
            return False

    #check comments
    df['IsEnglish'] = df['Comments'].apply(is_english)
    df = df[df['IsEnglish'] & df['Index'].notnull()]

    #save engish comment
    df[['Index', 'Comments']].to_csv(output_file, index=False)

Get top 5 videos by category

In [8]:
top_gaming = ['hI1MMVt7xEo', 'urHuO7Zbhhw', 'fPPGz5Qxw8A', 'BJPc49z57bU']
top_5_vlogs = ['HhM0BYCHL00', 'briN_W5yzYY','zU95-wX0xJo', '84WIaK3bl_s', 'WxfZkMm3wcg']
top_5_music = ['JGwWNGJdvx8', 'RgKAFK5djSk', 'OPf0YbXqDm0', '9bZkp7q19f0', '09R8_2nJtjg']
top_5_beauty = ['ex33wtqnNz8', 'OO3NO29L50U', 'mGs4CjeJiJQ', 'VD47yv2NfMw', 'Fz-DTp2iewQ']
top_5_reaction = ['-XKdCBZEWeU', 'KFChuq6piZ8', 'dYh6R4Jhxoo', 'KyNJD8Ewcjk', 'm5m8NoPRkRw']
top_controversial = []

In [None]:
for videoid in top_gaming:
    comments = get_youtube_comments(videoid)
    to_CSV('data/RawCSV/gaming_comments.csv',comments)

In [None]:
for videoid in top_5_vlogs:
    comments = get_youtube_comments(videoid)
    to_CSV('data/RawCSV/vlog_comments.csv',comments)

In [None]:
for videoid in top_5_music:
    comments = get_youtube_comments(videoid)
    to_CSV('data/RawCSV/music_comments.csv',comments)

In [None]:
for videoid in top_5_beauty:
    comments = get_youtube_comments(videoid)
    to_CSV('data/RawCSV/beauty_comments.csv',comments)

In [None]:
for videoid in top_5_reaction:
    comments = get_youtube_comments(videoid)
    to_CSV('data/RawCSV/reaction_comments.csv',comments)

In [4]:
clean_csv('data/RawCSV/gaming_comments.csv', 'data/CleanCSV/gaming_comments_clean.csv')
clean_csv('data/RawCSV/vlog_comments.csv', 'data/CleanCSV/vlog_comments_clean.csv')
clean_csv('data/RawCSV/music_comments.csv', 'data/CleanCSV/music_comments_clean.csv')
clean_csv('data/RawCSV/beauty_comments.csv', 'data/CleanCSV/beauty_comments_clean.csv')
clean_csv('data/RawCSV/reaction_comments.csv', 'data/CleanCSV/reaction_comments_clean.csv')