## Import and Initialize the model

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax
import torch
import pandas as pd



roberta = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(roberta)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = AutoModelForSequenceClassification.from_pretrained(roberta).to(device)



  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Analyze Text and output results as a Dataframe

In [11]:
from tqdm import tqdm 

# Initialize a dictionary to store overall sentiment counts
def AnalyzeSentiment(comments):
    results = []
    overall_sentiments = {'negative': int(0), 'neutral': int(
        0), 'positive': int(0)}  # Initializing with integer values

    for i, comment in tqdm(enumerate(comments), total=len(comments), desc="Progress..." ):
        
        tokens = tokenizer.tokenize(comment['text'])
        if len(tokens) > 514: 
            continue
        encoded_text = tokenizer(
            comment['text'], return_tensors='pt').to(device)
        
        # print(f"Encoded text length: {len(encoded_text[1])}")
        output = model(**encoded_text)
        scores = output.logits[0].cpu().detach().numpy()
        scores = softmax(scores)
        max_index = scores.argmax()
        label = ['negative', 'neutral', 'positive'][max_index]

        # Update overall sentiment counts
        overall_sentiments[label] += 1

        result = {
            'text': comment['text'],  # Adjust to 'text' key
            'published_at': comment['published_at'],
            'like_count': int(comment['like_count']),  # Convert to int
            'negative': float(scores[0]),  # Convert to float
            'neutral': float(scores[1]),  # Convert to float
            'positive': float(scores[2]),  # Convert to float
            'overall_sentiment': label
        }
        results.append(result)

    # Append modified overall sentiment counts to results
    results.append(overall_sentiments)
    return results


## Using the Youtube Data API to fetch comments from a Video

In [18]:
import os
import googleapiclient.discovery

import googleapiclient.errors

from dotenv import load_dotenv

load_dotenv()

api_service_name = "youtube"
api_version = "v3"
YOUTUBE_API_KEY = os.getenv("YOUTUBE_API_KEY")


youtube = googleapiclient.discovery.build(

    api_service_name, api_version, developerKey=YOUTUBE_API_KEY

)


# fetch comments from a youtube video
def fetchAllComments(video_id, pageToken=None):
    items = []
    # maxCount = 5000

    while True:
        request = youtube.commentThreads().list(
            part="snippet",
            videoId=video_id,
            maxResults=100,
            pageToken=pageToken
        )

        response = request.execute()
        items.extend(response['items'])
        # if len(items) >= maxCount:
        #     break
        if 'nextPageToken' in response:
            pageToken = response['nextPageToken']
        else:
            break

           # output as df
    comments = []

    for item in items:
        comment = item['snippet']['topLevelComment']['snippet']
        comment_info = {
            # comment['authorDisplayName'],
            'published_at': comment['publishedAt'],
            # comment['updatedAt'],
            'like_count': comment['likeCount'],
            'text': comment['textDisplay']
        }
        comments.append(comment_info)

    return comments


#full link https://www.youtube.com/watch?v=4_UDm-nCjeA

video_id = "Oy8zSYKkczI"

items = fetchAllComments(video_id)

## Output as DataFrame

In [9]:
comments = []

for item in items:
    comment = item['snippet']['topLevelComment']['snippet']
    comments.append([
        comment['authorDisplayName'],
        comment['publishedAt'],
        comment['updatedAt'],
        comment['likeCount'],
        comment['textDisplay']
    ])

df = pd.DataFrame(comments, columns=['author', 'published_at', 'updated_at', 'like_count', 'text'])

display(df)


Unnamed: 0,author,published_at,updated_at,like_count,text
0,@Threelittlepeople1,2024-05-30T19:32:56Z,2024-05-30T19:32:56Z,0,Handy boy for game boy by STD is wild.
1,@mohwe1007,2024-05-12T01:50:55Z,2024-05-12T01:50:55Z,1,The Game Boy<br><br>The birthplace of Pokémon
2,@DashIceland,2024-05-10T06:57:00Z,2024-05-10T06:57:00Z,0,Great video
3,@Youtube-Censorship-Police,2024-05-08T07:37:33Z,2024-05-08T07:37:33Z,1,everybody who doesn&#39;t instantly recognize ...
4,@Isomnophilia,2024-05-06T21:58:15Z,2024-05-06T21:58:15Z,0,A sealed Gameboy... Was it refurb or a truly s...
...,...,...,...,...,...
12562,@AliAkbar-zz4uk,2019-04-19T16:00:29Z,2019-04-19T16:00:29Z,0,Great
12563,@briankerosene22,2019-04-19T16:00:28Z,2019-04-19T16:00:28Z,0,Nice
12564,@nicholasfantini8273,2019-04-19T16:00:24Z,2019-04-19T16:00:24Z,0,First
12565,@suraj9519,2019-04-19T16:00:23Z,2019-04-19T16:00:23Z,1,Hey


## Analyze youtube comments

In [19]:
# print(df['text'])
youtubeAnalysisResults = AnalyzeSentiment(comments=items)


Progress...: 100%|██████████| 12567/12567 [05:14<00:00, 39.99it/s]


In [21]:
display(pd.DataFrame(youtubeAnalysisResults))

Unnamed: 0,text,published_at,like_count,negative,neutral,positive,overall_sentiment
0,Handy boy for game boy by STD is wild.,2024-05-30T19:32:56Z,0.0,0.334921,0.574093,0.090986,neutral
1,The Game Boy<br><br>The birthplace of Pokémon,2024-05-12T01:50:55Z,1.0,0.007495,0.713094,0.279411,neutral
2,Great video,2024-05-10T06:57:00Z,0.0,0.009886,0.047417,0.942697,positive
3,everybody who doesn&#39;t instantly recognize ...,2024-05-08T07:37:33Z,1.0,0.883647,0.105454,0.010900,negative
4,A sealed Gameboy... Was it refurb or a truly s...,2024-05-06T21:58:15Z,0.0,0.046080,0.892708,0.061212,neutral
...,...,...,...,...,...,...,...
12552,Nice,2019-04-19T16:00:28Z,0.0,0.040874,0.185068,0.774059,positive
12553,First,2019-04-19T16:00:24Z,0.0,0.089204,0.480304,0.430492,neutral
12554,Hey,2019-04-19T16:00:23Z,1.0,0.063890,0.488614,0.447495,neutral
12555,hi,2019-04-19T16:00:22Z,3.0,0.110424,0.472392,0.417184,neutral


In [24]:
print(youtubeAnalysisResults[-1])

{'negative': 2113, 'neutral': 4184, 'positive': 6259}
