### Import Libraries

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
pd.set_option("display.max_columns", None)
import numpy as np
import re
from bs4 import BeautifulSoup
from tqdm import tqdm
import torch
from lib_file import lib_path
from urllib.parse import urlparse
import googleapiclient
from langdetect import detect
from IPython import display

import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, pipeline, AdamW, get_linear_schedule_with_warmup
display.clear_output()

### Model Loading

In [2]:
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [3]:
class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        return logits

In [4]:
bert_model_name = 'bert-base-uncased'
num_classes = 2
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
tokens = pipeline("sentiment-analysis")
display.clear_output()

model = BERTClassifier(bert_model_name, num_classes)
PATH = "models/BERTClassification_model.pt"
model.load_state_dict(torch.load(PATH, map_location=device))
display.clear_output()

model.eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<All keys matched successfully>

In [6]:
class_labels = ['NEGATIVE', 'POSITIVE']

In [7]:
api_key = "AIzaSyB1uysE-Mn5kUml8hxafw9_TVQICQPfoMI"
youtube = googleapiclient.discovery.build('youtube', 'v3', developerKey=api_key)

In [8]:
def get_video_id_from_url(url):
    parsed_url = urlparse(url)
    if parsed_url.netloc == 'www.youtube.com' or parsed_url.netloc == 'youtube.com':
        query_params = parsed_url.query
        query_params = query_params.split('&')
        for param in query_params:
            key_value = param.split('=')
            if key_value[0] == 'v':
                return key_value[1]
    elif parsed_url.netloc == 'youtu.be':
        path_segments = parsed_url.path.split('/')
        if len(path_segments) > 1:
            return path_segments[1]

    return None

In [9]:
def retrieve_all_comments(video_id):
    comments = []
    next_page_token = None

    while True:
        response = youtube.commentThreads().list(
            part='snippet',
            videoId=video_id,
            maxResults=100,
            order='time',
            pageToken=next_page_token
        ).execute()

        for item in response['items']:
            comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
            comments.append(comment)

        if 'nextPageToken' in response:
            next_page_token = response['nextPageToken']
        else:
            break

    return comments

In [10]:
def is_english_sentence(sentence):
    try:
        lang = detect(sentence)
        if lang == 'en':
            return True
        else:
            return False
    except:
        return False

In [11]:
def text_cleaning(text):
    text = text.lower()
    text = BeautifulSoup(text, "html.parser")
    text = text.get_text()
    emoji_pattern = re.compile("["
                            u"\U0001F600-\U0001F64F"  # Emojis
                            u"\U0001F300-\U0001F5FF"  # Symbols & pictographs
                            u"\U0001F680-\U0001F6FF"  # Transport & map symbols
                            u"\U0001F700-\U0001F77F"  # Alphabetic presentation forms
                            u"\U0001F780-\U0001F7FF"  # Geometric shapes
                            u"\U0001F800-\U0001F8FF"  # Miscellaneous symbols"
                            u"\U0001F900-\U0001F9FF"  # Supplemental symbols & pictographs
                            u"\U0001FA00-\U0001FA6F"  # Extended-A
                            u"\U0001FA70-\U0001FAFF"  # Extended-B
                            u"\U0001F004-\U0001F0CF"  # Mahjong tiles
                            u"\U0001F170-\U0001F251"  # Enclosed characters
                            u"\U00020000-\U0002F73F"  # Chinese, Japanese, and Korean characters
                            u"\U000E0000-\U000E007F"  # Tags
                            "]+", flags=re.UNICODE)
    text = re.sub(emoji_pattern, '', text)
    text = re.sub(r'\n', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text

---

In [12]:
video_url = input("Paste youtube's appropriate link...\n")

Paste youtube's appropriate link...
https://youtu.be/Y9i3OIMitRQ?si=obsDOYg7IRrR9aLf


In [13]:
video_id = get_video_id_from_url(video_url)
if video_id is not None:
    print(f"The video ID, {video_id}, has been detected.")
else:
    print("The video ID could not be retrieved from the URL. Please try a different URL.")

The video ID, Y9i3OIMitRQ, has been detected.


In [14]:
all_comments = retrieve_all_comments(video_id)
del(all_comments[0])
all_comments[:10]

['A World of fakeness. Soon well make ai to read on our behalf and reply amd make decisions if reasonablefor us. But the person who first sent to email didn&#39;t even read what they sent nor did the receiver.',
 'Sir i need work from home job my qualifications post graduation 😊',
 'We must not miss use the technology to <a href="http://bad.it/">bad.it</a> has to be shown good on ways to the people.',
 'As the A. I. &#39;S are going to occupy more than 2.4 billion jobs from all Human beings from all the Nations of this World within these 2 years, i.e., 2023 and 2024 Years. <br><br>As the AI is harmful for human beings!<br>They will wage a war against  human beings with the help of Anti Christ on the year 2025, as according to the Biblical Standards, which was written on the book of Revelation.<br><br>From,<br>Apostle. Rev. Dr. P. Timothy John (Professor of Eschatology &amp; Paranormal Activity Expert).',
 'It&#39;s a great sign that ai is gonna rule the world 🌎 in a matter of time...<b

In [15]:
english_comments = []
for sent in tqdm(all_comments):
    result = is_english_sentence(sent)
    if result:
        english_comments.append(sent)
    else:
        continue

100%|████████████████████████████████████████████████████████████████████████████████| 505/505 [00:10<00:00, 48.34it/s]


In [16]:
english_comments[:10]

['A World of fakeness. Soon well make ai to read on our behalf and reply amd make decisions if reasonablefor us. But the person who first sent to email didn&#39;t even read what they sent nor did the receiver.',
 'Sir i need work from home job my qualifications post graduation 😊',
 'We must not miss use the technology to <a href="http://bad.it/">bad.it</a> has to be shown good on ways to the people.',
 'As the A. I. &#39;S are going to occupy more than 2.4 billion jobs from all Human beings from all the Nations of this World within these 2 years, i.e., 2023 and 2024 Years. <br><br>As the AI is harmful for human beings!<br>They will wage a war against  human beings with the help of Anti Christ on the year 2025, as according to the Biblical Standards, which was written on the book of Revelation.<br><br>From,<br>Apostle. Rev. Dr. P. Timothy John (Professor of Eschatology &amp; Paranormal Activity Expert).',
 'It&#39;s a great sign that ai is gonna rule the world 🌎 in a matter of time...<b

In [17]:
cleaned_sentences = []
for sentence in tqdm(english_comments):
    cleaned_text = text_cleaning(sentence)
    cleaned_sentences.append(cleaned_text)

df = pd.DataFrame(data={"Text": cleaned_sentences})
df['length']=df['Text'].apply(lambda x: len(x.split()))
df = df.loc[(df['length'] <= 500) & (df['length'] >= 2)]
df = df.drop(labels='length', axis=1)
df.head(10)

100%|██████████████████████████████████████████████████████████████████████████████| 441/441 [00:00<00:00, 4793.50it/s]


Unnamed: 0,Text
0,a world of fakeness soon well make ai to read ...
1,sir i need work from home job my qualification...
2,we must not miss use the technology to badit h...
3,as the a i s are going to occupy more than 24 ...
4,its a great sign that ai is gonna rule the wor...
5,immersive is not ai it5 is taking weather goo...
6,the only reason big companies get bussiness co...
7,common average public does not know it unless ...
8,two major contributions of google has been inf...
9,tech has many uses and still not expoored full...


In [24]:
max_length = 128

results = []
for text in tqdm(df['Text'].values.tolist()):
    encoding = tokenizer(text, return_tensors='pt', 
                         max_length=max_length, 
                         padding='max_length', 
                         truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask),tokens(text)
        results.append(outputs[-1][0]['label'])

100%|████████████████████████████████████████████████████████████████████████████████| 437/437 [09:07<00:00,  1.25s/it]


In [25]:
df['sentiment'] = results
df.head(10)

Unnamed: 0,Text,sentiment
0,a world of fakeness soon well make ai to read ...,NEGATIVE
1,sir i need work from home job my qualification...,NEGATIVE
2,we must not miss use the technology to badit h...,POSITIVE
3,as the a i s are going to occupy more than 24 ...,NEGATIVE
4,its a great sign that ai is gonna rule the wor...,POSITIVE
5,immersive is not ai it5 is taking weather goo...,POSITIVE
6,the only reason big companies get bussiness co...,POSITIVE
7,common average public does not know it unless ...,NEGATIVE
8,two major contributions of google has been inf...,NEGATIVE
9,tech has many uses and still not expoored full...,NEGATIVE


In [26]:
print(f"Total useful english comments: {df.shape[0]}")
print(f"Posititive comments: {df.loc[df['sentiment']=='POSITIVE'].shape[0]}")
print(f"Negative comments: {df.loc[df['sentiment']=='NEGATIVE'].shape[0]}")

Total useful english comments: 437
Posititive comments: 133
Negative comments: 304


---