[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/farheenfab/AppliedText_CW/blob/main/CW1-generate_dataset.ipynb)


# F20AA Coursework 1


In [1]:
import googleapiclient.discovery
import googleapiclient.errors
import numpy as np
import pandas as pd
import glob
import nltk 
import os
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.pipeline import Pipeline
from langdetect import detect
import shutil
import random
from textblob import TextBlob
from tabulate import tabulate
from sklearn.model_selection import GridSearchCV 
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.datasets import load_files

nltk.download('vader_lexicon')
nltk.download('stopwords')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/fayazbadubhai/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/fayazbadubhai/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## 1) Data Collection:


In [2]:
# Define the api service name, version and developer key for the api call.
api_service_name = "youtube"
api_version = "v3"
# AIzaSyC8VqY2cYxX7jOouIF076rpM8lvT1ZBJu4
# AIzaSyAWj_uzrhZL18X32S_P79pT1wnSYGpuA4k
DEVELOPER_KEY = "AIzaSyC8VqY2cYxX7jOouIF076rpM8lvT1ZBJu4"

Reference

https://developers.google.com/youtube/v3/docs/search/list#parameters

https://developers.google.com/youtube/v3/docs/comments/list


We have created a class called api_handler which contains functions such as `get_video_details()`, `get_videos()`, `get_video_df()`, `get_comments()`, `get_comment_replies()`, `get_comments_df()`, `create_video_df_from_search()`, `create_video_df()`. These functions help us by either manually retrieving the videos and comments or by automatically curating the videos and comments using the product given to the function.


In [3]:
class api_handler:
    def __init__(self, api_service_name, api_version, developer_key):
        self.client = googleapiclient.discovery.build(api_service_name,
                                                    api_version,
                                                    developerKey=developer_key)
        
    # Search for videos details given id
    def get_video_details(self, videoId, part="snippet"):
        request = self.client.videos().list(
            part=part,
            id=videoId
        )
        response = request.execute()

        if 'items' in response:
            video_details = response['items'][0]
            snippet=video_details['snippet']
            snippet['videoId']=videoId
            snippet['id']=videoId
            snippet['publishTime']=video_details.get('snippet', {}).get('publishedAt', {})
            snippet['thumbnails']=video_details.get('snippet', {}).get('thumbnails', {}).get('default', {}).get('url', '')
            return snippet

        return None

    # Search for videos given query
    def get_videos(self,query,maxResults=5,part="snippet"):
        request = self.client.search().list(
            part=part,
            maxResults=maxResults,
            # higher view count is likely to be more relevent 
            order="viewCount",
            q=query,  
            # american region videos 
            regionCode="US",
            # english videos
            relevanceLanguage="en",
            type="video"
        )
        response = request.execute()
        return response
    
    # Format Response from get_videos to dataframe
    def get_video_df(response):
        items=[]
        for item in response['items']:
            snippet=item.get('snippet', {})
            items+=[{
                'title':snippet.get('title', ''),
                'videoId':item.get('id', {}).get('videoId', ''),
                'channelTitle':snippet.get('channelTitle', ''),
                'publishTime':snippet.get('publishTime', ''),
                'description':snippet.get('description', ''),
                'thumbnails':snippet.get('thumbnails', {}).get('default', {}).get('url', '')
                }]
        df=pd.DataFrame(items)
        return df
    
    # Get comments from video
    def get_comments(self,videoId,part="snippet",maxResults=100,maxResultsDepth=100):
        all_comments = []
        f = 0
        nextPageToken = None
        while maxResults > 0:
            request = self.client.commentThreads().list(
                part=part,
                videoId=videoId,
                maxResults=min(maxResults, 100),
                order='relevance',
                moderationStatus='published',
                textFormat='plainText',
                pageToken=nextPageToken
            )
            response = request.execute()
            nextPageToken = response.get('nextPageToken')
            if 'items' in response:
                all_comments+=[response]
                for item in response['items']:
                    # extract the comment ID to get replies
                    comment_id = item.get('snippet',{}).get('topLevelComment',{}).get('id','')
                    if item.get('snippet',{}).get('totalReplyCount',0)>2:
                        if f == 0:
                            print('getting replies:',item.get('snippet',{}).get('totalReplyCount',0))
                            f = 1
                        replies = self.get_comment_replies(comment_id, maxResults=maxResultsDepth)
                        all_comments += replies

            maxResults -= min(maxResults, 100)
            if nextPageToken is None:
                break;    
        return all_comments
    
    # Get replies from comment 
    def get_comment_replies(self, commentId, part="snippet", maxResults=100):
        all_comments = []
        nextPageToken = None
        while maxResults > 0 and (nextPageToken != None or len(all_comments)==0):

            request = self.client.comments().list(
                part=part,
                parentId=commentId,
                maxResults=min(maxResults, 100),
                textFormat='plainText',
                pageToken=nextPageToken
            )

            response = request.execute()
            nextPageToken = response.get('nextPageToken')

            if 'items' in response and len(response['items'])>0:
                for item in response['items']:
                    modified_response = {
                        'items': [
                            {
                                'id':item.get('id'),
                                'snippet': {
                                    'topLevelComment': {
                                        'snippet': item.get('snippet','')
                                    }
                                }
                            }
                        ]
                    }
                    all_comments += [modified_response]
            maxResults -= min(maxResults, 100)
            if nextPageToken is None:
                break;    
        return all_comments

    # Format response from get_comments to dataframe
    def get_comments_df(response, video,product):
        comments = []
        for pages in response:
            for item in pages['items']:
                comment = item.get('snippet', {}).get('topLevelComment', {}).get('snippet', {})
                comments.append([
                        product,
                        video.get('title', ''),
                        video.get('videoId', ''),
                        video.get('channelTitle', ''),
                        video.get('publishTime', ''),
                        video.get('description', ''),
                        video.get('thumbnails', ''),
                        item.get('id', ''),  
                        comment.get('parentId', ''),  
                        comment.get('authorDisplayName', '')[1:],  
                        comment.get('publishedAt', ''),
                        comment.get('updatedAt', ''),
                        comment.get('likeCount', ''),
                        comment.get('textDisplay', '')
                    ])

        df = pd.DataFrame(comments,
            columns=['product', 'v_title', 'v_videoId',
                    'v_channelTitle', 'v_publishTime',
                    'v_description', 'v_thumbnail',
                    'c_id','c_parentId',
                    'c_author', 'c_published_at',
                    'c_updated_at', 'c_like_count',
                    'c_text'])
        
        return df
    
    # Search for videos related to products iteratively
    # Collect comments from each video and place it into an array
    def create_video_df_from_search(self, products,
                                    number_of_videos_per_product=5,
                                    number_of_comments_per_video=100
                                    ,number_of_replies_per_comment=100):
        multiple_video_comments = pd.DataFrame()
        for product in products:
            # get 25 first videos with the highest viewer counts 
            response = self.get_videos(query=product, maxResults=number_of_videos_per_product)
            # Convert results to df
            videos_df = api_handler.get_video_df(response)
            # For each video get a maximum of 100 comments
            # and place comments into an array
            for _, video in videos_df.iterrows():
                try:
                    response = self.get_comments(video['videoId'], maxResults=number_of_comments_per_video,maxResultsDepth=number_of_replies_per_comment)
                    comments_df = api_handler.get_comments_df(response, video, product)
                except:
                    # Function fails as the API returns 403 if the channel has comments disabled
                    # place an empty entry instead it can be deleted later
                    comments_df = pd.DataFrame(np.zeros((1, 14)),
                                                columns=['product', 'v_title', 'v_videoId',
                                                        'v_channelTitle', 'v_publishTime',
                                                        'v_description', 'v_thumbnail',
                                                        'c_id','c_parentId',
                                                        'c_author', 'c_published_at',
                                                        'c_updated_at', 'c_like_count',
                                                        'c_text'])
                    print('Unable to retrieve comments:', video.get('title', ''))
                multiple_video_comments = pd.concat([multiple_video_comments, comments_df], ignore_index=True)
        return multiple_video_comments
        
    # alternative method by explicitely specifying videos
    def create_video_df(self,products,videos,number_of_comments_per_video=100,number_of_replies_per_comment=100):
        count=0
        multiple_video_comments = pd.DataFrame()
        for product in products:
            for video in videos[count]:
                response = self.get_comments(video,maxResults=number_of_comments_per_video,maxResultsDepth=number_of_replies_per_comment) 
                video=self.get_video_details(video)
                comments_df = api_handler.get_comments_df(response, video, product)
                multiple_video_comments = pd.concat([multiple_video_comments, comments_df], ignore_index=True)
            count+=1
        return multiple_video_comments

We have chosen the Korean Drama called Squid Game to perform the sentiment analysis on. We specify the product in the products list, create a `api_handler` class object, use the `create_video_df_from_search()` function to automatically curate comments using the YouTube api call, and get a pandas Dataframe in return containing details about the videos and the comments.


In [4]:
products=["Squid Game Korean Drama (2021)"]

youtube=api_handler(api_service_name, api_version, DEVELOPER_KEY)

In [5]:
multiple_video_comments=youtube.create_video_df_from_search(products,number_of_videos_per_product=20,number_of_comments_per_video=1000,number_of_replies_per_comment=100)
multiple_video_comments

getting replies: 749
getting replies: 521
getting replies: 16
getting replies: 62
getting replies: 64
getting replies: 129
getting replies: 14
getting replies: 504
getting replies: 101
getting replies: 350
getting replies: 16
getting replies: 5
getting replies: 3
getting replies: 318
getting replies: 25
getting replies: 230
getting replies: 390
getting replies: 16
getting replies: 154
getting replies: 461


Unnamed: 0,product,v_title,v_videoId,v_channelTitle,v_publishTime,v_description,v_thumbnail,c_id,c_parentId,c_author,c_published_at,c_updated_at,c_like_count,c_text
0,Squid Game Korean Drama (2021),"$456,000 Squid Game In Real Life!",0e3GPea1Tyg,MrBeast,2021-11-24T21:00:01Z,MAKE SURE YOU WATCH UNTIL GLASS BRIDGE IT'S IN...,https://i.ytimg.com/vi/0e3GPea1Tyg/default.jpg,UgzH8vliQSJKHQMGZjx4AaABAg,,MrBeast,2021-11-24T21:02:45Z,2021-11-24T21:02:45Z,1008503,"Like I said in the video, subscribe if you hav..."
1,Squid Game Korean Drama (2021),"$456,000 Squid Game In Real Life!",0e3GPea1Tyg,MrBeast,2021-11-24T21:00:01Z,MAKE SURE YOU WATCH UNTIL GLASS BRIDGE IT'S IN...,https://i.ytimg.com/vi/0e3GPea1Tyg/default.jpg,UgwDhFNTCbfck5apuUJ4AaABAg,,DoodleChaos,2021-11-24T22:07:54Z,2021-11-24T22:07:54Z,514583,"Huge props to the set designers, everything wa..."
2,Squid Game Korean Drama (2021),"$456,000 Squid Game In Real Life!",0e3GPea1Tyg,MrBeast,2021-11-24T21:00:01Z,MAKE SURE YOU WATCH UNTIL GLASS BRIDGE IT'S IN...,https://i.ytimg.com/vi/0e3GPea1Tyg/default.jpg,UgzVlS_nKI4aXISU_ep4AaABAg,,mukul_editz,2023-12-30T01:55:59Z,2023-12-30T01:55:59Z,424,Your videos are so interesting ❤
3,Squid Game Korean Drama (2021),"$456,000 Squid Game In Real Life!",0e3GPea1Tyg,MrBeast,2021-11-24T21:00:01Z,MAKE SURE YOU WATCH UNTIL GLASS BRIDGE IT'S IN...,https://i.ytimg.com/vi/0e3GPea1Tyg/default.jpg,UgxykcUWbPcLhlL-Gy14AaABAg,,SpamR1_2013,2023-11-27T00:57:21Z,2023-11-27T00:57:21Z,1705,that guy who sacrificed himself on purpose for...
4,Squid Game Korean Drama (2021),"$456,000 Squid Game In Real Life!",0e3GPea1Tyg,MrBeast,2021-11-24T21:00:01Z,MAKE SURE YOU WATCH UNTIL GLASS BRIDGE IT'S IN...,https://i.ytimg.com/vi/0e3GPea1Tyg/default.jpg,Ugxu5B8dQ9-mZpfW-UV4AaABAg,,user-cs9zv3gh1k,2024-01-30T20:17:02Z,2024-01-30T20:17:02Z,265,This version of the game is pretty much what t...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42495,Squid Game Korean Drama (2021),Cast of Squid Game ditches tracksuits for suit...,o4EF1NG_xks,Netflix K-Content,2021-09-25T01:00:08Z,The stars of SQUID GAME are faced with yet ano...,https://i.ytimg.com/vi/o4EF1NG_xks/default.jpg,Ugyg_jtreiARzVQq6TB4AaABAg.9T8TlcU-mJu9TCfbJV8URO,Ugyg_jtreiARzVQq6TB4AaABAg,allure_ism,2021-10-07T18:11:30Z,2021-10-07T18:11:30Z,6,Nobody would pick paper or rock (I mean it dep...
42496,Squid Game Korean Drama (2021),Cast of Squid Game ditches tracksuits for suit...,o4EF1NG_xks,Netflix K-Content,2021-09-25T01:00:08Z,The stars of SQUID GAME are faced with yet ano...,https://i.ytimg.com/vi/o4EF1NG_xks/default.jpg,Ugyg_jtreiARzVQq6TB4AaABAg.9T8TlcU-mJu9TFObZDOfQq,Ugyg_jtreiARzVQq6TB4AaABAg,gabrieleDATass,2021-10-08T19:31:58Z,2021-10-08T19:31:58Z,1,How do you kill someone with paper lmao
42497,Squid Game Korean Drama (2021),Cast of Squid Game ditches tracksuits for suit...,o4EF1NG_xks,Netflix K-Content,2021-09-25T01:00:08Z,The stars of SQUID GAME are faced with yet ano...,https://i.ytimg.com/vi/o4EF1NG_xks/default.jpg,Ugyg_jtreiARzVQq6TB4AaABAg.9T8TlcU-mJu9TFPKHqbzEu,Ugyg_jtreiARzVQq6TB4AaABAg,snowrider9018,2021-10-08T19:38:13Z,2021-10-08T19:38:13Z,2,"@@gabrieleDATass sand-paper them to death, idk..."
42498,Squid Game Korean Drama (2021),Cast of Squid Game ditches tracksuits for suit...,o4EF1NG_xks,Netflix K-Content,2021-09-25T01:00:08Z,The stars of SQUID GAME are faced with yet ano...,https://i.ytimg.com/vi/o4EF1NG_xks/default.jpg,Ugyg_jtreiARzVQq6TB4AaABAg.9T8TlcU-mJu9TFkpMvsZvz,Ugyg_jtreiARzVQq6TB4AaABAg,Son9,2021-10-08T22:54:50Z,2021-10-08T22:54:50Z,0,@@gabrieleDATass Shove it down their throat/ev...


## 2) Data Analysis, Selection and Labeling:


Taken from:

https://stackoverflow.com/questions/33404752/removing-emojis-from-a-string-in-python


In [6]:
# Function to remove emojis : As emojis do not provide any helpful information they should be removed from the text strings.
def remove_emojis(data):
    if isinstance(data, str):
        # Remove html tags
        data = BeautifulSoup(data, "html.parser").get_text()
        # Remove emote, etc
        emoj = re.compile("["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
            u"\U00002500-\U00002BEF"  # chinese char
            u"\U00002702-\U000027B0"
            u"\U000024C2-\U0001F251"
            u"\U0001f926-\U0001f937"
            u"\U00010000-\U0010ffff"
            u"\u2640-\u2642" 
            u"\u2600-\u2B55"
            u"\u200d"
            u"\u23cf"
            u"\u23e9"
            u"\u231a"
            u"\ufe0f"  # dingbats
            u"\u3030"
                        "]+", re.UNICODE)
        # english_words = re.compile(r'\b[a-zA-Z]+\b')

        return re.sub(emoj, '', data)
    return 

In [7]:
# Drop any row containing NA values.
multiple_video_comments.dropna(subset=['c_text'],inplace=True)

In [8]:
# Remove emojis from the text to be analyzed
multiple_video_comments['c_text']=multiple_video_comments['c_text'].apply(remove_emojis)

df_length_before = len(multiple_video_comments)
print("DataFrame Length Before:", df_length_before)

# Drop duplicates
multiple_video_comments.drop_duplicates(inplace=True)
multiple_video_comments.dropna(subset=['c_text'],inplace=True)
# Drop rows with empty or text length <= 2 comments
multiple_video_comments = multiple_video_comments[multiple_video_comments['c_text'].apply(lambda x: len(x) > 2)]

df_length_after = len(multiple_video_comments)
print("DataFrame Length After:", df_length_after)

multiple_video_comments

  data = BeautifulSoup(data, "html.parser").get_text()
  data = BeautifulSoup(data, "html.parser").get_text()


DataFrame Length Before: 42500
DataFrame Length After: 38469


Unnamed: 0,product,v_title,v_videoId,v_channelTitle,v_publishTime,v_description,v_thumbnail,c_id,c_parentId,c_author,c_published_at,c_updated_at,c_like_count,c_text
0,Squid Game Korean Drama (2021),"$456,000 Squid Game In Real Life!",0e3GPea1Tyg,MrBeast,2021-11-24T21:00:01Z,MAKE SURE YOU WATCH UNTIL GLASS BRIDGE IT'S IN...,https://i.ytimg.com/vi/0e3GPea1Tyg/default.jpg,UgzH8vliQSJKHQMGZjx4AaABAg,,MrBeast,2021-11-24T21:02:45Z,2021-11-24T21:02:45Z,1008503,"Like I said in the video, subscribe if you hav..."
1,Squid Game Korean Drama (2021),"$456,000 Squid Game In Real Life!",0e3GPea1Tyg,MrBeast,2021-11-24T21:00:01Z,MAKE SURE YOU WATCH UNTIL GLASS BRIDGE IT'S IN...,https://i.ytimg.com/vi/0e3GPea1Tyg/default.jpg,UgwDhFNTCbfck5apuUJ4AaABAg,,DoodleChaos,2021-11-24T22:07:54Z,2021-11-24T22:07:54Z,514583,"Huge props to the set designers, everything wa..."
2,Squid Game Korean Drama (2021),"$456,000 Squid Game In Real Life!",0e3GPea1Tyg,MrBeast,2021-11-24T21:00:01Z,MAKE SURE YOU WATCH UNTIL GLASS BRIDGE IT'S IN...,https://i.ytimg.com/vi/0e3GPea1Tyg/default.jpg,UgzVlS_nKI4aXISU_ep4AaABAg,,mukul_editz,2023-12-30T01:55:59Z,2023-12-30T01:55:59Z,424,Your videos are so interesting
3,Squid Game Korean Drama (2021),"$456,000 Squid Game In Real Life!",0e3GPea1Tyg,MrBeast,2021-11-24T21:00:01Z,MAKE SURE YOU WATCH UNTIL GLASS BRIDGE IT'S IN...,https://i.ytimg.com/vi/0e3GPea1Tyg/default.jpg,UgxykcUWbPcLhlL-Gy14AaABAg,,SpamR1_2013,2023-11-27T00:57:21Z,2023-11-27T00:57:21Z,1705,that guy who sacrificed himself on purpose for...
4,Squid Game Korean Drama (2021),"$456,000 Squid Game In Real Life!",0e3GPea1Tyg,MrBeast,2021-11-24T21:00:01Z,MAKE SURE YOU WATCH UNTIL GLASS BRIDGE IT'S IN...,https://i.ytimg.com/vi/0e3GPea1Tyg/default.jpg,Ugxu5B8dQ9-mZpfW-UV4AaABAg,,user-cs9zv3gh1k,2024-01-30T20:17:02Z,2024-01-30T20:17:02Z,265,This version of the game is pretty much what t...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42495,Squid Game Korean Drama (2021),Cast of Squid Game ditches tracksuits for suit...,o4EF1NG_xks,Netflix K-Content,2021-09-25T01:00:08Z,The stars of SQUID GAME are faced with yet ano...,https://i.ytimg.com/vi/o4EF1NG_xks/default.jpg,Ugyg_jtreiARzVQq6TB4AaABAg.9T8TlcU-mJu9TCfbJV8URO,Ugyg_jtreiARzVQq6TB4AaABAg,allure_ism,2021-10-07T18:11:30Z,2021-10-07T18:11:30Z,6,Nobody would pick paper or rock (I mean it dep...
42496,Squid Game Korean Drama (2021),Cast of Squid Game ditches tracksuits for suit...,o4EF1NG_xks,Netflix K-Content,2021-09-25T01:00:08Z,The stars of SQUID GAME are faced with yet ano...,https://i.ytimg.com/vi/o4EF1NG_xks/default.jpg,Ugyg_jtreiARzVQq6TB4AaABAg.9T8TlcU-mJu9TFObZDOfQq,Ugyg_jtreiARzVQq6TB4AaABAg,gabrieleDATass,2021-10-08T19:31:58Z,2021-10-08T19:31:58Z,1,How do you kill someone with paper lmao
42497,Squid Game Korean Drama (2021),Cast of Squid Game ditches tracksuits for suit...,o4EF1NG_xks,Netflix K-Content,2021-09-25T01:00:08Z,The stars of SQUID GAME are faced with yet ano...,https://i.ytimg.com/vi/o4EF1NG_xks/default.jpg,Ugyg_jtreiARzVQq6TB4AaABAg.9T8TlcU-mJu9TFPKHqbzEu,Ugyg_jtreiARzVQq6TB4AaABAg,snowrider9018,2021-10-08T19:38:13Z,2021-10-08T19:38:13Z,2,"@@gabrieleDATass sand-paper them to death, idk..."
42498,Squid Game Korean Drama (2021),Cast of Squid Game ditches tracksuits for suit...,o4EF1NG_xks,Netflix K-Content,2021-09-25T01:00:08Z,The stars of SQUID GAME are faced with yet ano...,https://i.ytimg.com/vi/o4EF1NG_xks/default.jpg,Ugyg_jtreiARzVQq6TB4AaABAg.9T8TlcU-mJu9TFkpMvsZvz,Ugyg_jtreiARzVQq6TB4AaABAg,Son9,2021-10-08T22:54:50Z,2021-10-08T22:54:50Z,0,@@gabrieleDATass Shove it down their throat/ev...


Reference

https://stackoverflow.com/questions/40375366/pandas-to-csv-checking-for-overwrite


In [9]:
class preprocessing:
    def __init__(self):
        # Define keywords related to the TV show
        self.tv_show_keywords = ['Squid Game', 'Gi-hun', 'Sang-woo', 'Player', 'Red light, green light', 'Honeycomb',
                            'Tug of war', 'Marbles', 'Front man', 'VIPs', 'Doll', 'Coffin', 'Square', 'Triangle', 
                            'Circle', 'Death game', 'death', 'Survival game', 'Money', 'prize', 'Il-nam', 'Hwang Jun-ho'
                            'director', 'Cho Sang-woo', 'Masked man', 'Childhood', 'game', 'Pink soldier', 'Betrayal',
                            'Seong Gi-hun', 'Survival', 'Games', 'Competition', 'Squid', 'Masks', 'ali']
        # Setting threshold value for validating the relevance of the comment
        self.threshold = 1

    # Tokenize text and remove stop words
    def preprocess_text(self, text):
        stop_words = set(stopwords.words('english'))
        tokens = word_tokenize(text.lower())
        filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
        return filtered_tokens

    # Matching function to check relevance of the comments
    def match_keywords(self, tokens):
        return [token for token in tokens if token in self.tv_show_keywords]

    # Scoring function to calculate how many tokens matched
    def calculate_score(self, tokens):
        return len(tokens)

    # Validate function to validate the relevance based on threshold
    def validate_relevance(self, score):
        return score >= self.threshold

    def filter_comments(self, df):
        c = 0
        comments = []
        irrelevant_keywords = ['HYVE', 'crypto', 'promotion', 'ad', 'spam', 'advertisement', 'spoiler', 'leak', 'promo', 
                               'off-topic', 'clickbait', 'self-promotion', '0:', '1:', '2:', '3:', '4:', '5:', '6:', 
                               '7:', '8:', '9:', '10:', '11:', '12:', '13:', '14:', '15:', '\r', '\n', '\t', '@', 'subscribe', 'like', 'comment']
        for index, row in df.iterrows():
            try:
                if detect(row['c_text']) == 'en' and not any(keyword in row['c_text'] for keyword in irrelevant_keywords):
                    comments.append(row)
                    c += 1
            except Exception as e:  # Catch any exception
                pass
        print("Number of Filtered Comments: ", c)
        new_df = pd.DataFrame(comments, 
                    columns=['product', 'v_title', 'v_videoId',
                        'v_channelTitle', 'v_publishTime',
                        'v_description', 'v_thumbnail',
                        'c_id','c_parentId',
                        'c_author', 'c_published_at',
                        'c_updated_at', 'c_like_count',
                        'c_text'])  # Create a new DataFrame from the list of rows
        new_df = new_df.sort_values(by = ['c_like_count'], ascending = False)
        new_df.drop_duplicates(inplace=True)
        new_df = new_df[:4000]
        return new_df

    def preprocess(self, df):
        c = 0
        comments = []
        for index, row in df.iterrows():
            processed_text = self.preprocess_text(row['c_text'])
            matched_keywords = self.match_keywords(processed_text)
            score = self.calculate_score(matched_keywords)
            is_relevant = self.validate_relevance(score)
            if is_relevant == 1:
                comments.append(row)
                c += 1

        new_df = pd.DataFrame(comments, 
                    columns=['product', 'v_title', 'v_videoId',
                        'v_channelTitle', 'v_publishTime',
                        'v_description', 'v_thumbnail',
                        'c_id','c_parentId',
                        'c_author', 'c_published_at',
                        'c_updated_at', 'c_like_count',
                        'c_text'])
        print("Number of Processed Comments: ", c)
        new_df = self.filter_comments(new_df)
        return new_df

In [10]:
p = preprocessing()
new_df = p.preprocess(multiple_video_comments)

Number of Processed Comments:  3388
Number of Filtered Comments:  1368


In [11]:
new_df

Unnamed: 0,product,v_title,v_videoId,v_channelTitle,v_publishTime,v_description,v_thumbnail,c_id,c_parentId,c_author,c_published_at,c_updated_at,c_like_count,c_text
39726,Squid Game Korean Drama (2021),Cast of Squid Game ditches tracksuits for suit...,o4EF1NG_xks,Netflix K-Content,2021-09-25T01:00:08Z,The stars of SQUID GAME are faced with yet ano...,https://i.ytimg.com/vi/o4EF1NG_xks/default.jpg,UgxlkxXiNXjyjJcOcel4AaABAg,,esophagus3319,2021-09-25T02:58:15Z,2021-09-25T02:58:15Z,21223,the way her real personality is totally differ...
39732,Squid Game Korean Drama (2021),Cast of Squid Game ditches tracksuits for suit...,o4EF1NG_xks,Netflix K-Content,2021-09-25T01:00:08Z,The stars of SQUID GAME are faced with yet ano...,https://i.ytimg.com/vi/o4EF1NG_xks/default.jpg,Ugy5Vw4Hlzj-ZHpMgjN4AaABAg,,Jk-bp4nu,2021-09-25T18:37:02Z,2021-09-25T18:37:02Z,17518,"Even if Squid Game was Ho Yeon's first drama, ..."
39710,Squid Game Korean Drama (2021),Cast of Squid Game ditches tracksuits for suit...,o4EF1NG_xks,Netflix K-Content,2021-09-25T01:00:08Z,The stars of SQUID GAME are faced with yet ano...,https://i.ytimg.com/vi/o4EF1NG_xks/default.jpg,UgyMnV8PjtgP5-d2RfF4AaABAg,,brooke4608,2021-10-09T00:22:59Z,2021-10-09T00:22:59Z,14598,It’s so nice seeing them play a game that won’...
10395,Squid Game Korean Drama (2021),Squid Game (Behind The Scenes) #Shorts,4vb085gEgPc,Behind The Scenes,2022-03-20T16:43:54Z,This video gives you a chance to look BEHIND T...,https://i.ytimg.com/vi/4vb085gEgPc/default.jpg,UgxEeZvLDwVE2jcneuJ4AaABAg,,_Taylo_,2022-06-09T00:13:48Z,2022-06-09T00:13:48Z,10273,Oh so the camera-man plays squid game too?
8402,Squid Game Korean Drama (2021),SQUID GAME | RED LIGHT GREEN LIGHT SCENE,sH4Y450PSVM,memebappe,2021-10-14T18:52:37Z,BUY THE PERFECT CHRISTMAS GIFT : https://am...,https://i.ytimg.com/vi/sH4Y450PSVM/default.jpg,UgyXDA0Vdld5b5SMG2Z4AaABAg,,silver-eyedfox7713,2022-01-16T01:30:47Z,2022-01-16T01:30:47Z,8217,This scene is the perfect introduction to how ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28757,Squid Game Korean Drama (2021),Sugar Honeycomb No Blood - Squid Game 2,qE3TiUVd1Qc,PopMov,2021-10-07T18:44:36Z,Dont miss our new FRONTMAN song! See the music...,https://i.ytimg.com/vi/qE3TiUVd1Qc/default.jpg,UgzZK8YNQZA2rszlRjl4AaABAg,,veerkavyashow,2021-11-16T16:27:03Z,2021-11-16T16:27:03Z,0,Why did she participate squid game with Lightner
28775,Squid Game Korean Drama (2021),Sugar Honeycomb No Blood - Squid Game 2,qE3TiUVd1Qc,PopMov,2021-10-07T18:44:36Z,Dont miss our new FRONTMAN song! See the music...,https://i.ytimg.com/vi/qE3TiUVd1Qc/default.jpg,UgycJw6sG0oQb2k_eNp4AaABAg,,rhondageorge8283,2022-04-19T02:16:32Z,2022-04-19T02:16:32Z,0,You know I just got game
28784,Squid Game Korean Drama (2021),Sugar Honeycomb No Blood - Squid Game 2,qE3TiUVd1Qc,PopMov,2021-10-07T18:44:36Z,Dont miss our new FRONTMAN song! See the music...,https://i.ytimg.com/vi/qE3TiUVd1Qc/default.jpg,UgzTJ1CP2EMymQK67R54AaABAg,,RobloxCool605,2022-08-16T13:43:23Z,2022-08-16T13:43:36Z,0,SQUID GAME BUT IM IN INDONESIA I AM INDONESIAN...
28822,Squid Game Korean Drama (2021),Sugar Honeycomb No Blood - Squid Game 2,qE3TiUVd1Qc,PopMov,2021-10-07T18:44:36Z,Dont miss our new FRONTMAN song! See the music...,https://i.ytimg.com/vi/qE3TiUVd1Qc/default.jpg,UgyTPzDzcKVlMzMMVux4AaABAg,,umaymaumayrshowitko2,2023-02-27T10:30:48Z,2023-02-27T10:30:48Z,0,Squid Game is put 456


### Labelling comments using Sentiment Lexicon - VADER


In [12]:
sentiment_lexicon = SentimentIntensityAnalyzer()

def get_sentiment_score(c_text):
    sentiment_Score = sentiment_lexicon.polarity_scores(c_text)
    return sentiment_Score['compound']

def check_sentiment(sentiment_score):
    if sentiment_score > 0.00:
        return "Positive"
    elif sentiment_score < 0.00:
        return "Negative"
    elif sentiment_score == 0:
        return "Neutral "

new_df['sentiment_score'] = new_df['c_text'].apply(get_sentiment_score)
new_df['sentiment'] = new_df['sentiment_score'].apply(check_sentiment)

### Top Comments for each polarity


In [13]:
def select_top_comments(df, top_n=10):
    top_comments = []
    grouped = df.groupby('sentiment')

    # iterate over each polarity group
    for sentiment, group in grouped:
        # sort comments by sentiment score pick top 10
        top_group_comments = group.sort_values(by='sentiment_score', ascending=False).head(top_n)[['sentiment_score', 'sentiment', 'c_text']].values.tolist()
        top_comments.extend([(sentiment_score, sentiment, comment) for sentiment_score, sentiment, comment in top_group_comments])

    return top_comments

top_comments = select_top_comments(new_df, top_n=10)

# # top 10 comments for each polarity
# for sentiment_score, sentiment, comment in top_comments:
#     print(f"Sentiment: {sentiment}, Sentiment Score: {sentiment_score}, Comment: {comment}")

# making it pretty~~~
headers = ["Sentiment", "Sentiment Score", "Comment"]
print(tabulate(top_comments, headers=headers))

  Sentiment  Sentiment Score    Comment
-----------  -----------------  --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
    -0.0164  Negative           Hmmm so many people died in the game if I would be in their place I would know how to play it and then play the game .papa people .
    -0.0258  Negative           This scene is the perfect introdu

### Sentiment Lexicon using TextBlob


In [14]:
def text_blob_sentiment_score(text):
    return TextBlob(text).sentiment.polarity

def texblob_check_sentiment(score):
    if score == 0:
        return 'Neutral'
    elif score < 0.00:
        return 'Negative'
    elif score > 0.00:
        return 'Positive'

new_df['textblob_score'] = new_df['c_text'].apply(text_blob_sentiment_score)
new_df['textblob_sentiment'] = new_df['textblob_score'].apply(texblob_check_sentiment)

def textblob_select_top_comments(df, top_n=10):
    top_comments = []
    grouped = df.groupby('textblob_sentiment')

    for sentiment, group in grouped:
        top_group_comments = group.sort_values(by='textblob_score', ascending=False).head(top_n)[['textblob_score', 'textblob_sentiment', 'c_text']].values.tolist()
        top_comments.extend([(sentiment_score, sentiment, comment) for sentiment_score, sentiment, comment in top_group_comments])

    return top_comments

textblob_top_comments = select_top_comments(new_df, top_n=10)

headers = ["Sentiment", "Sentiment Score", "Comment"]
print(tabulate(textblob_top_comments, headers=headers))


  Sentiment  Sentiment Score    Comment
-----------  -----------------  --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
    -0.0164  Negative           Hmmm so many people died in the game if I would be in their place I would know how to play it and then play the game .papa people .
    -0.0258  Negative           This scene is the perfect introdu

In [15]:
folders = ['positive', 'negative', 'neutral']
for folder in folders:
    if not os.path.exists(folder):
        os.makedirs(folder)

def col_to_txt(row):
    sentiment = row['sentiment']  
    c_text = row['c_text']
    file_name = f"{sentiment}_{row.c_id}.txt"  
    folder = f"{sentiment.strip()}"  
    file_path = os.path.join(folder, file_name)

    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(c_text)

new_df.apply(col_to_txt, axis=1)

39726    None
39732    None
39710    None
10395    None
8402     None
         ... 
28757    None
28775    None
28784    None
28822    None
22909    None
Length: 1368, dtype: object

In [16]:
filename = 'final_comments_df.csv'
files_present = glob.glob(filename)
# will only write to disk if file doesnt exist
if not files_present:
    new_df.to_csv(filename, index=False)
    new_df
else:
    print (f'File Already Exists. Delete {filename}' )

In [17]:
import os
import shutil
import random

# Define the root directory containing the positive, negative, and neutral folders
root_dir = ''

# Define the directories for train and test sets
train_dir = 'data/train'
test_dir = 'data/test'
try:
    shutil.rmtree(os.path.join(root_dir, train_dir))
    shutil.rmtree(os.path.join(root_dir, test_dir))
except:
    pass
# Define the ratio for train-test split
split_ratio = 0.8

# Iterate through each sentiment folder
for sentiment in ['positive', 'negative', 'neutral']:
    # Get the list of file paths in the current sentiment folder
    files = os.listdir(os.path.join(root_dir, sentiment))
    # Shuffle the file paths
    random.shuffle(files)
    # Calculate the split index based on the split ratio
    split_index = int(len(files) * split_ratio)
    # Split the files into train and test sets
    train_files = files[:split_index]
    test_files = files[split_index:]
    
    # Move train files to train directory
    for file in train_files:
        src = os.path.join(root_dir, sentiment, file)
        dst = os.path.join(train_dir, sentiment, file)
        os.makedirs(os.path.dirname(dst), exist_ok=True)
        # Check if the file already exists in the destination directory
        if not os.path.exists(dst):
            shutil.move(src, dst)
    
    # Move test files to test directory
    for file in test_files:
        src = os.path.join(root_dir, sentiment, file)
        dst = os.path.join(test_dir, sentiment, file)
        os.makedirs(os.path.dirname(dst), exist_ok=True)
        # Check if the file already exists in the destination directory
        if not os.path.exists(dst):
            shutil.move(src, dst)

# Remove the sentiment folders
try:
    shutil.rmtree('negative')
    shutil.rmtree('neutral')
    shutil.rmtree('positive')
except:
    pass

## 3) Text Analytics Pipeline:


In [18]:
train_dir = 'data/train'
test_dir = 'data/test'

In [19]:
reviews_train = load_files(train_dir)
reviews_test = load_files(test_dir)

In [20]:
text_train, y_train = reviews_train.data, reviews_train.target
text_test, y_test = reviews_test.data, reviews_test.target
print(text_train[1],y_train[1])
print(text_train[2],y_train[2])
print(text_train[3],y_train[3])
# 0 -> negative, 1 -> neutral, 2 -> positive

b'To be honest Ali is the most underrated Actor ' 2
b'I hope squid  game is not real' 2
b'Money Heist, DARK, Squid Game are my top 3 among non-English series. Kinda wish for a Russian one.' 2


In [21]:
type(reviews_train)

sklearn.utils._bunch.Bunch

In [22]:
vect = CountVectorizer()
vect = CountVectorizer().fit(reviews_train)
X_train = vect.transform(reviews_train)
print("X_train:\n{}".format(repr(X_train)))

X_train:
<5x5 sparse matrix of type '<class 'numpy.int64'>'
	with 5 stored elements in Compressed Sparse Row format>


In [23]:
product_tokens = ['drama', 'film', 'cinema', 'actor', 'actress', 'director', 'plot','scene', 'genre', 
                  'subtitles', 'k-drama', 'kdrama', 'k-movie', 'television', 'episode', 'screenplay', 
                  'script', 'cinematography', 'soundtrack', 'OST', 'character', 'plot twist', 'review', 
                  'ratings', 'premiere', 'streaming', 'watchlist', 'subbed', 'dubbed', 'sequel', 'game', 
                  'song', 'season', 'trailer', 'casting', 'fanbase', 'recommendation', 'viewer', 'critic', 
                  'Korean', 'entertainment', 'watched', 'show', 'squid', 'watch', 'watching', 'acting', 
                  'netflix', 'show', 'end', 'squid game', 'gi-hun', 'Sang-woo', 'gi hun', 'sang woo', 'Player', 
                  'Red light', 'green light', 'Honeycomb', 'Tug of war', 'Marbles', 'Front man', 'VIPs', 'Doll', 
                  'Coffin', 'Square', 'Triangle', 'Circle', 'Death game', 'death', 'Survival game', 'Money', 
                  'prize', 'Il-nam', 'il nam', 'Hwang Jun-ho', 'hwang jun ho', 'director', 'Cho Sang-woo', 
                  'Masked man', 'Childhood', 'game', 'Pink soldier', 'Betrayal', 'Seong Gi-hun', 'Survival', 
                  'Games', 'Competition', 'Squid', 'Masks', 'ali']

In [24]:
def remove_punctuation(text):
    # Define the pattern to match punctuation
    punctuation_pattern = r'[^\w\s]'
    # Replace punctuation with an empty string
    text_without_punctuation = re.sub(punctuation_pattern, '', text)
    return text_without_punctuation

# Text Processing
def preprocess_text(text):
    tokens = word_tokenize(text)
    # stopwords punctuation etc
    stemmer = nltk.stem.SnowballStemmer('english')
    # stemmer = PorterStemmer()
    # split into tokens
    tokens = word_tokenize(text)
    # removes stopwords and numbers and stems from tokens makes sure its all lowercase too
    tokens = [stemmer.stem(remove_punctuation(token)) for token in tokens if token.isalnum() and token.lower() not in product_tokens]
    return ' '.join(tokens)

In [25]:
text_clf = Pipeline([
    ('preprocess', 
    TfidfVectorizer(
                    encoding="utf-8",
                    strip_accents='ascii',
                    lowercase=True,
                    preprocessor=preprocess_text,
                    # tokenizer=,
                    # analyzer=,
                    stop_words='english',
                    norm='l2',
                    ngram_range=(1, 1),
                    max_df=0.09,
                    min_df=0.003,
                    max_features=500,
                    binary=True,
                    use_idf=True,
                    smooth_idf=True,
                    sublinear_tf=True
                    )
    # CountVectorizer(preprocessor=preprocess_text,ngram_range=(1, 1))
     ), 
    ('classifier', LogisticRegression())
])

text_clf.fit(text_train, y_train)
y_pred = text_clf.predict(text_test)
print(classification_report(y_pred, y_test))



              precision    recall  f1-score   support

           0       0.38      0.67      0.48        36
           1       0.71      0.66      0.69        83
           2       0.88      0.76      0.82       156

    accuracy                           0.72       275
   macro avg       0.66      0.70      0.66       275
weighted avg       0.77      0.72      0.73       275



### --- Run only till here and check coz the grid search would take long so better to adjust by looking at this only ---


In [None]:
text_clf = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', MultinomialNB())
])

parameters = {
    'vectorizer': [TfidfVectorizer()],
    'classifier': [
        MultinomialNB(),
        SVC(),
        LogisticRegression()
    ],
    'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'vectorizer__preprocessor': [preprocess_text],
    'vectorizer__encoding': ['utf-8'],
    'vectorizer__binary': [False, True],
    'vectorizer__lowercase': [False, True],
    'vectorizer__encoding': ["utf-8"],
    'vectorizer__strip_accents': ['ascii'],
    'vectorizer__stop_words': ['english'],
    'vectorizer__norm': ['l2','l1'],
    'vectorizer__max_df': [0.1,0.09,0.08,0.07],
    'vectorizer__min_df': [0.004,0.003,0.002],
    # 'vectorizer__max_features': [500],
    'vectorizer__use_idf': [True,False],
    'vectorizer__smooth_idf': [True],
    # 'vectorizer__sublinear_tf': [True,False]
}

grid_search = GridSearchCV(text_clf, parameters, scoring='accuracy', cv=10, n_jobs=-1)
grid_search.fit(text_train, y_train)

print("Best Parameters: ", grid_search.best_params_)
best_model = grid_search.best_estimator_
y_pred = best_model.predict(text_test)

print(classification_report(y_test, y_pred))

In [None]:
results = grid_search.cv_results_
 
scores = results['mean_test_score']

params = results['params']

top_models_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:10]
 
for i in top_models_indices:
    print("Model {}: Mean Test Score - {:.4f}, Parameters - {}".format(i+1, scores[i], params[i]))

In [None]:
pipeline_ct = Pipeline([
    ('vectorizer', CountVectorizer()),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

parameters_ct = {
    'vectorizer': [CountVectorizer()],
    'classifier': [
        MultinomialNB(),
        SVC(),
        LogisticRegression()
    ],
    'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'vectorizer__preprocessor': [preprocess_text],
    'vectorizer__encoding': ['utf-8'],
    'vectorizer__binary': [False, True],
    'vectorizer__lowercase': [False, True],
    'vectorizer__encoding': ["utf-8"],
    'vectorizer__strip_accents': ['ascii'],
    'vectorizer__stop_words': ['english'],
    'vectorizer__max_df': [0.1,0.09,0.08,0.07],
    'vectorizer__min_df': [0.004,0.003,0.002],
    'tfidf__norm': ['l2','l1'],
    # 'vectorizer__max_features': [500],
    'tfidf__use_idf': [True,False],
    'tfidf__smooth_idf': [True],
    # 'vectorizer__sublinear_tf': [True,False]
}

# Perform GridSearchCV
grid_search_ct = GridSearchCV(pipeline_ct, parameters_ct, scoring='accuracy', cv=10, n_jobs=-1)
grid_search_ct.fit(text_train, y_train)

print("Best Parameters: ", grid_search_ct.best_params_)
best_model_ct = grid_search_ct.best_estimator_
y_pred_ct = best_model_ct.predict(text_test)

print(classification_report(y_test, y_pred_ct))

In [None]:
results_ct = grid_search_ct.cv_results_
 
scores_ct = results_ct['mean_test_score']

params_ct = results_ct['params']

top_models_indices_ct = sorted(range(len(scores_ct)), key=lambda i: scores_ct[i], reverse=True)[:10]
 
for i in top_models_indices_ct:
    print("Model {}: Mean Test Score - {:.4f}, Parameters - {}".format(i+1, scores_ct[i], params_ct[i]))

## 4) Visualization and Insights:


## 5) Discussion and conclusion from experiments:
