[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/farheenfab/AppliedText_CW/blob/main/CW1-generate_dataset.ipynb)


In [9]:
import googleapiclient.discovery
import googleapiclient.errors
import numpy as np
import pandas as pd
import glob

api_service_name = "youtube"
api_version = "v3"
DEVELOPER_KEY = "AIzaSyAWj_uzrhZL18X32S_P79pT1wnSYGpuA4k"

Reference

https://developers.google.com/youtube/v3/docs/search/list#parameters

https://developers.google.com/youtube/v3/docs/comments/list

In [10]:
class api_handler:
    def __init__(self, api_service_name, api_version, developer_key):
        self.client = googleapiclient.discovery.build(api_service_name,
                                                    api_version,
                                                    developerKey=developer_key)
        
    # Search for videos details given id
    def get_video_details(self, videoId, part="snippet"):
        request = self.client.videos().list(
            part=part,
            id=videoId
        )
        response = request.execute()

        if 'items' in response:
            video_details = response['items'][0]
            snippet=video_details['snippet']
            snippet['videoId']=videoId
            snippet['id']=videoId
            snippet['publishTime']=video_details.get('snippet', {}).get('publishedAt', {})
            snippet['thumbnails']=video_details.get('snippet', {}).get('thumbnails', {}).get('default', {}).get('url', '')
            return snippet

        return None

    # Search for videos given query
    def get_videos(self,query,maxResults=5,part="snippet"):
        request = self.client.search().list(
            part=part,
            maxResults=maxResults,
            # higher view count is likely to be more relevent 
            order="viewCount",
            q=query,  
            # american region videos 
            regionCode="US",
            # english videos
            relevanceLanguage="en",
            type="video"
        )
        response = request.execute()
        return response
    
    # Format Response from get_videos to dataframe
    def get_video_df(response):
        items=[]
        for item in response['items']:
            snippet=item.get('snippet', {})
            items+=[{
                'title':snippet.get('title', ''),
                'videoId':item.get('id', {}).get('videoId', ''),
                'channelTitle':snippet.get('channelTitle', ''),
                'publishTime':snippet.get('publishTime', ''),
                'description':snippet.get('description', ''),
                'thumbnails':snippet.get('thumbnails', {}).get('default', {}).get('url', '')
                }]
        df=pd.DataFrame(items)
        return df
    
    # Get comments from video
    def get_comments(self,videoId,part="snippet",maxResults=100,maxResultsDepth=100):
        all_comments = []
        nextPageToken = None
        while maxResults > 0:
            request = self.client.commentThreads().list(
                part=part,
                videoId=videoId,
                maxResults=min(maxResults, 100),
                order='relevance',
                moderationStatus='published',
                textFormat='plainText',
                pageToken=nextPageToken
            )
            response = request.execute()
            nextPageToken = response.get('nextPageToken')
            if 'items' in response:
                all_comments+=[response]
                for item in response['items']:
                    # extract the comment ID to get replies
                    comment_id = item.get('snippet',{}).get('topLevelComment',{}).get('id','')
                    if item.get('snippet',{}).get('totalReplyCount',0)>2:
                        print('getting replies:',item.get('snippet',{}).get('totalReplyCount',0))
                        replies = self.get_comment_replies(comment_id, maxResults=maxResultsDepth)
                        all_comments += replies

            maxResults -= min(maxResults, 100)
            if nextPageToken is None:
                break;    
        return all_comments
    
    # Get replies from comment 
    def get_comment_replies(self, commentId, part="snippet", maxResults=100):
        all_comments = []
        nextPageToken = None
        while maxResults > 0 and (nextPageToken != None or len(all_comments)==0):

            request = self.client.comments().list(
                part=part,
                parentId=commentId,
                maxResults=min(maxResults, 100),
                textFormat='plainText',
                pageToken=nextPageToken
            )

            response = request.execute()
            nextPageToken = response.get('nextPageToken')

            if 'items' in response and len(response['items'])>0:
                for item in response['items']:
                    modified_response = {
                        'items': [
                            {
                                'id':item.get('id'),
                                'snippet': {
                                    'topLevelComment': {
                                        'snippet': item.get('snippet','')
                                    }
                                }
                            }
                        ]
                    }
                    all_comments += [modified_response]
            maxResults -= min(maxResults, 100)
            if nextPageToken is None:
                break;    
        return all_comments

    # Format response from get_comments to dataframe
    def get_comments_df(response, video,product):
        comments = []
        for pages in response:
            for item in pages['items']:
                comment = item.get('snippet', {}).get('topLevelComment', {}).get('snippet', {})
                comments.append([
                        product,
                        video.get('title', ''),
                        video.get('videoId', ''),
                        video.get('channelTitle', ''),
                        video.get('publishTime', ''),
                        video.get('description', ''),
                        video.get('thumbnails', ''),
                        item.get('id', ''),  
                        comment.get('parentId', ''),  
                        comment.get('authorDisplayName', '')[1:],  
                        comment.get('publishedAt', ''),
                        comment.get('updatedAt', ''),
                        comment.get('likeCount', ''),
                        comment.get('textDisplay', '')
                    ])

        df = pd.DataFrame(comments,
            columns=['product', 'v_title', 'v_videoId',
                    'v_channelTitle', 'v_publishTime',
                    'v_description', 'v_thumbnail',
                    'c_id','c_parentId',
                    'c_author', 'c_published_at',
                    'c_updated_at', 'c_like_count',
                    'c_text'])
        
        return df
    
    # Search for videos related to products iteratively
    # Collect comments from each video and place it into an array
    def create_video_df_from_search(self, products,
                                    number_of_videos_per_product=5,
                                    number_of_comments_per_video=100
                                    ,number_of_replies_per_comment=100):
        multiple_video_comments = pd.DataFrame()
        for product in products:
            # get 25 first videos with the highest viewer counts 
            response = self.get_videos(query=product, maxResults=number_of_videos_per_product)
            # Convert results to df
            videos_df = api_handler.get_video_df(response)
            # For each video get a maximum of 100 comments
            # and place comments into an array
            for _, video in videos_df.iterrows():
                try:
                    response = self.get_comments(video['videoId'], maxResults=number_of_comments_per_video,maxResultsDepth=number_of_replies_per_comment)
                    comments_df = api_handler.get_comments_df(response, video, product)
                except:
                    # Function fails as the API returns 403 if the channel has comments disabled
                    # place an empty entry instead it can be deleted later
                    comments_df = pd.DataFrame(np.zeros((1, 14)),
                                                columns=['product', 'v_title', 'v_videoId',
                                                        'v_channelTitle', 'v_publishTime',
                                                        'v_description', 'v_thumbnail',
                                                        'c_id','c_parentId',
                                                        'c_author', 'c_published_at',
                                                        'c_updated_at', 'c_like_count',
                                                        'c_text'])
                    print('Unable to retrieve comments:', video.get('title', ''))
                multiple_video_comments = pd.concat([multiple_video_comments, comments_df], ignore_index=True)
        return multiple_video_comments
        
    # alternative method by explicitely specifying videos
    def create_video_df(self,products,videos,number_of_comments_per_video=100,number_of_replies_per_comment=100):
        count=0
        multiple_video_comments = pd.DataFrame()
        for product in products:
            for video in videos[count]:
                response = self.get_comments(video,maxResults=number_of_comments_per_video,maxResultsDepth=number_of_replies_per_comment) 
                video=self.get_video_details(video)
                comments_df = api_handler.get_comments_df(response, video, product)
                multiple_video_comments = pd.concat([multiple_video_comments, comments_df], ignore_index=True)
            count+=1
        return multiple_video_comments

In [11]:
products=["Love Alarm Season 1",
         "Love Alarm Season 2"]

# careful when adding videos index number should match between products and videos
# index 0 should contain the videos used to get comments for video 0 and so on 
videos=[['LhCQ7lHEjU8','Yh7PNUGxihU','8sXTfzaLmiQ'],
        ['c2xta7hcvXI','mkrrKGo1VEs','CL0wU3ss2uw','jPKm6kc9j5A','g0Oj4A2rslY']]

youtube=api_handler(api_service_name, api_version, DEVELOPER_KEY)


In [12]:
multiple_video_comments=youtube.create_video_df_from_search(products,number_of_videos_per_product=10,number_of_comments_per_video=2000,number_of_replies_per_comment=1000)
# multiple_video_comments=youtube.create_video_df(products,videos,number_of_comments_per_video=20000,number_of_replies_per_comment=20000)
multiple_video_comments

getting replies: 18
getting replies: 24
getting replies: 18
getting replies: 5
getting replies: 32
getting replies: 21
getting replies: 18
getting replies: 6
getting replies: 19
getting replies: 15
getting replies: 35
getting replies: 4
getting replies: 20
getting replies: 76
getting replies: 22
getting replies: 501
getting replies: 413
getting replies: 6
getting replies: 12
getting replies: 60
getting replies: 96
getting replies: 12
getting replies: 378
getting replies: 167
getting replies: 129
getting replies: 18
getting replies: 14
getting replies: 42
getting replies: 61
getting replies: 117
getting replies: 9
getting replies: 16
getting replies: 10
getting replies: 35
getting replies: 73
getting replies: 35
getting replies: 14
getting replies: 12
getting replies: 29
getting replies: 146
getting replies: 55
getting replies: 7
getting replies: 22
getting replies: 78
getting replies: 31
getting replies: 219
getting replies: 39
getting replies: 14
getting replies: 4
getting replies: 88

Unnamed: 0,product,v_title,v_videoId,v_channelTitle,v_publishTime,v_description,v_thumbnail,c_id,c_parentId,c_author,c_published_at,c_updated_at,c_like_count,c_text
0,Love Alarm Season 1,"TEARLINER, HAEJIN - Blooming Story (Love Alarm...",1xLrbvjCx1k,nvillera,2019-10-01T01:32:48Z,"gracias por ver ¡! si tengo algún error, puede...",https://i.ytimg.com/vi/1xLrbvjCx1k/default.jpg,UgymI4uhPmf2rU9nmst4AaABAg,,saikibu,2021-03-12T15:16:47Z,2021-03-12T15:16:47Z,725.0,"Vengo a llorar. Sunho, lo hiciste bien :)."
1,Love Alarm Season 1,"TEARLINER, HAEJIN - Blooming Story (Love Alarm...",1xLrbvjCx1k,nvillera,2019-10-01T01:32:48Z,"gracias por ver ¡! si tengo algún error, puede...",https://i.ytimg.com/vi/1xLrbvjCx1k/default.jpg,Ugxlv0IjyeU9y9CIL-J4AaABAg,,maicolsebastianvegaquiroga8164,2022-07-20T13:59:59Z,2022-07-20T13:59:59Z,1716.0,Esta serie nos demuestra que existen 2 amores ...
2,Love Alarm Season 1,"TEARLINER, HAEJIN - Blooming Story (Love Alarm...",1xLrbvjCx1k,nvillera,2019-10-01T01:32:48Z,"gracias por ver ¡! si tengo algún error, puede...",https://i.ytimg.com/vi/1xLrbvjCx1k/default.jpg,UgyuMtKWSg1g41IVWLV4AaABAg,,cielojuneth,2022-01-29T06:21:55Z,2022-01-29T06:21:55Z,1467.0,"Pinshi drama, ya es 2022 y sigo aún sigo llora..."
3,Love Alarm Season 1,"TEARLINER, HAEJIN - Blooming Story (Love Alarm...",1xLrbvjCx1k,nvillera,2019-10-01T01:32:48Z,"gracias por ver ¡! si tengo algún error, puede...",https://i.ytimg.com/vi/1xLrbvjCx1k/default.jpg,UgymkUO5XCYBkcDNWKF4AaABAg,,mirtanormadavid7287,2023-01-26T00:52:46Z,2023-01-26T00:52:46Z,350.0,2023 y aún lloro a esta serie 😭 Sun Oh se feli...
4,Love Alarm Season 1,"TEARLINER, HAEJIN - Blooming Story (Love Alarm...",1xLrbvjCx1k,nvillera,2019-10-01T01:32:48Z,"gracias por ver ¡! si tengo algún error, puede...",https://i.ytimg.com/vi/1xLrbvjCx1k/default.jpg,UgxvgMKvQqz4BXO4qtF4AaABAg,,anabelencofficial,2021-11-18T17:01:26Z,2021-11-18T17:01:26Z,689.0,Esta serie fue el claro ejemplo de que aveces ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42330,Love Alarm Season 2,wait for the Twist..🤣#kdrama #shorts #lovealarm,591hS-MP3NM,CherryBlossom,2022-09-24T10:09:50Z,cherryblossom.,https://i.ytimg.com/vi/591hS-MP3NM/default.jpg,UgxHL-bICq7TcTtfqDV4AaABAg.9gYDgjcDMlT9gZYrBMk5IO,UgxHL-bICq7TcTtfqDV4AaABAg,myweirdlife5279,2022-09-29T14:45:49Z,2022-09-29T14:45:49Z,0.0,@7 💜 Army love alarm
42331,Love Alarm Season 2,wait for the Twist..🤣#kdrama #shorts #lovealarm,591hS-MP3NM,CherryBlossom,2022-09-24T10:09:50Z,cherryblossom.,https://i.ytimg.com/vi/591hS-MP3NM/default.jpg,Ugz4konhLiToR_QwJ554AaABAg.9zQbTrdxL949zRGUfKSWbd,Ugz4konhLiToR_QwJ554AaABAg,cherryblossom7949,2024-01-11T15:27:53Z,2024-01-11T15:27:53Z,0.0,Same 🥲
42332,Love Alarm Season 2,wait for the Twist..🤣#kdrama #shorts #lovealarm,591hS-MP3NM,CherryBlossom,2022-09-24T10:09:50Z,cherryblossom.,https://i.ytimg.com/vi/591hS-MP3NM/default.jpg,Ugz4konhLiToR_QwJ554AaABAg.9zQbTrdxL949zkD_sNdX_g,Ugz4konhLiToR_QwJ554AaABAg,niankochwan1233,2024-01-19T09:27:20Z,2024-01-19T09:27:20Z,0.0,Why? What's the story
42333,Love Alarm Season 2,wait for the Twist..🤣#kdrama #shorts #lovealarm,591hS-MP3NM,CherryBlossom,2022-09-24T10:09:50Z,cherryblossom.,https://i.ytimg.com/vi/591hS-MP3NM/default.jpg,Ugz4konhLiToR_QwJ554AaABAg.9zQbTrdxL94A-KwX3gIXtu,Ugz4konhLiToR_QwJ554AaABAg,Safire919,2024-02-03T01:00:40Z,2024-02-03T01:00:40Z,0.0,"So technically JoJo, and sunho start dating, b..."


Taken from:

https://stackoverflow.com/questions/33404752/removing-emojis-from-a-string-in-python

In [13]:
import re
from bs4 import BeautifulSoup

def remove_emojis(data):
    if isinstance(data, str):
        # Remove html tags
        data = BeautifulSoup(data, "html.parser").get_text()
        # Remove emote, etc
        emoj = re.compile("["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
            u"\U00002500-\U00002BEF"  # chinese char
            u"\U00002702-\U000027B0"
            u"\U000024C2-\U0001F251"
            u"\U0001f926-\U0001f937"
            u"\U00010000-\U0010ffff"
            u"\u2640-\u2642" 
            u"\u2600-\u2B55"
            u"\u200d"
            u"\u23cf"
            u"\u23e9"
            u"\u231a"
            u"\ufe0f"  # dingbats
            u"\u3030"
                        "]+", re.UNICODE)
        # english_words = re.compile(r'\b[a-zA-Z]+\b')

        return re.sub(emoj, '', data)
    return 

In [23]:
multiple_video_comments.dropna(subset=['c_text'],inplace=True)

In [24]:
# remove emotes from the text to be analyzed c_text = comment text
multiple_video_comments['c_text']=multiple_video_comments['c_text'].apply(remove_emojis)

df_length_before = len(multiple_video_comments)
print("DataFrame Length Before:", df_length_before)

# drop duplicates
multiple_video_comments.drop_duplicates(inplace=True)

# drop rows with empty or text length <= 2 comments
multiple_video_comments = multiple_video_comments[multiple_video_comments['c_text'].apply(lambda x: len(x) > 2)]

df_length_after = len(multiple_video_comments)
print("DataFrame Length After:", df_length_after)

multiple_video_comments

  data = BeautifulSoup(data, "html.parser").get_text()
  data = BeautifulSoup(data, "html.parser").get_text()


DataFrame Length Before: 42333
DataFrame Length After: 40536


Unnamed: 0,product,v_title,v_videoId,v_channelTitle,v_publishTime,v_description,v_thumbnail,c_id,c_parentId,c_author,c_published_at,c_updated_at,c_like_count,c_text
0,Love Alarm Season 1,"TEARLINER, HAEJIN - Blooming Story (Love Alarm...",1xLrbvjCx1k,nvillera,2019-10-01T01:32:48Z,"gracias por ver ¡! si tengo algún error, puede...",https://i.ytimg.com/vi/1xLrbvjCx1k/default.jpg,UgymI4uhPmf2rU9nmst4AaABAg,,saikibu,2021-03-12T15:16:47Z,2021-03-12T15:16:47Z,725.0,"Vengo a llorar. Sunho, lo hiciste bien :)."
1,Love Alarm Season 1,"TEARLINER, HAEJIN - Blooming Story (Love Alarm...",1xLrbvjCx1k,nvillera,2019-10-01T01:32:48Z,"gracias por ver ¡! si tengo algún error, puede...",https://i.ytimg.com/vi/1xLrbvjCx1k/default.jpg,Ugxlv0IjyeU9y9CIL-J4AaABAg,,maicolsebastianvegaquiroga8164,2022-07-20T13:59:59Z,2022-07-20T13:59:59Z,1716.0,Esta serie nos demuestra que existen 2 amores ...
2,Love Alarm Season 1,"TEARLINER, HAEJIN - Blooming Story (Love Alarm...",1xLrbvjCx1k,nvillera,2019-10-01T01:32:48Z,"gracias por ver ¡! si tengo algún error, puede...",https://i.ytimg.com/vi/1xLrbvjCx1k/default.jpg,UgyuMtKWSg1g41IVWLV4AaABAg,,cielojuneth,2022-01-29T06:21:55Z,2022-01-29T06:21:55Z,1467.0,"Pinshi drama, ya es 2022 y sigo aún sigo llora..."
3,Love Alarm Season 1,"TEARLINER, HAEJIN - Blooming Story (Love Alarm...",1xLrbvjCx1k,nvillera,2019-10-01T01:32:48Z,"gracias por ver ¡! si tengo algún error, puede...",https://i.ytimg.com/vi/1xLrbvjCx1k/default.jpg,UgymkUO5XCYBkcDNWKF4AaABAg,,mirtanormadavid7287,2023-01-26T00:52:46Z,2023-01-26T00:52:46Z,350.0,2023 y aún lloro a esta serie Sun Oh se feliz...
4,Love Alarm Season 1,"TEARLINER, HAEJIN - Blooming Story (Love Alarm...",1xLrbvjCx1k,nvillera,2019-10-01T01:32:48Z,"gracias por ver ¡! si tengo algún error, puede...",https://i.ytimg.com/vi/1xLrbvjCx1k/default.jpg,UgxvgMKvQqz4BXO4qtF4AaABAg,,anabelencofficial,2021-11-18T17:01:26Z,2021-11-18T17:01:26Z,689.0,Esta serie fue el claro ejemplo de que aveces ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42330,Love Alarm Season 2,wait for the Twist..🤣#kdrama #shorts #lovealarm,591hS-MP3NM,CherryBlossom,2022-09-24T10:09:50Z,cherryblossom.,https://i.ytimg.com/vi/591hS-MP3NM/default.jpg,UgxHL-bICq7TcTtfqDV4AaABAg.9gYDgjcDMlT9gZYrBMk5IO,UgxHL-bICq7TcTtfqDV4AaABAg,myweirdlife5279,2022-09-29T14:45:49Z,2022-09-29T14:45:49Z,0.0,@7 Army love alarm
42331,Love Alarm Season 2,wait for the Twist..🤣#kdrama #shorts #lovealarm,591hS-MP3NM,CherryBlossom,2022-09-24T10:09:50Z,cherryblossom.,https://i.ytimg.com/vi/591hS-MP3NM/default.jpg,Ugz4konhLiToR_QwJ554AaABAg.9zQbTrdxL949zRGUfKSWbd,Ugz4konhLiToR_QwJ554AaABAg,cherryblossom7949,2024-01-11T15:27:53Z,2024-01-11T15:27:53Z,0.0,Same
42332,Love Alarm Season 2,wait for the Twist..🤣#kdrama #shorts #lovealarm,591hS-MP3NM,CherryBlossom,2022-09-24T10:09:50Z,cherryblossom.,https://i.ytimg.com/vi/591hS-MP3NM/default.jpg,Ugz4konhLiToR_QwJ554AaABAg.9zQbTrdxL949zkD_sNdX_g,Ugz4konhLiToR_QwJ554AaABAg,niankochwan1233,2024-01-19T09:27:20Z,2024-01-19T09:27:20Z,0.0,Why? What's the story
42333,Love Alarm Season 2,wait for the Twist..🤣#kdrama #shorts #lovealarm,591hS-MP3NM,CherryBlossom,2022-09-24T10:09:50Z,cherryblossom.,https://i.ytimg.com/vi/591hS-MP3NM/default.jpg,Ugz4konhLiToR_QwJ554AaABAg.9zQbTrdxL94A-KwX3gIXtu,Ugz4konhLiToR_QwJ554AaABAg,Safire919,2024-02-03T01:00:40Z,2024-02-03T01:00:40Z,0.0,"So technically JoJo, and sunho start dating, b..."


Reference

https://stackoverflow.com/questions/40375366/pandas-to-csv-checking-for-overwrite

Create CSV

In [25]:
import glob

filename = 'final_comments_df2.csv'
files_present = glob.glob(filename)
# will only write to disk if file doesnt exist
if not files_present:
    multiple_video_comments.to_csv(filename, index=False)
    multiple_video_comments
else:
    print (f'File Already Exists. Delete {filename}' )