In [81]:
import googleapiclient.discovery
import googleapiclient.errors
import numpy as np
import pandas as pd
import glob

api_service_name = "youtube"
api_version = "v3"
DEVELOPER_KEY = "AIzaSyC8VqY2cYxX7jOouIF076rpM8lvT1ZBJu4"

Reference

https://developers.google.com/youtube/v3/docs/search/list#parameters

https://developers.google.com/youtube/v3/docs/comments/list

In [82]:
class api_handler:
    def __init__(self, api_service_name, api_version, developer_key):
        self.client = googleapiclient.discovery.build(api_service_name,
                                                    api_version,
                                                    developerKey=developer_key)
        
    # Search for videos details given id
    def get_video_details(self, videoId, part="snippet"):
        request = self.client.videos().list(
            part=part,
            id=videoId
        )
        response = request.execute()

        if 'items' in response:
            video_details = response['items'][0]
            snippet=video_details['snippet']
            snippet['videoId']=videoId
            snippet['id']=videoId
            snippet['publishTime']=video_details.get('snippet', {}).get('publishedAt', {})
            snippet['thumbnails']=video_details.get('snippet', {}).get('thumbnails', {}).get('default', {}).get('url', '')
            return snippet

        return None

    # Search for videos given query
    def get_videos(self,query,maxResults=5,part="snippet"):
        request = self.client.search().list(
            part=part,
            maxResults=maxResults,
            # higher view count is likely to be more relevent 
            order="viewCount",
            q=query,  
            # american region videos 
            regionCode="US",
            # english videos
            relevanceLanguage="en",
            type="video"
        )
        response = request.execute()
        return response
    
    # Format Response from get_videos to dataframe
    def get_video_df(response):
        items=[]
        for item in response['items']:
            snippet=item.get('snippet', {})
            items+=[{
                'title':snippet.get('title', ''),
                'videoId':item.get('id', {}).get('videoId', ''),
                'channelTitle':snippet.get('channelTitle', ''),
                'publishTime':snippet.get('publishTime', ''),
                'description':snippet.get('description', ''),
                'thumbnails':snippet.get('thumbnails', {}).get('default', {}).get('url', '')
                }]
        df=pd.DataFrame(items)
        return df
    
    # Get comments from video
    def get_comments(self,videoId,part="snippet",maxResults=100):
        all_comments = []
        nextPageToken = None
        start=True
        while maxResults > 0 and (nextPageToken !=None or start) :
            request = self.client.commentThreads().list(
                part=part,
                videoId=videoId,
                maxResults=min(maxResults, 100),
                order='relevance',
                moderationStatus='published',
                textFormat='plainText',
                pageToken=nextPageToken
            )
            
            response = request.execute()
            
            if 'items' in response:
                all_comments+=[response]
            nextPageToken = response.get('nextPageToken')
            start=False
            maxResults -= min(maxResults, 100)
        return all_comments
    
    # Format response from get_comments to dataframe
    def get_comments_df(response, video,product):
        comments = []
        for pages in response:
            for item in pages['items']:
                comment = item.get('snippet', {}).get('topLevelComment', {}).get('snippet', {})
                comments.append([
                        product,
                        video.get('title', ''),
                        video.get('videoId', ''),
                        video.get('channelTitle', ''),
                        video.get('publishTime', ''),
                        video.get('description', ''),
                        video.get('thumbnails', ''),
                        comment.get('authorDisplayName', '')[1:],  
                        comment.get('publishedAt', ''),
                        comment.get('updatedAt', ''),
                        comment.get('likeCount', ''),
                        comment.get('textDisplay', '')
                    ])

        df = pd.DataFrame(comments,
            columns=['product','v_title', 'v_videoId',
                    'v_channelTitle', 'v_publishTime',
                    'v_description', 'v_thumbnail',
                    'c_author', 'c_published_at',
                    'c_updated_at', 'c_like_count', 'c_text'])
        
        return df
    
    # Search for videos related to products iteratively
    # Collect comments from each video and place it into an array
    def create_video_df_from_search(self, products,
                                    number_of_videos_per_product=5,
                                    number_of_comments_per_video=100):
        multiple_video_comments = pd.DataFrame()
        for product in products:
            # get 25 first videos with the highest viewer counts 
            response = self.get_videos(query=product, maxResults=number_of_videos_per_product)
            # Convert results to df
            videos_df = api_handler.get_video_df(response)
            # For each video get a maximum of 100 comments
            # and place comments into an array
            for _, video in videos_df.iterrows():
                try:
                    response = self.get_comments(video['videoId'], maxResults=number_of_comments_per_video)
                    comments_df = api_handler.get_comments_df(response, video, product)
                except:
                    # Function fails as the API returns 403 if the channel has comments disabled
                    # place an empty entry instead it can be deleted later
                    comments_df = pd.DataFrame(np.zeros((1, 12)),
                                                columns=['product', 'v_title', 'v_videoId',
                                                        'v_channelTitle', 'v_publishTime',
                                                        'v_description', 'v_thumbnail',
                                                        'c_author', 'c_published_at',
                                                        'c_updated_at', 'c_like_count',
                                                        'c_text'])
                    print('Unable to retrieve comments:', video.get('title', ''))
                multiple_video_comments = pd.concat([multiple_video_comments, comments_df], ignore_index=True)
        return multiple_video_comments
        
    # alternative method by explicitely specifying videos
    def create_video_df(self,products,videos,number_of_comments_per_video=100):
        count=0
        multiple_video_comments = pd.DataFrame()
        for product in products:
            for video in videos[count]:
                response = self.get_comments(video,maxResults=number_of_comments_per_video) 
                video=self.get_video_details(video)
                comments_df = api_handler.get_comments_df(response, video, product)
                multiple_video_comments = pd.concat([multiple_video_comments, comments_df], ignore_index=True)
            count+=1
        return multiple_video_comments

In [83]:
products=["Love Alarm Season 1",
         "Love Alarm Season 2"]

# careful when adding videos index number should match between products and videos
# index 0 should contain the videos used to get comments for video 0 and so on 
videos=[['LhCQ7lHEjU8','Yh7PNUGxihU','8sXTfzaLmiQ'],
        ['c2xta7hcvXI','mkrrKGo1VEs','CL0wU3ss2uw','jPKm6kc9j5A','g0Oj4A2rslY']]

youtube=api_handler(api_service_name, api_version, DEVELOPER_KEY)


In [84]:
# products=["Love Alarm",
#          "Squid Game"]

# # careful when adding videos index number should match between products and videos
# # index 0 should contain the videos used to get comments for video 0 and so on 
# videos=[['g0Oj4A2rslY','jPKm6kc9j5A','CL0wU3ss2uw','mkrrKGo1VEs','c2xta7hcvXI','8sXTfzaLmiQ','LhCQ7lHEjU8'],
#         ['O61C8zc8Znk','Dm3mCuKrkgE','oqxAJKy0ii4','LguovynN950','MV-pCUiUNI0','TyeuJgSaX5A','MnpaXmOXdaY','pwqicf--P18']]

# youtube=api_handler(api_service_name, api_version, DEVELOPER_KEY)

In [85]:
# multiple_video_comments=youtube.create_video_df_from_search(products)
multiple_video_comments=youtube.create_video_df(products,videos,number_of_comments_per_video=1000000)

Taken from:

https://stackoverflow.com/questions/33404752/removing-emojis-from-a-string-in-python

In [86]:
import re
from bs4 import BeautifulSoup

def remove_emojis(data):
    if isinstance(data, str):
        # Remove html tags
        data = BeautifulSoup(data, "html.parser").get_text()
        # Remove emote, etc
        emoj = re.compile("["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
            u"\U00002500-\U00002BEF"  # chinese char
            u"\U00002702-\U000027B0"
            u"\U000024C2-\U0001F251"
            u"\U0001f926-\U0001f937"
            u"\U00010000-\U0010ffff"
            u"\u2640-\u2642" 
            u"\u2600-\u2B55"
            u"\u200d"
            u"\u23cf"
            u"\u23e9"
            u"\u231a"
            u"\ufe0f"  # dingbats
            u"\u3030"
                        "]+", re.UNICODE)
        # english_words = re.compile(r'\b[a-zA-Z]+\b')

        return re.sub(emoj, '', data)
    return 

In [87]:
# remove emotes from the text to be analyzed c_text = comment text
multiple_video_comments['c_text']=multiple_video_comments['c_text'].apply(remove_emojis)

df_length_before = len(multiple_video_comments)
print("DataFrame Length Before:", df_length_before)

# drop duplicates
multiple_video_comments.drop_duplicates(inplace=True)

# drop rows with empty or text length <= 2 comments
multiple_video_comments = multiple_video_comments[multiple_video_comments['c_text'].apply(len) > 2]

df_length_after = len(multiple_video_comments)
print("DataFrame Length After:", df_length_after)

multiple_video_comments

  data = BeautifulSoup(data, "html.parser").get_text()


DataFrame Length Before: 4345
DataFrame Length After: 4305


Unnamed: 0,product,v_title,v_videoId,v_channelTitle,v_publishTime,v_description,v_thumbnail,c_author,c_published_at,c_updated_at,c_like_count,c_text
0,Love Alarm Season 1,Love Alarm | Official Teaser | Netflix [ENG SUB],LhCQ7lHEjU8,Netflix K-Content,2019-07-29T00:00:00Z,"🔔 “The moment he rang my Love Alarm, I felt th...",https://i.ytimg.com/vi/LhCQ7lHEjU8/default.jpg,angelinaho3925,2019-07-29T00:53:46Z,2019-07-29T00:53:46Z,4637,Bro if I got this app it would have zero peopl...
1,Love Alarm Season 1,Love Alarm | Official Teaser | Netflix [ENG SUB],LhCQ7lHEjU8,Netflix K-Content,2019-07-29T00:00:00Z,"🔔 “The moment he rang my Love Alarm, I felt th...",https://i.ytimg.com/vi/LhCQ7lHEjU8/default.jpg,howlymoly914,2019-07-29T00:23:17Z,2019-07-29T00:23:17Z,2155,All I can say is *Finally*\nKim so hyun is her...
2,Love Alarm Season 1,Love Alarm | Official Teaser | Netflix [ENG SUB],LhCQ7lHEjU8,Netflix K-Content,2019-07-29T00:00:00Z,"🔔 “The moment he rang my Love Alarm, I felt th...",https://i.ytimg.com/vi/LhCQ7lHEjU8/default.jpg,clairerominez1318,2019-08-01T11:22:44Z,2019-08-01T11:22:44Z,378,Another highschool drama with Kim So Hyun! \n\...
3,Love Alarm Season 1,Love Alarm | Official Teaser | Netflix [ENG SUB],LhCQ7lHEjU8,Netflix K-Content,2019-07-29T00:00:00Z,"🔔 “The moment he rang my Love Alarm, I felt th...",https://i.ytimg.com/vi/LhCQ7lHEjU8/default.jpg,panitia_rimaajengnurraihan9346,2019-07-29T00:41:51Z,2019-07-29T00:41:51Z,1527,I really miss her acting. The trailer look goo...
4,Love Alarm Season 1,Love Alarm | Official Teaser | Netflix [ENG SUB],LhCQ7lHEjU8,Netflix K-Content,2019-07-29T00:00:00Z,"🔔 “The moment he rang my Love Alarm, I felt th...",https://i.ytimg.com/vi/LhCQ7lHEjU8/default.jpg,keanna2060,2019-07-29T10:52:00Z,2019-07-29T10:52:54Z,319,Finally! Kim So Hyun is back with a school-ro...
...,...,...,...,...,...,...,...,...,...,...,...,...
4340,Love Alarm Season 2,Love Alarm Season 2 | Official Trailer | Netfl...,g0Oj4A2rslY,Netflix K-Content,2021-02-26T00:00:19Z,The updated Love Alarm app v2.0 now shows you ...,https://i.ytimg.com/vi/g0Oj4A2rslY/default.jpg,yoshhikko6735,2021-03-26T10:58:55Z,2021-03-26T10:58:55Z,1,WHY DOES EVERYONE LIKES SUN OH FOR KIM JOJO HA...
4341,Love Alarm Season 2,Love Alarm Season 2 | Official Trailer | Netfl...,g0Oj4A2rslY,Netflix K-Content,2021-02-26T00:00:19Z,The updated Love Alarm app v2.0 now shows you ...,https://i.ytimg.com/vi/g0Oj4A2rslY/default.jpg,ZahraDias,2021-03-20T07:18:07Z,2021-03-20T07:18:07Z,1,i dont understand why people like sun oh so mu...
4342,Love Alarm Season 2,Love Alarm Season 2 | Official Trailer | Netfl...,g0Oj4A2rslY,Netflix K-Content,2021-02-26T00:00:19Z,The updated Love Alarm app v2.0 now shows you ...,https://i.ytimg.com/vi/g0Oj4A2rslY/default.jpg,seojun8209,2021-03-20T06:57:54Z,2021-03-20T06:57:54Z,8,Y'all have no right to complain about the endg...
4343,Love Alarm Season 2,Love Alarm Season 2 | Official Trailer | Netfl...,g0Oj4A2rslY,Netflix K-Content,2021-02-26T00:00:19Z,The updated Love Alarm app v2.0 now shows you ...,https://i.ytimg.com/vi/g0Oj4A2rslY/default.jpg,rose-gl4io,2021-03-23T01:20:46Z,2021-03-23T01:24:11Z,2,To anyone who hasn't watched this already\n\n*...


Reference

https://stackoverflow.com/questions/40375366/pandas-to-csv-checking-for-overwrite

Create CSV

In [88]:
import glob

filename = 'final_comments_df.csv'
files_present = glob.glob(filename)
# will only write to disk if file doesnt exist
if not files_present:
    multiple_video_comments.to_csv(filename, index=False)
    multiple_video_comments
else:
    print (f'File Already Exists. Delete {filename}' )