In [8]:
#pip install google-api-python-client
#pip install google-auth google-auth-oauthlib google-auth-httplib2
#pip install demoji
#pip install pandas
#pip install langdetect

In [1]:
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
import pandas as pd
import demoji
from langdetect import detect
import re
import os

### Restrict Access and set YouTube Parameters

In [2]:
CLIENT_SECRETS_FILE = "client_secret.json"

SCOPES = ['https://www.googleapis.com/auth/youtube.force-ssl']
API_SERVICE_NAME = 'youtube'
API_VERSION = 'v3'

### Build the service and get the access token

In [3]:
def get_authenticated_service():
    flow = InstalledAppFlow.from_client_secrets_file(CLIENT_SECRETS_FILE, SCOPES)
    credentials = flow.run_console()
    return build(API_SERVICE_NAME, API_VERSION, credentials = credentials)

os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = '1'
service = get_authenticated_service()


Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=124038159274-bpvr7vijqn3bf9psatbu7f801qrdem3c.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fyoutube.force-ssl&state=k1TMxG9UASdQIQ8jKMLuLiWvgnXCmf&prompt=consent&access_type=offline
Enter the authorization code: 4/1AY0e-g5AraOK250_SY3htbUXQoxSoLIhtDwnjRCbMCE8N8wyIuNyyCAsO0U


### Perform YouTube Search on query

In [4]:
query = "FULL Microsoft Xbox E3 2019 Press Conference"

query_results = service.search().list(part = 'snippet', q= query,
                                     order = 'relevance',
                                     type = 'video',
                                     relevanceLanguage = 'en',
                                     safeSearch = 'moderate',).execute()

In [5]:
query_results['items']

[{'kind': 'youtube#searchResult',
  'etag': 'nhdVA3fAQ9GfUge5Z5XNDRAHRfM',
  'id': {'kind': 'youtube#video', 'videoId': '73kSvsQ_kkA'},
  'snippet': {'publishedAt': '2019-06-09T21:48:12Z',
   'channelId': 'UCbu2SsF-Or3Rsn3NxqODImw',
   'title': 'FULL Microsoft Xbox E3 2019 Press Conference',
   'description': 'Tune in to catch Phil Spencer and the Microsoft team talk about all their latest game reveals, trailers, hardware reveals, and more at E3 2019. For even more ...',
   'thumbnails': {'default': {'url': 'https://i.ytimg.com/vi/73kSvsQ_kkA/default.jpg',
     'width': 120,
     'height': 90},
    'medium': {'url': 'https://i.ytimg.com/vi/73kSvsQ_kkA/mqdefault.jpg',
     'width': 320,
     'height': 180},
    'high': {'url': 'https://i.ytimg.com/vi/73kSvsQ_kkA/hqdefault.jpg',
     'width': 480,
     'height': 360}},
   'channelTitle': 'GameSpot',
   'liveBroadcastContent': 'none',
   'publishTime': '2019-06-09T21:48:12Z'}},
 {'kind': 'youtube#searchResult',
  'etag': 'mTlllIGdm3sYo7Q6

### Extract Video Details- videoId, channel Id, title, description

In [6]:
video_id = []
channel = []
video_title = []
video_desc = []
for item in query_results['items']:
    video_id.append(item['id']['videoId'])
    channel.append(item['snippet']['channelTitle'])
    video_title.append(item['snippet']['title'])
    video_desc.append(item['snippet']['description'])
    
# we only wants to work with one video so selecting only first element

video_id = video_id[0]
channel = channel[0]
video_title = video_title[0]
video_desc = video_desc[0]


### Extract Video details - comments 

In [7]:
video_id_pop = []
channel_pop = []
video_title_pop = []
video_desc_pop = []
comments_pop = []
comment_id_pop = []
reply_count_pop = []
like_count_pop = []

comments_temp = []
comment_id_temp = []
reply_count_temp = []
like_count_temp = []

In [8]:
nextPage_token = None

while 1:
    response = service.commentThreads().list(
        part  = 'snippet',
        videoId = video_id,
        maxResults = 100,
        order = 'relevance',
        textFormat = 'plainText',
        pageToken = nextPage_token
        ).execute()

    
    nextPage_token = response.get('nextPageToken')
    for item in response['items']:
        comments_temp.append(item['snippet']['topLevelComment']['snippet']['textDisplay'])
        comment_id_temp.append(item['snippet']['topLevelComment']['id'])
        reply_count_temp.append(item['snippet']['totalReplyCount'])
        like_count_temp.append(item['snippet']['topLevelComment']['snippet']['likeCount'])
        comments_pop.extend(comments_temp)
        comment_id_pop.extend(comment_id_temp)
        reply_count_pop.extend(reply_count_temp)
        like_count_pop.extend(like_count_temp)
        
        video_id_pop.extend([video_id]*len(comments_temp))
        channel_pop.extend([channel]*len(comments_temp))
        video_title_pop.extend([video_title]*len(comments_temp))
        video_desc_pop.extend([video_desc]*len(comments_temp))
        
    if nextPage_token is None:
        break

In [9]:
output_dict = {
        'Channel': channel_pop,
        'Video Title': video_title_pop,
        'Video Description': video_desc_pop,
        'Video ID': video_id_pop,
        'Comment': comments_pop,
        'Comment ID': comment_id_pop,
        'Replies': reply_count_pop,
        'Likes': like_count_pop
        
    }
    
output_df = pd.DataFrame(output_dict, columns = output_dict.keys())
    

In [10]:
output_df.head()

Unnamed: 0,Channel,Video Title,Video Description,Video ID,Comment,Comment ID,Replies,Likes
0,GameSpot,FULL Microsoft Xbox E3 2019 Press Conference,Tune in to catch Phil Spencer and the Microsof...,73kSvsQ_kkA,1:17 Outer Worlds (Gameplay Trailer )\n5:08 Bl...,UgyolGA3btmyXsv1yPF4AaABAg,45,1186
1,GameSpot,FULL Microsoft Xbox E3 2019 Press Conference,Tune in to catch Phil Spencer and the Microsof...,73kSvsQ_kkA,1:17 Outer Worlds (Gameplay Trailer )\n5:08 Bl...,UgyolGA3btmyXsv1yPF4AaABAg,45,1186
2,GameSpot,FULL Microsoft Xbox E3 2019 Press Conference,Tune in to catch Phil Spencer and the Microsof...,73kSvsQ_kkA,John Wick survives all the way to 2077. Legit.,UgyI1VT_X38sdK5WxH94AaABAg,13,1599
3,GameSpot,FULL Microsoft Xbox E3 2019 Press Conference,Tune in to catch Phil Spencer and the Microsof...,73kSvsQ_kkA,1:17 Outer Worlds (Gameplay Trailer )\n5:08 Bl...,UgyolGA3btmyXsv1yPF4AaABAg,45,1186
4,GameSpot,FULL Microsoft Xbox E3 2019 Press Conference,Tune in to catch Phil Spencer and the Microsof...,73kSvsQ_kkA,John Wick survives all the way to 2077. Legit.,UgyI1VT_X38sdK5WxH94AaABAg,13,1599


In [11]:
output_df.shape

(397386, 8)

In [12]:
#checking duplicates
duplicates = output_df[output_df.duplicated('Comment')]
duplicates.shape[0]

396501

In [13]:
#remove duplicate
df = output_df.drop_duplicates(subset=['Comment'])
df.shape

(885, 8)

In [14]:
# Saving data to csv
df.to_csv('data.csv', index = False)