# Initial Data Scrape Workbook

***

### We ideally want to scrape a dataset consisting of
- video thumbnails
- titles
- views
- parent channel

### YouTube API
https://developers.google.com/youtube/v3/getting-started#quota

Get credentials by going to https://console.developers.google.com/apis/dashboard?project=red-means-go
- Look for "YouTube Data API v3" in the library tab and make sure it's enabled.
- Select Credentials and get an api key

Daily limit of 10,000 "units" worth of requests.
- Different operations have different cost values, need to be careful what data we request.

We can more efficiently get data by using the offered compressed gzip request format.

In [None]:
# Run once
!pip install --upgrade google-api-python-client
!pip install --upgrade google-auth-oauthlib google-auth-httplib2
!pip install --upgrade google-api-core

### Desired scraping code
- config files to identify what categories of videos to scrape
- what level of popularity to lower bound our videos to
    - what measurement works for this? subscription to yearly average view count in relation to videos uploaded count?
- possible inversion config option to instead opt for getting the least popular videos(?)
- output to data/out/
    - /thumbs -- a folder full of thumbnails with identifying labels (possibly gzip compressed?)
    - videos.csv -- a .csv containing metadata on the videos that correspond to the thumbnails in the above folder.

### Possible search parameters
- Safesearch
    - none
    - moderate
    - strict

***

# Code

## Imports

In [5]:
import os
import json
import pandas as pd
import time
from PIL import Image
import requests
from io import BytesIO
import numpy as np

import google_auth_oauthlib.flow
import googleapiclient.discovery
import googleapiclient.errors

## Data Setup and Global Vars

In [6]:
scopes = ["https://www.googleapis.com/auth/youtube.force-ssl"]
with open('../../api_key.json') as json_file:
    cred = json.load(json_file)
api_key = cred['api_key']
# Disable OAuthlib's HTTPS verification when running locally.
# *DO NOT* leave this option enabled in production.
os.environ["OAUTHLIB_INSECURE_TRANSPORT"] = "1"
api_service_name = "youtube"
api_version = "v3"
youtube = googleapiclient.discovery.build(
    api_service_name, api_version, developerKey=api_key)

## Library Functions

In [7]:
def request_playlist_videos(playlist_id, num_results, page_token=None):
    if page_token:
        request = youtube.playlistItems().list(
            part="snippet",
            maxResults=num_results,
            playlistId=playlist_id,
            pageToken=page_token
        )
    else:
        request = youtube.playlistItems().list(
                part="snippet",
                maxResults=num_results,
                playlistId=playlist_id,
            )
    response = request.execute()
    return response

def request_topic_id(q_term): 
    request = youtube.search().list(
            part="snippet",
            q=q_term,
            maxResults="1",
            type="channel",
            order="relevance"
            )
    response = request.execute()
    game_topic_channel = response['items'][0]['snippet']['channelId']
    return game_topic_channel

def request_recent_playlist_id(game_topic_channel):
    request = youtube.channelSections().list(
        part="snippet,contentDetails",
        channelId=game_topic_channel,
        )
    response = request.execute()
    recent_playlist = None
    for section in response['items']:
        try: 
            if section['snippet']['localized']['title'] == "Recent Videos":
                recent_playlist = section['contentDetails']['playlists'][0]
            else:
                continue
        except:
            continue
    return recent_playlist

def generate_dataset(q_term, num_channels, videos_per_channel):
    topic_id = request_topic_id(q_term + " topic")
    topic_recent_playlist_id = request_recent_playlist_id(topic_id)
    recent_playlist_details = request_playlist_videos(topic_recent_playlist_id,5)
    video_ids = get_video_ids(topic_recent_playlist_id, recent_playlist_details, num_channels)
    parent_ids = get_parent_channels(video_ids)
    channel_videos_dic = populate_channel_game_videos(q_term, parent_ids)
    res = generate_result_dics(video_ids, parent_ids, channel_videos_dic)
    return res

def get_video_ids(playlist_id, playlist_details, num_vids):
    recent_video_ids = []
    for vid_data in playlist_details['items']:
        recent_video_ids.append(vid_data['snippet']['resourceId']['videoId'])
        if len(recent_video_ids) == num_vids:
            break
    next_token = playlist_details['nextPageToken']
    cur_page = request_playlist_videos(playlist_id, 5, next_token)
    while len(recent_video_ids) < num_vids:
        cur_page = request_playlist_videos(playlist_id, 5, next_token)
        for vid_data in cur_page['items']:
            recent_video_ids.append(vid_data['snippet']['resourceId']['videoId'])
            if len(recent_video_ids) == num_vids:
                break
        next_token = cur_page['nextPageToken']
    return recent_video_ids

def get_parent_channels(video_ids):
    parent_channel_ids = []
    for vid_id in video_ids:
        vid_content = request_sparse_video_details(vid_id)
        parent_channel = vid_content['items'][0]['snippet']['channelId']
        parent_channel_ids.append(parent_channel)
    return parent_channel_ids

def request_sparse_video_details(video_id):
    youtube = googleapiclient.discovery.build(
        api_service_name, api_version, developerKey=api_key)
    # note that this uses youtube.videos instead of youtube.search
    request = youtube.videos().list(
        part="snippet",
        id=video_id
    )
    response = request.execute()
    return response     

def get_channel_game_videos(game, parent):
    request = youtube.channels().list(
    part="snippet,contentDetails",
    id=parent,
    )
    response = request.execute()
    uploads_id = response['items'][0]['contentDetails']['relatedPlaylists']['uploads']
    uploads_details = request_playlist_videos(uploads_id, 50)
    game_vids = []
    for vid_data in uploads_details['items']:
        vid_title = vid_data['snippet']['title'].lower()
        vid_desc = vid_data['snippet']['description'].lower()
        if game in vid_title or game in vid_desc:
            game_vids.append(vid_data['snippet']['resourceId']['videoId'])
    return game_vids

def populate_channel_game_videos(game, parents):
    channel_videos = {}
    for par_chan in parents:
        if par_chan not in channel_videos.keys():
            channel_videos[par_chan] = get_channel_game_videos(game, par_chan)
        else:
            continue
    return channel_videos

def generate_result_dics(videos, parents, channel_videos):
    all_results = []
    for i in range(len(parents)):
        out_dic = {"video_id": videos[i],
                   "position": i,
                   "channel_id": parents[i],
                   "channel_videos": channel_videos[parents[i]]}
        all_results.append(out_dic)
    return all_results

## Full Run

In [8]:
fortnite = generate_dataset("fortnite", 5, 5)

In [9]:
apex = generate_dataset("apex legends", 5, 5)

***

# Old Functions / Code that is no longer in use

In [None]:
# def general_search(q_term, num_results, res_type, channel_id, order_type, page_token=None):
#     youtube = googleapiclient.discovery.build(
#         api_service_name, api_version, developerKey=api_key)
    
#     if page_token:
#         request = youtube.search().list(
#             part="snippet",
#             maxResults=num_results,
#             type=res_type,
#             channelId=channel_id,
#             order=order_type,
#             pageToken=page_token
#         )
#     else:
#         request = youtube.search().list(
#             part="snippet",
#             maxResults=num_results,
#             type=res_type,
#             channelId=channel_id,
#             order=order_type
#         )
#     response = request.execute()
#     return response

# def request_channels(q_term, num_results,order_type):
#     # all arguments must be strings, returns json object with list of channels
#     youtube = googleapiclient.discovery.build(
#         api_service_name, api_version, developerKey=api_key)
    
#     # Search parameters
#     request = youtube.search().list(
#         part="snippet",
#         q=q_term,
#         maxResults=num_results,
#         type="channel",
#         order=order_type
#     )
#     response = request.execute()

#     return response

# def request_channel_videos(channel_id, num_results, order_type, page_token):
#     # all arguments must be strings, returns json object with list of videos from the given channel
    
#     youtube = googleapiclient.discovery.build(
#         api_service_name, api_version, developerKey=api_key)
#     if page_token:
#         request = youtube.search().list(
#             part="snippet",
#             maxResults=num_results,
#             type="video",
#             channelId=channel_id,
#             order=order_type,
#             pageToken=page_token
#         )
#     else:
#         request = youtube.search().list(
#             part="snippet",
#             maxResults=num_results,
#             type="video",
#             channelId=channel_id,
#             order=order_type
#         )
#     response = request.execute()
#     return response

# def request_video_details(video_id):
#     youtube = googleapiclient.discovery.build(
#         api_service_name, api_version, developerKey=api_key)
#     # note that this uses youtube.videos instead of youtube.search
#     request = youtube.videos().list(
#         part="snippet,contentDetails,statistics",
#         id=video_id
#     )
#     response = request.execute()
#     return response

# def get_vid_stats(vid):
#     channel_id = vid['snippet']['channelId']
#     channel_title = vid['snippet']['channelTitle']
#     try:
#         thumbnail_link = vid['snippet']['thumbnails']['maxres']['url']
#     except:
#         thumbnail_link = vid['snippet']['thumbnails']['high']['url']
#     title = vid['snippet']['title']
#     date = vid['snippet']['publishedAt']
#     views = vid['statistics']['viewCount']
#     likes = vid['statistics']['likeCount']
#     dislikes = vid['statistics']['dislikeCount']
#     comments = vid['statistics']['commentCount']
#     stats = [channel_id, channel_title, thumbnail_link, title, date, views, likes, dislikes, comments]
#     return stats

# # full pipeline for scraping and generating dataframe
# def main_pipeline():
#     metadata = []
#     # get initial search results (usually going to be a list of channels)
#     out = request_channels("gaming","5","relevance")
#     data = out['items']
#     # get channel ids from search results
#     channel_ids = []
#     for channel in data:
#         cur_channel_id = channel['snippet']['channelId']
#         channel_ids.append(cur_channel_id)
#         # get channel videos from the current channel id (we can also choose to handpick channels / videos)
#         videos = request_channel_videos(cur_channel_id,"5","date",None)
#         video_ids = []
#         for video in videos['items']:
#             cur_id = video['id']['videoId']
#             video_ids.append(cur_id)
#             # use youtube videos api to get metadata about a single video, by video id
#             cur_vid = request_video_details(cur_id)['items'][0]
#             row = get_vid_stats(cur_vid)
#             metadata.append(row)
#         time.sleep(1)

#     # create a dataframe from the gathered metadata
#     columns = ['channelId','channelTitle','thumbnailLink',
#                'videoTitle','Date','Views',
#                'Likes','Dislikes','Comments']
#     df = pd.DataFrame(metadata,columns=columns)
#     return df

# def get_channel_id(q_term):
#     # rough channel conversion function (may not work all the time)
    
#     init_search = request_channels(q_term,"1","relevance")
#     chan_id = init_search['items'][0]['snippet']['channelId']
#     return chan_id

# def generate_video_data(video_list):
#     # generates video data for each video in given list and returns a formatted df of video statistics
#     channel_data = []
#     for vid in video_list['items']:
#         cur_id = vid['id']['videoId']
#         cur_vid = request_video_details(cur_id)['items'][0]
#         row = get_vid_stats(cur_vid)
#         channel_data.append(row)
#     columns = ['channelId','channelTitle','thumbnailLink',
#                'videoTitle','Date','Views',
#                'Likes','Dislikes','Comments']
#     df = pd.DataFrame(channel_data,columns=columns)
#     df['Views'] = df['Views'].astype(int)
#     df['Likes'] = df['Likes'].astype(int)
#     df['Dislikes'] = df['Dislikes'].astype(int)
#     df['Comments'] = df['Comments'].astype(int)
#     return df

# def generate_thumbnails(thumbnail_list):
#     # just makes a request call to the thumbnail link
#     imgs = []
#     for link in thumbnail_list:
#         response = requests.get(link)
#         img = Image.open(BytesIO(response.content))
#         imgs.append(img)
#     return imgs

# def evaluate_videos(df):
#     # evaulated based on z score of views and likes currently, very rough approx
#     avg_views = df['Views'].mean()
#     avg_likes = df['Likes'].mean()
#     avg_dislikes = df['Dislikes'].mean()
#     avg_comments = df['Comments'].mean()
    
#     std_views = np.std(df['Views'])
#     std_likes = np.std(df['Likes'])
#     std_dislikes = np.std(df['Dislikes'])
#     std_comments = np.std(df['Comments'])
    
#     z_views = df['Views'].apply(lambda x: (x - avg_views) / std_views)
#     z_likes = df['Likes'].apply(lambda x: (x - avg_likes) / std_likes)
#     z_dislikes = df['Dislikes'].apply(lambda x: (x - avg_dislikes) / std_dislikes)
#     z_comments = df['Comments'].apply(lambda x: (x - avg_comments) / std_comments)
    
#     good_videos = []
#     bad_videos = []
#     for i in range(len(df)):
#         if z_views[i] > .3 and z_likes[i] > .3:
#             good_videos.append(i)
#         if z_views[i] < -.5 and z_likes[i] < -.5:
#             bad_videos.append(i)

#     return good_videos, bad_videos

# def analyze_channel(keyword):
#     channel_id = get_channel_id(keyword)
#     channel_videos = request_channel_videos(channel_id, "30", "date", None)
#     video_data_df = generate_video_data(channel_videos)
#     video_thumbnails = generate_thumbnails(video_data_df['thumbnailLink'].values)
#     good_videos, bad_videos = evaluate_videos(video_data_df)
#     return good_videos, bad_videos

In [None]:
# FULL RUN
# run this if you want to scrape data and generate a dataset BE CAREFUL THIS USES MANY API CALLS
# meta_df = main_pipeline()

In [None]:
# # get initial search data (usually a list of channels)
# out = request_channels("gaming","5","relevance")
# data = out['items']

In [None]:
# # get channel ids from search results
# channel_ids = []
# for channel in data:
#     cur_channel_id = channel['snippet']['channelId']
#     channel_ids.append(cur_channel_id)

In [None]:
# # get channel videos from a specific channel id
# cur_channel_id = channel_ids[1] # arbitrary channel id for demo purposes
# videos = request_channel_videos(cur_channel_id,"5","date",None)
# video_ids = []
# for video in videos['items']:
#     video_ids.append(video['id']['videoId'])

In [None]:
# metadata = []
# # use youtube videos api to get metadata about a single video, by video id
# for cur_id in video_ids:
#     cur_vid = request_video_details(cur_id)['items'][0]
#     row = get_vid_stats(cur_vid)
#     metadata.append(row)    

In [None]:
# # create a dataframe from the gathered metadata
# columns = ['channelId','channelTitle','thumbnailLink','videoTitle','Date','Views','Likes','Dislikes','Comments']
# df = pd.DataFrame(metadata,columns=columns)

In [None]:
# # handpicked youtubers to analyze (channel ids)
# pokeaimMD = 'UCbXuFrNGSKcmZY_5DwYz4Ew'
# pewdiepie = 'UC-lHJZR3Gqxm24_Vd_AJ5Yw'
# ninja = 'UCAW-NpUFkMyCNrvRSSGIvDQ'
# abdallah = 'UCsDtTzkvGxxw95C4IOfZ7dw'
# ytdan = 'UCXgNU9GtLPiE2dAYDwIQO6Q'
# # possible idea is to just use most relevant to "gaming" search term

# Full pipeline analysis

In [None]:
# good_videos, bad_videos = analyze_channel("ninja")
# good_videos, bad_videos

# Individual Example (Step-by-Step)

In [None]:
# ninja_videos = request_channel_videos(ninja, "30", "date", None) # first 30 videos
# next_page = ninja_videos['nextPageToken']
# ninja_videos2 = request_channel_videos(ninja, "30", "date",next_page) # next 30 videos

In [None]:
# ninja_videos = generate_video_data(ninja_videos)
# ninja_thumbnails = generate_thumbnails(ninja_videos['thumbnailLink'].values)
# good_videos, bad_videos = evaluate_videos(ninja_videos)

In [None]:
# # check to see that output of full pipeline is similar to step by step process
# good_videos, bad_videos

In [None]:
# # analysis of next 30 ninja videos
# more_ninja_videos = generate_video_data(ninja_videos2)
# more_ninja_thumbnails = generate_thumbnails(more_ninja_videos['thumbnailLink'].values)
# good_videos2, bad_videos2 = evaluate_videos(more_ninja_videos)

# Actually Displaying the good/bad thumbnails

In [None]:
# # good thumbnails
# for i in good_videos:
#     display(ninja_thumbnails[i])

In [None]:
# # more good videos
# for i in good_videos2:
#     display(more_ninja_thumbnails[i])

In [None]:
# # bad videos
# for i in bad_videos:
#     display(ninja_thumbnails[i])

In [None]:
# # more bad videos
# for i in bad_videos2:
#     display(more_ninja_thumbnails[i])