# Initial Data Scrape Workbook

***

### We ideally want to scrape a dataset consisting of
- video thumbnails
- titles
- views
- parent channel

### YouTube API
https://developers.google.com/youtube/v3/getting-started#quota

Get credentials by going to https://console.developers.google.com/apis/dashboard?project=red-means-go
- Look for "YouTube Data API v3" in the library tab and make sure it's enabled.
- Select Credentials and get an api key

Daily limit of 10,000 "units" worth of requests.
- Different operations have different cost values, need to be careful what data we request.

We can more efficiently get data by using the offered compressed gzip request format.

In [None]:
# Run once
!pip install --upgrade google-api-python-client
!pip install --upgrade google-auth-oauthlib google-auth-httplib2
!pip install --upgrade google-api-core

### Desired scraping code
- config files to identify what categories of videos to scrape
- what level of popularity to lower bound our videos to
    - what measurement works for this? subscription to yearly average view count in relation to videos uploaded count?
- possible inversion config option to instead opt for getting the least popular videos(?)
- output to data/out/
    - /thumbs -- a folder full of thumbnails with identifying labels (possibly gzip compressed?)
    - videos.csv -- a .csv containing metadata on the videos that correspond to the thumbnails in the above folder.

### Possible search parameters
- Safesearch
    - none
    - moderate
    - strict

***

# Code

## Imports

In [None]:
import os
import json
import pandas as pd
import time
from PIL import Image
import requests
from io import BytesIO
import numpy as np
from datetime import datetime
import dateutil.relativedelta

import google_auth_oauthlib.flow
import googleapiclient.discovery
import googleapiclient.errors

## Data Setup and Global Vars

In [None]:
scopes = ["https://www.googleapis.com/auth/youtube.force-ssl"]
with open('../../api_key.json') as json_file:
    cred = json.load(json_file)
api_key = cred['api_key']
# Disable OAuthlib's HTTPS verification when running locally.
# *DO NOT* leave this option enabled in production.
os.environ["OAUTHLIB_INSECURE_TRANSPORT"] = "1"
api_service_name = "youtube"
api_version = "v3"
youtube = googleapiclient.discovery.build(
    api_service_name, api_version, developerKey=api_key)

## Library Functions

In [None]:
def full_run_search_result(q_term, num_search_results, videos_per_channel):
    
    # list of strings: 
    # returns the list of video/parent channel ids from a search result with the given query term
    video_ids, parent_ids = iterate_search_results(q_term, num_search_results)
    
    # dictionary:
    # returns a dictionary where the keys are the parent channel ids
    # and the values are the video ids from the uploads playlist for that channel of length videos_per_channel
    channel_videos_dic = populate_channel_game_videos(q_term, parent_ids, videos_per_channel)
    print("------------------")
    print("unique channels gathered: ", len(channel_videos_dic.keys()))
    print("------------------")
    
    # list of dictionaries:
    # Each dictionary corresponds to a video_id from the recent playlist in the game topic channel
    # the number of dictionaries is equal to num_recent_videos.
    #
    # Each dictionary contains the video id, parent channel id, 
    # position in the YouTube game topic recent playlist, 
    # and list of videos from that channel on the speicified game.
    print("------------------")
    print("Aggregating Results...")
    res = generate_result_dics(video_ids, parent_ids, channel_videos_dic)
    print("Data Successfully scraped!")
    print("------------------")
    return res

def full_run_topic_channel(q_term, num_recent_videos, videos_per_channel):
    # string: 
    # Gets the id of the YouTube auto-generated topic channel for the q_term
    topic_id = request_topic_id(q_term + " topic")
    
    # string: 
    # Gets the playlist id of the recent playlist in the game topic channel
    topic_recent_playlist_id = request_recent_playlist_id(topic_id)
    
    # list of strings: 
    # returns the YouTube response object containing the specified amount of video_ids from the recent playlist
    video_ids = get_video_ids(topic_recent_playlist_id, num_recent_videos)
    
    # list of strings:
    # returns the parent channel of the videos in the game topic recent playlist
    parent_ids = get_parent_channels(video_ids)
    
    # dictionary:
    # returns a dictionary where the keys are the parent channel ids
    # and the values are the video ids from the uploads playlist for that channel of length videos_per_channel
    channel_videos_dic = populate_channel_game_videos(q_term, parent_ids, videos_per_channel)
    
    # list of dictionaries:
    # Each dictionary corresponds to a video_id from the recent playlist in the game topic channel
    # the number of dictionaries is equal to num_recent_videos.
    #
    # Each dictionary contains the video id, parent channel id, 
    # position in the YouTube game topic recent playlist, 
    # and list of videos from that channel on the speicified game.
    res = generate_result_dics(video_ids, parent_ids, channel_videos_dic)
    return res

def generate_dataset(q_term, num_recent_videos, videos_per_channel):
    start = datetime.now()
    res = full_run_search_result(q_term, num_recent_videos, videos_per_channel)
    end = datetime.now()
    print("time elapsed:", end-start)
    return res


def generate_result_dics(videos, parents, channel_videos):
    all_results = []
    for i in range(len(videos)):
        out_dic = {"video_id": videos[i],
                   "position": i,
                   "channel_id": parents[i],
                   "channel_videos": channel_videos[parents[i]]}
        all_results.append(out_dic)
    return all_results


def get_channel_game_videos(game, parent, num_vids):
    request = youtube.channels().list(
        part="snippet,contentDetails",
        id=parent,
        )
    response = request.execute()
    
    game_vids = []
    if num_vids < 50:
        max_results = num_vids
    else:
        max_results = 50
       
    # initial first page result
    uploads_id = response['items'][0]['contentDetails']['relatedPlaylists']['uploads']
    uploads_details = request_playlist_videos(uploads_id, max_results) 
    for vid_data in uploads_details['items']:
        game_vids.append(vid_data['snippet']['resourceId']['videoId'])
        if len(game_vids) == num_vids:
            break  
            
    # if first page doesn't provide enough videos specified by num_vids, this iterates the pages
    # until the length of game_vids matches num_vids
    try:
        next_token = uploads_details['nextPageToken']
        while len(game_vids) < num_vids:
            cur_page = request_playlist_videos(uploads_id, max_results, next_token)
            for vid_data in cur_page['items']:
                game_vids.append(vid_data['snippet']['resourceId']['videoId'])
                if len(game_vids) == num_vids:
                    break 

            next_token = cur_page['nextPageToken']
            time.sleep(3) # trying to not overload the api
    except:
        print("Probably next page token error")
        print(uploads_details['items'][0]['snippet']['channelTitle'])
    return game_vids


def get_parent_channels(video_ids):
    parent_channel_ids = []
    for vid_id in video_ids:
        vid_content = request_sparse_video_details(vid_id)
        parent_channel = vid_content['items'][0]['snippet']['channelId']
        parent_channel_ids.append(parent_channel)
        
    return parent_channel_ids


def get_video_ids(playlist_id, num_vids):
    recent_video_ids = []
    max_results = 50
    if num_vids < max_results:
        playlist_details = request_playlist_videos(playlist_id, num_vids)
    else:
        playlist_details = request_playlist_videos(playlist_id, max_results)
        
    for vid_data in playlist_details['items']:
        recent_video_ids.append(vid_data['snippet']['resourceId']['videoId'])
        if len(recent_video_ids) == num_vids:
            break         
    
    # extends recent video ids if the num_vids was larger than the initial page's results
    print(playlist_details)
    print(recent_video_ids)
    next_token = playlist_details['nextPageToken']
    while len(recent_video_ids) < num_vids:
        cur_page = request_playlist_videos(playlist_id, max_results, next_token)
        for vid_data in cur_page['items']:
            recent_video_ids.append(vid_data['snippet']['resourceId']['videoId'])
            if len(recent_video_ids) == num_vids:
                break
        next_token = cur_page['nextPageToken']
        time.sleep(3)
    return recent_video_ids

def iterate_search_results(q_term, num_results):
    print("------------------")
    print("Starting iteration of search results...")
    video_ids = []
    parent_ids = []
    max_results = 50
    
    if num_results < 50:
        max_results = num_results
        
    init_search = search_result(q_term, max_results)
    for vid_data in init_search['items']:
        video_ids.append(vid_data['id']['videoId'])
        parent_ids.append(vid_data['snippet']['channelId'])
        if len(video_ids) == num_results:
            break
    print("Current results retrieved:", len(video_ids), 100*len(video_ids)/num_results, "%")
    try:
        next_token = init_search['nextPageToken']
        while len(video_ids) < num_results:
            cur_page = search_result(q_term, max_results, next_token)
            for vid_data in cur_page['items']:
                video_ids.append(vid_data['id']['videoId'])
                parent_ids.append(vid_data['snippet']['channelId'])
                if len(video_ids) == num_results:
                    break
            next_token = cur_page['nextPageToken']
            print("Current results retrieved:", len(video_ids), 100*len(video_ids)/num_results, "%")
    except:
        print("No new pages. Returning current video ids and parent ids")
    print("Done iterating search results!")
    print("------------------")
    return video_ids, parent_ids

def populate_channel_game_videos(game, parents, num_vids):
    print("------------------")
    print("Starting retrieval of channel videos for", len(parents), "channels...")
    channel_videos = {}
    counter = 0
    for par_chan in parents:
        if counter % 5 == 0:
            print("Channels completed: " + str(counter), 100*counter/len(parents), "%")
        if par_chan not in channel_videos.keys():
            channel_videos[par_chan] = get_channel_game_videos(game, par_chan, num_vids)
            counter += 1
        else:
            counter += 1
    print("Done Retrieving Channel Videos!")
    print("------------------")
    return channel_videos

def request_playlist_videos(playlist_id, num_results, page_token=None):
    if page_token:
        request = youtube.playlistItems().list(
            part="snippet",
            maxResults=num_results,
            playlistId=playlist_id,
            pageToken=page_token
        )
    else:
        request = youtube.playlistItems().list(
                part="snippet",
                maxResults=num_results,
                playlistId=playlist_id,
            )
    response = request.execute()
    return response


def request_recent_playlist_id(game_topic_channel):
    request = youtube.channelSections().list(
        part="snippet,contentDetails",
        channelId=game_topic_channel,
        )
    response = request.execute()
    recent_playlist = None
    for section in response['items']:
        try: 
            if section['snippet']['localized']['title'] == "Recent Videos":
                recent_playlist = section['contentDetails']['playlists'][0]
            else:
                continue
        except:
            continue
    return recent_playlist


def request_sparse_video_details(video_id):
    youtube = googleapiclient.discovery.build(
        api_service_name, api_version, developerKey=api_key)
    # note that this uses youtube.videos instead of youtube.search
    request = youtube.videos().list(
        part="snippet",
        id=video_id
    )
    response = request.execute()
    return response 


def request_topic_id(q_term): 
    request = youtube.search().list(
            part="snippet",
            q=q_term,
            maxResults="1",
            type="channel",
            order="relevance"
            )
    response = request.execute()
    game_topic_channel = response['items'][0]['snippet']['channelId']
    return game_topic_channel

def save_to_json(data, date, dir_path, fname, overwrite=False):  
    out_dic = {"date_scraped": date,
               "data": data
              }
    if os.path.exists(dir_path + fname) and not overwrite:
        raise ValueError("Attempting to overwrite existing data. If you want to proceed pass overwrite=True")
    if os.path.isdir(dir_path):
        with open(dir_path + fname, 'w') as outfile:
            json.dump(out_dic, outfile)
    else:
        os.makedirs(dir_path)
        with open(dir_path + fname, 'w') as outfile:
            json.dump(out_dic, outfile) 
            
def search_result(q_term, max_results, page_token=None):
    d = datetime.now()
    prev_month = d - dateutil.relativedelta.relativedelta(months=1)
    prev_month_rfc = prev_month.isoformat('T') + "Z"
    time.sleep(3)
    if page_token:
        request = youtube.search().list(
            part="snippet",
            q=q_term,
            maxResults=max_results,
            pageToken=page_token,
            type="video",
            order="relevance",
            publishedAfter=prev_month_rfc
            )
    else:
        request = youtube.search().list(
                part="snippet",
                q=q_term,
                maxResults=max_results,
                type="video",
                order="relevance",
                publishedAfter=prev_month_rfc
                )
    response = request.execute()
    return response

## Full Run

In [247]:
number_of_search_results = 200
videos_per_channel = 100
date = time.strftime("%m_%d_%y",time.localtime())
fname = "scrape_" + date + ".json"
dir_path = "../../data/out/fortnite/"

In [None]:
fortnite = generate_dataset("fortnite", number_of_search_results, videos_per_channel)

In [None]:
# checking the amount of videos for each channel
prev_api = 683
for channel in fortnite:
    print(len(channel['channel_videos']))

In [248]:
# saving dataset to json
save_to_json(fortnite,date,dir_path,fname)

In [None]:
apex = generate_dataset("apex legends", number_of_search_results, videos_per_channel)

***

# Old Functions / Code that is no longer in use

In [None]:
# def general_search(q_term, num_results, res_type, channel_id, order_type, page_token=None):
#     youtube = googleapiclient.discovery.build(
#         api_service_name, api_version, developerKey=api_key)
    
#     if page_token:
#         request = youtube.search().list(
#             part="snippet",
#             maxResults=num_results,
#             type=res_type,
#             channelId=channel_id,
#             order=order_type,
#             pageToken=page_token
#         )
#     else:
#         request = youtube.search().list(
#             part="snippet",
#             maxResults=num_results,
#             type=res_type,
#             channelId=channel_id,
#             order=order_type
#         )
#     response = request.execute()
#     return response

# def request_channels(q_term, num_results,order_type):
#     # all arguments must be strings, returns json object with list of channels
#     youtube = googleapiclient.discovery.build(
#         api_service_name, api_version, developerKey=api_key)
    
#     # Search parameters
#     request = youtube.search().list(
#         part="snippet",
#         q=q_term,
#         maxResults=num_results,
#         type="channel",
#         order=order_type
#     )
#     response = request.execute()

#     return response

# def request_channel_videos(channel_id, num_results, order_type, page_token):
#     # all arguments must be strings, returns json object with list of videos from the given channel
    
#     youtube = googleapiclient.discovery.build(
#         api_service_name, api_version, developerKey=api_key)
#     if page_token:
#         request = youtube.search().list(
#             part="snippet",
#             maxResults=num_results,
#             type="video",
#             channelId=channel_id,
#             order=order_type,
#             pageToken=page_token
#         )
#     else:
#         request = youtube.search().list(
#             part="snippet",
#             maxResults=num_results,
#             type="video",
#             channelId=channel_id,
#             order=order_type
#         )
#     response = request.execute()
#     return response

# def request_video_details(video_id):
#     youtube = googleapiclient.discovery.build(
#         api_service_name, api_version, developerKey=api_key)
#     # note that this uses youtube.videos instead of youtube.search
#     request = youtube.videos().list(
#         part="snippet,contentDetails,statistics",
#         id=video_id
#     )
#     response = request.execute()
#     return response

# def get_vid_stats(vid):
#     channel_id = vid['snippet']['channelId']
#     channel_title = vid['snippet']['channelTitle']
#     try:
#         thumbnail_link = vid['snippet']['thumbnails']['maxres']['url']
#     except:
#         thumbnail_link = vid['snippet']['thumbnails']['high']['url']
#     title = vid['snippet']['title']
#     date = vid['snippet']['publishedAt']
#     views = vid['statistics']['viewCount']
#     likes = vid['statistics']['likeCount']
#     dislikes = vid['statistics']['dislikeCount']
#     comments = vid['statistics']['commentCount']
#     stats = [channel_id, channel_title, thumbnail_link, title, date, views, likes, dislikes, comments]
#     return stats

# # full pipeline for scraping and generating dataframe
# def main_pipeline():
#     metadata = []
#     # get initial search results (usually going to be a list of channels)
#     out = request_channels("gaming","5","relevance")
#     data = out['items']
#     # get channel ids from search results
#     channel_ids = []
#     for channel in data:
#         cur_channel_id = channel['snippet']['channelId']
#         channel_ids.append(cur_channel_id)
#         # get channel videos from the current channel id (we can also choose to handpick channels / videos)
#         videos = request_channel_videos(cur_channel_id,"5","date",None)
#         video_ids = []
#         for video in videos['items']:
#             cur_id = video['id']['videoId']
#             video_ids.append(cur_id)
#             # use youtube videos api to get metadata about a single video, by video id
#             cur_vid = request_video_details(cur_id)['items'][0]
#             row = get_vid_stats(cur_vid)
#             metadata.append(row)
#         time.sleep(1)

#     # create a dataframe from the gathered metadata
#     columns = ['channelId','channelTitle','thumbnailLink',
#                'videoTitle','Date','Views',
#                'Likes','Dislikes','Comments']
#     df = pd.DataFrame(metadata,columns=columns)
#     return df

# def get_channel_id(q_term):
#     # rough channel conversion function (may not work all the time)
    
#     init_search = request_channels(q_term,"1","relevance")
#     chan_id = init_search['items'][0]['snippet']['channelId']
#     return chan_id

# def generate_video_data(video_list):
#     # generates video data for each video in given list and returns a formatted df of video statistics
#     channel_data = []
#     for vid in video_list['items']:
#         cur_id = vid['id']['videoId']
#         cur_vid = request_video_details(cur_id)['items'][0]
#         row = get_vid_stats(cur_vid)
#         channel_data.append(row)
#     columns = ['channelId','channelTitle','thumbnailLink',
#                'videoTitle','Date','Views',
#                'Likes','Dislikes','Comments']
#     df = pd.DataFrame(channel_data,columns=columns)
#     df['Views'] = df['Views'].astype(int)
#     df['Likes'] = df['Likes'].astype(int)
#     df['Dislikes'] = df['Dislikes'].astype(int)
#     df['Comments'] = df['Comments'].astype(int)
#     return df

# def generate_thumbnails(thumbnail_list):
#     # just makes a request call to the thumbnail link
#     imgs = []
#     for link in thumbnail_list:
#         response = requests.get(link)
#         img = Image.open(BytesIO(response.content))
#         imgs.append(img)
#     return imgs

# def evaluate_videos(df):
#     # evaulated based on z score of views and likes currently, very rough approx
#     avg_views = df['Views'].mean()
#     avg_likes = df['Likes'].mean()
#     avg_dislikes = df['Dislikes'].mean()
#     avg_comments = df['Comments'].mean()
    
#     std_views = np.std(df['Views'])
#     std_likes = np.std(df['Likes'])
#     std_dislikes = np.std(df['Dislikes'])
#     std_comments = np.std(df['Comments'])
    
#     z_views = df['Views'].apply(lambda x: (x - avg_views) / std_views)
#     z_likes = df['Likes'].apply(lambda x: (x - avg_likes) / std_likes)
#     z_dislikes = df['Dislikes'].apply(lambda x: (x - avg_dislikes) / std_dislikes)
#     z_comments = df['Comments'].apply(lambda x: (x - avg_comments) / std_comments)
    
#     good_videos = []
#     bad_videos = []
#     for i in range(len(df)):
#         if z_views[i] > .3 and z_likes[i] > .3:
#             good_videos.append(i)
#         if z_views[i] < -.5 and z_likes[i] < -.5:
#             bad_videos.append(i)

#     return good_videos, bad_videos

# def analyze_channel(keyword):
#     channel_id = get_channel_id(keyword)
#     channel_videos = request_channel_videos(channel_id, "30", "date", None)
#     video_data_df = generate_video_data(channel_videos)
#     video_thumbnails = generate_thumbnails(video_data_df['thumbnailLink'].values)
#     good_videos, bad_videos = evaluate_videos(video_data_df)
#     return good_videos, bad_videos

In [None]:
# FULL RUN
# run this if you want to scrape data and generate a dataset BE CAREFUL THIS USES MANY API CALLS
# meta_df = main_pipeline()

In [None]:
# # get initial search data (usually a list of channels)
# out = request_channels("gaming","5","relevance")
# data = out['items']

In [None]:
# # get channel ids from search results
# channel_ids = []
# for channel in data:
#     cur_channel_id = channel['snippet']['channelId']
#     channel_ids.append(cur_channel_id)

In [None]:
# # get channel videos from a specific channel id
# cur_channel_id = channel_ids[1] # arbitrary channel id for demo purposes
# videos = request_channel_videos(cur_channel_id,"5","date",None)
# video_ids = []
# for video in videos['items']:
#     video_ids.append(video['id']['videoId'])

In [None]:
# metadata = []
# # use youtube videos api to get metadata about a single video, by video id
# for cur_id in video_ids:
#     cur_vid = request_video_details(cur_id)['items'][0]
#     row = get_vid_stats(cur_vid)
#     metadata.append(row)    

In [None]:
# # create a dataframe from the gathered metadata
# columns = ['channelId','channelTitle','thumbnailLink','videoTitle','Date','Views','Likes','Dislikes','Comments']
# df = pd.DataFrame(metadata,columns=columns)

In [None]:
# # handpicked youtubers to analyze (channel ids)
# pokeaimMD = 'UCbXuFrNGSKcmZY_5DwYz4Ew'
# pewdiepie = 'UC-lHJZR3Gqxm24_Vd_AJ5Yw'
# ninja = 'UCAW-NpUFkMyCNrvRSSGIvDQ'
# abdallah = 'UCsDtTzkvGxxw95C4IOfZ7dw'
# ytdan = 'UCXgNU9GtLPiE2dAYDwIQO6Q'
# # possible idea is to just use most relevant to "gaming" search term

# Full pipeline analysis

In [None]:
# good_videos, bad_videos = analyze_channel("ninja")
# good_videos, bad_videos

# Individual Example (Step-by-Step)

In [None]:
# ninja_videos = request_channel_videos(ninja, "30", "date", None) # first 30 videos
# next_page = ninja_videos['nextPageToken']
# ninja_videos2 = request_channel_videos(ninja, "30", "date",next_page) # next 30 videos

In [None]:
# ninja_videos = generate_video_data(ninja_videos)
# ninja_thumbnails = generate_thumbnails(ninja_videos['thumbnailLink'].values)
# good_videos, bad_videos = evaluate_videos(ninja_videos)

In [None]:
# # check to see that output of full pipeline is similar to step by step process
# good_videos, bad_videos

In [None]:
# # analysis of next 30 ninja videos
# more_ninja_videos = generate_video_data(ninja_videos2)
# more_ninja_thumbnails = generate_thumbnails(more_ninja_videos['thumbnailLink'].values)
# good_videos2, bad_videos2 = evaluate_videos(more_ninja_videos)

# Actually Displaying the good/bad thumbnails

In [None]:
# # good thumbnails
# for i in good_videos:
#     display(ninja_thumbnails[i])

In [None]:
# # more good videos
# for i in good_videos2:
#     display(more_ninja_thumbnails[i])

In [None]:
# # bad videos
# for i in bad_videos:
#     display(ninja_thumbnails[i])

In [None]:
# # more bad videos
# for i in bad_videos2:
#     display(more_ninja_thumbnails[i])