In [174]:
import json
import os.path
import time

import pandas as pd
import numpy as np

import google_auth_oauthlib.flow
import googleapiclient.discovery
import googleapiclient.errors

In [288]:
# read Youtube API credential
with open('client_secret.json') as json_file:
    credentials = json.load(json_file)
    
# Config
WORKING_COUNTRY_CODE = "CA"
QUERY_LIMIT = 5000 # youtube enforce from 1-50

ORIGINAL_FILE_PATH = "youtube-dataset/" + WORKING_COUNTRY_CODE + "videos.csv"
APPENDING_FILE_PATH = "query_result/" + WORKING_COUNTRY_CODE + "videos.csv"

In [289]:
original_df = pd.read_csv(ORIGINAL_FILE_PATH)
appending_df = pd.read_csv(APPENDING_FILE_PATH)

In [290]:
all_unique_ids = original_df.video_id.unique()
appened_ids = appending_df.video_id.unique()

to_append_ids = np.setdiff1d(all_unique_ids,appened_ids)

search_targets = to_append_ids[:QUERY_LIMIT]
len(search_targets)

3976

In [291]:
len(appending_df)

20451

In [299]:
len(all_unique_ids)

24427

In [292]:
# repeat_query(appending_df, search_targets)

In [307]:
#appending_df = handle_private_video(original_df, search_targets, appending_df)
#save_appending_progress(appending_df)
appending_df

Unnamed: 0,video_id,duration,topic_id
0,--45ws7CEN0,PT3H2M28S,/m/04rlf
1,--7vNbh4UNA,PT10M36S,/m/02jjt|/m/0f2f9
2,--YgtVuvWGo,PT2M23S,/m/04rlf|/m/06j6l|/m/0gywn
3,--wOJ2VyKDI,PT17M18S,/m/098wr|/m/05qt0
4,-0CMnp02rNY,PT5M8S,/m/02jjt|/m/0f2f9
5,-0DjA_r32uQ,PT29M55S,/m/019_rr
6,-0F7AFzWXik,PT7M14S,/m/098wr|/m/05qt0
7,-0NhqVYR4UY,PT11M18S,/m/098wr|/m/05qt0
8,-0Nuw8wX3tE,PT2M47S,/m/02jjt|/m/02vxn
9,-0QvjiG4sYM,PT17M4S,/m/098wr|/m/05qt0


In [314]:
final_df = merge_two_df(appending_df, original_df)
save_appending_progress(final_df)

Success!  CA File updated.


In [313]:
def merge_two_df(appending_df, original_df):
    
    return pd.merge(appending_df, original_df, on="video_id", how="left")

In [286]:
def repeat_query(appending_df, search_targets):
    
    query_head = 0
    
    while query_head <= len(search_targets):
        
        query_tail = query_head + 50
        search_str = ",".join(search_targets[query_head:query_tail])
        
        # reset head
        query_head = query_tail
        
        query_raw_result = fetch_youtube(search_str)
        
        query_formatted_dt = pd.DataFrame(np.reshape(query_raw_result, (-1, 3)), columns = ['video_id', 'duration', 'topic_id'])
        appending_df = pd.concat([appending_df, query_formatted_dt])
        
        print("One query completed... next starting point: ", query_tail)
        
        time.sleep(3)    
    
    save_appending_progress(appending_df)

In [305]:
def handle_private_video(original_df, search_targets, appending_df):

    none_table = pd.DataFrame(data={
        'video_id': search_targets,
        'duration': np.nan,
        'topic_id': np.nan
    })

    return pd.concat([appending_df, none_table])    

In [123]:
def fetch_youtube(search_str):
    
    scopes = ["https://www.googleapis.com/auth/youtube.readonly"]
    
    # Disable OAuthlib's HTTPS verification when running locally.
    # *DO NOT* leave this option enabled in production.
    os.environ["OAUTHLIB_INSECURE_TRANSPORT"] = "1"

    api_service_name = "youtube"
    api_version = "v3"
    DEVELOPER_KEY = credentials['api_key']

    # Get credentials and create an API client
    youtube = googleapiclient.discovery.build(
        api_service_name, api_version, developerKey = DEVELOPER_KEY)
    
    result = []
    next_page_token = 'first_call'
        
    # Because the API uses page tokens, we loop the exuection and combine all data together
    # until next_page_token is None
    while next_page_token is not None:
              
        if next_page_token == 'first_call':
            request = youtube.videos().list(
                part="contentDetails,topicDetails",
                id=search_str
            )
        else:
            request = youtube.videos().list(
                part="contentDetails,topicDetails",
                id=search_str,
                pageToken=next_page_token
            )
        
        
        response = request.execute()
    
        
        # Get the next page token, if not found assign as None and while loop will quit
        next_page_token = response.get("nextPageToken", None)

        # Get all of the items as a list and let get_videos return the needed features
        items = response.get('items', [])
        result.extend(get_videos(items))
        
    return result

    

In [308]:
def get_videos(items):
    lines = []
    for video in items:
                
        if "topicDetails" not in video:
            topic_id = "[none]"
        else:
            topic_id = get_tags(video['topicDetails'].get('topicIds', np.nan))

        lines.extend([video['id'], 
                      video['contentDetails'].get('duration', ""),
                      topic_id])
        
    return lines

In [125]:
def get_tags(tags_list):
    # Takes a list of tags, prepares each tag and joins them into a string by the pipe character
    return "|".join(tags_list)

In [171]:
def save_appending_progress(df):
    
    df.to_csv(APPENDING_FILE_PATH, encoding='utf-8', index=False)
    print("Success! ", WORKING_COUNTRY_CODE, "File updated.")