# Connect to Google Drive

In [271]:
from google.colab import drive
gdrive_dir = "/content/gdrive"
drive.mount(gdrive_dir, force_remount=True)

Mounted at /content/gdrive


In [1]:
import os
import os.path as osp

In [273]:
os.chdir("/content/gdrive/My Drive/BT4221-Chi/")

In [274]:
try: 
    os.mkdir("/Data")
except: 
    print("The path is already created")
os.chdir("/content/gdrive/My Drive/BT4221-Chi/Data/")

The path is already created


# Install and Import libraries

In [275]:
# !pip install google-api-python-client
# !pip install pandas
# !pip install google-auth google-auth-oauthlib google-auth-httplib2
# !pip install demoji
# !pip install langdetect

In [2]:
%%capture
from tqdm import tqdm_notebook as tqdm
tqdm().pandas()

In [277]:
# from googleapiclient.discovery import build
# from google_auth_oauthlib.flow import InstalledAppFlow
from collections import Mapping
from itertools import chain
from operator import add
import pandas as pd
import pickle
# import demoji
# from langdetect import detect
import traceback 
import re
import os
import logging
logging.getLogger('googleapiclient.discovery_cache').setLevel(logging.ERROR)

# Produce

## Produce csv and sav files Functions

In [278]:
# _FLAG_FIRST = object()
# df_channel = pd.DataFrame()
# df_video = pd.DataFrame()
# nextPage_token = None

In [279]:
from time import gmtime, strftime
from datetime import datetime
import pytz
def get_current_time(): 
    tz_SG = pytz.timezone('Asia/Singapore') 
    current_time = datetime.now(tz_SG).strftime("%Y-%m-%d-%H-%M-%S")
    return current_time

In [280]:
current_time = get_current_time()
model_path_channel      = f"/content/gdrive/My Drive/BT4221-Chi/Data/Extracted_channel_{current_time}"
model_path_video        = f"/content/gdrive/My Drive/BT4221-Chi/Data/Extracted_video_{current_time}"
model_path_comment      = f"/content/gdrive/My Drive/BT4221-Chi/Data/Extracted_comment_{current_time}"
model_path_channel_meta = f"/content/gdrive/My Drive/BT4221-Chi/Data/Extracted_channel"
model_path_video_meta   = f"/content/gdrive/My Drive/BT4221-Chi/Data/Extracted_video"
model_path_comment_meta = f"/content/gdrive/My Drive/BT4221-Chi/Data/Extracted_comment"

In [281]:
def rename_columns_df(target_df): 
    def unique_list(string):
        string = string.replace("video_snippet", "snippet").replace("video_stats", "statistics")
        l = string.split("_")
        ulist = []
        [ulist.append(x) for x in l if x not in ulist]
        new_string = '_'.join(ulist)
        return new_string
    target_df.columns = target_df.columns.to_series().apply(unique_list)
    return target_df

In [282]:
def produce(target_df, name, file_type, Index=False): 
    current_time = get_current_time()
    model_path_channel      = f"/content/gdrive/My Drive/BT4221-Chi/Data/Extracted_channel_{current_time}"
    model_path_video        = f"/content/gdrive/My Drive/BT4221-Chi/Data/Extracted_video_{current_time}"
    model_path_comment      = f"/content/gdrive/My Drive/BT4221-Chi/Data/Extracted_comment_{current_time}"
    model_path_channel_meta = f"/content/gdrive/My Drive/BT4221-Chi/Data/Extracted_channel"
    model_path_video_meta   = f"/content/gdrive/My Drive/BT4221-Chi/Data/Extracted_video"
    model_path_comment_meta = f"/content/gdrive/My Drive/BT4221-Chi/Data/Extracted_comment"
    list_cols = target_df.columns.tolist()
    list_cols.sort()
    target_df = target_df[list_cols]
    if file_type == ".csv": 
        target_df.to_csv(name + ".csv", index=Index)
        print(f"The file named {name} is created!!!")
    elif file_type == ".sav": 
        pickle.dump(target_df, open(name + ".sav", "wb"))
        print(f"The file named {name} is created!!!")

In [310]:
def combine_all_to_df(target_df, data_type, prefix, file_type): 
    res_dict = []
    for name in os.listdir(): 
        if re.match(f"^{prefix}.*{file_type}", name): 
            if ".csv"==file_type:
                print("Appending", name)
                df = pd.read_csv(f"/content/gdrive/My Drive/BT4221-Chi/Data/{name}") 
            elif ".sav"==file_type:
                print("Appending", name)
                df = pickle.load(open(f"/content/gdrive/My Drive/BT4221-Chi/Data/{name}", "rb"))
            res_dict += df.to_dict('records')
    target_df = pd.DataFrame(res_dict)
    target_df = transform(target_df, data_type)
    target_df.reset_index(drop=True, inplace=True)
    list_cols = target_df.columns.tolist()
    list_cols.sort()
    target_df = target_df[list_cols]
    return target_df

In [284]:
# Transform the DF
def transform(df, data_type): 
    df = rename_columns_df(df)
    column_dict = {
        "channel": {
            "channelID": "channelId", "snippet_items_description": "description", "snippet_items_publishedAt": "publishedAt", "snippet_regionCode": "region", 
            "statistics_items_subscriberCount": "subscriberCount", "statistics_items_videoCount": "videoCount", "statistics_items_viewCount": "viewCount"
        }, 
        "video": {
            "snippet_channelId": "channelId", "snippet_publishedAt": "publishedAt", "snippet_resourceId_videoId": "videoId", 
            "snippet_thumbnails_default_url": "thumbnails", "snippet_title": "title", "statistics_commentCount": "commentCount",
            "statistics_dislikeCount": "dislikeCount", "statistics_likeCount": "likeCount", "statistics_viewCount": "viewCount"
        }
    }
    intersect_list = [i for i in df.columns.tolist() if i in column_dict[data_type].keys()]
    print("intersect_list", intersect_list)
    column_name_subset = {key: value for key, value in column_dict[data_type].items() if key in intersect_list}
    print("column_name_subset", column_name_subset)
    df = df[list(column_name_subset.keys())]
    df = df.rename(columns=column_name_subset)
    list_cols = df.columns.tolist()
    list_cols.sort()
    df = df[list_cols]
    return df

### Production

In [314]:
def start_production(): 
    if not df_channel.empty:
        produce(df_channel, model_path_channel,'.csv')
    if not df_video.empty:
        produce(df_video, model_path_video,'.csv')
def start_production_meta(): 
    if not df_channel.empty:
        produce(df_channel_meta, model_path_channel_meta,'.csv')
    if not df_video.empty:
        produce(df_video_meta, model_path_video_meta,'.csv')

# Utility Functions

## Structure functions

In [286]:
def pretty(d, indent=0):
    for key, value in d.items():
        print('--' * indent + str(key))
        if isinstance(value, dict):
            pretty(value, indent+1)
        elif isinstance(value, list): 
            pretty(value[0], indent+1)
        else:
            print('--' * (indent+1) + str(value))

In [287]:
def flattenDict_recur(propertyName, d): 
    def flattenDict(d, join=add, lift=lambda x:x):
        results = []
        def visit(subdict, results, partialKey):
            for k,v in subdict.items():
                newKey = lift(k) if partialKey==_FLAG_FIRST else join(partialKey,lift(k))
                if isinstance(v,Mapping):
                    visit(v, results, newKey)
                elif isinstance(v,list):
                    visit(v[0], results, newKey)
                else:
                    results.append((newKey,v))
        visit(d, results, _FLAG_FIRST)
        return results
    global _FLAG_FIRST
    _FLAG_FIRST = object()
    res = dict(flattenDict(d, join=lambda a,b: a + '_' + b))
    return {f"{propertyName}_{key}": val for key, val in res.items()}

## Video + Channel functions

### Create Youtube API Instance

In [288]:
# CLIENT_SECRETS_FILE = "client_secret_channel.json"

In [289]:
# SCOPES = ['https://www.googleapis.com/auth/youtube.force-ssl']
# API_SERVICE_NAME = 'youtube'
# API_VERSION = 'v3'

In [290]:
# def get_authenticated_service():
#     flow = InstalledAppFlow.from_client_secrets_file(CLIENT_SECRETS_FILE, SCOPES)
#     credentials = flow.run_console()
#     return build(API_SERVICE_NAME, API_VERSION, credentials = credentials)

In [291]:
# os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = '1'
# youtube = get_authenticated_service()

In [292]:
current_api = "AIzaSyAf4Xxogy15RLZ_0dunuQy5T2YlT9FCoqU"
youTubeApiKey= current_api
youtube=build('youtube','v3',developerKey=youTubeApiKey)

### Video Functions

In [293]:
def get_videos(UploadId): 
    # get all video IDs
    allVideos = []
    global nextPage_token
    while 1:
        try: 
            res = youtube.playlistItems().list(playlistId = UploadId,maxResults = 50,part = 'snippet',pageToken = nextPage_token).execute()
            allVideos += res['items']
            nextPage_token = res.get('nextPageToken')
            if nextPage_token is  None:
                break
        except: 
            break
    print("The last page token is: ", nextPage_token, " and at UploadID: ", UploadId)
    nextPage_token = None
    return allVideos

In [294]:
def get_video_info(allVideos):
    stats = []
    video_ids = list(map(lambda x:x['snippet']['resourceId']['videoId'], allVideos))
    part_len = 40
    for i in range(0, len(video_ids), part_len):
        try: 
            end_point = i+part_len if (i+part_len) < len(video_ids) else len(video_ids)
            res_stat = (youtube).videos().list(id=','.join(video_ids[i:end_point]),part='statistics').execute()
            stats += res_stat['items']
        except: 
            break
    return stats

In [295]:
def get_video_suggestions(allVideos):
    stats = []
    video_ids = list(map(lambda x:x['snippet']['resourceId']['videoId'], allVideos))
    part_len = 40
    for i in range(0, len(video_ids), part_len):
        try: 
            end_point = i+part_len if (i+part_len) < len(video_ids) else len(video_ids)
            res_suggestion = (youtube).videos().list(id=','.join(video_ids[i:end_point]),part='suggestions').execute()
            stats += res_suggestion['items']
        except: 
            break
    return stats

In [296]:
def combine_list(dict_list, list_len): 
    res = list()
    for idx in range(0, list_len): 
        tmp_dict = dict()
        for i in dict_list.keys(): 
            if len(dict_list[i]) > 0: 
                tmp_dict.update(flattenDict_recur(i, dict_list[i][idx]))
        res.append(tmp_dict)
    return res

### Channel functions

In [297]:
def run_channel(channelCategory, channelID):
    def flattenDict_recur(propertyName, d): 
        def flattenDict(d, join=add, lift=lambda x:x):
            results = []
            def visit(subdict, results, partialKey):
                for k,v in subdict.items():
                    newKey = lift(k) if partialKey==_FLAG_FIRST else join(partialKey,lift(k))
                    if isinstance(v,Mapping):
                        visit(v, results, newKey)
                    elif isinstance(v,list):
                        visit(v[0], results, newKey)
                    else:
                        results.append((newKey,v))
            visit(d, results, _FLAG_FIRST)
            return results
        global _FLAG_FIRST
        _FLAG_FIRST = object()
        res = dict(flattenDict(d, join=lambda a,b: a + '_' + b))
        return {f"{propertyName}_{key}": val for key, val in res.items()}

    global df_channel, df_video, nextPage_token
    channel_meta_info = {"category": channelCategory, "channelID": channelID}
    snippets          = youtube.search().list(part="snippet", type="channel", channelId=channelID).execute()
    stats             = youtube.channels().list(part="statistics", id = channelID).execute()
    content           = youtube.channels().list(id = channelID, part = 'contentDetails').execute()
    # status            = youtube.channels().list(id = channelID, part = 'status').execute()
    # channelSections   = youtube.channelSections().list(channelId = channelID, part = 'snippet').execute()

    channel_meta_info.update(flattenDict_recur("snippet", snippets))
    channel_meta_info.update(flattenDict_recur("statistics", stats))
    # channel_meta_info.update(flattenDict_recur("contentDetails", content))
    # channel_meta_info.update(flattenDict_recur("status", status))
    # channel_meta_info.update(flattenDict_recur("channelSection", channelSections))

    df_channel = df_channel.append(pd.DataFrame.from_dict([channel_meta_info]))
    flattented_Dict = flattenDict_recur("contentDetails", content)
    UploadID = flattented_Dict["contentDetails_items_contentDetails_relatedPlaylists_uploads"]
    nextPage_token   = None

    video_meta_info  = []
    video_snippet    = get_videos(UploadID)
    video_stats      = get_video_info(video_snippet)
    video_suggestion = get_video_suggestions(video_snippet)
    video_all        = combine_list({"snippet": video_snippet, "statistics": video_stats, "suggestions": video_suggestion}, len(video_snippet))
    for item in video_all: 
        video_meta_info_item = {"uploadId": UploadID, "channelID": channelID}
        video_meta_info_item.update(item)
        video_meta_info.append(video_meta_info_item)
    df_video = df_video.append(pd.DataFrame(video_meta_info))
    start_production()

### Running program

In [298]:
def start_program(file_name):
    global current_api
    df = pd.read_csv(file_name)
    for index, row in tqdm(df.iterrows()):
        try: 
            print(row['Category'], '---', row['Channel_Name'], '---', row['Channel_ID'])
            run_channel(row['Category'], row['Channel_ID'])
        except:
            print("The connection is broken down, trying new api")
            traceback.print_exc() 
            break
    # if not df_channel.empty: 
    #     df_channel_meta = pd.DataFrame()
    #     df_channel_meta = combine_all_to_df(df_channel_meta, "Extracted_channel_", '.csv')
    # if not df_video.empty: 
    #     df_video_meta = pd.DataFrame()
    #     df_video_meta = combine_all_to_df(df_video_meta, "Extracted_video_", '.csv')

In [299]:
_FLAG_FIRST = object()
df_channel = pd.DataFrame()
df_video = pd.DataFrame()
nextPage_token = None
start_program("https://raw.githubusercontent.com/nguyenngoclinhchi/Analytic_Steps/master/Extract-YouTube-ChannelData/Top10EachChannels.csv")

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

NONPROFIT & ACTIVISM --- Jace Norman --- UCBP4B896svWOcWdRp8UjH2Q
The last page token is:  None  and at UploadID:  UUBP4B896svWOcWdRp8UjH2Q
The file named /content/gdrive/My Drive/BT4221-Chi/Data/Extracted_channel_2020-10-09-13-39-47 is created!!!
The file named /content/gdrive/My Drive/BT4221-Chi/Data/Extracted_video_2020-10-09-13-39-47 is created!!!
NONPROFIT & ACTIVISM --- MotivationGrid --- UCB7BryuXaMe1pUMznYAq4Jg
The last page token is:  None  and at UploadID:  UUB7BryuXaMe1pUMznYAq4Jg
The file named /content/gdrive/My Drive/BT4221-Chi/Data/Extracted_channel_2020-10-09-13-39-47 is created!!!
The file named /content/gdrive/My Drive/BT4221-Chi/Data/Extracted_video_2020-10-09-13-39-47 is created!!!
NONPROFIT & ACTIVISM --- Jeff & Alyssa --- UCc4yillQaNo6a-iG2PYbbrA
The last page token is:  None  and at UploadID:  UUc4yillQaNo6a-iG2PYbbrA
The file named /content/gdrive/My Drive/BT4221-Chi/Data/Extracted_channel_2020-10-09-13-39-47 is created!!!
The file named /content/gdrive/My Drive

Traceback (most recent call last):
  File "<ipython-input-298-89745dd38449>", line 7, in start_program
    run_channel(row['Category'], row['Channel_ID'])
  File "<ipython-input-297-a4a90d003582>", line 23, in run_channel
    snippets          = youtube.search().list(part="snippet", type="channel", channelId=channelID).execute()
  File "/usr/local/lib/python3.6/dist-packages/googleapiclient/_helpers.py", line 134, in positional_wrapper
    return wrapped(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/googleapiclient/http.py", line 898, in execute
    raise HttpError(resp, content, uri=self.uri)
googleapiclient.errors.HttpError: <HttpError 403 when requesting https://www.googleapis.com/youtube/v3/search?part=snippet&type=channel&channelId=UCeY0bbntWzzVIaj2z3QigXg&key=AIzaSyAf4Xxogy15RLZ_0dunuQy5T2YlT9FCoqU&alt=json returned "The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.">


In [300]:
start_production()

The file named /content/gdrive/My Drive/BT4221-Chi/Data/Extracted_channel_2020-10-09-13-39-47 is created!!!
The file named /content/gdrive/My Drive/BT4221-Chi/Data/Extracted_video_2020-10-09-13-39-47 is created!!!


In [311]:
if not df_channel.empty: 
    df_channel_meta = pd.DataFrame()
    df_channel_meta = combine_all_to_df(df_channel_meta, "channel", "Extracted_channel_", '.csv')
if not df_video.empty: 
    df_video_meta = pd.DataFrame()
    df_video_meta = combine_all_to_df(df_video_meta, "video", "Extracted_video_", '.csv')
start_production_meta()

Appending Extracted_channel_2020-10-09-13-39-47.csv
intersect_list ['channelID', 'snippet_items_description', 'snippet_items_publishedAt', 'snippet_regionCode', 'statistics_items_subscriberCount', 'statistics_items_videoCount', 'statistics_items_viewCount']
column_name_subset {'channelID': 'channelId', 'snippet_items_description': 'description', 'snippet_items_publishedAt': 'publishedAt', 'snippet_regionCode': 'region', 'statistics_items_subscriberCount': 'subscriberCount', 'statistics_items_videoCount': 'videoCount', 'statistics_items_viewCount': 'viewCount'}
Appending Extracted_video_2020-10-09-13-39-47.csv


  if self.run_code(code, result):


intersect_list ['snippet_channelId', 'snippet_publishedAt', 'snippet_resourceId_videoId', 'snippet_thumbnails_default_url', 'snippet_title', 'statistics_commentCount', 'statistics_dislikeCount', 'statistics_likeCount', 'statistics_viewCount']
column_name_subset {'snippet_channelId': 'channelId', 'snippet_publishedAt': 'publishedAt', 'snippet_resourceId_videoId': 'videoId', 'snippet_thumbnails_default_url': 'thumbnails', 'snippet_title': 'title', 'statistics_commentCount': 'commentCount', 'statistics_dislikeCount': 'dislikeCount', 'statistics_likeCount': 'likeCount', 'statistics_viewCount': 'viewCount'}
