In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import os
import sys
import os.path as op
from tqdm.notebook import tqdm
#pip install --upgrade google-api-python-client
#pip install --upgrade google-auth-oauthlib google-auth-httplib2

import requests as req
#from bs4 import BeautifulSoup as bs

sys.path.insert(0, '..') # project folder
path_data = op.join('..', 'data', 'raw')
path_deriv = op.join(path_data, '..', 'derivatives')
path_metadata = op.join(path_data, "yt_metadata_en.jsonl.gz")
path_edu = op.join(path_deriv, "Education_videos_{}.csv")

API_KEY = pd.read_json(op.join('.','config.json'))['api_key'][0]

In [3]:
def extract_channels_edu(verbose = False):
    channels = []
    for i in range(8):
            if verbose :
                print(f'Processing file : path_edu_{i}', end = '')
            edu = pd.read_csv(path_edu.format(i), index_col=0)
            ch = list(pd.unique(edu['channel_id']))
            if verbose : 
                print(f"  --> Found {len(ch)} channels")
            channels.extend(ch)
        
    if verbose:
         print('Total number of channels :' , len(channels))
    return channels

channels = extract_channels_edu(verbose = True)

Processing file : path_edu_0  --> Found 3412 channels
Processing file : path_edu_1  --> Found 3039 channels
Processing file : path_edu_2  --> Found 3069 channels
Processing file : path_edu_3  --> Found 3036 channels
Processing file : path_edu_4  --> Found 3384 channels
Processing file : path_edu_5  --> Found 3282 channels
Processing file : path_edu_6  --> Found 3150 channels
Processing file : path_edu_7  --> Found 3224 channels
Total number of channels : 25596


In [None]:
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError

def youtube_country_scraper(api_key, channel_ids, verbose = False, redo = False):
    # Disable OAuthlib's HTTPS verification when running locally. *DO NOT* leave this option enabled in production.
    os.environ["OAUTHLIB_INSECURE_TRANSPORT"] = "1"
    youtube = build('youtube', 'v3', developerKey = api_key)
    ids_string = ",".join(channel_ids)
    
    request = youtube.channels().list(
        part = 'snippet',
        id= ids_string
    )
    items = request.execute()
    countries = {ch: 'Redo' for ch in channel_ids}
    if ('items' in items):
        for item in items.get('items', []):
            if 'snippet' in item:
                id = item.get('id')
                country  = item.get('snippet').get('country')
                if (id in channel_ids): # else the channel now has a different id and need to be redone
                    countries[id] = country
            else:
                countries[id] = None
    else:
        countries[list(countries)[0]] = 'deleted' # channel info is not available anymore
    if verbose :
        print(items)
        print(countries)
    return countries

In [None]:
countries = pd.DataFrame(columns = ['channel_id', 'country']) 
countries['channel_id'] = channels
countries['country'] = 'empty'

In [3]:
if op.isfile(op.join(path_deriv, 'countries.csv')):
    countries = pd.read_csv(op.join(path_deriv, 'countries.csv'), index_col = 0)

In [None]:
start = 0
end = len(countries) # max 10k per day - improved since now we do batches so can run all of them in one go
batch_size = 50 # max youtube allows per request

try:
    for i in tqdm(range(start,end,batch_size), total = (end - start)//batch_size, desc = 'Country extraction'):
        chs = channels[i:min(i+batch_size,end)]
        nations = youtube_country_scraper(API_KEY, chs, verbose= False)
        
        for ch in chs:
            countries.loc[countries.channel_id == ch, 'country'] = nations[ch]
            tqdm.write('Channel {} : {} - {}  '.format(i, ch, nations[ch]), end="\r")

    if end % batch_size != 0:
        chs_extra = channels[(end//batch_size)* batch_size:end]
        nations = youtube_country_scraper(API_KEY, chs_extra, verbose= False)
        for i, ch in enumerate(chs_extra):
            countries.loc[countries.channel_id == ch, 'country'] = nations[ch]
            tqdm.write('Channel {} : {} - {}  '.format(i, ch, nations[ch]), end="\r")

except HttpError as e:
        if e.resp.status == 403:
            print('Quota exceeded, saving extracted countries')
            countries.to_csv(op.join(path_deriv, 'countries.csv'), index = False)        

countries.to_csv(op.join(path_deriv, 'countries.csv'))
tqdm.write('Done!                                        ')

In [None]:
#?????
#countries.loc[countries.country == 'deleted', 'country'] = 'Redo'
#Redo those that did not return the same channel id
countries_redo = countries[countries.country == 'Redo']
for i, ch in tqdm(enumerate(countries_redo.channel_id), total = len(countries_redo), desc = 'Countries redo'):
    nations = youtube_country_scraper(API_KEY, [ch], verbose= False)
    countries.loc[countries.channel_id == ch, 'country'] = nations[ch]
    tqdm.write('Channel {} : {} - {}  '.format(i, ch, nations[ch]), end="\r")

Countries redo:   0%|          | 0/1250 [00:00<?, ?it/s]

Channel 1249 : UCrxytzQWXftMcG-2bLs2Ubg - deleted  

In [60]:
countries.to_csv(op.join(path_deriv, 'countries.csv'))

In [61]:
pd.set_option("display.max_rows", None) #11 is default
countries.country.value_counts(dropna=False)

country
US         11632
None        3622
IN          2452
GB          1873
deleted     1250
CA          1092
AU           551
PK           249
DE           231
PH           151
NL           127
SE           110
NZ            97
BD            97
ID            93
FR            92
ES            88
IE            79
IT            78
SG            76
JP            68
CH            65
ZA            63
RO            56
AE            56
NP            55
MY            53
PL            53
DK            50
KR            47
RU            42
NO            40
BR            38
BE            38
TH            37
AT            37
PT            35
UA            34
FI            32
NG            31
HK            30
MX            29
IL            29
VN            27
KE            26
GR            25
CZ            23
HR            21
HU            21
LK            20
RS            20
SI            19
SA            17
TR            17
SK            16
KH            15
TW            13
GH            13
CO    

In [243]:
countries.to_csv(op.join(path_deriv, 'countries.csv'))

## other explored methods - legacy 

In [None]:
"""url = r'https://socialblade.com/youtube/c/simonegiertz'
r = req.get(url)
print('Response status code: {0}\n'.format(r.status_code))
soup = bs(r.text, 'html.parser')
country = soup.find('span', {'id': 'youtube-stats-header-country'}).text
print(country)"""

In [None]:
#Web archive for Socialblade - solution not practical
"""url = r'https://web.archive.org/web/20161218062757/https://socialblade.com/youtube/user/leafyishere/monthly'
r = req.get(url)
print('Response status code: {0}\n'.format(r.status_code))
soup = bs(r.text, 'html.parser')
country = soup.find('span', {'id': 'youtube-stats-header-country'}).text
print(country)"""

Response status code: 200

US
