In [149]:
import requests, sys, time, os, argparse

In [150]:
# List of simple to collect features
snippet_features = ["title"]

In [151]:
# Any characters to exclude, generally these are things that become problematic in CSV files
unsafe_characters = ['\n', '"']

In [152]:
# Used to identify columns, currently hardcoded order
header = ["category_id", "category_title", "category_code", "date"]

In [153]:
def setup(api_path, code_path):
    with open(api_path, 'r') as file:
        api_key = file.readline()

    with open(code_path) as file:
        country_codes = [x.rstrip() for x in file]

    return api_key, country_codes

In [154]:
def prepare_feature(feature):
    # Removes any character from the unsafe characters list and surrounds the whole item in quotes
    for ch in unsafe_characters:
        feature = str(feature).replace(ch, "")
    return f'"{feature}"'

In [155]:
api_path = './api_key.txt'
code_path = './country_codes.txt'
output_dir = 'output/'

In [156]:
api_key, country_codes = setup(api_path, code_path)

In [157]:
def api_request(country_code):
    # Builds the URL and requests the JSON from it
    request_url = f"https://www.googleapis.com/youtube/v3/videoCategories?part=snippet&regionCode={country_code}&key={api_key}"
    request = requests.get(request_url)
    if request.status_code == 429:
        print("Temp-Banned due to excess requests, please wait and continue later")
        sys.exit()
    return request.json()

In [158]:
def get_categories(items, country_code):
    lines = []
    for category in items:
    
        category_id = prepare_feature(category['id'])

        # Snippet and statistics are sub-dicts of video, containing the most useful info
        snippet = category['snippet']

        # This list contains all of the features in snippet that are 1 deep and require no special processing
        features = [prepare_feature(snippet.get(feature, "")) for feature in snippet_features]
        
        date = prepare_feature(time.strftime("%y.%d.%m"))
        
        # Compiles all of the various bits of info into one consistently formatted line
        line = [category_id] + features + [country_code] + [date]
        lines.append(",".join(line))
        
    return lines

In [159]:
def get_pages(country_code):
    country_data = []
    
    # A page of data i.e. a list of videos and all needed data
    category_page = api_request(country_code)
    
    # Get all of the items as a list and let get_videos return the needed features
    items = category_page.get('items', [])
    country_data += get_categories(items, country_code)

    return country_data

In [160]:
def write_to_file(country_code, country_data):

    print(f"Writing {country_code} category to file...")

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    with open(f"{output_dir}/{time.strftime('%y.%d.%m')}_{country_code}_categories.csv", "w+", encoding='utf-8') as file:
        for row in country_data:
            file.write(f"{row}\n")


def get_data():
    for country_code in country_codes:
        country_data = [",".join(header)] + get_pages(country_code)
        write_to_file(country_code, country_data)

In [161]:
get_data()

Writing US category to file...
