# Golden Globe Awards - Get URL List

In [71]:
import sys
import os
import requests
import re
from bs4 import BeautifulSoup
from http import HTTPStatus
from urllib.parse import \
    urlparse, urljoin
import json

In [72]:
GG_AWARDS_TOP_URL = "https://www.goldenglobes.com"
GG_AWARDS_START_URL = urljoin(GG_AWARDS_TOP_URL, "/winners-nominees/best-motion-picture-drama")

data_dir = 'data'
if not os.path.isdir(data_dir):
    os.makedirs(data_dir)

gg_awards_urls_file = os.path.join(data_dir, 'golden_globe_awards_urls.json')    

In [73]:
def _gg_get_all_urls():
    
    response = requests.get(GG_AWARDS_START_URL, timeout=60)
    assert(response.status_code == HTTPStatus.OK)

    soup = BeautifulSoup(response.text, "lxml")
    pull_down = soup.find("ul", attrs={"class": "dropdown-menu"})
    assert(pull_down)
    
    attrs = dict(typeof="skos:Concept", property="rdfs:label skos:prefLabel", datatype="")
    text_re = [
        re.compile(r'(?i)(actor|actress)'),
        re.compile(r'(?i)director'),
        re.compile(r'(?i)screenplay')
    ]
    urls = [
        urljoin(GG_AWARDS_TOP_URL, e['href'] + '/all-years') \
            for e in pull_down.find_all('a', attrs=attrs, text=text_re)
    ]
    assert(urls)
    return urls

def gg_get_all_urls(force_refresh=False):
    """Get all  URLs from cache; refresh/seed it as needed.
    """
    
    if force_refresh or not os.path.isfile(gg_awards_urls_file):
        urls = _gg_get_all_urls()
        with open(gg_awards_urls_file, 'w') as f:
            json.dump(urls, f, indent=2)
    with open(gg_awards_urls_file) as f:
        urls = json.load(f)
    return urls 

In [76]:
gg_get_all_urls()

['https://www.goldenglobes.com/winners-nominees/best-performance-actress-motion-picture-drama/all-years',
 'https://www.goldenglobes.com/winners-nominees/best-performance-actor-motion-picture-drama/all-years',
 'https://www.goldenglobes.com/winners-nominees/best-performance-actress-motion-picture-musical-or-comedy/all-years',
 'https://www.goldenglobes.com/winners-nominees/best-performance-actor-motion-picture-musical-or-comedy/all-years',
 'https://www.goldenglobes.com/winners-nominees/best-performance-actress-supporting-role-any-motion-picture/all-years',
 'https://www.goldenglobes.com/winners-nominees/best-performance-actor-supporting-role-any-motion-picture/all-years',
 'https://www.goldenglobes.com/winners-nominees/best-director-motion-picture/all-years',
 'https://www.goldenglobes.com/winners-nominees/best-screenplay-motion-picture/all-years',
 'https://www.goldenglobes.com/winners-nominees/best-performance-actress-limited-series-or-motion-picture-made-television/all-years',
 'ht