# Golden Globes Scraper

In [1]:
import sys
import os
import json
import re
import pprint
import multiprocessing as mp
import time
import csv

In [2]:
from bs4 import BeautifulSoup
from web_fetcher import get_cache

In [3]:
data_dir = 'data'
gg_urls_file = os.path.join(data_dir, 'golden_globe_awards_urls.json')
with open(gg_urls_file) as f:
    gg_urls = json.load(f)

In [4]:
def normalize_title(title):
    title = title.strip()
    if title.endswith(', The'):
        title = 'The ' + title[:-5]
    return title

def normalize_name(name):
    return name.strip()

def normalize_win_nom(win_nom):
    return win_nom.strip().upper()[0]

def extract_year(text):
    return re.sub('[^\d]', '', text)

def get_one_year_win_nom(win_nom_year_node):
    group_node = win_nom_year_node.parent.parent
    
    win_nom_list = []
    for win_nom_node in group_node.find_all('div', class_="views-field views-field-field-nomination-is-winner"):
        win_nom = normalize_win_nom(win_nom_node.find('div', class_="field-content").text)
        name, title = None, None
        name_title_node = win_nom_node.find_next_sibling('div', class_="views-field views-field-nominee-title")
        for node in name_title_node.find_all('a'):
            if not name:
                name = normalize_name(node.text)
            elif not title:
                title = normalize_title(node.text)
        win_nom_list.append((name, title, win_nom))
    return win_nom_list

def normalize_category(category):
    
    return re.sub(r'(?i)Winners\s*\&\s*Nominees\s*', '', category).strip()

def get_category(node):

    return normalize_category(
        node
        .find('div', class_="region region-content")
        .find('div', class_="panel-pane pane-views-panes pane-winners-nominees-category")
        .find('h1', class_="pane-title")
        .text
        .strip()
    )

def get_all_win_nom(node):
    
    category = get_category(node)
    year_win_nom_list = []
    for win_nom_year_node in node.find_all('a', href=re.compile('/winners-nominees/\d{4}')):
        year = int(extract_year(win_nom_year_node.text))
        for win_nom in get_one_year_win_nom(win_nom_year_node):
            year_win_nom_list.append((year, category) + win_nom)
    return year_win_nom_list

def rt_scrape(url):
    
    page = get_cache(url)
    if not page:
        return
    soup = BeautifulSoup(page, "lxml")
    return get_all_win_nom(soup)


In [15]:
gg_awards = []
for url in gg_urls:
   gg_awards.extend(rt_scrape(url))

In [20]:
gg_awards.sort(
    key=lambda k: (-k[0], k[1], k[-1] == 'N' ) + k[2:-1]
)

In [21]:
csvfile = os.path.join(data_dir, 'golden_globe_awards.csv')
with open(csvfile, 'w') as f:
    csv_w = csv.writer(f)
    csv_w.writerow(['Year', 'Category', 'Nominee', 'Additional Info', 'Won?'])
    csv_w.writerows(gg_awards)