In [44]:
import httpx
import json
import time
import pandas as pd
import os

## Data Extraction

Extract the data from the Golden Globes Official Website

In [3]:
def get_golden_globes_page(page_number: int):
    res = httpx.get(f"https://goldenglobes.com/wp-json/awdb/v1/nominations/?page={page_number}")
    if res.status_code == 200:
        return res.json()
    return None

In [6]:
def get_pages_count():
    res = httpx.get(f"https://goldenglobes.com/wp-json/awdb/v1/nominations/")
    if res.status_code == 200:
        pages = res.headers.get("x-wp-totalpages")
        if pages:
            return int(pages)
    return None

In [12]:
def store_data(file_name: str, data: dict):
    with open(f"./golden_globes/{file_name}.json", "w") as f:
        json.dump(data, f)

In [16]:
pages = get_pages_count()
for page in range(pages + 1):
    page_data = get_golden_globes_page(page)
    store_data(str(page), page_data)
    time.sleep(1)

## Data Processing

Process the raw data and transform it into a pandas dataframe

In [57]:
def create_df():
    df = pd.DataFrame(columns=[
        "nominee_type",
        "year",
        "winner",
        "award",
        "title"
    ])
    return df

def process_item(item: dict, idx_error: bool = False):
    nominee_type = item.get("nominee_type")
    year = int(item.get("year"))
    winner = bool(item.get("winner"))
    award = item.get("award")
    title = item.get("title") if idx_error else item.get("nominees")[0].get("title")

    return {
        "nominee_type": nominee_type,
        "year": year,
        "winner": winner,
        "award": award,
        "title": title
    }

df = create_df()


files = sorted(os.listdir("./golden_globes/"))[1:]
for file in files:
    full_path = "./golden_globes/" + file
    with open(full_path, "r") as f:
        data = json.load(f)
    for item in data:
        try:
            processed_item = process_item(item)
            df = pd.concat([df, pd.DataFrame([processed_item])], ignore_index=True)
        except IndexError:
            processed_item = process_item(item, idx_error=True)
            df = pd.concat([df, pd.DataFrame([processed_item])], ignore_index=True)
            
        except Exception as e:
            print(str(e), " ----> ", item)
            pass

df.to_csv("Golden_Globes_Awards_Dataset.csv")