# Oscars.com Scraper and Data Saving: Stage 1

Data Collection, Integration and Preprocessing

Hochschule Luzern, 2024

Master's in Applied Information and Data Science

Dominik Bacher Suarez

The following code gathers the data from https://www.oscars.org/oscars/ceremonies/

The data on each each, contains the awards given, the movie titles for the nominees and the winners

In [1]:
import random
import requests
from bs4 import BeautifulSoup
from time import sleep
import pandas as pd
import json

pd.set_option('display.max_colwidth', 100)
pd.set_option('display.max_columns', None)

## Helper Functions

In [2]:
def get_random_user_agent():
    """ Return a random user agent from the list of user agents.
    
    Returns:
        str: A random user agent.
    """
    USER_AGENTS = [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:89.0) Gecko/20100101 Firefox/89.0",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1 Safari/605.1.15",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.59",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 OPR/76.0.4017.123"
    ]
    return random.choice(USER_AGENTS)

# User agents obtained from https://iproyal.com/blog/user-agents-for-scraping/

In [3]:
def fetch_url_content(url, session=None):
    """ Fetch the content of each URL. If the status code is 429, meaning we are rate limited, sleep for 10 seconds.

    Returns:
        str: The content of the URL.
        str: The error message if the request fails.
    """
    
    # Get a random user agent
    headers = {'User-Agent': get_random_user_agent()}

    try:
        if not session:
            # Create a new session
            session = requests.Session()
        response = session.get(url, headers=headers, timeout=15)
        # Raise an HTTPError for bad status codes
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        # Get the body content
        body_content = soup.find('body')
        content, error = str(body_content), None
        return content, error

    except requests.exceptions.HTTPError as he:
        # Log the error for the HTTP error
        if response.status_code != 404:
            print(f"Error for {url}: {he}")
        if response.status_code == 429:
            print(f"Rate limited for {url}: {he} \nSleeping for 10 seconds.")
            sleep(10)
        content, error = None, str(he)
        return content, error

    except requests.exceptions.RequestException as error:
        # Log the error for the request exception
        print(f"RequestException for {url}: {error}")
        content, error = None, str(error)
        return content, error

In [4]:
def fetch_all_data(BASE_URL, range_of_url_ids, session=None):
    """ Fetch all the data from the range of URL IDs.

    Returns:
        dict: The raw content of the fetched data.
        dict: The missing content from the fetched data.

    """
    raw_content = {}
    missing_content = {}

    try:
        print("Scraping initialized...")
        # Iterate over the range of URL IDs
        for e, url_id in enumerate(range_of_url_ids):
            # Format the URL with the iteration ID
            url = BASE_URL.format(url_id)
            content, error = fetch_url_content(url, session)
            if content:
                raw_content[url_id] = str(content)
            else:
                # If no content is found, add the URL ID to the missing content
                raw_content[url_id] = None
                print(f"No content found for url_id: {url_id}")
            if error:
                # If an error is found, add the URL ID to the missing content
                missing_content[url_id] = error
            #print(f"Processed {url_id}")

            # Print success to keep the user knowing it's working
            if (e+1) % 5 == 0:
                print(f"Processed {e+1} pages")
            # Sleep for a random time between 0.05 and 0.2 seconds, making it look more human-like
            sleep(random.uniform(0.2, 0.9))

    except KeyboardInterrupt:
        print("\nFetching interrupted by user.")
        print(f"Last URL ID scraped: {url_id}")
    return raw_content, missing_content

## Scraping Process

In [5]:
# Fetch the data from the main URL
BASE_URL = "https://www.oscars.org/oscars/ceremonies/{}"
raw_content, missing_content = fetch_all_data(BASE_URL, range(2009, 2024+1))

Scraping initialized...
Processed 5 pages
Processed 10 pages
Processed 15 pages


In [6]:
print(f"Number of raw content: {len(raw_content)}")
print(f"Number of missing content: {len(missing_content)}")

Number of raw content: 16
Number of missing content: 0


In [7]:
# Preview all the raw content
raw_content

{2009: '<body class="layout-no-sidebars page-node-51456 path-node node--type-award-ceremony">\n<a class="visually-hidden-focusable" href="#main-content">\n      Skip to main content\n    </a>\n<noscript><iframe height="0" src="https://www.googletagmanager.com/ns.html?id=GTM-W3KXNCL" style="display:none;visibility:hidden" width="0"></iframe></noscript>\n<div class="dialog-off-canvas-main-canvas" data-off-canvas-main-canvas="">\n<div id="page-wrapper">\n<div id="site-page">\n<header aria-label="Site header" class="site-header" id="header" role="banner">\n<nav id="topBarMessage">\n<section class="region region-top-bar-msg">\n<div class="block block-oscars-org-core block-top-bar-msg-block" id="block-oscars-org-topbarmessageblock">\n<div class="content">\n<div class="topbarmessage-icon wrapper" id="topBarMessageClose">\n<svg class="icon icon-arrow-up" height="70" viewbox="0 0 48 48" width="70" xmlns="http://www.w3.org/2000/svg">\n<path d="M0 0h48v48H0z" fill="none"></path>\n<path d="m6.586 

## Save the Data: Stage 1

In [8]:
# Save the data to a JSON file
# IMPURITY #1: Without propper encoding, the characters are not saved correctly
with open("../data/Bacher_Dominik_studentA_stage1.json", 'w', encoding='utf-8') as file:
    json.dump(raw_content, file, ensure_ascii=False, indent=4)