In [223]:
import logging
import sys
import yaml
import bs4
import random
import requests
from bs4 import BeautifulSoup
from time import sleep
import pandas as pd
import json
import numpy as np
import os

pd.set_option('display.max_colwidth', None)

In [80]:
def load_config():
    # Load the configuration file with the settings for the specific scraper
    with open('../scraper_src/settings.yaml', 'r') as file:
        config = yaml.safe_load(file)["settings"]
    return config

# Define the constants for the scraper
CONFIG = load_config()
USER_AGENTS = CONFIG["USER_AGENTS"]
SLEEP_MIN = CONFIG["SLEEP_MIN"]
SLEEP_MAX = CONFIG["SLEEP_MAX"]
BASE_URL = CONFIG["BASE_URL"]

def get_random_user_agent():
    """Return a random user agent from a predefined list."""
    return random.choice(USER_AGENTS)

def fetch_url_content(url, session=None):
    """Fetches content for a given URL."""
    
    headers = {'User-Agent': get_random_user_agent()}

    try:
        if not session:
            session = requests.Session()
        response = session.get(url, headers=headers, timeout=15)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        body_content = soup.find('body')
        content, error = str(body_content), None
        return content, error

    except requests.exceptions.HTTPError as he:
        if response.status_code != 404:
            logging.error(f"Error for {url}: {he}")
        if response.status_code == 429:
            logging.error(f"Rate limited for {url}: {he} \nSleeping for 10 seconds.")
            sleep(10)
        content, error = None, str(he)
        return content, error

    except requests.exceptions.RequestException as error:
        logging.error(f"RequestException for {url}: {error}")
        content, error = None, str(error)
        return content, error
    
def find_latest_fetch(*args):
    highest_id = 0
    for data in args:
        latest_id = max(map(int, data.keys())) if data else 0
        if latest_id > highest_id:
            highest_id = latest_id 
    return highest_id
    
def fetch_all_data(range_of_url_ids, session=None):
    """Fetch data for a range of movies."""
    raw_content = {}
    missing_content = {}

    try:
        print("Scraping initialized...")
        for e, url_id in enumerate(range_of_url_ids):
            url = BASE_URL.format(url_id)
            content, error = fetch_url_content(url, session)
            #
            if content:
                raw_content[url_id] = content
            else:
                raw_content[url_id] = None
                logging.info(f"No content found for url_id: {url_id}")
            #
            if error:
                missing_content[url_id] = error

            logging.info(f"Processed {url_id}")
            # Print success to keep the user knowing it's working
            if (e+1) % 5 == 0:
                print(f"Processed {e+1} pages")
            sleep(random.uniform(SLEEP_MIN, SLEEP_MAX))

    except KeyboardInterrupt:
        print("\nFetching interrupted by user.")
        print(f"Last URL ID scraped: {url_id}")
        logging.error(f"Fetching interrupted by user. Last URL ID scraped: {url_id}")
    return raw_content, missing_content


def parse_movie_data(content, main_html_tag=None):
    """Parses movie data from the fetched content."""
    

In [81]:
raw_content, missing_content = fetch_all_data(range(2009, 2024+1))

Scraping initialized...
Processed 5 pages
Processed 10 pages
Processed 15 pages


## Scrape Categories by Year

In [86]:
all_categories = set()
categories_dictionary = dict()

for year, content in raw_content.items():
    soup = BeautifulSoup(content, "html.parser")
    # First filter the main div with the main content
    content = soup.find("div", id="quicktabs-tabpage-honorees-0")
    # Then filter the divs with the categories, which upon inspection are the ones with the class "view-grouping-header"
    categories = content.find_all("div", {"class": "view-grouping-header"})
    all_categories.update([c.find("h2").text for c in categories])
    categories_dictionary[year] = [c.find("h2").text for c in categories]

# Ensure they are sorted, can be converted to a list now
all_categories = sorted(all_categories)

# Make dataframe with the columns as the categories, and index as the years
df_categories = pd.DataFrame(columns=all_categories, index=categories_dictionary.keys())
df_categories = df_categories.infer_objects(copy=False)
df_categories.fillna(int(0), inplace=True)
df_categories = df_categories.astype(int)

for year, category in categories_dictionary.items():
    for column in category:
        df_categories.loc[year, column] = 1

df_categories.to_csv("../data/categories.csv")
df_categories.head()


Unnamed: 0,Actor in a Leading Role,Actor in a Supporting Role,Actress in a Leading Role,Actress in a Supporting Role,Animated Feature Film,Animated Short Film,Art Direction,Best Picture,Cinematography,Costume Design,...,Music (Original Song),Production Design,Short Film (Animated),Short Film (Live Action),Sound,Sound Editing,Sound Mixing,Visual Effects,Writing (Adapted Screenplay),Writing (Original Screenplay)
2009,1,1,1,1,1,0,1,1,1,1,...,1,0,1,1,0,1,1,1,1,1
2010,1,1,1,1,1,0,1,1,1,1,...,1,0,1,1,0,1,1,1,1,1
2011,1,1,1,1,1,0,1,1,1,1,...,1,0,1,1,0,1,1,1,1,1
2012,1,1,1,1,1,0,1,1,1,1,...,1,0,1,1,0,1,1,1,1,1
2013,1,1,1,1,1,0,0,1,1,1,...,1,1,1,1,0,1,1,1,1,1


## Scrape All Awards Data

In [189]:
exception_acting = ["actor", "actress"]

all_oscars_awards = dict()

for year, content in raw_content.items():
    soup = BeautifulSoup(content, "html.parser")
    # First filter the div with the awards
    content = soup.find("div", id="quicktabs-tabpage-honorees-0")
    content = content.find("div", {"class": "view-content"})
    all_awards = content.find_all("div", {"class": "view-grouping"})

    this_years_awards = dict()

    for award in all_awards:
        category = award.find("div", {"class": "view-grouping-header"}).find("h2").text
        # First scarpe awards for actor and actresses since the structure is different, then the rest
        if any(keyword in category.lower() for keyword in exception_acting):
            # Winners
            winner_div = award.find('h3', string=lambda text: text and 'Winner' in text).find_next_sibling('div')
            winner_cast = winner_div.find_all("div")[0].text.strip()
            winner_movie = winner_div.find_all("div")[1].text.strip()
            # Nominees
            nominees_div = award.find('h3', string=lambda text: text and 'Nominees' in text).find_next_siblings('div')
            nominees_cast = []
            nominees_movies = []
            for nominee in nominees_div:
                nominees_cast.append(nominee.find_all("div")[0].text.strip())
                nominees_movies.append(nominee.find_all("div")[1].text.strip())
            # Save the data
            this_years_awards[category] = {
                "winner_movie": winner_movie,
                "winner_cast": winner_cast,
                "nominees_movies": nominees_movies,
                "nominees_cast": nominees_cast
            }
        # Now we can do the rest
        else:
            try: # <-- try because sometimes there are only winners and no nominees
                winner_div = award.find('h3', string=lambda text: text and 'Winner' in text).find_next_sibling('div')
                winner_movie = winner_div.find_all("div")[0].text.strip()
                winner_cast = winner_div.find_all("div")[1].text.strip()
                # Nominees
                nominees_div = award.find('h3', string=lambda text: text and 'Nominees' in text).find_next_siblings('div')
                nominees_movies = []
                nominees_cast = []
                for nominee in nominees_div:
                    nominees_movies.append(nominee.find_all("div")[0].text.strip())
                    nominees_cast.append(nominee.find_all("div")[1].text.strip())
            except:
                continue
            # Save the data
            this_years_awards[category] = {
                "winner_movie": winner_movie,
                "winner_cast": winner_cast,
                "nominees_movies": nominees_movies,
                "nominees_cast": nominees_cast
            }

    all_oscars_awards[year] = this_years_awards

with open("../data/all_oscars_awards.json", 'w', encoding='utf-8') as file:
    json.dump(all_oscars_awards, file, ensure_ascii=False, indent=4)

## Create Dataframe With All Awards data

In [231]:
# First get unique Movies
all_oscars_filename = "../data/all_oscars_awards.json"
if os.path.exists(all_oscars_filename):
    with open(all_oscars_filename, 'r', encoding='utf-8') as file:
        all_oscars_awards = json.load(file)
else:
    print(f"File {all_oscars_filename} not found. Exiting...")
    sys.exit(1)

unique_movies = set()
for year, ceremony in all_oscars_awards.items():
    for category, awardess in ceremony.items():
        unique_movies.update([awardess["winner_movie"]])
        unique_movies.update(awardess["nominees_movies"])

unique_movies = sorted(unique_movies, key=lambda x: x.lower())

# Now create the dataframe
df_awards = pd.DataFrame(columns=["Movie Title", "Release Year"] + all_categories,
                         index=range(len(unique_movies)))
df_awards["Movie Title"] = unique_movies

for year, ceremony in all_oscars_awards.items():
    for category, awardess in ceremony.items():
        movie_title_name = df_awards["Movie Title"] == awardess["winner_movie"]
        # year
        df_awards.loc[movie_title_name, "Release Year"] = year
        # cast
        # TODO: Add cast for each category
        # category
        df_awards.loc[movie_title_name, category] = "Winner"
        # nominee_movie
        for nominees_movie, nominees_cast in zip(awardess["nominees_movies"], awardess["nominees_cast"]):
            movie_title_name = df_awards["Movie Title"] == nominees_movie
            # year
            df_awards.loc[movie_title_name, "Release Year"] = year
            # cast
            # TODO: Add cast for each category
            # category
            df_awards.loc[movie_title_name, category] = "Nominated"
        
df_awards.to_csv("../data/all_oscars_awards.csv")
df_awards.head()
# df_awards.loc[df_awards["Movie Title"] == "12 Years a Slave"].dropna(axis=1)

Unnamed: 0,Movie Title,Release Year,Actor in a Leading Role,Actor in a Supporting Role,Actress in a Leading Role,Actress in a Supporting Role,Animated Feature Film,Animated Short Film,Art Direction,Best Picture,...,Music (Original Song),Production Design,Short Film (Animated),Short Film (Live Action),Sound,Sound Editing,Sound Mixing,Visual Effects,Writing (Adapted Screenplay),Writing (Original Screenplay)
0,(I'm Gonna) Love Me Again,2020,,,,,,,,,...,Winner,,,,,,,,,
1,12 Years a Slave,2014,Nominated,Nominated,,Winner,,,,Winner,...,,Nominated,,,,,,,Winner,
2,127 Hours,2011,Nominated,,,,,,,Nominated,...,Nominated,,,,,,,,Nominated,
3,13 Hours: The Secret Soldiers of Benghazi,2017,,,,,,,,,...,,,,,,,Nominated,,,
4,13th,2017,,,,,,,,,...,,,,,,,,,,


In [233]:
df_just_movies_and_year = df_awards[["Movie Title", "Release Year"]]
df_just_movies_and_year.to_csv("../data/just_movies_and_year.csv")