In [63]:
import logging
import sys
import yaml
import bs4
import random
import requests
from bs4 import BeautifulSoup
from time import sleep
import pandas as pd

In [33]:
def load_config():
    # Load the configuration file with the settings for the specific scraper
    with open('../scraper_src/settings.yaml', 'r') as file:
        config = yaml.safe_load(file)["settings"]
    return config

# Define the constants for the scraper
CONFIG = load_config()
USER_AGENTS = CONFIG["USER_AGENTS"]
SLEEP_MIN = CONFIG["SLEEP_MIN"]
SLEEP_MAX = CONFIG["SLEEP_MAX"]
BASE_URL = CONFIG["BASE_URL"]

def get_random_user_agent():
    """Return a random user agent from a predefined list."""
    return random.choice(USER_AGENTS)

def fetch_url_content(url, session=None):
    """Fetches content for a given URL."""
    
    headers = {'User-Agent': get_random_user_agent()}

    try:
        if not session:
            session = requests.Session()
        response = session.get(url, headers=headers, timeout=15)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        body_content = soup.find('body')
        content, error = str(body_content), None
        return content, error

    except requests.exceptions.HTTPError as he:
        if response.status_code != 404:
            logging.error(f"Error for {url}: {he}")
        if response.status_code == 429:
            logging.error(f"Rate limited for {url}: {he} \nSleeping for 10 seconds.")
            sleep(10)
        content, error = None, str(he)
        return content, error

    except requests.exceptions.RequestException as error:
        logging.error(f"RequestException for {url}: {error}")
        content, error = None, str(error)
        return content, error
    

    
def fetch_all_data(range_of_url_ids, session=None):
    """Fetch data for a range of movies."""
    raw_content = {}
    missing_content = {}

    try:
        print("Scraping initialized...")
        for e, url_id in enumerate(range_of_url_ids):
            url = BASE_URL.format(url_id)
            content, error = fetch_url_content(url, session)
            #
            if content:
                raw_content[url_id] = content
            else:
                raw_content[url_id] = None
                logging.info(f"No content found for url_id: {url_id}")
            #
            if error:
                missing_content[url_id] = error

            logging.info(f"Processed {url_id}")
            # Print success to keep the user knowing it's working
            if (e+1) % 5 == 0:
                print(f"Processed {e+1} pages")
            sleep(random.uniform(SLEEP_MIN, SLEEP_MAX))

    except KeyboardInterrupt:
        print("\nFetching interrupted by user.")
        print(f"Last URL ID scraped: {url_id}")
        logging.error(f"Fetching interrupted by user. Last URL ID scraped: {url_id}")
    return raw_content, missing_content


def parse_movie_data(content, main_html_tag=None):
    """Parses movie data from the fetched content."""
    

In [34]:
raw_content, missing_content = fetch_all_data(range(2009, 2024+1))

Scraping initialized...
Processed 5 pages
Processed 10 pages
Processed 15 pages


## Scrape Categories by Year

In [75]:
all_categories = set()
categories_dictionary = dict()

for year, content in raw_content.items():
    soup = BeautifulSoup(content, "html.parser")
    # First filter the main div with the main content
    content = soup.find("div", id="quicktabs-tabpage-honorees-0")
    # Then filter the divs with the categories, which upon inspection are the ones with the class "view-grouping-header"
    categories = content.find_all("div", {"class": "view-grouping-header"})
    all_categories.update([c.find("h2").text for c in categories])
    categories_dictionary[year] = [c.find("h2").text for c in categories]

# Ensure they are sorted, can be converted to a list now
all_categories = sorted(all_categories)

# Make dataframe with the columns as the categories, and index as the years
df_categories = pd.DataFrame(columns=all_categories, index=categories_dictionary.keys())
df_categories = df_categories.infer_objects(copy=False)
df_categories.fillna(int(0), inplace=True)
df_categories = df_categories.astype(int)

for year, category in categories_dictionary.items():
    for column in category:
        df_categories.loc[year, column] = 1

df_categories.to_csv("../data/categories.csv")
df_categories


Unnamed: 0,Actor in a Leading Role,Actor in a Supporting Role,Actress in a Leading Role,Actress in a Supporting Role,Animated Feature Film,Animated Short Film,Art Direction,Best Picture,Cinematography,Costume Design,...,Music (Original Song),Production Design,Short Film (Animated),Short Film (Live Action),Sound,Sound Editing,Sound Mixing,Visual Effects,Writing (Adapted Screenplay),Writing (Original Screenplay)
2009,1,1,1,1,1,0,1,1,1,1,...,1,0,1,1,0,1,1,1,1,1
2010,1,1,1,1,1,0,1,1,1,1,...,1,0,1,1,0,1,1,1,1,1
2011,1,1,1,1,1,0,1,1,1,1,...,1,0,1,1,0,1,1,1,1,1
2012,1,1,1,1,1,0,1,1,1,1,...,1,0,1,1,0,1,1,1,1,1
2013,1,1,1,1,1,0,0,1,1,1,...,1,1,1,1,0,1,1,1,1,1
2014,1,1,1,1,1,0,0,1,1,1,...,1,1,1,1,0,1,1,1,1,1
2015,1,1,1,1,1,0,0,1,1,1,...,1,1,1,1,0,1,1,1,1,1
2016,1,1,1,1,1,0,0,1,1,1,...,1,1,1,1,0,1,1,1,1,1
2017,1,1,1,1,1,0,0,1,1,1,...,1,1,1,1,0,1,1,1,1,1
2018,1,1,1,1,1,0,0,1,1,1,...,1,1,1,1,0,1,1,1,1,1


In [None]:


job_ads = soup.find_all('article', {'class' : 'Div-sc-1cpunnt-0'})

df = pd.DataFrame(columns=['profession', 'title', 'query'])

for job in job_ads:
    try:
        title = job.find('a',{'data-cy' : 'job-link'}).get('title')
    except AttributeError:
        title = None

    try:
        location = job.find_all('p', {'class' : 'Span-sc-1ybanni-0'})[1].get_text()
    except AttributeError:
        location = None

    try:
        date = job.select_one('p > span.ftUOUz').get_text()
    except AttributeError:
        date = None

    try:
        query = job.find('a', {'data-cy': 'job-link'}).get('href')
    except AttributeError:
        query = None
    

    job_dict = {
                "profession": profession,
                "title": title,
                "location": location,
                "date": date,
                "query": query
            }

    df = pd.concat(
        [df, pd.DataFrame([job_dict])],
        ignore_index=True,
    )

In [None]:

sys.path.append("scraper")




In [None]:
def find_latest_fetch(*args):
    highest_id = 0
    for data in args:
        latest_id = max(map(int, data.keys())) if data else 0
        if latest_id > highest_id:
            highest_id = latest_id 
    return highest_id

In [None]:
logging.basicConfig(
    filename='app.log',
    filemode='a',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

def main():
    session = requests.Session()
    raw_content = data_storage.load_json_data("raw_content.json")
    missing_movies = data_storage.load_json_data("missing_movies.json")

    try:
        new_raw_content, new_missing_movies = data_fetcher.fetch_movie_data(
            start_idx=utils.find_latest_fetch(raw_content, missing_movies),
            session=session
        )

    except Exception as e:
        logging.error(f"Error during data fetching: {e}")
    else:
        raw_content.update(new_raw_content)
        missing_movies.update(new_missing_movies)

    finally:
        data_storage.save_to_json("raw_content.json", raw_content)
        data_storage.save_to_json("missing_movies.json", missing_movies)

if __name__ == "__main__":
    main()

In [None]:
import json
import logging
import os
import sys

def save_to_json(filename, data):
    with open(filename, 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=4)

def load_json_data(filename):
    """Load data from JSON if it exists."""
    try:
        if os.path.exists(filename):
            print(f"Loading existing data from {filename}...")
            with open(filename, 'r', encoding='utf-8') as file:
                return json.load(file)
        return {}

    except KeyboardInterrupt:
        print("\nFetching interrupted by user before loading the data.")
        logging.error(f"Fetching interrupted by user before loading the data. Terminating the script to avoid data loss.")
        sys.exit("Terminating the script to avoid data loss.")