In [1]:
import pandas as pd

from bs4 import BeautifulSoup

import aswan

In [2]:
project = aswan.Project("imdb-example")

In [3]:
@project.register_handler
class CelebHandler(aswan.RequestSoupHandler):
    url_root = "https://www.imdb.com"

    def parse(self, soup: BeautifulSoup):
        return {
            "name": soup.find("h1").find("span").text.strip(),
            "dob": soup.find("div", id="name-born-info").find("time")["datetime"],
        }

In [4]:
@project.register_handler
class MovieHandler(aswan.RequestSoupHandler):
    url_root = "https://www.imdb.com"

    def parse(self, soup: BeautifulSoup):

        for cast in soup.find("table", class_="cast_list").find_all(
            "td", class_="primary_photo"
        )[:3]:
            self.register_links_to_handler([cast.find("a")["href"]], CelebHandler)

        ref_section = soup.find("section", class_="titlereference-section-overview")
        summary = None
        if ref_section is not None:
            summary = getattr(ref_section.find("div"), "text", "").strip()
        return {
            "title": soup.find("title")
            .text.replace(" - Reference View - IMDb", "")
            .strip(),
            "summary": summary,
            "year": int(
                soup.find("span", class_="titlereference-title-year").find("a").text
            ),
        }

In [5]:
project.run(
    urls_to_register={
        MovieHandler: [
            "https://www.imdb.com/title/tt1045772/reference",
            "https://www.imdb.com/title/tt2543164/reference",
        ],
        CelebHandler: ["https://www.imdb.com/name/nm0000190"],
    },
    force_sync=True
)

2022-10-06 16:47.56 [info     ] running function setup         batch=prep
2022-10-06 16:47.56 [info     ] function setup returned None   batch=prep
2022-10-06 16:47.56 [info     ] running function _initiate_status batch=prep
2022-10-06 16:47.56 [info     ] function _initiate_status returned None batch=prep
2022-10-06 16:47.56 [info     ] running function _create_scheduler batch=prep
2022-10-06 16:47.56 [info     ] function _create_scheduler returned None batch=prep
2022-10-06 16:48.12 [info     ] running function join          batch=cleanup
2022-10-06 16:48.12 [info     ] function join returned None    batch=cleanup


In [6]:
pd.DataFrame([pcev.content for pcev in project.handler_events(MovieHandler)])

Unnamed: 0,title,summary,year
0,I Love You Phillip Morris (2009),A cop turns con man once he comes out of the c...,2009
1,Arrival (2016),A linguist works with the military to communic...,2016


In [7]:
pd.DataFrame([pcev.content for pcev in project.handler_events(CelebHandler)])

Unnamed: 0,name,dob
0,Ewan McGregor,1971-3-31
1,Jeremy Renner,1971-1-7
2,Leslie Mann,1972-3-26
3,Forest Whitaker,1961-7-15
4,Amy Adams,1974-8-20
5,Jim Carrey,1962-1-17
6,Matthew McConaughey,1969-11-4


In [8]:
project.cleanup_current_run()

In [9]:
project.depot.purge()