In [1]:
import aswan

In [2]:
project = aswan.Project("imdb-example")

In [3]:
celeb_table = project.get_prod_table("person")
movie_table = project.get_prod_table("movie")

In [4]:
@project.register_handler
class CelebHandler(aswan.RequestSoupHandler):
    url_root = "https://www.imdb.com"

    def parse(self, soup):
        return {
            "name": soup.find("h1").find("span").text.strip(),
            "dob": soup.find("div", id="name-born-info").find("time")["datetime"],
        }

In [5]:
@project.register_handler
class MovieHandler(aswan.RequestSoupHandler):
    url_root = "https://www.imdb.com"
    def parse(self, soup):

        for cast in soup.find("table", class_="cast_list").find_all("td", class_="primary_photo")[:3]:
            self.register_links_to_handler([cast.find("a")["href"]], CelebHandler)
        
        ref_section = soup.find("section",class_="titlereference-section-overview")
        summary = None
        if ref_section is not None:
            summary = getattr(ref_section.find("div"), "text", "").strip()
        return {
            "title": soup.find("title").text.replace(" - Reference View - IMDb", "").strip(),
            "summary": summary,
            "year": int(soup.find("span", class_="titlereference-title-year").find("a").text),
        }

In [6]:
@project.register_t2_integrator
class MovieIntegrator(aswan.FlexibleDfParser):
    handlers = [MovieHandler]

    def url_parser(self, url):
        return {"id": url.split("/")[-1]}

    def write_df(self, df):
        return movie_table.extend(df)


@project.register_t2_integrator
class CelebIntegrator(aswan.FlexibleDfParser):
    handlers = [CelebHandler]

    def write_df(self, df):
        return celeb_table.extend(df)

In [7]:
def add_init_urls():
    movie_urls = [
        "https://www.imdb.com/title/tt1045772/reference",
        "https://www.imdb.com/title/tt2543164/reference",
    ]

    person_urls = ["https://www.imdb.com/name/nm0000190"]
    project.add_urls_to_handler(MovieHandler, movie_urls)
    project.add_urls_to_handler(CelebHandler, person_urls)

In [8]:
add_init_urls()

In [9]:
project.run(with_monitor_process=True)

2022-07-12 01:05.14 [info     ] running function reset_surls   env=prod function_batch=run_prep
2022-07-12 01:05.14 [info     ] function reset_surls returned None env=prod function_batch=run_prep
2022-07-12 01:05.14 [info     ] running function expire_surls  env=prod function_batch=run_prep
2022-07-12 01:05.14 [info     ] function expire_surls returned None env=prod function_batch=run_prep
2022-07-12 01:05.14 [info     ] running function _register_starter_urls env=prod function_batch=run_prep
2022-07-12 01:05.14 [info     ] function _register_starter_urls returned 0 env=prod function_batch=run_prep
2022-07-12 01:05.14 [info     ] running function _create_scheduler env=prod function_batch=run_prep
2022-07-12 01:05.14 [info     ] function _create_scheduler returned None env=prod function_batch=run_prep
2022-07-12 01:05.14 [info     ] running function start_monitor_process env=prod function_batch=run_prep
2022-07-12 01:05.14 [info     ]  monitor app at: http://localhost:6969
2022-07-12 01

In [10]:
movie_table.get_full_df()

Unnamed: 0,title,summary,year,id
0,Arrival (2016),A linguist works with the military to communic...,2016,reference
1,I Love You Phillip Morris (2009),A cop turns con man once he comes out of the c...,2009,reference


In [11]:
celeb_table.get_full_df()

Unnamed: 0,name,dob
0,Matthew McConaughey,1969-11-4
1,Leslie Mann,1972-3-26
2,Jeremy Renner,1971-1-7
3,Jim Carrey,1962-1-17
4,Forest Whitaker,1961-7-15
5,Ewan McGregor,1971-3-31
6,Amy Adams,1974-8-20


In [12]:
project.purge(True)