In [16]:
YEAR = "2020"
BASE_DIR = "."

In [17]:
arg_year = 1999
if isinstance(YEAR, int):
    arg_year = YEAR
elif isinstance(YEAR, str):
    try:
        arg_year = int(YEAR)
    except:
        pass
arg_year

2020

In [30]:
import sys
import pathlib

BASE_DIR = pathlib.Path(BASE_DIR)

In [1]:
# !pip install requests requests-html pandas

In [1]:
from dataclasses import dataclass
import pathlib
import pandas as pd
import requests
from requests_html import HTML

In [4]:
@dataclass
class ScrapeBoxOffice:
    base_endpoint:str = "https://www.boxofficemojo.com/year/world/"
    year:int = None
    save_raw:bool = False
    save:bool = False
    output_dir: str = "."
    table_selector: str = '.imdb-scroll-table'
    table_data = []
    table_header_names = []
    df = pd.DataFrame()
    
    @property
    def name(self):
        return self.year if isinstance(self.year, int) else 'world'
    
    def get_endpoint(self):
        endpoint = self.base_endpoint
        if isinstance(self.year, int):
            endpoint = f"{endpoint}{self.year}/"
        return endpoint
    
    def get_output_dir(self):
        return pathlib.Path(self.output_dir)
    
    def extract_html_str(self, endpoint=None):
        url = endpoint if endpoint is not None else self.get_endpoint()
        r = requests.get(url, stream=True)
        html_text = None
        status = r.status_code
        if r.status_code == 200:
            html_text = r.text
            if self.save_raw:
                output_fname = f"{self.name}.html"
                raw_output_dir = self.get_output_dir() / 'html'
                raw_output_dir.mkdir(exist_ok=True, parents=True)
                output_fname = raw_output_dir / output_fname
                with open(f"{output_fname}", 'w') as f:
                    f.write(html_text)
            return html_text, status
        return html_text, status
    
    def parse_html(self, html_str=''):
        r_html = HTML(html=html_str)
        r_table = r_html.find(self.table_selector)
        if len(r_table) == 0:
            return None
        table_data = []
        header_names = []
        parsed_table = r_table[0]
        rows = parsed_table.find("tr")
        header_row = rows[0]
        header_cols = header_row.find('th')
        header_names = [x.text for x in header_cols]
        for row in rows[1:]:
            cols = row.find("td")
            row_data = []
            row_dict_data = {}
            for i, col in enumerate(cols):
                header_name = header_names[i]
                row_data.append(col.text)
            table_data.append(row_data)
        self.table_data = table_data
        self.table_header_names = header_names
        return self.table_data, self.table_header_names
    
    def to_df(self, data=[], columns=[]):
        return pd.DataFrame(data, columns=columns)
    
    def run(self, save=False):
        save = self.save if save is False else save
        endpoint = self.get_endpoint()
        sys.stdout.write(f"Endpoint:\t{endpoint}\n")
        html_str, status = self.extract_html_str(endpoint=endpoint)
        sys.stdout.write(f"Response Status:\t{status}\n")
        if status not in range(200, 299):
            raise Exception(f"Extraction failed, endpoint status {status} at {endpoint}")
        data, headers = self.parse_html(html_str if html_str is not None else '')
        sys.stdout.write(f"Rows:\t{len(data)}\nColumns:\t{len(headers)}\n")
        df = self.to_df(data=data, columns=headers)
        self.df = df
        if save:
            filepath = self.get_output_dir() / f'{self.name}.csv'
            sys.stdout.write(f"Saved to {filepath}")
            df.to_csv(filepath, index=False)
        sys.stdout.write(f"\n\n")
        return self.df

In [5]:
scraper = ScrapeBoxOffice(year=arg_year, save=True, save_raw=True, output_dir=str(BASE_DIR / 'data'))
df = scraper.run()
df.head()

Unnamed: 0,Rank,Release Group,Worldwide,Domestic,%,Foreign,%.1
0,1,Toy Story 3,"$1,066,969,703","$415,004,880",38.9%,"$651,964,823",61.1%
1,2,Alice in Wonderland,"$1,025,467,110","$334,191,110",32.6%,"$691,276,000",67.4%
2,3,Harry Potter and the Deathly Hallows: Part 1,"$976,536,918","$295,983,305",30.3%,"$680,553,613",69.7%
3,4,Inception,"$826,137,188","$292,576,195",35.4%,"$533,560,993",64.6%
4,5,Shrek Forever After,"$752,600,867","$238,736,787",31.7%,"$513,864,080",68.3%
