In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import time
import re

In [68]:
options = Options()
options.add_argument("--headless=new")
options.add_argument("--window-size=1920,1080")

driver = webdriver.Chrome(options=options)

In [70]:
base_url = "https://www.the-numbers.com"
years = range(2018, 2026)
contents = []

for year in years:
    url = f"{base_url}/market/{year}/top-grossing-movies"
    print("Year page:", year, url)
    driver.get(url)
    time.sleep(1)

    tables = driver.find_elements(By.TAG_NAME, "table")
    table = tables[0]
    rows = table.find_elements(By.TAG_NAME, "tr")

    header_cells = rows[0].find_elements(By.TAG_NAME, "th")
    column_names = [th.text.replace("\n", " ").strip() for th in header_cells]

    normalized_columns = []
    for name in column_names:
        if re.match(r"^\d{4}\s+Gross$", name):
            normalized_columns.append("Year Gross")
        else:
            normalized_columns.append(name)
    column_names = normalized_columns

    for row in rows[1:101]:
        tds = row.find_elements(By.TAG_NAME, "td")
        if not tds:
            continue

        row_dict = {}
        for idx, td in enumerate(tds):
            row_dict[column_names[idx]] = td.text.strip()

        a_tag = row.find_element(By.TAG_NAME, "a")
        href = a_tag.get_attribute("href")
        if not href.startswith("http"):
            href = base_url + href
        row_dict["Movie URL"] = href

        row_dict["Year"] = year

        print(row_dict)
        contents.append(row_dict)

Year page: 2018 https://www.the-numbers.com/market/2018/top-grossing-movies
{'Rank': '1', 'Movie': 'Black Panther', 'Release Date': 'Feb 16, 2018', 'Distributor': 'Walt Disney', 'Genre': 'Action', 'Year Gross': '$700,059,566', 'Tickets Sold': '76,845,177', 'Movie URL': 'https://www.the-numbers.com/movie/Black-Panther-(2018)#tab=summary', 'Year': 2018}
{'Rank': '2', 'Movie': 'Avengers: Infinity War', 'Release Date': 'Apr 27, 2018', 'Distributor': 'Walt Disney', 'Genre': 'Action', 'Year Gross': '$678,815,482', 'Tickets Sold': '74,513,225', 'Movie URL': 'https://www.the-numbers.com/movie/Avengers-Infinity-War-(2017)#tab=summary', 'Year': 2018}
{'Rank': '3', 'Movie': 'Incredibles 2', 'Release Date': 'Jun 15, 2018', 'Distributor': 'Walt Disney', 'Genre': 'Adventure', 'Year Gross': '$608,581,744', 'Tickets Sold': '66,803,704', 'Movie URL': 'https://www.the-numbers.com/movie/Incredibles-2#tab=summary', 'Year': 2018}
{'Rank': '4', 'Movie': 'Jurassic World: Fallen Kingdom', 'Release Date': 'Jun

In [71]:
for row_dict in contents:
    movie_url = row_dict["Movie URL"]
    print("Movie:", movie_url)
    driver.get(movie_url)
    time.sleep(1)

    try:
        domestic_text = driver.find_element(
            By.XPATH,
            "//table[@id='movie_finances']//tr[.//b[contains(text(),'Domestic Box Office')]]/td[2]"
        ).text.strip()
        domestic_box = domestic_text
    except NoSuchElementException:
        domestic_box = None

    try:
        intl_text = driver.find_element(
            By.XPATH,
            "//table[@id='movie_finances']//tr[.//b[contains(text(),'International Box Office')]]/td[2]"
        ).text.strip()
        international_box = intl_text
    except NoSuchElementException:
        international_box = None

    try:
        world_text = driver.find_element(
            By.XPATH,
            "//table[@id='movie_finances']//tr[.//b[contains(text(),'Worldwide Box Office')]]/td[2]"
        ).text.strip()
        worldwide_box = world_text
    except NoSuchElementException:
        worldwide_box = None

    row_dict["Domestic Box Office"] = domestic_box
    row_dict["International Box Office"] = international_box
    row_dict["Worldwide Box Office"] = worldwide_box

    print(row_dict)


Movie: https://www.the-numbers.com/movie/Black-Panther-(2018)#tab=summary
{'Rank': '1', 'Movie': 'Black Panther', 'Release Date': 'Feb 16, 2018', 'Distributor': 'Walt Disney', 'Genre': 'Action', 'Year Gross': '$700,059,566', 'Tickets Sold': '76,845,177', 'Movie URL': 'https://www.the-numbers.com/movie/Black-Panther-(2018)#tab=summary', 'Year': 2018, 'Domestic Box Office': '$700,059,566', 'International Box Office': '$634,097,516', 'Worldwide Box Office': '$1,334,157,082'}
Movie: https://www.the-numbers.com/movie/Avengers-Infinity-War-(2017)#tab=summary
{'Rank': '2', 'Movie': 'Avengers: Infinity War', 'Release Date': 'Apr 27, 2018', 'Distributor': 'Walt Disney', 'Genre': 'Action', 'Year Gross': '$678,815,482', 'Tickets Sold': '74,513,225', 'Movie URL': 'https://www.the-numbers.com/movie/Avengers-Infinity-War-(2017)#tab=summary', 'Year': 2018, 'Domestic Box Office': '$678,815,482', 'International Box Office': '$1,369,342,759', 'Worldwide Box Office': '$2,048,158,241'}
Movie: https://www.

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [84]:
headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.9",
    "Referer": "https://www.google.com/"
}

for row_dict in contents:
    movie_url = row_dict["Movie URL"]
    print("Movie:", movie_url)
    movie_page = BeautifulSoup(requests.get(url=movie_url, headers=headers).text, "html.parser")

    production_cell = movie_page.find(string=lambda t: t and "Production" in t and "Budget" in t)
    if production_cell:
        production_cell_row = production_cell.find_parent("tr")
        tds = production_cell_row.find_all("td")
        production_budget = tds[-1].get_text(strip=True)
        production_budget = production_budget.split("(")[0].strip()
    else:
        production_budget = None    

    mpaa_rating_cell = movie_page.find("td", string=lambda t: t and "MPAA" in t and "Rating" in t)
    if mpaa_rating_cell:
        mpaa_rating_cell_row = mpaa_rating_cell.find_next_sibling("td")
        if mpaa_rating_cell_row:
            rating_a = mpaa_rating_cell_row.find("a")
        if rating_a:
            mpaa_rating = rating_a.get_text(strip=True)
    else:
        mpaa_rating = None

    row_dict["Production Budget"] = production_budget
    row_dict["MPAA Rating"] = mpaa_rating

    print(row_dict)

Movie: https://www.the-numbers.com/movie/Black-Panther-(2018)#tab=summary
{'Rank': '1', 'Movie': 'Black Panther', 'Release Date': 'Feb 16, 2018', 'Distributor': 'Walt Disney', 'Genre': 'Action', 'Year Gross': '$700,059,566', 'Tickets Sold': '76,845,177', 'Movie URL': 'https://www.the-numbers.com/movie/Black-Panther-(2018)#tab=summary', 'Year': 2018, 'Domestic Box Office': '$700,059,566', 'International Box Office': '$634,097,516', 'Worldwide Box Office': '$1,334,157,082', 'Production Budget': '$200,000,000', 'MPAA Rating': 'PG-13'}
Movie: https://www.the-numbers.com/movie/Avengers-Infinity-War-(2017)#tab=summary
{'Rank': '2', 'Movie': 'Avengers: Infinity War', 'Release Date': 'Apr 27, 2018', 'Distributor': 'Walt Disney', 'Genre': 'Action', 'Year Gross': '$678,815,482', 'Tickets Sold': '74,513,225', 'Movie URL': 'https://www.the-numbers.com/movie/Avengers-Infinity-War-(2017)#tab=summary', 'Year': 2018, 'Domestic Box Office': '$678,815,482', 'International Box Office': '$1,369,342,759', 

In [85]:
contents

[{'Rank': '1',
  'Movie': 'Black Panther',
  'Release Date': 'Feb 16, 2018',
  'Distributor': 'Walt Disney',
  'Genre': 'Action',
  'Year Gross': '$700,059,566',
  'Tickets Sold': '76,845,177',
  'Movie URL': 'https://www.the-numbers.com/movie/Black-Panther-(2018)#tab=summary',
  'Year': 2018,
  'Domestic Box Office': '$700,059,566',
  'International Box Office': '$634,097,516',
  'Worldwide Box Office': '$1,334,157,082',
  'Production Budget': '$200,000,000',
  'MPAA Rating': 'PG-13'},
 {'Rank': '2',
  'Movie': 'Avengers: Infinity War',
  'Release Date': 'Apr 27, 2018',
  'Distributor': 'Walt Disney',
  'Genre': 'Action',
  'Year Gross': '$678,815,482',
  'Tickets Sold': '74,513,225',
  'Movie URL': 'https://www.the-numbers.com/movie/Avengers-Infinity-War-(2017)#tab=summary',
  'Year': 2018,
  'Domestic Box Office': '$678,815,482',
  'International Box Office': '$1,369,342,759',
  'Worldwide Box Office': '$2,048,158,241',
  'Production Budget': '$300,000,000',
  'MPAA Rating': 'PG-13'

In [87]:
df = pd.DataFrame(contents)
df

Unnamed: 0,Rank,Movie,Release Date,Distributor,Genre,Year Gross,Tickets Sold,Movie URL,Year,Domestic Box Office,International Box Office,Worldwide Box Office,Production Budget,MPAA Rating
0,1,Black Panther,"Feb 16, 2018",Walt Disney,Action,"$700,059,566",76845177,https://www.the-numbers.com/movie/Black-Panthe...,2018,"$700,059,566","$634,097,516","$1,334,157,082","$200,000,000",PG-13
1,2,Avengers: Infinity War,"Apr 27, 2018",Walt Disney,Action,"$678,815,482",74513225,https://www.the-numbers.com/movie/Avengers-Inf...,2018,"$678,815,482","$1,369,342,759","$2,048,158,241","$300,000,000",PG-13
2,3,Incredibles 2,"Jun 15, 2018",Walt Disney,Adventure,"$608,581,744",66803704,https://www.the-numbers.com/movie/Incredibles-...,2018,"$608,581,744","$634,223,615","$1,242,805,359","$200,000,000",PG
3,4,Jurassic World: Fallen Kingdom,"Jun 22, 2018",Universal,Action,"$417,719,760",45852883,https://www.the-numbers.com/movie/Jurassic-Wor...,2018,"$417,719,760","$890,603,542","$1,308,323,302","$170,000,000",PG-13
4,5,Deadpool 2,"May 18, 2018",20th Century Fox,Action,"$324,512,774",35621600,https://www.the-numbers.com/movie/Deadpool-2#t...,2018,"$324,591,735","$461,770,635","$786,362,370","$110,000,000",R
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,96,The Smashing Machine,"Oct 3, 2025",A24,Drama,"$11,411,388",1008964,https://www.the-numbers.com/movie/Smashing-Mac...,2025,"$11,411,388","$8,863,845","$20,275,233",,R
796,97,Sarah’s Oil,"Nov 7, 2025",Amazon MGM Studios,Drama,"$10,633,745",940207,https://www.the-numbers.com/movie/Sarahs-Oil-(...,2025,"$10,633,745",,,,PG
797,98,Becoming Led Zeppelin,"Feb 7, 2025",Sony Pictures Cla…,Documentary,"$10,403,808",919877,https://www.the-numbers.com/movie/Becoming-Led...,2025,"$10,403,808","$2,769,460","$13,173,268",,PG-13
798,99,Eddington,"Jul 18, 2025",A24,Black Comedy,"$10,109,484",893854,https://www.the-numbers.com/movie/Eddington-(2...,2025,"$10,109,484","$3,043,163","$13,152,647",,R


In [88]:
import numpy as np

cols = ["Tickets Sold", "Year Gross", "Domestic Box Office", "International Box Office", "Worldwide Box Office", "Production Budget"]

for col in cols:
    df[col] = (df[col].astype(str).str.replace("[$,]", "", regex=True).replace("None", np.nan).astype(float))

In [90]:
df.head(n=10)

Unnamed: 0,Rank,Movie,Release Date,Distributor,Genre,Year Gross,Tickets Sold,Movie URL,Year,Domestic Box Office,International Box Office,Worldwide Box Office,Production Budget,MPAA Rating
0,1,Black Panther,"Feb 16, 2018",Walt Disney,Action,700059566.0,76845177.0,https://www.the-numbers.com/movie/Black-Panthe...,2018,700059566.0,634097500.0,1334157000.0,200000000.0,PG-13
1,2,Avengers: Infinity War,"Apr 27, 2018",Walt Disney,Action,678815482.0,74513225.0,https://www.the-numbers.com/movie/Avengers-Inf...,2018,678815482.0,1369343000.0,2048158000.0,300000000.0,PG-13
2,3,Incredibles 2,"Jun 15, 2018",Walt Disney,Adventure,608581744.0,66803704.0,https://www.the-numbers.com/movie/Incredibles-...,2018,608581744.0,634223600.0,1242805000.0,200000000.0,PG
3,4,Jurassic World: Fallen Kingdom,"Jun 22, 2018",Universal,Action,417719760.0,45852883.0,https://www.the-numbers.com/movie/Jurassic-Wor...,2018,417719760.0,890603500.0,1308323000.0,170000000.0,PG-13
4,5,Deadpool 2,"May 18, 2018",20th Century Fox,Action,324512774.0,35621600.0,https://www.the-numbers.com/movie/Deadpool-2#t...,2018,324591735.0,461770600.0,786362400.0,110000000.0,R
5,6,Dr. Seuss’ The Grinch,"Nov 9, 2018",Universal,Adventure,267287030.0,29339959.0,https://www.the-numbers.com/movie/Dr-Seuss-The...,2018,272961295.0,241369400.0,514330700.0,75000000.0,PG
6,7,Jumanji: Welcome to the Jungle,"Dec 20, 2017",Sony Pictures,Adventure,235506359.0,25851411.0,https://www.the-numbers.com/movie/Jumanji-Welc...,2018,404508916.0,557123900.0,961632800.0,90000000.0,PG-13
7,8,Mission: Impossible—Fallout,"Jul 27, 2018",Paramount Pictures,Action,220159104.0,24166751.0,https://www.the-numbers.com/movie/Mission-Impo...,2018,220159104.0,566110500.0,786269600.0,178000000.0,PG-13
8,9,Ant-Man and the Wasp,"Jul 6, 2018",Walt Disney,Action,216648740.0,23781420.0,https://www.the-numbers.com/movie/Ant-Man-and-...,2018,216648740.0,406495900.0,623144700.0,130000000.0,PG-13
9,10,Solo: A Star Wars Story,"May 25, 2018",Walt Disney,Adventure,213767512.0,23465149.0,https://www.the-numbers.com/movie/Solo-A-Star-...,2018,213767512.0,179383800.0,393151300.0,330400000.0,PG-13


In [89]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Data columns (total 14 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Rank                      800 non-null    object 
 1   Movie                     800 non-null    object 
 2   Release Date              800 non-null    object 
 3   Distributor               800 non-null    object 
 4   Genre                     800 non-null    object 
 5   Year Gross                800 non-null    float64
 6   Tickets Sold              800 non-null    float64
 7   Movie URL                 800 non-null    object 
 8   Year                      800 non-null    int64  
 9   Domestic Box Office       800 non-null    float64
 10  International Box Office  779 non-null    float64
 11  Worldwide Box Office      779 non-null    float64
 12  Production Budget         648 non-null    float64
 13  MPAA Rating               784 non-null    object 
dtypes: float64

In [91]:
df.to_csv("movies_roi.csv", index=False)