[Open in Colab](https://colab.research.google.com/drive/1aIXWV0tGMUmNYrKpZOykIsPQZuY-BFGT)

In [0]:
!pip install requests-html



In [0]:
import os
import re
import pandas as pd
import requests
from requests_html import HTML
from google.colab import drive

In [0]:
BASE_DIR = '/content/drive'
PROJECT_BASE_DIR = os.path.join(BASE_DIR, "My Drive", "Scraping", "BoxOfficeMojo")
os.makedirs(PROJECT_BASE_DIR, exist_ok=True)
drive.mount(BASE_DIR)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
RAW_DATA_OUT = os.path.join(PROJECT_BASE_DIR, 'raw.csv')
CLEANED_DATA_OUT = os.path.join(PROJECT_BASE_DIR, 'clean.csv')

In [0]:
URL = 'https://www.boxofficemojo.com/year/world/2019/'

In [0]:
r = requests.get(URL)
print(r.status_code) # 200-299 we're good. If 500-599 that's server error. #404 --> HTTP Status CODES
html_doc = r.text

200


In [0]:
html = HTML(html=html_doc)

In [0]:
# html.find(".mojo-gutter", first=True).text # .mojo-gutter

In [0]:
table = html.find(".imdb-scroll-table", first=True)
# print(table.text)

In [0]:
rows = table.find("tr")
header = {}
for i, row in enumerate(rows):
  header_cols = row.find('th')
  for sub_i, header_col in enumerate(header_cols):
    text = header_col.text
    if sub_i == 4:
      text = f"Domestic {text}"
    if sub_i == 6:
       text = f"Foreign {text}"
    header[sub_i] = text
print(header)

{0: 'Rank', 1: 'Release Group', 2: 'Worldwide', 3: 'Domestic', 4: 'Domestic %', 5: 'Foreign', 6: 'Foreign %'}


In [0]:
all_row_data = []
for i, row in enumerate(rows):
  cols = row.find("td")
  row_data = {}
  for sub_i, col in enumerate(cols):
    col_name = header[sub_i]
    row_data[col_name] = col.text
  all_row_data.append(row_data)

print(all_row_data)

[{}, {'Rank': '1', 'Release Group': 'Avengers: Endgame', 'Worldwide': '$2,797,800,564', 'Domestic': '$858,373,000', 'Domestic %': '30.7%', 'Foreign': '$1,939,427,564', 'Foreign %': '69.3%'}, {'Rank': '2', 'Release Group': 'The Lion King', 'Worldwide': '$1,656,405,082', 'Domestic': '$543,638,043', 'Domestic %': '32.8%', 'Foreign': '$1,112,767,039', 'Foreign %': '67.2%'}, {'Rank': '3', 'Release Group': 'Spider-Man: Far from Home', 'Worldwide': '$1,131,927,996', 'Domestic': '$390,532,085', 'Domestic %': '34.5%', 'Foreign': '$741,395,911', 'Foreign %': '65.5%'}, {'Rank': '4', 'Release Group': 'Captain Marvel', 'Worldwide': '$1,128,274,794', 'Domestic': '$426,829,839', 'Domestic %': '37.8%', 'Foreign': '$701,444,955', 'Foreign %': '62.2%'}, {'Rank': '5', 'Release Group': 'Toy Story 4', 'Worldwide': '$1,073,394,593', 'Domestic': '$434,038,008', 'Domestic %': '40.4%', 'Foreign': '$639,356,585', 'Foreign %': '59.6%'}, {'Rank': '6', 'Release Group': 'Joker', 'Worldwide': '$1,055,973,580', 'Dome

In [0]:
df = pd.DataFrame(all_row_data)
df.head()
df.to_csv(RAW_DATA_OUT, index=False)

In [0]:
df.set_index("Rank", inplace=True)
df.dropna(inplace=True)
df.head(n=10)

Unnamed: 0_level_0,Release Group,Worldwide,Domestic,Domestic %,Foreign,Foreign %
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Avengers: Endgame,"$2,797,800,564","$858,373,000",30.7%,"$1,939,427,564",69.3%
2,The Lion King,"$1,656,405,082","$543,638,043",32.8%,"$1,112,767,039",67.2%
3,Spider-Man: Far from Home,"$1,131,927,996","$390,532,085",34.5%,"$741,395,911",65.5%
4,Captain Marvel,"$1,128,274,794","$426,829,839",37.8%,"$701,444,955",62.2%
5,Toy Story 4,"$1,073,394,593","$434,038,008",40.4%,"$639,356,585",59.6%
6,Joker,"$1,055,973,580","$332,373,580",31.5%,"$723,600,000",68.5%
7,Aladdin,"$1,050,693,953","$355,559,216",33.8%,"$695,134,737",66.2%
8,Frozen II,"$927,920,321","$343,566,681",37%,"$584,353,640",63%
9,Fast & Furious Presents: Hobbs & Shaw,"$758,910,100","$173,810,100",22.9%,"$585,100,000",77.1%
10,Ne Zha,"$700,547,754","$3,695,533",0.5%,"$696,852,221",99.5%


In [0]:
df['currency'] = 'USD'
df.head()

Unnamed: 0_level_0,Release Group,Worldwide,Domestic,Domestic %,Foreign,Foreign %,currency
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Avengers: Endgame,"$2,797,800,564","$858,373,000",30.7%,"$1,939,427,564",69.3%,USD
2,The Lion King,"$1,656,405,082","$543,638,043",32.8%,"$1,112,767,039",67.2%,USD
3,Spider-Man: Far from Home,"$1,131,927,996","$390,532,085",34.5%,"$741,395,911",65.5%,USD
4,Captain Marvel,"$1,128,274,794","$426,829,839",37.8%,"$701,444,955",62.2%,USD
5,Toy Story 4,"$1,073,394,593","$434,038,008",40.4%,"$639,356,585",59.6%,USD


In [0]:
# df['Domestic'] = df['Domestic'].astype(int)
df.fillna(0, inplace=True)

In [0]:
def currency_to_int(currency_str):
  if isinstance(currency_str, int):
    return currency_str
  pattern = r'[^\d.]'
  amount = re.sub(pattern, '', currency_str)
  if amount == "":
    return 0
  return amount

In [0]:
cols_to_clean = ['Worldwide', 'Domestic', 'Foreign']
def clean_row_data(row, *args, **kwargs):
  for col in cols_to_clean:
    row[col] = currency_to_int(row[col])
  return row

df.apply(clean_row_data, axis=1)

df.head(n=10)

Unnamed: 0_level_0,Release Group,Worldwide,Domestic,Domestic %,Foreign,Foreign %,currency
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Avengers: Endgame,2797800564,858373000,30.7%,1939427564,69.3%,USD
2,The Lion King,1656405082,543638043,32.8%,1112767039,67.2%,USD
3,Spider-Man: Far from Home,1131927996,390532085,34.5%,741395911,65.5%,USD
4,Captain Marvel,1128274794,426829839,37.8%,701444955,62.2%,USD
5,Toy Story 4,1073394593,434038008,40.4%,639356585,59.6%,USD
6,Joker,1055973580,332373580,31.5%,723600000,68.5%,USD
7,Aladdin,1050693953,355559216,33.8%,695134737,66.2%,USD
8,Frozen II,927920321,343566681,37%,584353640,63%,USD
9,Fast & Furious Presents: Hobbs & Shaw,758910100,173810100,22.9%,585100000,77.1%,USD
10,Ne Zha,700547754,3695533,0.5%,696852221,99.5%,USD


In [0]:
for col in cols_to_clean:
  df[col] = df[col].astype(int) 

df.dtypes

Release Group    object
Worldwide         int64
Domestic          int64
Domestic %       object
Foreign           int64
Foreign %        object
currency         object
dtype: object

In [0]:
df.to_csv(CLEANED_DATA_OUT, index=True)

In [0]:
final_df = pd.read_csv(CLEANED_DATA_OUT)
final_df.set_index("Rank", inplace=True)
final_df.dtypes

Release Group    object
Worldwide         int64
Domestic          int64
Domestic %       object
Foreign           int64
Foreign %        object
currency         object
dtype: object