In [5]:
import os
import datetime
import requests
from requests_html import HTML
import pandas as pd

BASE_DIR = os.path.dirname("__file__")



def url_to_txt(url, filename="world.html", save=False):
    r = requests.get(url)
    if r.status_code == 200:
        html_text = r.text
        if save:
            with open(f"world-{year}.html", 'w') as f:
                f.write(html_text)
        return html_text
    return ""

def parse_and_extract(url, name='2020'):
    html_text = url_to_txt(url)
    r_html = HTML(html=html_text)
    table_class = ".imdb-scroll-table"
    r_table = r_html.find(table_class)
    #print(r_table)
    table_data = []
    header_names = []
    if len(r_table) == 1:
        parsed_table = r_table[0]
        rows = parsed_table.find("tr")
        header_row = rows[0]
        header_cols = header_row.find('th')
        header_names = [x.text for x in header_cols]

        for row in rows[1:]:
            #print(row.text)
            cols = row.find('td')
            row_data = []
            for i, col in enumerate(cols):
                print(i, col.text, '\n\n')
                row_data.append(col.text)
            table_data.append(row_data)
        df = pd.DataFrame(table_data, columns=header_names)
        path = os.path.join(BASE_DIR, 'scraped_data')
        os.makedirs(path, exist_ok=True)
        filepath = os.path.join('scraped_data', f'{name}.csv')
        df.to_csv(filepath, index=False)

def run(start_year=None, years_ago=10):
    if start_year == None:
        now = datetime.datetime.now()
        start_year = now.year        
    assert isinstance(start_year, int)
    assert isinstance(years_ago, int)
    assert len(f"{start_year}") == 4
    for i in range(0, years_ago+1):
        url = f"https://www.boxofficemojo.com/year/world/{start_year}"     
        parse_and_extract(url, name=start_year)
        print(f"Finished {start_year}")
        start_year -= 1

if __name__ == "__main__":
    run()

            
r = requests.get(url)
r.status_code



0 1 


1 Bad Boys for Life 


2 $424,617,855 


3 $204,417,855 


4 48.1% 


5 $220,200,000 


6 51.9% 


0 2 


1 Sonic the Hedgehog 


2 $306,766,470 


3 $146,066,470 


4 47.6% 


5 $160,700,000 


6 52.4% 


0 3 


1 Dolittle 


2 $243,365,738 


3 $77,047,065 


4 31.7% 


5 $166,318,673 


6 68.3% 


0 4 


1 Harley Quinn: Birds of Prey 


2 $201,858,461 


3 $84,158,461 


4 41.7% 


5 $117,700,000 


6 58.3% 


0 5 


1 The Invisible Man 


2 $130,558,746 


3 $64,914,050 


4 49.7% 


5 $65,644,696 


6 50.3% 


0 6 


1 Onward 


2 $111,291,825 


3 $61,555,145 


4 55.3% 


5 $49,736,680 


6 44.7% 


0 7 


1 The Call of the Wild 


2 $107,604,626 


3 $62,342,368 


4 57.9% 


5 $45,262,258 


6 42.1% 


0 8 


1 Fantasy Island 


2 $47,315,959 


3 $26,441,782 


4 55.9% 


5 $20,874,177 


6 44.1% 


0 9 


1 Underwater 


2 $40,882,928 


3 $17,291,078 


4 42.3% 


5 $23,591,850 


6 57.7% 


0 10 


1 The Man Standing Next 


2 $34,664,900 


3 $113,527 


4 0.3% 




200

In [5]:
r.text

'<!doctype html><html class="a-no-js" data-19ax5a9jf="dingo"><head><script>var aPageStart = (new Date()).getTime();</script><meta charset="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1.0" />\n                <meta charset="utf-8" />\n            <title dir="ltr">2020 Worldwide Box Office - Box Office Mojo</title><meta content="2020 Worldwide Box Office" name="title" />\n            <meta content="Box Office Mojo" property="og:site_name" />\n            <meta name="format-detection" content="telephone=no" />\n            <link href="https://m.media-amazon.com/images/G/01/boxofficemojo/v2/favicon._CB448965889_.ico" type="image/x-icon" rel="icon" />\n            <link rel="stylesheet" href="https://images-na.ssl-images-amazon.com/images/I/11EIQ5IGqaL._RC|012LjolmrML.css,51AZ-Jz5kmL.css,51IB+wfP8qL.css,01evdoiemkL.css,01K+Ps1DeEL.css,01Vctty9pOL.css,314djKvMsUL.css,01ZTetsDh7L.css,11cMnOipjJL.css,01pbA9Lg3yL.css,21LK7jaicML.css,11L58Qpo0GL.css,21kyTi1FabL.css,0