In [11]:
!pip install selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import requests
import pandas as pd



In [12]:
def parse_html(html_content):

    """
    Parse the HTML content to extract movie information from a table.

    Parameters:
    - html_content (str): The HTML content of the webpage.

    Returns:
    - pd.DataFrame: A DataFrame containing the extracted movie information.
    """

    #create an emoty list to store all the results
    movies = []

    # Parse the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find the table containing the movie information
    table = soup.find('table')
    
    # Check if a table is found
    if table is not None:

        # Extract the header row
        header_row = table.find('tr')
        
        # Check if the header row is found
        if header_row is not None:
            
            # Get the index of each column
            columns = [col.get_text(strip=True) for col in header_row.find_all('th')]

            # Extract information for each movie
            for row in table.find_all('tr')[1:]:  # Skip the header row
                movie_info = [data.get_text(strip=True) for data in row.find_all('td')]
                movies.append(dict(zip(columns, movie_info)))

        else:
            print("No header row found in the table.")
    else:
        print("No table found on the webpage.")

    return pd.DataFrame(movies)

In [13]:
def get_html_content(url,headers,next_button_x_path):
    """
    Scrape HTML content from a webpage using Selenium.

    Parameters:
    - url (str): The URL of the webpage to scrape.
    - headers (dict): Headers to be included in the HTTP request.
    - next_button_x_path (str): XPath of the 'Load More' or next button.

    Returns:
    - pd.DataFrame: A DataFrame containing the scraped information.
    """

    # Set up the Safari WebDriver
    driver = webdriver.Safari()

    driver.get(url)
    df_list = []

    # Assuming there's a button to load more content, find and click it in a loop
    while True:
        try:
            # Get the current URL after clicking the button
            next_page_url = driver.current_url

            # Use the next_page_url to fetch HTML content
            html_content = requests.get(next_page_url, headers=headers).content

            df = parse_html(html_content)
            df_list.append(df)

            # Find and click the button to load more content
            load_more_button = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH,next_button_x_path )))
            load_more_button.click()

            # Wait for some time to let the new content load
            time.sleep(1)  # Adjust this based on your website's loading time

        except Exception as e:
            # Break the loop if there is an exception or the button is not found
            print(f"Loading done ")
            break

    # Concatenate the list of DataFrames vertically
    result_df = pd.concat(df_list, axis=0, ignore_index=True)

    return result_df


In [14]:
# headers (dict): Headers to be included in the HTTP request.
headers = {'User-Agent': 'AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36'}

# url (str): The URL of the webpage to scrape.
url = "https://www.boxofficemojo.com/chart/ww_top_lifetime_gross/"

# next_button_x_path (str): XPath of the 'Load More' or next button.
next_button_x_path = '//*[@id="a-page"]/main/div/div/div[4]/ul/li[3]/a'

# get all the movie information
result_dataframe = get_html_content(url,headers,next_button_x_path)
result_dataframe

Loading done 


Unnamed: 0,Rank,Title,Worldwide Lifetime Gross,Domestic Lifetime Gross,Domestic %,Foreign Lifetime Gross,Foreign %,Year
0,1,Avatar,"$2,923,706,026","$785,221,649",26.9%,"$2,138,484,377",73.1%,2009
1,2,Avengers: Endgame,"$2,799,439,100","$858,373,000",30.7%,"$1,941,066,100",69.3%,2019
2,3,Avatar: The Way of Water,"$2,320,250,281","$684,075,767",29.5%,"$1,636,174,514",70.5%,2022
3,4,Titanic,"$2,264,743,305","$674,292,608",29.8%,"$1,590,450,697",70.2%,1997
4,5,Star Wars: Episode VII - The Force Awakens,"$2,071,310,218","$936,662,225",45.2%,"$1,134,647,993",54.8%,2015
...,...,...,...,...,...,...,...,...
995,996,Hot Shots!,"$181,096,164","$69,467,617",38.4%,"$111,628,547",61.6%,1991
996,997,Road to Perdition,"$181,001,478","$104,454,762",57.7%,"$76,546,716",42.3%,2002
997,998,Kill Bill: Vol. 1,"$180,906,076","$70,099,045",38.8%,"$110,807,031",61.2%,2003
998,999,The Scorpion King,"$180,630,907","$91,047,077",50.4%,"$89,583,830",49.6%,2002


In [6]:
#Save movie
result_dataframe.to_csv('../external_dataset/boxofficemojo_dataframe.csv', index=False)