In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
# Get HTML data for top 1000 grossing movies from Box Office Mojo
url = 'https://www.boxofficemojo.com/chart/top_lifetime_gross/'
html_page = requests.get(url)
soup = BeautifulSoup(html_page.content, 'html.parser')

In [3]:
# Function to convert money string to integer
def money_str_int(str):
    number = int(str.strip('$').replace(',',''))
    return number

In [4]:
# Build table of top grossing movies
headers = [col.text.strip() for col in soup.findAll('th')] # column names
headers.append('URL')
rows = soup.findAll('table')[0].findAll('tr') # retrieve rows
data = []
base_url = 'https://www.boxofficemojo.com'

# Format table data
for row in rows[1:]:
    # Get list of text displayed on web page
    cell_data = row.findAll('td')
    cells = [cell.text for cell in cell_data] 
    
    # Get URL for each movie and append to list
    movie_url = base_url + row.find('a').attrs['href']
    cells.append(movie_url)
    
    # Add list to data
    data.append(cells)

df = pd.DataFrame(data)
df.columns = headers
df

Unnamed: 0,Rank,Title,Lifetime Gross,Year,URL
0,1,Star Wars: Episode VII - The Force Awakens,"$936,662,225",2015,https://www.boxofficemojo.com/title/tt2488496/...
1,2,Avengers: Endgame,"$858,373,000",2019,https://www.boxofficemojo.com/title/tt4154796/...
2,3,Avatar,"$760,507,625",2009,https://www.boxofficemojo.com/title/tt0499549/...
3,4,Black Panther,"$700,426,566",2018,https://www.boxofficemojo.com/title/tt1825683/...
4,5,Avengers: Infinity War,"$678,815,482",2018,https://www.boxofficemojo.com/title/tt4154756/...
...,...,...,...,...,...
195,196,Coco,"$209,726,015",2017,https://www.boxofficemojo.com/title/tt2380307/...
196,197,Mission: Impossible - Ghost Protocol,"$209,397,903",2011,https://www.boxofficemojo.com/title/tt1229238/...
197,198,Wedding Crashers,"$209,255,921",2005,https://www.boxofficemojo.com/title/tt0396269/...
198,199,Sherlock Holmes,"$209,028,679",2009,https://www.boxofficemojo.com/title/tt0988045/...


In [22]:
# Iterate through above dataframe and pull data from each movie's page
def get_movie_data(url):
    movie_page = requests.get(url)
    movie = BeautifulSoup(movie_page.content, 'html.parser')
    divs = movie.findAll('div', class_='a-section a-spacing-none') #first section of table stored as divs instead of table

    # Variables will come back as 'No Data' if the webpage doesn't have this information
    distributer = 'No Data'
    budget = 'No Data'
    rating = 'No Data'
    duration = 'No Data'
    genres = 'No Data'

    for div in divs:
        spans = div.findAll('span')
        i=0
        for span in spans:      
            if span.text  == 'Domestic Distributor':
                distributer = spans[i+1].text.replace('See full company information\n\n','') #remove extra link text from end
                i+=1
                break
            if span.text == 'Budget':
                budget = money_str_int(spans[i+1].text) #convert budget string to integer
                i+=1
                break
            if span.text == 'MPAA':
                rating = spans[i+1].text
                i+=1
                break
            if span.text == 'Running Time':
                dur = spans[i+1].text.split() #imports duration as hours and minutes
                duration = round(int(dur[0]) + int(dur[2])/60, 2) #converts to hours rounded to hundreths
                i+=1
                break
            if span.text == 'Genres':
                genres = spans[i+1].text.replace(' ','').replace('\n\n',',') #.strip wasn't working for some reason
                i+=1
                break
            else:
                i+=1
            
    df_movie = pd.DataFrame([distributer, budget, rating, duration, genres]).transpose()
    df_movie.columns = ['Distributer', 'Budget', 'Rating', 'Running_Time_hrs', 'Genres']

    return df_movie

In [19]:
# loop to get data for first 100 movies
df2 = pd.DataFrame(columns = ['Distributer', 'Budget', 'Rating', 'Running_Time_hrs', 'Genres'])
for url in df['URL'][:100]:
    movie_data = get_movie_data(url)
    df2 = df2.append(movie_data, ignore_index=True)
df2

Unnamed: 0,Distributer,Budget,Rating,Running_Time,Genres
0,Walt Disney Studios Motion Pictures,245000000,PG-13,2.3,"Action,Adventure,Sci-Fi"
1,Walt Disney Studios Motion Pictures,356000000,PG-13,3.02,"Action,Adventure,Drama,Sci-Fi"
2,Twentieth Century Fox,237000000,PG-13,2.7,"Action,Adventure,Fantasy,Sci-Fi"
3,Walt Disney Studios Motion Pictures,No Data,PG-13,2.23,"Action,Adventure,Sci-Fi"
4,Walt Disney Studios Motion Pictures,No Data,PG-13,2.48,"Action,Adventure,Sci-Fi"
...,...,...,...,...,...
95,Lionsgate,120000000,PG-13,1.92,"Adventure,Drama,Fantasy,Romance"
96,Twentieth Century Fox,18000000,PG,2.07,"Action,Adventure,Fantasy,Sci-Fi"
97,Walt Disney Studios Motion Pictures,180000000,PG,2.38,"Adventure,Family,Fantasy"
98,Warner Bros.,225000000,PG-13,2.38,"Action,Adventure,Sci-Fi"


In [24]:
df2.columns = ['Distributer', 'Budget', 'Rating', 'Running_Time_hrs', 'Genres']
result = pd.concat([df, df2], axis=1, join='inner')
result

Unnamed: 0,Rank,Title,Lifetime Gross,Year,URL,Distributer,Budget,Rating,Running_Time_hrs,Genres
0,1,Star Wars: Episode VII - The Force Awakens,"$936,662,225",2015,https://www.boxofficemojo.com/title/tt2488496/...,Walt Disney Studios Motion Pictures,245000000,PG-13,2.3,"Action,Adventure,Sci-Fi"
1,2,Avengers: Endgame,"$858,373,000",2019,https://www.boxofficemojo.com/title/tt4154796/...,Walt Disney Studios Motion Pictures,356000000,PG-13,3.02,"Action,Adventure,Drama,Sci-Fi"
2,3,Avatar,"$760,507,625",2009,https://www.boxofficemojo.com/title/tt0499549/...,Twentieth Century Fox,237000000,PG-13,2.7,"Action,Adventure,Fantasy,Sci-Fi"
3,4,Black Panther,"$700,426,566",2018,https://www.boxofficemojo.com/title/tt1825683/...,Walt Disney Studios Motion Pictures,No Data,PG-13,2.23,"Action,Adventure,Sci-Fi"
4,5,Avengers: Infinity War,"$678,815,482",2018,https://www.boxofficemojo.com/title/tt4154756/...,Walt Disney Studios Motion Pictures,No Data,PG-13,2.48,"Action,Adventure,Sci-Fi"
...,...,...,...,...,...,...,...,...,...,...
95,96,The Twilight Saga: Breaking Dawn - Part 2,"$292,324,737",2012,https://www.boxofficemojo.com/title/tt1673434/...,Lionsgate,120000000,PG-13,1.92,"Adventure,Drama,Fantasy,Romance"
96,97,Star Wars: Episode V - The Empire Strikes Back,"$292,194,960",1980,https://www.boxofficemojo.com/title/tt0080684/...,Twentieth Century Fox,18000000,PG,2.07,"Action,Adventure,Fantasy,Sci-Fi"
97,98,"The Chronicles of Narnia: The Lion, the Witch ...","$291,710,957",2005,https://www.boxofficemojo.com/title/tt0363771/...,Walt Disney Studios Motion Pictures,180000000,PG,2.38,"Adventure,Family,Fantasy"
98,99,Man of Steel,"$291,045,518",2013,https://www.boxofficemojo.com/title/tt0770828/...,Warner Bros.,225000000,PG-13,2.38,"Action,Adventure,Sci-Fi"


In [9]:
# Ignore for now
# Loop to get data from subsequent pages
# base_url =  'https://www.boxofficemojo.com/'
# for i in range(4):
#     url = base_url + soup.findAll('li', class_='a-last')[0].find('a').attrs["href"]
#     html_page = requests.get(url)
#     soup = BeautifulSoup(html_page.content, 'html.parser')
#     df2 = pd.DataFrame([ranks(soup), titles(soup), earnings(soup), years(soup), movie_urls(soup)]).transpose()
#     df2.columns = ['Rank', 'Title', 'Lifetime_Gross', 'Year']
#     df = df.append(df2, ignore_index=True)

