In [1]:
#import packages
from bs4 import BeautifulSoup
import requests 
import pandas as pd

In [2]:
#define webiste url and get content
url = 'https://www.the-numbers.com/market/distributors'
r = requests.get(url)
soup = BeautifulSoup(r.content)

In [3]:
#clean up the html code
soup.prettify

<bound method Tag.prettify of <!DOCTYPE html>

<html xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://ogp.me/ns#">
<head>
<!-- Global site tag (gtag.js) - Google Analytics -->
<script async="" src="https://www.googletagmanager.com/gtag/js?id=UA-1343128-1"></script>
<script>
  window.dataLayer = window.dataLayer || [];
  function gtag(){dataLayer.push(arguments);}
  gtag('js', new Date());

  gtag('config', 'UA-1343128-1');
</script>
<meta content='(PICS-1.1 "https://www.icra.org/ratingsv02.html" l gen true for "https://www.the-numbers.com/" r (cb 1 lz 1 nz 1 oz 1 vz 1) "https://www.rsac.org/ratingsv01.html" l gen true for "https://www.the-numbers.com/" r (n 0 s 0 v 0 l 0))' http-equiv="PICS-Label"/>
<!--<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1" >-->
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="telephone=no" name="format-detection"/> <!-- for apple mobile -->
<meta content="521546213" property="fb:admins">

In [4]:
#define data as a list
data = []

#find table
results = soup.select('table')[0]

#find all tr elements
rows = results.find_all(['tr'])

#find data in td tags
for row in rows:
    cols = row.find_all('td')
    #get text
    cols = [ele.text.strip() for ele in cols]
    #remove header row
    data.append(cols[1:])

#name columns
cols = ['Distributor', 'Number of Movies', 'Box Office Total', 'Ticket Sales', 'Market Share']

#define dataframe
#remove redundant "Rank" column
df = pd.DataFrame(data[1:], columns = cols)

#return head of table
df.head()

Unnamed: 0,Distributor,Number of Movies,Box Office Total,Ticket Sales,Market Share
0,Walt Disney,572,"$39,690,172,167",5668211991,16.94%
1,Warner Bros.,803,"$35,635,746,799",5132722643,15.21%
2,Sony Pictures,729,"$28,774,824,501",4257878719,12.28%
3,Universal,511,"$27,464,279,056",3938555708,11.72%
4,20th Century Fox,520,"$25,854,596,898",3792345700,11.03%


In [5]:
# Get HTML data for top 1000 grossing movies from Box Office Mojo
url = 'https://www.boxofficemojo.com/chart/top_lifetime_gross/'
html_page = requests.get(url)
soup = BeautifulSoup(html_page.content, 'html.parser')

In [6]:
# Function to convert money string to integer
def money_str_int(str):
    number = int(str.strip('$').replace(',',''))
    return number

In [7]:
# Build table of top grossing movies
headers = [col.text.strip() for col in soup.findAll('th')] # column names
headers.append('URL')
rows = soup.findAll('table')[0].findAll('tr') # retrieve rows
data = []
base_url = 'https://www.boxofficemojo.com'

# Format table data
for row in rows[1:]:
    # Get list of text displayed on web page
    cell_data = row.findAll('td')
    cells = [cell.text for cell in cell_data] 
    
    # Get URL for each movie and append to list
    movie_url = base_url + row.find('a').attrs['href']
    cells.append(movie_url)
    
    # Add list to data
    data.append(cells)

df = pd.DataFrame(data)
df.columns = headers
df

Unnamed: 0,Rank,Title,Lifetime Gross,Year,URL
0,1,Star Wars: Episode VII - The Force Awakens,"$936,662,225",2015,https://www.boxofficemojo.com/title/tt2488496/...
1,2,Avengers: Endgame,"$858,373,000",2019,https://www.boxofficemojo.com/title/tt4154796/...
2,3,Avatar,"$760,507,625",2009,https://www.boxofficemojo.com/title/tt0499549/...
3,4,Black Panther,"$700,426,566",2018,https://www.boxofficemojo.com/title/tt1825683/...
4,5,Avengers: Infinity War,"$678,815,482",2018,https://www.boxofficemojo.com/title/tt4154756/...
...,...,...,...,...,...
195,196,Coco,"$209,726,015",2017,https://www.boxofficemojo.com/title/tt2380307/...
196,197,Mission: Impossible - Ghost Protocol,"$209,397,903",2011,https://www.boxofficemojo.com/title/tt1229238/...
197,198,Wedding Crashers,"$209,255,921",2005,https://www.boxofficemojo.com/title/tt0396269/...
198,199,Sherlock Holmes,"$209,028,679",2009,https://www.boxofficemojo.com/title/tt0988045/...


In [8]:
# Iterate through above dataframe and pull data from each movie's page
def get_movie_data(url):
    movie_page = requests.get(url)
    movie = BeautifulSoup(movie_page.content, 'html.parser')
    divs = movie.findAll('div', class_='a-section a-spacing-none') #first section of table stored as divs instead of table

    # Variables will come back as 'No Data' if the webpage doesn't have this information
    distributer = 'No Data'
    budget = 'No Data'
    rating = 'No Data'
    duration = 'No Data'
    genres = 'No Data'

    for div in divs:
        spans = div.findAll('span')
        i=0
        for span in spans:      
            if span.text  == 'Domestic Distributor':
                distributer = spans[i+1].text.replace('See full company information\n\n','') #remove extra link text from end
                i+=1
                break
            if span.text == 'Budget':
                budget = money_str_int(spans[i+1].text) #convert budget string to integer
                i+=1
                break
            if span.text == 'MPAA':
                rating = spans[i+1].text
                i+=1
                break
            if span.text == 'Running Time':
                dur = spans[i+1].text.split() #imports duration as hours and minutes
                duration = round(int(dur[0]) + int(dur[2])/60, 2) #converts to hours rounded to hundreths
                i+=1
                break
            if span.text == 'Genres':
                genres = spans[i+1].text.replace(' ','').replace('\n\n',',') #.strip wasn't working for some reason
                i+=1
                break
            else:
                i+=1
            
    df_movie = pd.DataFrame([distributer, budget, rating, duration, genres]).transpose()
    df_movie.columns = ['Distributer', 'Budget', 'Rating', 'Running_Time_hrs', 'Genres']

    return df_movie

In [11]:
# loop to get data for first 100 movies
df2 = pd.DataFrame(columns = ['Distributer', 'Budget', 'Rating', 'Running_Time_hrs', 'Genres'])
for url in df['URL'][:100]:
    movie_data = get_movie_data(url)
    df2 = df2.append(movie_data, ignore_index=True)
df2

Unnamed: 0,Distributer,Budget,Rating,Running_Time_hrs,Genres
0,Walt Disney Studios Motion Pictures,245000000,PG-13,2.3,"Action,Adventure,Sci-Fi"
1,Walt Disney Studios Motion Pictures,356000000,PG-13,3.02,"Action,Adventure,Drama,Sci-Fi"
2,Twentieth Century Fox,237000000,PG-13,2.7,"Action,Adventure,Fantasy,Sci-Fi"
3,Walt Disney Studios Motion Pictures,No Data,PG-13,2.23,"Action,Adventure,Sci-Fi"
4,Walt Disney Studios Motion Pictures,No Data,PG-13,2.48,"Action,Adventure,Sci-Fi"
...,...,...,...,...,...
95,Warner Bros.,150000000,PG-13,2.3,"Action,Adventure,Family,Fantasy,Mystery"
96,Lionsgate,120000000,PG-13,1.92,"Adventure,Drama,Fantasy,Romance"
97,Walt Disney Studios Motion Pictures,180000000,PG,2.38,"Adventure,Family,Fantasy"
98,Warner Bros.,225000000,PG-13,2.38,"Action,Adventure,Sci-Fi"


In [12]:
df2.columns = ['Distributer', 'Budget', 'Rating', 'Running_Time_hrs', 'Genres']
result = pd.concat([df, df2], axis=1, join='inner')
result

Unnamed: 0,Rank,Title,Lifetime Gross,Year,URL,Distributer,Budget,Rating,Running_Time_hrs,Genres
0,1,Star Wars: Episode VII - The Force Awakens,"$936,662,225",2015,https://www.boxofficemojo.com/title/tt2488496/...,Walt Disney Studios Motion Pictures,245000000,PG-13,2.3,"Action,Adventure,Sci-Fi"
1,2,Avengers: Endgame,"$858,373,000",2019,https://www.boxofficemojo.com/title/tt4154796/...,Walt Disney Studios Motion Pictures,356000000,PG-13,3.02,"Action,Adventure,Drama,Sci-Fi"
2,3,Avatar,"$760,507,625",2009,https://www.boxofficemojo.com/title/tt0499549/...,Twentieth Century Fox,237000000,PG-13,2.7,"Action,Adventure,Fantasy,Sci-Fi"
3,4,Black Panther,"$700,426,566",2018,https://www.boxofficemojo.com/title/tt1825683/...,Walt Disney Studios Motion Pictures,No Data,PG-13,2.23,"Action,Adventure,Sci-Fi"
4,5,Avengers: Infinity War,"$678,815,482",2018,https://www.boxofficemojo.com/title/tt4154756/...,Walt Disney Studios Motion Pictures,No Data,PG-13,2.48,"Action,Adventure,Sci-Fi"
...,...,...,...,...,...,...,...,...,...,...
95,96,Harry Potter and the Order of the Phoenix,"$292,353,413",2007,https://www.boxofficemojo.com/title/tt0373889/...,Warner Bros.,150000000,PG-13,2.3,"Action,Adventure,Family,Fantasy,Mystery"
96,97,The Twilight Saga: Breaking Dawn - Part 2,"$292,324,737",2012,https://www.boxofficemojo.com/title/tt1673434/...,Lionsgate,120000000,PG-13,1.92,"Adventure,Drama,Fantasy,Romance"
97,98,"The Chronicles of Narnia: The Lion, the Witch ...","$291,710,957",2005,https://www.boxofficemojo.com/title/tt0363771/...,Walt Disney Studios Motion Pictures,180000000,PG,2.38,"Adventure,Family,Fantasy"
98,99,Man of Steel,"$291,045,518",2013,https://www.boxofficemojo.com/title/tt0770828/...,Warner Bros.,225000000,PG-13,2.38,"Action,Adventure,Sci-Fi"


In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
url= 'https://www.the-numbers.com/home-market/distributors'
response = requests.get('https://www.the-numbers.com/home-market/distributors')
soup = BeautifulSoup(response.text, 'lxml')
art_body= soup.find_all('table', id = 'page_filling_chart')
for body in art_body:
    print(art_body.text)

In [3]:
df_list = pd.read_html(response.text) 
df = df_list[0]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 613 entries, 0 to 612
Data columns (total 3 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   Home Market Distributors            613 non-null    object
 1   No. of Movies                       613 non-null    int64 
 2   Total Domestic Home Market Revenue  613 non-null    object
dtypes: int64(1), object(2)
memory usage: 14.5+ KB


In [4]:
df.head()

Unnamed: 0,Home Market Distributors,No. of Movies,Total Domestic Home Market Revenue
0,Walt Disney Home Entertainment,388,"$10,722,434,842"
1,Universal Home Entertainment,1229,"$9,474,305,052"
2,Warner Home Video,959,"$9,283,862,934"
3,Fox Home Entertainment,855,"$8,862,395,595"
4,Sony Pictures Home Entertainment,1297,"$7,645,434,206"
