In [1]:
from __future__ import print_function, division

### Utilities

#### A simple web page cache

In [2]:
import os
import pickle
import requests
import uuid

In [3]:
_cache = None
_cache_dir = "./cache"
_cache_index = os.path.join(_cache_dir, 'index.pkl')

In [4]:
def cache_init():
    global _cache
    if _cache == None:
        if os.path.exists(_cache_index):
            with open(_cache_index, 'rb') as fd:
                _cache = pickle.load(fd)
        else:
            _cache = {}
    return _cache

def cache_get(key):
    return cache_init().get(key, '')

def cache_add(key, value):
    cache = cache_init()
    cache[key] = value
    with open(_cache_index, 'wb') as fd:
        pickle.dump(cache, fd)

In [5]:
def get_page(url):
    """Get a web page."""

    # Check if we have this page
    
    filename = cache_get(url)
    if filename and os.path.exists(filename):
        with open(filename, 'rb') as fd:
            return fd.read()

    # Otherwise, download the page ...
    
    r = requests.get(url, timeout=10)
    r.raise_for_status()
    
    # ... and cache it

    global _cache_dir
    if not os.path.isdir(_cache_dir):
        os.mkdir(_cache_dir)
        
    if not filename:
        filename = os.path.join(_cache_dir, uuid.uuid4().hex)

    with open(filename, 'wb') as fd:
        for chunk in r.iter_content(chunk_size=4096):
            fd.write(chunk)

    cache_add(url, filename)
    
    return r.text

#### Pretty printers

In [6]:
from IPython.display import display, HTML

In [7]:
def pp_dict(d):
    display(HTML(
        u'<table>{}</table>'.format(
            u''.join(u'<tr><td><b>{}</b></td><td>{}</td></tr>'.format(k, d[k]) for k in d))))

def pp_dictOflist(d):
    display(HTML(
        u'<table>{}</table>'.format(
            u''.join(u'<tr><td><b>{}</b></td>{}</tr>'.format(k,
                u''.join(u'<td>{}</td>'.format(v) for v in d[k])) for k in sorted(d.keys())))))
    
def pp_listOflist(l):
    display(HTML(
        u'<table>{}</table>'.format(
            u''.join(u'<tr>{}</tr>'.format(
                u''.join(u'<td>{}</td>'.format(v) for v in sublist)) for sublist in l))))

#### Conversion/parsing functions

In [8]:
def cvt_runtime(s):
    m = re.search('(\d+)\s+hrs.\s+(\d+)\s+min.', s)
    return 60 * int(m.group(1)) + int(m.group(2)) if m else float('nan')

def cvt_million(s):
    result = float('nan')
    regexs = [('\$([0-9.]+) million', 1), ('\$([0-9.]+)$', 1),
              ('\$([0-9,]+)', 1e-6), ('\$([0-9.]+)k', 1e-3)]
    if not s == 'N/A':
        for exp, factor in regexs:
            m = re.match(exp, s)
            if m:
                result = round(float(m.group(1).replace(',', '')) * factor, 3)
                break
        if result == float('nan'):
            print('warning: unrecognized budget amount \'{0:s}\''.format(s), file=sys.stderr)
    return result if result > 0 else float('nan')

_genres = {}
def cvt_genre(s):
    global _genres
    if not _genres:
        with open('genres.csv', mode='r') as fd:
            reader = csv.reader(fd)
            _genres = { row[0] : row[1] for row in reader }
    return _genres.get(s, 'Other')

def cvt_rating(s):
    return s.lower().replace('-', '')

#### Scraping functions

In [9]:
import csv
import numpy as np
import pandas as pd
import re
import sys
import urllib
from bs4 import BeautifulSoup

I scrape the average gross per movie for all actors, composers, directors, producers, and writers from boxofficemojo.com. This information is used calculate artist scores.

In [10]:
%%html
<h4>People Table:</h4>
<table cellspacing="1" cellpadding="5" border="0" width="98%"><tbody><tr bgcolor="#dcdcdc"><td align="center"><font size="2">Row</font></td><td align="center"><font size="2"><a href="/people/?view=Actor&amp;sort=person&amp;order=ASC&amp;p=.htm">Person<br>(Click to view)</a></font></td><td align="center"><font size="2"><a href="/people/?view=Actor&amp;sort=sumgross&amp;order=ASC&amp;p=.htm"><b>Total Gross</b></a></font></td><td colspan="2" align="center"><font size="2"><a href="/people/?view=Actor&amp;sort=nummovies&amp;order=DESC&amp;p=.htm"># Movies</a> / </font><font size="2"><a href="/people/?view=Actor&amp;sort=avggross&amp;order=DESC&amp;p=.htm">Average</a></font></td><td align="center"><font size="2"><a href="/people/?view=Actor&amp;sort=title&amp;order=ASC&amp;p=.htm">#1 Picture</a></font></td><td align="center"><font size="2"><a href="/people/?view=Actor&amp;sort=gross&amp;order=DESC&amp;p=.htm">Gross</a></font></td></tr><tr bgcolor="#ffffff"><td align="center"><font size="2">1</font></td><td><font size="2"><a href="./chart/?view=Actor&amp;id=samuelljackson.htm"><b>Samuel L. Jackson</b></a></font></td><td align="right"><font size="2"><b>$4,894.1</b></font></td><td align="right"><font size="2">71</font></td><td align="right"><font size="2">$68.9</font></td><td align="center"><font size="2"><a href="/movies/?id=avengers11.htm">The Avengers</a></font></td><td align="right"><font size="2">$623.4</font></td></tr><tr bgcolor="#f4f4ff"><td align="center"><font size="2">2</font></td><td><font size="2"><a href="./chart/?view=Actor&amp;id=harrisonford.htm"><b>Harrison Ford</b></a></font></td><td align="right"><font size="2"><b>$4,871.7</b></font></td><td align="right"><font size="2">41</font></td><td align="right"><font size="2">$118.8</font></td><td align="center"><font size="2"><a href="/movies/?id=starwars7.htm">Star Wars: The Force Awakens</a></font></td><td align="right"><font size="2">$936.7</font></td></tr><tr bgcolor="#ffffff"><td align="center"><font size="2">3</font></td><td><font size="2"><a href="./chart/?view=Actor&amp;id=tomhanks.htm"><b>Tom Hanks</b></a></font></td><td align="right"><font size="2"><b>$4,502.4</b></font></td><td align="right"><font size="2">46</font></td><td align="right"><font size="2">$97.9</font></td><td align="center"><font size="2"><a href="/movies/?id=toystory3.htm">Toy Story 3</a></font></td><td align="right"><font size="2">$415.0</font></td></tr><tr bgcolor="#f4f4ff"><td align="center"><font size="2">4</font></td><td><font size="2"><a href="./chart/?view=Actor&amp;id=morganfreeman.htm"><b>Morgan Freeman</b></a></font></td><td align="right"><font size="2"><b>$4,470.5</b></font></td><td align="right"><font size="2">61</font></td><td align="right"><font size="2">$73.3</font></td><td align="center"><font size="2"><a href="/movies/?id=darkknight.htm">The Dark Knight</a></font></td><td align="right"><font size="2">$534.9</font></td></tr></tbody></table>

0,1,2,3,4,5,6
Row,Person (Click to view),Total Gross,# Movies / Average,# Movies / Average,#1 Picture,Gross
1,Samuel L. Jackson,"$4,894.1",71,$68.9,The Avengers,$623.4
2,Harrison Ford,"$4,871.7",41,$118.8,Star Wars: The Force Awakens,$936.7
3,Tom Hanks,"$4,502.4",46,$97.9,Toy Story 3,$415.0
4,Morgan Freeman,"$4,470.5",61,$73.3,The Dark Knight,$534.9


In [11]:
URL_ARTISTS = 'http://www.boxofficemojo.com/people/?view={0}&pagenum={1}&sort=sumgross&order=DESC'

In [12]:
def scrape_artists(role, verbose=False):
    result = {}
    
    pageno = 1
    done = False
    while not done:
        url = URL_ARTISTS.format(role, pageno)
        soup = BeautifulSoup(get_page(url), 'lxml')
        head = soup.find(text=re.compile('\(Click to view\)'))
        if not head:
            break

        if verbose:
            print(url)
        done = True
        for tr in head.findParent('table').contents[1:]: # exclude header
            if tr != u'\n':
                name = tr.contents[2].text
                avg_gross = tr.contents[8].text
                result[name] = cvt_million(avg_gross)
                done = False
 
        pageno += 1

    return result

We'll scrape movie names, studio, genre, etc., from www.boxofficemojo.com.  
The list of movies comes from the yearly box office table, and the other information from the detail pages.

In [13]:
%%html
<h4>Box Office Table:</h4>
<table><tr bgcolor="#ffffff"><td align="center"><font size="2">Rank</font></td><td><font size="2">Movie Title (click to view)</font></td><td><font size="2">Studio</font></td><td align="right"><font size="2">Total Gross</font></td><td align="right"><font size="2">Theaters</font></td><td align="right"><font size="2">Opening Gross</font></td><td align="right"><font size="2">Theaters</font></td><td align="right"><font size="2">Open</a></font></td><td align="right"><font size="2">Close</font></td></tr><tr bgcolor="#ffffff"><td align="center"><font size="2">1</font></td><td><b><font size="2"><a href="/movies/?id=starwars2016.htm">Rogue One: A Star Wars Story</a></font></b></td><td><font size="2"><a href="/studio/chart/?studio=buenavista.htm">BV</a></font></td><td align="right"><font size="2"><b>$506,908,563</b></font></td><td align="right"><font size="2">4,157</font></td><td align="right"><font size="2">$155,081,681</font></td><td align="right"><font size="2">4,157</font></td><td align="right"><font size="2"><a href="/schedule/?view=bydate&amp;release=theatrical&amp;date=2016-12-16&amp;p=.htm">12/16</a></font></td><td align="right"><font size="2">-</font></td></tr><tr bgcolor="#f4f4ff"><td align="center"><font size="2">2</font></td><td><b><font size="2"><a href="/movies/?id=pixar2015.htm">Finding Dory</a></font></b></td><td><font size="2"><a href="/studio/chart/?studio=buenavista.htm">BV</a></font></td><td align="right"><font size="2"><b>$486,295,561</b></font></td><td align="right"><font size="2">4,305</font></td><td align="right"><font size="2">$135,060,273</font></td><td align="right"><font size="2">4,305</font></td><td align="right"><font size="2"><a href="/schedule/?view=bydate&amp;release=theatrical&amp;date=2016-06-17&amp;p=.htm">6/17</a></font></td><td align="right"><font size="2">12/8</font></td></tr><tr bgcolor="#ffffff"><td align="center"><font size="2">3</font></td><td><b><font size="2"><a href="/movies/?id=marvel2016.htm">Captain America: Civil War</a></font></b></td><td><font size="2"><a href="/studio/chart/?studio=buenavista.htm">BV</a></font></td><td align="right"><font size="2"><b>$408,084,349</b></font></td><td align="right"><font size="2">4,226</font></td><td align="right"><font size="2">$179,139,142</font></td><td align="right"><font size="2">4,226</font></td><td align="right"><font size="2"><a href="/schedule/?view=bydate&amp;release=theatrical&amp;date=2016-05-06&amp;p=.htm">5/6</a></font></td><td align="right"><font size="2">9/22</font></td></tr><table>
<h4>Movie Detail Table:</h4>
<table><tbody><tr bgcolor="#ffffff"><td colspan="2" align="center"><font size="4">Domestic Total Gross: <b>$24,252,420</b></font></td></tr><tr bgcolor="#ffffff"><td valign="top">Distributor: <b><a href="/studio/chart/?studio=universal.htm">Universal</a></b></td><td valign="top">Release Date: <b><nobr><a href="/schedule/?view=bydate&amp;release=theatrical&amp;date=2016-09-16&amp;p=.htm">September 16, 2016</a></nobr></b></td></tr><tr bgcolor="#ffffff"><td valign="top">Genre: <b>Romantic Comedy</b></td><td valign="top">Runtime: <b>2 hrs. 2 min.</b></td></tr><tr bgcolor="#ffffff"><td valign="top">MPAA Rating: <b>R</b></td><td valign="top">Production Budget: <b>$35 million</b></td></tr></tbody></table>

0,1,2,3,4,5,6,7,8
"Domestic Total Gross: $24,252,420","Domestic Total Gross: $24,252,420",,,,,,,
Distributor: Universal,"Release Date: September 16, 2016",,,,,,,
Genre: Romantic Comedy,Runtime: 2 hrs. 2 min.,,,,,,,
MPAA Rating: R,Production Budget: $35 million,,,,,,,
Rank,Movie Title (click to view),Studio,Total Gross,Theaters,Opening Gross,Theaters,Open,Close
1,Rogue One: A Star Wars Story,BV,"$506,908,563",4157,"$155,081,681",4157,12/16,-
2,Finding Dory,BV,"$486,295,561",4305,"$135,060,273",4305,6/17,12/8
3,Captain America: Civil War,BV,"$408,084,349",4226,"$179,139,142",4226,5/6,9/22

0,1
"Domestic Total Gross: $24,252,420","Domestic Total Gross: $24,252,420"
Distributor: Universal,"Release Date: September 16, 2016"
Genre: Romantic Comedy,Runtime: 2 hrs. 2 min.
MPAA Rating: R,Production Budget: $35 million

0,1
"Domestic Total Gross: $24,252,420","Domestic Total Gross: $24,252,420"
Distributor: Universal,"Release Date: September 16, 2016"
Genre: Romantic Comedy,Runtime: 2 hrs. 2 min.
MPAA Rating: R,Production Budget: $35 million


In [14]:
URL_MOVIES = 'http://www.boxofficemojo.com/yearly/chart/?view=releasedate&view2=domestic&yr={0:d}&page={1:d}'
URL_DETAILS = 'http://www.boxofficemojo.com/movies/?id={0:s}'

In [15]:
def scrape_year(year, verbose=False):
    result = []

    pageno = 1
    done = False
    while not done:

        url = URL_MOVIES.format(year, pageno)
        soup = BeautifulSoup(get_page(url), 'lxml')
        head = soup.find(text=re.compile('Movie Title \(click to view\)'))
        if not head:
            break
            
        if verbose:
            print(url)
            
        done = True
        for tr in head.findParent('table').contents[1:-4]: # exclude header and footer
            name = tr.contents[1].text
            gross = cvt_million(tr.contents[3].text)

            href = tr.contents[1].find('a', href=re.compile('/movies'))['href']
            id_ = href[href.rfind('=')+1:]
            try:
                details, artists = scrape_details(id_)
                result.append([name, gross] + details + [artists])
            except:
                print('warning: error scraping \'{0:s}\' ({1:s})'.format(name, id_), 
                      file=sys.stderr)
                pass
            done = False

        pageno = pageno + 1

    return result

def scrape_details(id_):
    details = [float('nan')]*6
    artists = {}
    if id_:
        url = URL_DETAILS.format(id_)
        soup = BeautifulSoup(get_page(url), 'lxml')

        rating = soup.find(text=re.compile('MPAA Rating:'))
        if rating:
            table = rating.findParent('table')
            if table:
                cells = [t.text for t in table.findAll('td')] # gross, studio, release date, genre, runtime, rating, budget
                details = [cells[i][cells[i].rfind(':')+1:].strip() for i in [1, 3, 4, 5, 6]]
                details[1] = cvt_genre(details[1])
                details[2] = cvt_runtime(details[2])
                details[3] = cvt_rating(details[3])
                details[4] = cvt_million(details[4])
                try:
                    wwide = soup.find('b', text=re.compile('Worldwide:'))
                    ww_gross = wwide.parent.parent.find_all('td')[1].text.strip()
                    details.append(cvt_million(ww_gross))
                except:
                    details.append(float('nan'))
                    pass

        artist = soup.find(text=re.compile('Actor|Composer|Director|Producer|Writer'))
        if artist:
            artists = scrape_movie_artists(artist.findParent('table'), id_)

    return (details, artists)

def scrape_movie_artists(table, id_):
    result = {}
    if table:
        for row in table.contents:
            if 2 == len(row):
                role = re.sub('s*: *$', '', row.contents[0].text)
                name = []
                for s in row.contents[1].strings:
                    if name and (name[-1].endswith(' ') or s.string.startswith(' ')):
                        name[-1] = name[-1] + s.string
                    else:
                        name.append(s.string)
                result[role] = name

        for s in [item for sublist in result.values() for item in sublist]:
            if s.startswith('('):
                print('warning: invalid name \'{0:s}\' in {1:s}'
                      .format(s, id_), file=sys.stderr)
    return result

In [16]:
def scrape_movies(years, verbose=False):
    result = []
    
    movies = []
    for y in years:
        print('scraping {0}'.format(y))
        movies.extend(scrape_year(y, verbose))

    def rank(names, ranks):
        return sum([ranks.get(re.sub('(\*| \().*$', '', name), 0) for name in names])

    global _artists, _roles
    for movie in movies:
        names = movie[-1]
        ranks = [rank(names.get(role, []), _artists[role]) for role in _roles]
        result.append(movie[:-1] + ranks)

    return result

### Scrape artists and movies

In [17]:
data_range = range(2012, 2017)
data_filename = 'movies_{}_to_{}.csv'.format(min(data_range), max(data_range))

In [18]:
%%time
_roles = ['Actor', 'Composer', 'Director', 'Producer', 'Writer']
_artists = { role : scrape_artists(role) for role in _roles }

CPU times: user 15.4 s, sys: 524 ms, total: 16 s
Wall time: 16.1 s


In [19]:
_artists['Actor'].items()[:5]

[(u'Leonard Nimoy', 74.6),
 (u'Josh Hartnett', 37.7),
 (u'Shailene Woodley', 72.9),
 (u'Helena Bonham Carter', 78.4),
 (u'Patrick Wilson', 38.7)]

In [20]:
%%time
movies = scrape_movies(data_range, True)

scraping 2012
http://www.boxofficemojo.com/yearly/chart/?view=releasedate&view2=domestic&yr=2012&page=1
http://www.boxofficemojo.com/yearly/chart/?view=releasedate&view2=domestic&yr=2012&page=2
http://www.boxofficemojo.com/yearly/chart/?view=releasedate&view2=domestic&yr=2012&page=3
http://www.boxofficemojo.com/yearly/chart/?view=releasedate&view2=domestic&yr=2012&page=4
http://www.boxofficemojo.com/yearly/chart/?view=releasedate&view2=domestic&yr=2012&page=5
http://www.boxofficemojo.com/yearly/chart/?view=releasedate&view2=domestic&yr=2012&page=6
http://www.boxofficemojo.com/yearly/chart/?view=releasedate&view2=domestic&yr=2012&page=7
scraping 2013
http://www.boxofficemojo.com/yearly/chart/?view=releasedate&view2=domestic&yr=2013&page=1
http://www.boxofficemojo.com/yearly/chart/?view=releasedate&view2=domestic&yr=2013&page=2
http://www.boxofficemojo.com/yearly/chart/?view=releasedate&view2=domestic&yr=2013&page=3
http://www.boxofficemojo.com/yearly/chart/?view=releasedate&view2=domest



http://www.boxofficemojo.com/yearly/chart/?view=releasedate&view2=domestic&yr=2016&page=6
http://www.boxofficemojo.com/yearly/chart/?view=releasedate&view2=domestic&yr=2016&page=7
http://www.boxofficemojo.com/yearly/chart/?view=releasedate&view2=domestic&yr=2016&page=8
CPU times: user 1min 2s, sys: 25.8 s, total: 1min 28s
Wall time: 1min 38s


In [21]:
df_raw = pd.DataFrame(movies, columns = \
    ['name', 'us_gross', 'studio', 'genre', 'runtime', 'rating', 'budget', 'ww_gross',
     'actor', 'composer', 'director', 'producer', 'writer'])

In [22]:
df_raw.info()
df_raw.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3494 entries, 0 to 3493
Data columns (total 13 columns):
name        3494 non-null object
us_gross    3494 non-null float64
studio      3491 non-null object
genre       3491 non-null object
runtime     3367 non-null float64
rating      3491 non-null object
budget      578 non-null float64
ww_gross    1968 non-null float64
actor       3482 non-null float64
composer    3494 non-null float64
director    3447 non-null float64
producer    3478 non-null float64
writer      3456 non-null float64
dtypes: float64(9), object(4)
memory usage: 354.9+ KB


Unnamed: 0,name,us_gross,studio,genre,runtime,rating,budget,ww_gross,actor,composer,director,producer,writer
0,Marvel's The Avengers,623.358,Buena Vista,action_adventure,142.0,pg13,220.0,1518.813,1060.1,75.7,278.1,1829.4,269.9
1,The Dark Knight Rises,448.139,Warner Bros.,action_adventure,165.0,pg13,250.0,1084.939,750.2,95.1,181.7,678.9,359.0
2,The Hunger Games,408.011,Lionsgate,action_adventure,142.0,pg13,78.0,694.395,868.1,73.2,147.4,762.2,583.5
3,Skyfall,304.36,Sony / Columbia,action_adventure,143.0,pg13,200.0,1108.561,370.6,53.1,119.1,290.6,310.4
4,The Hobbit: An Unexpected Journey,303.004,Warner Bros. (New Line),scifi_fantasy,166.0,pg13,,1021.104,827.3,65.6,177.9,410.7,822.7


#### Drop re-releases

In [23]:
df = df_raw.copy()

In [24]:
df = df[~df.name.str.contains('re-release')].copy()

#### Set missing Worldwide gross numbers to USA gross

In [25]:
df.ww_gross = np.where(df.ww_gross == 0, df['us_gross'], df['ww_gross'])

### Scrape budgets

Boxofficemojo doesn't have budget numbers for most films:

In [26]:
total = len(df)
with_budget = len(df[~df.budget.isnull()])
print('{0} ({2:.2%}) of {1} have budget'.format(with_budget, total, float(with_budget) / total))

578 (16.88%) of 3424 have budget


We'll some budget numbers from the-numbers.com. This is quick because it has all the information in one table:

In [27]:
%%html
<table><tbody><tr><th>&nbsp;</th><th>Release Date</th><th>Movie</th><th>Production Budget</th><th>Domestic Gross</th><th>Worldwide Gross</th></tr><tr><td class="data">1</td><td><a href="/box-office-chart/daily/2009/12/18">12/18/2009</a></td><td><b><a href="/movie/Avatar#tab=summary">Avatar</a></b></td><td class="data">$425,000,000</td><td class="data">$760,507,625</td><td class="data">$2,783,918,982</td></tr><tr></tr><tr><td class="data">2</td><td><a href="/box-office-chart/daily/2015/12/18">12/18/2015</a></td><td><b><a href="/movie/Star-Wars-Ep-VII-The-Force-Awakens#tab=summary">Star Wars Ep. VII: The Force Awakens</a></b></td><td class="data">$306,000,000</td><td class="data">$936,662,225</td><td class="data">$2,058,662,225</td></tr><tr></tr><tr><td class="data">3</td><td><a href="/box-office-chart/daily/2007/05/24">5/24/2007</a></td><td><b><a href="/movie/Pirates-of-the-Caribbean-At-Worlds-End#tab=summary">Pirates of the Caribbean: At World's End</a></b></td><td class="data">$300,000,000</td><td class="data">$309,420,425</td><td class="data">$963,420,425</td></tr></tbody></table>

Unnamed: 0,Release Date,Movie,Production Budget,Domestic Gross,Worldwide Gross
1.0,12/18/2009,Avatar,"$425,000,000","$760,507,625","$2,783,918,982"
,,,,,
2.0,12/18/2015,Star Wars Ep. VII: The Force Awakens,"$306,000,000","$936,662,225","$2,058,662,225"
,,,,,
3.0,5/24/2007,Pirates of the Caribbean: At World's End,"$300,000,000","$309,420,425","$963,420,425"


In [28]:
URL_THENUMBERS_BUDGETS = 'http://www.the-numbers.com/movie/budgets/all'

In [29]:
def scrape_thenumbers_budgets(target_years):
    d = {}

    url = URL_THENUMBERS_BUDGETS
    soup = BeautifulSoup(get_page(url), 'lxml')
    head = soup.find(text=re.compile('Release Date'))
    if head:
        table = head.findParent('table')
        if table:
            for tr in table.findAll('tr')[1:]: # skip header row
                contents = filter(lambda t: t != u'\n', tr.contents)
                if 6 != len(contents):
                    continue

                date, name = (contents[i].text for i in [1, 2])
                year = int(date[date.rfind('/')+1:])
                if not year in target_years and not target_years == None:
                    continue

                budget = cvt_million(contents[3].text)
                if not budget:
                    continue

                d[name] = budget
    return d

def set_budgets_old(df, dict_):
    def set_budget(row, dict_):
        name = row['name']
        return dict_[name] if name in dict_ else row['budget']
    df['budget'] = df.apply(set_budget, axis=1, args=(dict_,))

def set_budgets(df, dict_):
    df.budget = [dict_.get(name, budget) for name, budget in zip(df.name, df.budget)]


We'll get more budget numbers from IMDB. This is slow. We have to search for each movie, scrape the results page for the movie's id, then get the budget from the movie page.

In [30]:
URL_IMDB_SEARCH = 'http://www.imdb.com/find?q={0:s}&exact=true'
URL_IMDB_BUSINESS = 'http://www.imdb.com/title/{0:s}/business'

In [31]:
def scrape_imdb_budgets(names, verbose=False):
    d = {}

    for name in names:
        id_ = ''
        try:
            url = URL_IMDB_SEARCH.format(urllib.quote(name))
            soup = BeautifulSoup(get_page(url), 'lxml')
            if not soup:
                continue

            tag = soup.find(attrs={'name' : 'tt'})
            if not tag or not tag.parent:
                continue
            
            table = tag.parent.findNextSibling()
            if not table:
                continue

            for result in table.findAll(class_='result_text'):
                if not 'TV Episode' in result.text:
                    href = result.find('a')['href']
                    id_ = href.split('/')[2]
                    break

            if not id_:
                continue

            url = URL_IMDB_BUSINESS.format(id_)
            soup = BeautifulSoup(get_page(url), 'lxml')
            if not soup:
                continue

            tag = soup.find(text=re.compile('Budget'))
            if not tag or not tag.next:
                continue

            m = re.search('(\$[0-9,]+)', tag.next)
            if not m:
                if verbose:
                    print(u'warning: unrecognized budget {2:s} for \'{0:s}\' ({1:s})'
                          .format(name, id_, tag.next.strip()))
                continue

            d[name] = cvt_million(m.group(1))
        except requests.HTTPError:
            print(u'warning: error scraping \'{0:s}\' ({1:s})'.format(name, id_), 
                file=sys.stderr)
            pass


    return d

#### Scrape budgets from the-numbers.com

In [32]:
%%time
d = scrape_thenumbers_budgets(data_range)
set_budgets(df, d)
#pp_dict(d)

CPU times: user 4.92 s, sys: 576 ms, total: 5.5 s
Wall time: 5.54 s


In [33]:
total = len(df)
with_budget = len(df[~df.budget.isnull()])
print('{0} ({2:.2%}) of {1} have budget'.format(with_budget, total, float(with_budget) / total))

855 (24.97%) of 3424 have budget


#### Scrape budgets from imdb.com

In [34]:
%%time
names = [s.encode('utf-8') for s in df[df.budget.isnull()].name.values]
d = scrape_imdb_budgets(names)
set_budgets(df, d)
#pp_dict(d)

CPU times: user 2min 37s, sys: 3.59 s, total: 2min 40s
Wall time: 2min 49s


In [35]:
total = len(df)
with_budget = len(df[~df.budget.isnull()])
print('{0} ({2:.2%}) of {1} have budget'.format(with_budget, total, float(with_budget) / total))

1306 (38.14%) of 3424 have budget


### Save the data

In [36]:
df.to_csv(data_filename, encoding='utf-8', index=False)

### Inspect the data

In [37]:
df = pd.read_csv(data_filename, encoding='utf-8', index_col=None)

In [38]:
columns = ['name', 'budget', 'ww_gross', 'actor', 'director', 'producer', 'writer', 'genre', 'rating']

In [39]:
df2 = df[(df.T > 0).all()].copy()
df2[(df2.budget<5)&(df2.ww_gross>10)][columns].sort_values(by='budget', ascending=False)

Unnamed: 0,name,budget,ww_gross,actor,director,producer,writer,genre,rating
774,The Last Exorcism Part II,4.0,15.179,28.1,7.6,78.8,39.3,horror,pg13
2086,The Boy Next Door,4.0,52.426,120.5,53.2,59.4,35.4,drama,r
69,Sinister,3.0,77.712,17.4,92.9,70.7,221.1,horror,r
704,The Purge,3.0,89.329,42.6,68.2,286.6,43.4,drama,r
802,The Company You Keep,2.0,19.633,452.0,30.5,5.1,20.0,drama,r
772,Fruitvale Station,0.9,17.386,140.9,62.9,16.1,62.9,drama,r


In [40]:
df2[df2.ww_gross>1400][columns]

Unnamed: 0,name,budget,ww_gross,actor,director,producer,writer,genre,rating
0,Marvel's The Avengers,220.0,1518.813,1060.1,278.1,1829.4,269.9,action_adventure,pg13
2015,Star Wars: The Force Awakens,245.0,2068.224,3271.9,336.8,525.4,702.7,scifi_fantasy,pg13
2016,Jurassic World,215.0,1670.401,679.2,328.1,588.6,1296.0,action_adventure,pg13
2017,Avengers: Age of Ultron,250.0,1405.404,1378.2,278.1,1715.4,151.3,action_adventure,pg13
2019,Furious 7,190.0,1516.046,1058.1,101.7,357.7,153.0,action_adventure,pg13
