In [71]:
# The %... is an iPython thing, and is not part of the Python language.
# In this case we're just telling the plotting library to draw things on
# the notebook, instead of on a separate window.
%matplotlib inline
# See all the "as ..." contructs? They're just aliasing the package names.
# That way we can call methods like plt.plot() instead of matplotlib.pyplot.plot().
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
import re
from collections import defaultdict
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

from pyquery import PyQuery as pq
from bs4 import BeautifulSoup
# The "requests" library makes working with HTTP requests easier
# than the built-in urllib libraries.
import requests



In [2]:
# here we access the webpage and download the content using requests
movie_list=requests.get("http://www.the-numbers.com/movie/budgets/all")

In [85]:
soup = BeautifulSoup(movie_list.text, "html.parser")
datarows = soup.find_all("td", attrs={"class": "data"})
titlerows = soup.find_all("b")[1:]
titles = [row.get_text() for row in titlerows]
titles = titles[:len(titles)-3]

moviedict = {}
for i, title in enumerate(titles):
    vals = datarows[i*4:(i*4)+4]
    newvals = []
    for val in vals:
        exp = r'((?<=\d),(?=\d))|(\$(?=\d))'
        newvals.append(re.sub(exp,r'',val.get_text()))
    moviedict[title] = [int(val) for val in newvals]
    
movieframe = pd.DataFrame.from_dict(moviedict, orient = 'index')
movieframe.columns = ['budget_rank','budget','domestic_gross','worldwide_gross']
movieframe.sort_values('budget_rank', inplace = True)
movieframe = movieframe[movieframe.domestic_gross > 0]
movieframe['dom_profits'] = map(lambda dom, budget: dom - budget, movieframe['domestic_gross'], movieframe['budget'])
movieframe['total_profits'] = map(lambda world, budget: world - budget, movieframe['worldwide_gross'], movieframe['budget'])
movieframe


Unnamed: 0,budget_rank,budget,domestic_gross,worldwide_gross,dom_profits,total_profits
Avatar,1,425000000,760507625,2783918982,335507625,2358918982
Pirates of the Caribbean: At World's End,2,300000000,309420425,963420425,9420425,663420425
Spectre,3,300000000,153702879,572702879,-146297121,272702879
The Dark Knight Rises,4,275000000,448139099,1084439099,173139099,809439099
The Lone Ranger,5,275000000,89289910,259989910,-185710090,-15010090
John Carter,6,275000000,73058679,282778100,-201941321,7778100
Tangled,7,260000000,200821936,586581936,-59178064,326581936
Spider-Man 3,8,258000000,336530303,890875303,78530303,632875303
The Avengers: Age of Ultron,9,250000000,459005868,1404705868,209005868,1154705868
The Hobbit: An Unexpected Journey,10,250000000,303003568,1017003568,53003568,767003568
