### Use BeautifulSoup to scrape box office data by MPAA rating for 1982 to 2016 from BoxOfficeMojo

Example page urls: 

**1982 - 132 movies**
* 1982 Page 1
http://www.boxofficemojo.com/yearly/chart/mpaarating.htm?yr=1982&view=releasedate&sort=gross&order=DESC&m=1&rating=G|PG|PG-13|R&p=.htm
* 1982 Page 2
http://www.boxofficemojo.com/yearly/chart/mpaarating.htm?page=2&yr=1982&rating=G|PG|PG-13|R&view=releasedate&m=1&p=.htm

**2016 - 357 movies**
* 2016 Page 1
http://www.boxofficemojo.com/yearly/chart/mpaarating.htm?page=1&yr=2016&rating=G|PG|PG-13|R&view=releasedate&m=1&p=.htm
* 2016 Page 2
http://www.boxofficemojo.com/yearly/chart/mpaarating.htm?page=2&yr=2016&rating=G|PG|PG-13|R&view=releasedate&m=1&p=.htm
* 2016 Page 3
http://www.boxofficemojo.com/yearly/chart/mpaarating.htm?page=3&yr=2016&rating=G|PG|PG-13|R&view=releasedate&m=1&p=.htm
* 2016 Page 4
http://www.boxofficemojo.com/yearly/chart/mpaarating.htm?page=4&yr=2016&rating=G|PG|PG-13|R&view=releasedate&m=1&p=.htm

In [1]:
import requests
from bs4 import BeautifulSoup
from math import ceil
import numpy as np
import re
import urllib
import pandas as pd

In [2]:
def try_mojo_url(url):
    '''
    Attempt to access BoxOfficeMojo page
    '''
    response = requests.get(url)
    status = response.status_code
    if status != 200:
        print status
    else:
        page = response.text
        soup = BeautifulSoup(page, 'lxml')
        return soup

In [3]:
def number(string):
    '''
    Convert string (currency, totals with thousands separators) to ints
    '''
    try:
        return int(re.sub('[^\d]', '', string))
    except:
        return string

In [9]:
mojo_df = pd.DataFrame(data=None)

def get_data_from_table(df, movie_table, year):
    '''
    Retrieves data for table on boxofficemojo site
    movie_table is BeautifulSoup(page)
    '''
    #number of records in movie table
    num_records = len(movie_table.find_all('tr')) - 6
    for i in np.arange(num_records):
        row = movie_table.find_all('tr')[i+2].find_all('font')
        rank = row[0].text
        title = row[1].text
        studio = row[2].text
        mpaa = row[3].text
        gross_revenue = number(row[4].text)
        gross_theaters = number(row[5].text)
        opening_revenue = number(row[6].text)
        opening_theaters = number(row[7].text)
        #convert to datetime        
        #p.agent_info = u' '.join((agent_contact, agent_telno)).encode('utf-8').strip()
        release_date = str(row[8].text) + '/' + str(year)
        release_date = release_date.encode('utf-8')
        results = pd.Series((rank, title, studio, mpaa, gross_revenue, gross_theaters, opening_revenue, opening_theaters, release_date))
        #print rank, title, studio, mpaa, gross_revenue, gross_theaters, opening_revenue, opening_theaters, release_date
        df = df.append(results, ignore_index=True)
    return df

In [5]:
years = np.arange(1982, 2017)
#years = np.arange(1982, 1990)

base_url = 'http://www.boxofficemojo.com/yearly/chart/mpaarating.htm?'
search_criteria = {'page': '1',
                   'yr': None,
                   'rating': 'G|PG|PG-13|R',
                   'view': 'releasedate',
                   'm': '1',
                   'p': '.'
                   }
page_1_urls = []

for year in years:
    search_criteria['yr'] = year
    url = base_url + urllib.urlencode(search_criteria)
    page_1_urls.append(url)

In [6]:
all_urls = []

#get list of all links from first page
for url in page_1_urls:
    year = re.findall('yr=(\d*)', url)[0]
    all_urls.append(url)
    soup = try_mojo_url(url)
    #last link title bar with number of movies; only use on first page
    num_movies = soup.find_all('center')[1].find_all('a')[-1].text
    #number of pages for given year
    num_pages = int(ceil(float(num_movies[-3:])/100))
    for page in np.arange(2, num_pages+1):
        search_criteria['yr'] = year
        search_criteria['page'] = page
        url = base_url + urllib.urlencode(search_criteria)
        all_urls.append(url)

In [7]:
len(all_urls)

118

In [10]:
#get all data for all urls
for url in all_urls:
    year = re.findall('yr=(\d*)', url)[0]
    soup = try_mojo_url(url)
    #table of movie data
    movie_table = soup.find_all('table')[9].find_all('tr')[1]
    mojo_df = get_data_from_table(mojo_df, movie_table, year)

In [11]:
len(mojo_df)

10221

In [12]:
columns = {0: 'rank',
           1: 'title',
           2: 'studio',
           3: 'mpaa', 
           4: 'gross_revenue', 
           5: 'gross_theaters',
           6: 'opening_revenue',
           7: 'opening_theaters',
           8: 'release_date'
          }
mojo_df.rename(columns=columns, inplace=True)

In [13]:
mojo_df.head()

Unnamed: 0,rank,title,studio,mpaa,gross_revenue,gross_theaters,opening_revenue,opening_theaters,release_date
0,1,E.T.: The Extra-Terrestrial,Uni.,PG,359197037.0,1778,11835400.0,1103,6/11/1982
1,2,Tootsie,Col.,PG,177200000.0,1222,5540470.0,943,12/17/1982
2,3,An Officer and a Gentleman,Par.,R,129795554.0,1050,3304680.0,346,7/30/1982
3,4,Rocky III,UA,PG,124146897.0,1317,12431500.0,939,5/28/1982
4,5,Porky's,Fox,R,105492483.0,1605,7623990.0,1148,3/19/1982


In [15]:
mojo_df.to_csv('mojo.csv', encoding='utf-8')