In this notebook, we use BeautifulSoup to scrape studio market share data from boxofficemojo.com and save the data as a CSV. We also do some exploratory analysis to get a feel for the data.

http://www.boxofficemojo.com/studio/?view=company&view2=yearly&yr=2000&p=.htm

In [1]:
import requests
from time import sleep
from bs4 import BeautifulSoup
import csv

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn-darkgrid')

In [2]:
build_url = lambda year: 'http://www.boxofficemojo.com/studio/?view=company&view2=yearly&yr={}&p=.htm'.format(year)
columns = ['year', 'rank', 'distributor', 'market_share', 'total_gross', 'movies_tracked', 'movies_released']
data = [columns]
value_functions = {
    'rank': int,
    'distributor': str,
    'market_share': lambda string: float(string.replace('%', '')) / 100,
    'total_gross': lambda string: float(string.replace('$', '').replace(',','').replace('k', '000')) * 1e6,
    'movies_tracked': int,
    'movies_released': int
}

def get_data_for_year(year):
    response = requests.get(build_url(year))
    soup = BeautifulSoup(response.text,"lxml")
    tables = soup.find_all('table')
    # The second to last table is the one with the data
    rows = tables[-2].find_all('tr')
    for row in rows[1:]:
        cell_strings = list(map((lambda c: c.text), row.find_all('td')))
        cells = [year]
        for i, cell_string in enumerate(cell_strings, 1):
            cells.append(value_functions[columns[i]](cell_string))
        data.append(cells)


In [3]:
years = range(2000, 2018) # January 2000 through December 2017
for year in years:
    sleep(1) # Throttle requests
    get_data_for_year(year)

In [4]:
with open('studio_market_share_by_year.csv', 'w') as studio_csv:
    writer = csv.writer(studio_csv)
    writer.writerows(data)

In [5]:
df = pd.read_csv('studio_market_share_by_year.csv')
df.head()

Unnamed: 0,year,rank,distributor,market_share,total_gross,movies_tracked,movies_released
0,2000,1,Buena Vista,0.155,1175600000.0,30,21
1,2000,2,Universal,0.141,1069100000.0,19,13
2,2000,3,Warner Bros.,0.119,905300000.0,28,22
3,2000,4,Paramount,0.104,791100000.0,19,12
4,2000,5,DreamWorks SKG,0.103,777200000.0,12,10
