In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta

# Scraping Broadway Data

This notebook aims to scrape Broadway's Weekly Gross Revenue Data from Playbill's website. The Data will then be compiled into one dataframe, saved into a .csv file to be uploaded and utilized in the Data Exploration and Modeling notebooks.

In [2]:
#Create a function that will scrape the dataset

def get_gross_data_for_week(week_url):
  #make a request
  r = requests.get(week_url)
  #get the text into a soup
  soup = BeautifulSoup(r.text)


  #manual creation of columns

  columns = ['Show', 'Theater', 'This Week Gross', 'Potential Gross',
             'Diff $',	'Avg Ticket', 'Top Ticket', 'Seats Sold',
             'Seats in Theatre', 'Performances', 'Previews',
             '% Cap', 'Diff % cap']

  #find and put the data table into a variable
  table = soup.find('tbody')
  #find and put all the rows of data into the variable
  rows = table.find_all('tr')

  #working with the given week url, splitting it by = to get the appropriate date
  week_str = week_url.split('=')[-1]
  #getting the date by converting it to datetime
  week_date = pd.to_datetime(week_str, format = '%Y-%m-%d').date()

  #make the dictionary to temporarily place the data
  data = {}
  #the dataframe that all data will eventually be placed in
  df = pd.DataFrame()
  #accessing all rows, getting the show information
  for i in range(len(rows)):
   show = rows[i].find_all('span')
   show2 = rows[i].find_all('a')
   #getting the data for the corresponding column. NaN if null
   for j in range(len(columns)):
    if show[j].text != '':
      data[columns[j]] = (show[j].text)
      data['Week'] = week_date
    else:
      data[columns[j]] = np.nan
      #scraping the embedded link to access specific show information
   for j in range(len(show2)):
    new_url = show2[j].get('href')
    #accessing new link
    r2 = requests.get(new_url)
    soup = BeautifulSoup(r2.text)
    table = soup.find('div', {'class': 'md:flex md:justify-between md:items-baseline'})
    #accessing link within the previous link to get to the show page
    newer_url = table.find('a').get('href')
    r3 = requests.get(newer_url)
    soup = BeautifulSoup(r3.text)
    #finally taking and creating Genre column
    genres = soup.find('div', {'class': 'bsp-bio-subtitle'})
    genre_list = genres.find_all('h5')
    #since the first genre is always "broadway", we take the second or third genre
    ##to make sure that genre is varied
    if len(genre_list) >= 3:
      data['Genre'] = genre_list[2].text
    elif len(genre_list) == 0:
      data['Genre'] = 'Unknown'
    else:
      data['Genre'] = genre_list[0].text

    #counting and creating Num of Awards column
    awards = soup.find_all('td', {'data-label': 'Award'})
    data['Number of Awards'] = len(awards)
    #creating binary awards column
    if len(awards) == 0:
      data['Awards'] = 0
    else:
      data['Awards'] = 1
  #putting data into a temporary dataframe
   df2 = pd.DataFrame(data, index = [0])
   df2.set_index('Show', inplace = True)
   #concatenating data into final dataframe
   df = pd.concat([df, df2])

#returning df filled with the week's data
  return df

In [3]:
#Scraping Dataset

#list of urls from May 12 2024 to May 4 2025 (one year)
url_list_2 = [
    'https://playbill.com/grosses?week=2025-05-04',
    'https://playbill.com/grosses?week=2025-04-27',
    'https://playbill.com/grosses?week=2025-04-20',
    'https://playbill.com/grosses?week=2025-04-13',
    'https://playbill.com/grosses?week=2025-04-06',
    'https://playbill.com/grosses?week=2025-03-30',
    'https://playbill.com/grosses?week=2025-03-23',
    'https://playbill.com/grosses?week=2025-03-16',
    'https://playbill.com/grosses?week=2025-03-09',
    'https://playbill.com/grosses?week=2025-03-02',
    'https://playbill.com/grosses?week=2025-02-23',
    'https://playbill.com/grosses?week=2025-02-16',
    'https://playbill.com/grosses?week=2025-02-09',
    'https://playbill.com/grosses?week=2025-02-02',
    'https://playbill.com/grosses?week=2025-01-26',
    'https://playbill.com/grosses?week=2025-01-19',
    'https://playbill.com/grosses?week=2025-01-12',
    'https://playbill.com/grosses?week=2025-01-05',
    'https://playbill.com/grosses?week=2024-12-29',
    'https://playbill.com/grosses?week=2024-12-22',
    'https://playbill.com/grosses?week=2024-12-15',
    'https://playbill.com/grosses?week=2024-12-08',
    'https://playbill.com/grosses?week=2024-12-01',
    'https://playbill.com/grosses?week=2024-11-24',
    'https://playbill.com/grosses?week=2024-11-17',
    'https://playbill.com/grosses?week=2024-11-10',
    'https://playbill.com/grosses?week=2024-11-03',
    'https://playbill.com/grosses?week=2024-10-27',
    'https://playbill.com/grosses?week=2024-10-20',
    'https://playbill.com/grosses?week=2024-10-13',
    'https://playbill.com/grosses?week=2024-10-06',
    'https://playbill.com/grosses?week=2024-09-29',
    'https://playbill.com/grosses?week=2024-09-22',
    'https://playbill.com/grosses?week=2024-09-15',
    'https://playbill.com/grosses?week=2024-09-08',
    'https://playbill.com/grosses?week=2024-09-01',
    'https://playbill.com/grosses?week=2024-08-25',
    'https://playbill.com/grosses?week=2024-08-18',
    'https://playbill.com/grosses?week=2024-08-11',
    'https://playbill.com/grosses?week=2024-08-04',
    'https://playbill.com/grosses?week=2024-07-28',
    'https://playbill.com/grosses?week=2024-07-21',
    'https://playbill.com/grosses?week=2024-07-14',
    'https://playbill.com/grosses?week=2024-07-07',
    'https://playbill.com/grosses?week=2024-06-30',
    'https://playbill.com/grosses?week=2024-06-23',
    'https://playbill.com/grosses?week=2024-06-16',
    'https://playbill.com/grosses?week=2024-06-09',
    'https://playbill.com/grosses?week=2024-06-02',
    'https://playbill.com/grosses?week=2024-05-26',
    'https://playbill.com/grosses?week=2024-05-19',
    'https://playbill.com/grosses?week=2024-05-12'
]

#iterating through the list, calling the function and adding it to the dataframe
df = pd.DataFrame()
for i in url_list_2:
  df = pd.concat([df, get_gross_data_for_week(i)])

df

Unnamed: 0_level_0,Week,Theater,This Week Gross,Potential Gross,Diff $,Avg Ticket,Top Ticket,Seats Sold,Seats in Theatre,Performances,Previews,% Cap,Diff % cap,Genre,Number of Awards,Awards
Show,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Wicked,2025-05-04,Gershwin Theatre,"$2,232,068.00",,"-$345,699.00",$144.86,$352.00,15408,1926,8,0,100.00%,0.00%,Original,33,1
The Picture of Dorian Gray,2025-05-04,Music Box Theatre,"$1,344,831.26",,"$25,612.38",$168.06,$497.00,8002,995,8,0,100.53%,-0.14%,Play,17,1
The Outsiders,2025-05-04,Bernard B. Jacobs Theatre,"$1,124,988.24",,"-$204,211.86",$138.00,$497.00,8152,1024,8,0,99.51%,-2.30%,Original,31,1
The Lion King,2025-05-04,Minskoff Theatre,"$1,999,729.00",,"-$320,876.00",$153.10,$222.00,13062,1696,8,0,96.27%,-2.15%,Original,32,1
The Last Five Years,2025-05-04,Hudson Theatre,"$613,501.45",,"-$146,227.05",$94.46,$425.00,6495,968,8,0,83.87%,-8.70%,One Act,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Appropriate,2024-05-12,Belasco Theatre,"$702,686.00",,"$43,422.80",$118.84,$297.00,5913,813,8,0,90.91%,2.66%,Original,23,1
An Enemy of the People,2024-05-12,Circle in the Square Theatre,"$1,060,185.20",,"$11,898.80",$155.02,$480.00,6839,828,8,0,103.25%,0.05%,Drama,13,1
Aladdin,2024-05-12,New Amsterdam Theatre,"$1,274,359.60",,"$140,138.20",$97.42,$215.50,13081,1727,8,0,94.68%,0.48%,Original,23,1
"A Beautiful Noise, The Neil Diamond Musical",2024-05-12,Broadhurst Theatre,"$929,935.30",,"$172,063.50",$149.97,$348.00,6201,1153,8,0,67.23%,12.67%,Original,10,1


In [4]:
#add season column

def get_season(date):
    month = date.month
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Fall'

df['Season'] = df['Week'].apply(get_season)
df

Unnamed: 0_level_0,Week,Theater,This Week Gross,Potential Gross,Diff $,Avg Ticket,Top Ticket,Seats Sold,Seats in Theatre,Performances,Previews,% Cap,Diff % cap,Genre,Number of Awards,Awards,Season
Show,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
Wicked,2025-05-04,Gershwin Theatre,"$2,232,068.00",,"-$345,699.00",$144.86,$352.00,15408,1926,8,0,100.00%,0.00%,Original,33,1,Spring
The Picture of Dorian Gray,2025-05-04,Music Box Theatre,"$1,344,831.26",,"$25,612.38",$168.06,$497.00,8002,995,8,0,100.53%,-0.14%,Play,17,1,Spring
The Outsiders,2025-05-04,Bernard B. Jacobs Theatre,"$1,124,988.24",,"-$204,211.86",$138.00,$497.00,8152,1024,8,0,99.51%,-2.30%,Original,31,1,Spring
The Lion King,2025-05-04,Minskoff Theatre,"$1,999,729.00",,"-$320,876.00",$153.10,$222.00,13062,1696,8,0,96.27%,-2.15%,Original,32,1,Spring
The Last Five Years,2025-05-04,Hudson Theatre,"$613,501.45",,"-$146,227.05",$94.46,$425.00,6495,968,8,0,83.87%,-8.70%,One Act,3,1,Spring
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Appropriate,2024-05-12,Belasco Theatre,"$702,686.00",,"$43,422.80",$118.84,$297.00,5913,813,8,0,90.91%,2.66%,Original,23,1,Spring
An Enemy of the People,2024-05-12,Circle in the Square Theatre,"$1,060,185.20",,"$11,898.80",$155.02,$480.00,6839,828,8,0,103.25%,0.05%,Drama,13,1,Spring
Aladdin,2024-05-12,New Amsterdam Theatre,"$1,274,359.60",,"$140,138.20",$97.42,$215.50,13081,1727,8,0,94.68%,0.48%,Original,23,1,Spring
"A Beautiful Noise, The Neil Diamond Musical",2024-05-12,Broadhurst Theatre,"$929,935.30",,"$172,063.50",$149.97,$348.00,6201,1153,8,0,67.23%,12.67%,Original,10,1,Spring


In [5]:
#add month column
df['Month'] = pd.to_datetime(df['Week']).dt.month


In [6]:
#resetting the index so the show will be included in the csv file
df.reset_index(inplace=True)
df.head()

Unnamed: 0,Show,Week,Theater,This Week Gross,Potential Gross,Diff $,Avg Ticket,Top Ticket,Seats Sold,Seats in Theatre,Performances,Previews,% Cap,Diff % cap,Genre,Number of Awards,Awards,Season,Month
0,Wicked,2025-05-04,Gershwin Theatre,"$2,232,068.00",,"-$345,699.00",$144.86,$352.00,15408,1926,8,0,100.00%,0.00%,Original,33,1,Spring,5
1,The Picture of Dorian Gray,2025-05-04,Music Box Theatre,"$1,344,831.26",,"$25,612.38",$168.06,$497.00,8002,995,8,0,100.53%,-0.14%,Play,17,1,Spring,5
2,The Outsiders,2025-05-04,Bernard B. Jacobs Theatre,"$1,124,988.24",,"-$204,211.86",$138.00,$497.00,8152,1024,8,0,99.51%,-2.30%,Original,31,1,Spring,5
3,The Lion King,2025-05-04,Minskoff Theatre,"$1,999,729.00",,"-$320,876.00",$153.10,$222.00,13062,1696,8,0,96.27%,-2.15%,Original,32,1,Spring,5
4,The Last Five Years,2025-05-04,Hudson Theatre,"$613,501.45",,"-$146,227.05",$94.46,$425.00,6495,968,8,0,83.87%,-8.70%,One Act,3,1,Spring,5


In [7]:
#Save to csv to be used for EDA and Predicting
df.to_csv('broadway_gross_data.csv', index=False)