In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
import seaborn as sns
import pickle

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)
pd.set_option('display.precision', 3)

# Load in Cleaned Dataset

- Prepare event dataset for analysis by cleaning up and determining assumptions.

In [2]:
ticket_sales_df = pd.read_csv(r'fox-theater-ticket-sales.csv')

In [3]:
ticket_sales_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 893 entries, 0 to 892
Data columns (total 18 columns):
Start Date          893 non-null object
End Date            751 non-null object
Num Shows           893 non-null int64
Headliner           893 non-null object
Support             579 non-null object
Tickets Sold        751 non-null float64
Gross USD           751 non-null float64
Gross Gate          751 non-null float64
Currency            751 non-null object
Venue Capacity      751 non-null float64
Percentage Sold     893 non-null float64
Ticket Price Min    751 non-null float64
Ticket Price Max    751 non-null float64
Venue Name          893 non-null object
Venue City          893 non-null object
Venue State         893 non-null object
Venue Country       893 non-null object
Promoter            739 non-null object
dtypes: float64(7), int64(1), object(10)
memory usage: 125.7+ KB


# Preliminary Data Cleaning

## Convert Date Features to datetime

In [4]:
# start_date = pd.to_datetime(ticket_sales_df['Start_Date'], format='%m/%d/%Y')
# end_date = pd.to_datetime(ticket_sales_df['End_Date'], format='%m/%d/%Y')
ticket_sales_df['Start Date'] = pd.to_datetime(ticket_sales_df['Start Date'], format='%m/%d/%Y').dt.date
ticket_sales_df['End Date'] = pd.to_datetime(ticket_sales_df['End Date'], format='%m/%d/%Y').dt.date

## Split multiple show rows into multiple rows

- **Assumption**: 
    - The dataset contains rows that represent multiple shows for a single artist (e.g. if an artist plays two shows and the start date is 01/02, the end date would be 01/03).  It's ideal to have each row of the dataset represent one show, so these rows will be split as necessary into multiple rows.  Ticket sales, gross revenue, etc. will be averaged across these rows.

### Generate duplicate rows for consecutive shows

In [5]:
# Split up dataframe by num_shows (1, 2 or 3)

ticket_sales_1show = ticket_sales_df[ticket_sales_df['Num Shows'] == 1]

ticket_sales_2shows = ticket_sales_df[ticket_sales_df['Num Shows'] == 2]
ticket_sales_x2 = pd.concat([ticket_sales_2shows]*2).sort_values(by='Start Date').reset_index(drop=True)

ticket_sales_3shows = ticket_sales_df[ticket_sales_df['Num Shows'] == 3]
ticket_sales_x3 = pd.concat([ticket_sales_3shows]*3).sort_values(by='Start Date').reset_index(drop=True)


In [6]:
#Fix the dates where an artist played two nights.

ticket_sales_x2['Start Date'][1::2] = ticket_sales_x2['End Date'][1::2]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [7]:
# Verify all dates are correct.
# Looks like there were 4 errors: 2012-02-12, 2012-11-25, 08-08-2018, and 09-10-2018.

dup_list = []
dup_dates = ticket_sales_x2.pivot_table(index=['Start Date'], aggfunc='size')

for index, value in dup_dates.items():
    if value == 2:
        dup_list.append(index)

dup_list

[datetime.date(2012, 2, 12),
 datetime.date(2012, 11, 25),
 datetime.date(2018, 8, 8),
 datetime.date(2018, 9, 10)]

In [8]:
# Doing a quick google search:
# duplicate 2012-02-12 show is 2012-02-13
# duplicate 2012-11-25 show is 2012-11-26
# duplicate 2018-08-08 show is 2018-08-07
# duplicate 2018-09-10 show is 2018-09-11

ticket_sales_x2.loc[29, 'Start Date'] = dt.date(2012, 2, 13)
ticket_sales_x2.loc[41, 'Start Date'] = dt.date(2012, 11, 26)
ticket_sales_x2.loc[140, 'Start Date'] = dt.date(2018, 8, 7)
ticket_sales_x2.loc[147, 'Start Date'] = dt.date(2018, 9, 11)

In [9]:
# Fix the dates where an artist played three nights.

ticket_sales_x3['Start Date'][1::3] += dt.timedelta(days=1)
ticket_sales_x3['Start Date'][2::3] = ticket_sales_x3['End Date'][2::3]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [10]:
# Verify all dates are correct. Looks like they are (i.e. no duplicates).

dup_dates = ticket_sales_x3.pivot_table(index=['Start Date'], aggfunc='size')
dup_dates.value_counts()


1    30
dtype: int64

### Split up ticket sales and gross USD across the multiple shows

In [11]:
# Write a function that will find the average tickets sold, 
# gross USD, and gross gate across multiple shows.

def multiple_show_means(df):
    
    df['Tickets Sold'] = df['Tickets Sold']/df['Num Shows']
    df['Gross USD'] = df['Gross USD']/df['Num Shows']
    df['Gross Gate']
        
    return df

In [12]:
ticket_sales_x3 = multiple_show_means(ticket_sales_x3)

In [13]:
ticket_sales_x2 = multiple_show_means(ticket_sales_x2)

### Update main dataframe with new rows and values

In [14]:
# Combine the separate dataframes into a single dataframe, and delete 
# the "end_date" column as it's no longer necessary.

multiple_shows_df = ticket_sales_x3.append(ticket_sales_x2)
ticket_clean_df = ticket_sales_1show.append(multiple_shows_df).sort_values(by='Start Date').reset_index(drop=True)
del ticket_clean_df['End Date']

In [15]:
ticket_clean_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001 entries, 0 to 1000
Data columns (total 17 columns):
Start Date          1001 non-null object
Num Shows           1001 non-null int64
Headliner           1001 non-null object
Support             644 non-null object
Tickets Sold        859 non-null float64
Gross USD           859 non-null float64
Gross Gate          859 non-null float64
Currency            859 non-null object
Venue Capacity      859 non-null float64
Percentage Sold     1001 non-null float64
Ticket Price Min    859 non-null float64
Ticket Price Max    859 non-null float64
Venue Name          1001 non-null object
Venue City          1001 non-null object
Venue State         1001 non-null object
Venue Country       1001 non-null object
Promoter            833 non-null object
dtypes: float64(7), int64(1), object(9)
memory usage: 133.1+ KB


## Replace Festival Names with Actual Headliners

- There are 9 nights of the Noise Pop Festival throughout the dataset. In order to facilitate web scraping of artist data, the headliner name will be replaced with the actual headliner from each corresponding night.

In [16]:
ticket_clean_df.loc[83, 'Headliner'] = 'Yoko Ono'
ticket_clean_df.loc[86, 'Headliner'] = 'The Magnetic Fields'
ticket_clean_df.loc[157, 'Headliner'] = 'Yo La Tengo'
ticket_clean_df.loc[218, 'Headliner'] = 'Porter Robinson'
ticket_clean_df.loc[479, 'Headliner'] = 'Geographer'
ticket_clean_df.loc[480, 'Headliner'] = 'New Pornographers'
ticket_clean_df.loc[704, 'Headliner'] = 'Vince Staples'
ticket_clean_df.loc[705, 'Headliner'] = 'Ty Segall'
ticket_clean_df.loc[817, 'Headliner'] = 'Tune-Yards'

# Export `ticket_clean_df` to CSV

In [17]:
ticket_clean_df.to_csv(r'fox-theater-all-shows.csv', index=False)