In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
import seaborn as sns
import pickle

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)
pd.set_option('display.precision', 3)

# Load in Cleaned Dataset

- Prepare event dataset for analysis by cleaning up and determining assumptions.

In [2]:
with open('Data/sql_past_shows_df.pkl','rb') as read_file:
    ticket_sales_df = pickle.load(read_file)

In [3]:
ticket_sales_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 751 entries, 0 to 750
Data columns (total 18 columns):
start_date          751 non-null object
end_date            751 non-null object
year                751 non-null int64
month               751 non-null object
season              751 non-null object
day_of_week         751 non-null object
time_of_week        751 non-null object
num_shows           751 non-null int64
show_type           751 non-null object
headliner           751 non-null object
support             579 non-null object
num_support         751 non-null int64
tickets_sold        751 non-null int64
gross_usd           751 non-null float64
venue_capacity      751 non-null int64
percentage_sold     751 non-null float64
ticket_price_min    751 non-null float64
ticket_price_max    751 non-null float64
dtypes: float64(4), int64(5), object(9)
memory usage: 105.7+ KB


# Additional Data Cleaning

## Split multiple show rows into multiple rows

- **Assumption**: 
    - The dataset contains rows that represent multiple shows for a single artist (e.g. if an artist plays two shows and the start date is 01/02, the end date would be 01/03).  It's ideal to have each row of the dataset represent one show, so these rows will be split as necessary into multiple rows.  Ticket sales, gross revenue, etc. will be averaged across these rows.

### Generate duplicate rows for consecutive shows

In [4]:
# Split up dataframe by num_shows (1, 2 or 3)

ticket_sales_1show = ticket_sales_df[ticket_sales_df['num_shows'] == 1]

ticket_sales_2shows = ticket_sales_df[ticket_sales_df['num_shows'] == 2]
ticket_sales_x2 = pd.concat([ticket_sales_2shows]*2).sort_values(by='start_date').reset_index(drop=True)

ticket_sales_3shows = ticket_sales_df[ticket_sales_df['num_shows'] == 3]
ticket_sales_x3 = pd.concat([ticket_sales_3shows]*3).sort_values(by='start_date').reset_index(drop=True)


In [5]:
#Fix the dates where an artist played two nights.

ticket_sales_x2['start_date'][1::2] = ticket_sales_x2['end_date'][1::2]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [6]:
# Verify all dates are correct.
# Looks like there were 4 errors: 2012-02-12, 2012-11-25, 08-08-2018, and 09-10-2018.

dup_list = []
dup_dates = ticket_sales_x2.pivot_table(index=['start_date'], aggfunc='size')

for index, value in dup_dates.items():
    if value == 2:
        dup_list.append(index)

dup_list

[datetime.date(2012, 2, 12),
 datetime.date(2012, 11, 25),
 datetime.date(2018, 8, 8),
 datetime.date(2018, 9, 10)]

In [7]:
# Doing a quick google search:
# duplicate 2012-02-12 show is 2012-02-13
# duplicate 2012-11-25 show is 2012-11-26
# duplicate 2018-08-08 show is 2018-08-07
# duplicate 2018-09-10 show is 2018-09-11

ticket_sales_x2.loc[29, 'start_date'] = dt.date(2012, 2, 13)
ticket_sales_x2.loc[41, 'start_date'] = dt.date(2012, 11, 26)
ticket_sales_x2.loc[140, 'start_date'] = dt.date(2018, 8, 7)
ticket_sales_x2.loc[147, 'start_date'] = dt.date(2018, 9, 11)

In [8]:
# Fix the dates where an artist played three nights.

ticket_sales_x3['start_date'][1::3] += dt.timedelta(days=1)
ticket_sales_x3['start_date'][2::3] = ticket_sales_x3['end_date'][2::3]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [9]:
# Verify all dates are correct. Looks like they are (i.e. no duplicates).

dup_dates = ticket_sales_x3.pivot_table(index=['start_date'], aggfunc='size')
dup_dates.value_counts()


1    30
dtype: int64

### Split up ticket sales and gross USD across the multiple shows

In [10]:
# Write a function that will find the average tickets sold, 
# gross USD, and gross gate across multiple shows.

def multiple_show_means(df):
    
    df['tickets_sold'] = df['tickets_sold']/df['num_shows']
    df['gross_usd'] = df['gross_usd']/df['num_shows']
        
    return df

In [11]:
ticket_sales_x3 = multiple_show_means(ticket_sales_x3)

In [12]:
ticket_sales_x2 = multiple_show_means(ticket_sales_x2)

### Update main dataframe with new rows and values

In [13]:
# Combine the separate dataframes into a single dataframe, and delete 
# the "end_date" column as it's no longer necessary.

multiple_shows_df = ticket_sales_x3.append(ticket_sales_x2)
ticket_clean_df = ticket_sales_1show.append(multiple_shows_df).sort_values(by='start_date').reset_index(drop=True)
del ticket_clean_df['end_date']

In [22]:
ticket_clean_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 859 entries, 0 to 858
Data columns (total 17 columns):
start_date          859 non-null object
year                859 non-null int64
month               859 non-null object
season              859 non-null object
day_of_week         859 non-null object
time_of_week        859 non-null object
num_shows           859 non-null int64
show_type           859 non-null object
headliner           859 non-null object
support             644 non-null object
num_support         859 non-null int64
tickets_sold        859 non-null float64
gross_usd           859 non-null float64
venue_capacity      859 non-null int64
percentage_sold     859 non-null float64
ticket_price_min    859 non-null float64
ticket_price_max    859 non-null float64
dtypes: float64(5), int64(4), object(8)
memory usage: 114.2+ KB


## Update date-related features

- The date features are incorrect for the newly-generated rows

In [26]:
# Add year, month, and day of week columns to dataframe.
date = pd.to_datetime(ticket_clean_df.start_date, format='%Y-%m-%d')
ticket_clean_df['year'] = date.dt.year
ticket_clean_df['month'] = date.dt.month
ticket_clean_df['day_of_week'] = date.dt.weekday

In [28]:
def month_helper(month_num):
    if month_num == 1:
        return 'January'
    elif month_num == 2:
        return 'February'
    elif month_num == 3:
        return 'March'
    elif month_num == 4:
        return 'April'
    elif month_num == 5:
        return 'May'
    elif month_num == 6:
        return 'June'
    elif month_num == 7:
        return 'July'
    elif month_num == 8:
        return 'August'
    elif month_num == 9:
        return 'September'
    elif month_num == 10:
        return 'October'
    elif month_num == 11:
        return 'November'
    else:
        return 'December'

In [33]:
ticket_clean_df['month'] = ticket_clean_df['month'].apply(month_helper)

In [35]:
# Create season column:

def season_helper(month):
    if month in ['December', 'January', 'February']:
        return 'Winter'
    if month in ['March', 'April', 'May']:
        return 'Spring'
    if month in ['June', 'July', 'August']:
        return 'Summer'
    else:
        return 'Fall'

In [36]:
ticket_clean_df['season'] = ticket_clean_df['month'].apply(season_helper)

In [30]:
def dow_helper(dow_num):
    if dow_num == 0:
        return 'Sunday'
    elif dow_num == 1:
        return 'Monday'
    elif dow_num == 2:
        return 'Tuesday'
    elif dow_num == 3:
        return 'Wednesday'
    elif dow_num == 4:
        return 'Thursday'
    elif dow_num == 5:
        return 'Friday'
    else:
        return 'Saturday'

In [37]:
ticket_clean_df['day_of_week'] = ticket_clean_df['day_of_week'].apply(dow_helper)

In [38]:
# Create weekday vs weekend column

def week_helper(day):

     if day in ['Thursday','Friday','Saturday']:
        return 'Weekend'
     else:
        return 'Weekday'

In [39]:
ticket_clean_df['time_of_week'] = ticket_clean_df['day_of_week'].apply(week_helper)

## Manual Cleaning of Band Names (to facilitate future web scraping)

In [64]:
ticket_clean_df.loc[612, 'Headliner'] = 'Peppa Pig'
ticket_clean_df.loc[687, 'Headliner'] = 'Miguel'
ticket_clean_df.loc[796, 'Headliner'] = 'Lauryn Hill'
ticket_clean_df.loc[806, 'Headliner'] = 'Ella Mai'
ticket_clean_df.loc[807, 'Headliner'] = 'Neal Schon'
ticket_clean_df.loc[833, 'Headliner'] = 'Ella Mai'
ticket_clean_df.loc[834, 'Headliner'] = "Chris D'Elia"
ticket_clean_df.loc[840, 'Headliner'] = 'The Specials'
ticket_clean_df.loc[841, 'Headliner'] = 'Little Feat'
ticket_clean_df.loc[844, 'Headliner'] = 'Bela Fleck & The Flecktones'
ticket_clean_df.loc[845, 'Headliner'] = 'Local Natives'
ticket_clean_df.loc[853, 'Headliner'] = 'Paul Simon'
ticket_clean_df.loc[854, 'Headliner'] = 'Kirk Franklin'
ticket_clean_df.loc[855, 'Headliner'] = 'Daniel Caesar'
ticket_clean_df.loc[857, 'Headliner'] = 'King Crimson'
ticket_clean_df.loc[858, 'Headliner'] = 'King Crimson'
ticket_clean_df = ticket_clean_df.drop(ticket_clean_df.index[725])

---

## Cleaning portion is complete for event dataset.
- **See "Artist_Feature_Web_Scraping" notebook for second half of data compilation.**
    - In that notebook, artist features will be scraped and added to the event dataset to create complete final dataset.
    
---