In [67]:
import pandas as pd
import datetime as dt
import pickle

# Load Datasets and Make Adjustments

## Main Dataset (`ticket_df`)

In [459]:
ticket_df = pd.read_csv(r'Data/Tableau_Archive/ticket_final_complete_R2.csv')

In [460]:
ticket_df.rename(columns={'Unnamed: 0':'index'}, inplace=True)
ticket_df.rename(columns={'Percentage Sold':'Percent Tickets Sold'}, inplace=True)

In [461]:
ticket_df.loc[649, 'Artist Genre'] = 'Hip Hop'
ticket_df.loc[759, 'Artist Genre'] = 'Hip Hop'

In [462]:
punk_metal = ['Punk', 'Metal']
other = ['Comedy', 'Reggae', 'Experimental', 'Podcast', 'Children Music', 'World', 'Jazz', 'Religious']


In [463]:
ticket_df['Artist Genre'].replace(punk_metal, 'Punk or Metal', inplace=True)
ticket_df['Artist Genre'].replace(other, 'Other', inplace=True)

In [464]:
ticket_df.Month = [month.strip() for month in ticket_df.Month.values]

In [465]:
# Create quarter column:

def quarter_helper(month):
    if month in ['January', 'February', 'March']:
        return 'Q1'
    elif month in ['April', 'May', 'June']:
        return 'Q2'
    elif month in ['July', 'August', 'September']:
        return 'Q3'
    elif month in ['October', 'November', 'December']:
        return 'Q4'

In [466]:
ticket_df.insert(4, 'Quarter', ticket_df['Month'].apply(quarter_helper))

In [467]:
ticket_df.rename(columns={'Quarter':'Q'}, inplace=True)
ticket_year = ticket_df.Year.astype('str')
ticket_df.Year = ticket_year
ticket_df.insert(5, 'Quarter', ticket_df['Year'] + ' ' + ticket_df['Q'])

In [468]:
ticket_df

Unnamed: 0,index,Event Date,Year,Month,Q,Quarter,Season,Day of Week,Time of Week,Number of Shows,...,Maximum Ticket Price,Ticket Range,Artist Start Date,Years Active,Artist Hometown,US Region,Local or Not,Miles From Home,Artist Genre,Main Genre
0,0,2/6/09,2009,February,Q1,2009 Q1,Winter,Friday,Weekend,1,...,32.5,0.0,1983,26,"Fullerton, California, United States",West Coast,US,364.697459,Punk or Metal,punk
1,1,2/7/09,2009,February,Q1,2009 Q1,Winter,Saturday,Weekend,1,...,32.5,0.0,1994,15,"San Francisco, California, United States",West Coast,Local,8.441116,Hip Hop,hip hop
2,2,2/13/09,2009,February,Q1,2009 Q1,Winter,Friday,Weekend,1,...,49.5,10.0,1988,21,"Brooklyn, New York, New York, United States",Northeast,US,2567.263768,Soul,soul
3,3,2/20/09,2009,February,Q1,2009 Q1,Winter,Friday,Weekend,1,...,40.0,2.5,2006,3,"Mogadishu, Banaadir, Somalia",International,International,9577.129183,Hip Hop,hip hop
4,4,2/21/09,2009,February,Q1,2009 Q1,Winter,Saturday,Weekend,1,...,35.0,0.0,1994,15,"Sacramento, California, United States",West Coast,Local,68.027617,Rock,alternative rock
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
853,853,8/14/19,2019,August,Q3,2019 Q3,Summer,Wednesday,Weekday,1,...,85.0,40.0,1993,26,"Fort Worth, Texas, United States",South,US,1447.368982,Other,gospel
854,854,8/23/19,2019,August,Q3,2019 Q3,Summer,Friday,Weekend,1,...,49.5,14.5,2017,2,"Toronto, Ontario, Canada",International,International,2261.182925,Hip Hop,r&b
855,855,8/31/19,2019,August,Q3,2019 Q3,Summer,Saturday,Weekend,1,...,249.5,190.0,1973,46,"Washington, Sunderland, Tyne and Wear, England...",West Coast,US,5152.434700,Rock,glam rock
856,856,9/5/19,2019,September,Q3,2019 Q3,Autumn,Thursday,Weekend,2,...,149.5,84.5,1969,50,"London, England, United Kingdom",International,International,5361.521946,Rock,progressive rock


In [458]:
# ticket_df.to_csv(r'Data/Tableau/ticket_final_complete_R3.csv')

## Location Dataset (`hometown_df`)

In [469]:
with open('Data/hometown_lat_long.pkl','rb') as read_file:
    hometown_df = pickle.load(read_file).reset_index()

In [470]:
del hometown_df['Venue Latitude']
del hometown_df['Venue Longitude']

In [471]:
hometown_df

Unnamed: 0,index,Headliner,Hometown Latitude,Hometown Longitude
0,0,Social Distortion,33.870821,-117.929417
1,1,Michael Franti & Spearhead,37.779026,-122.419906
2,2,Will Downing,40.650104,-73.949582
3,3,K'naan,2.042778,45.338564
4,4,CAKE,38.581572,-121.494400
...,...,...,...,...
853,853,Kirk Franklin,32.753177,-97.332746
854,854,Daniel Caesar,43.653482,-79.383935
855,855,Bryan Ferry,54.898614,-1.528827
856,856,King Crimson,51.507322,-0.127647


## Merge `ticket_df` and `hometown_df`

In [472]:
ticket_df = ticket_df.merge(hometown_df, how='left', on='index')
ticket_df.rename(columns={'Headliner_x':'Headliner'}, inplace=True)
del ticket_df['index']
del ticket_df['Headliner_y']

In [473]:
ticket_df

Unnamed: 0,Event Date,Year,Month,Q,Quarter,Season,Day of Week,Time of Week,Number of Shows,Show Type,...,Artist Start Date,Years Active,Artist Hometown,US Region,Local or Not,Miles From Home,Artist Genre,Main Genre,Hometown Latitude,Hometown Longitude
0,2/6/09,2009,February,Q1,2009 Q1,Winter,Friday,Weekend,1,single headliner,...,1983,26,"Fullerton, California, United States",West Coast,US,364.697459,Punk or Metal,punk,33.870821,-117.929417
1,2/7/09,2009,February,Q1,2009 Q1,Winter,Saturday,Weekend,1,single headliner,...,1994,15,"San Francisco, California, United States",West Coast,Local,8.441116,Hip Hop,hip hop,37.779026,-122.419906
2,2/13/09,2009,February,Q1,2009 Q1,Winter,Friday,Weekend,1,single headliner,...,1988,21,"Brooklyn, New York, New York, United States",Northeast,US,2567.263768,Soul,soul,40.650104,-73.949582
3,2/20/09,2009,February,Q1,2009 Q1,Winter,Friday,Weekend,1,festival,...,2006,3,"Mogadishu, Banaadir, Somalia",International,International,9577.129183,Hip Hop,hip hop,2.042778,45.338564
4,2/21/09,2009,February,Q1,2009 Q1,Winter,Saturday,Weekend,1,single headliner,...,1994,15,"Sacramento, California, United States",West Coast,Local,68.027617,Rock,alternative rock,38.581572,-121.494400
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
853,8/14/19,2019,August,Q3,2019 Q3,Summer,Wednesday,Weekday,1,single headliner,...,1993,26,"Fort Worth, Texas, United States",South,US,1447.368982,Other,gospel,32.753177,-97.332746
854,8/23/19,2019,August,Q3,2019 Q3,Summer,Friday,Weekend,1,single headliner,...,2017,2,"Toronto, Ontario, Canada",International,International,2261.182925,Hip Hop,r&b,43.653482,-79.383935
855,8/31/19,2019,August,Q3,2019 Q3,Summer,Saturday,Weekend,1,single headliner,...,1973,46,"Washington, Sunderland, Tyne and Wear, England...",West Coast,US,5152.434700,Rock,glam rock,54.898614,-1.528827
856,9/5/19,2019,September,Q3,2019 Q3,Autumn,Thursday,Weekend,2,multiple shows,...,1969,50,"London, England, United Kingdom",International,International,5361.521946,Rock,progressive rock,51.507322,-0.127647


# Determine Benchmarks

## Quarterly Revenue Benchmark

In [279]:
benchmark_df = ticket_df[ticket_df.Year.isin([2009, 2010, 2011, 2012, 2013])]

In [273]:
benchmark_year_df = benchmark_df.groupby(['Year','Quarter']).agg({'Gross USD': 'sum'}).reset_index()
benchmark_year_df

Unnamed: 0,Year,Quarter,Gross USD
0,2009,Q1,533619.0
1,2009,Q2,2010819.0
2,2009,Q3,1677490.5
3,2009,Q4,1667777.5
4,2010,Q1,1344893.0
5,2010,Q2,1491048.0
6,2010,Q3,1110599.0
7,2010,Q4,1375903.0
8,2011,Q1,1111780.0
9,2011,Q2,1860808.0


In [274]:
benchmark_year_df.groupby(['Quarter']).agg({'Gross USD': 'mean'})

Unnamed: 0_level_0,Gross USD
Quarter,Unnamed: 1_level_1
Q1,1096951.1
Q2,1917670.9
Q3,1145646.3
Q4,1811395.1


## Quarterly Revenue Benchmark by Day of Week

In [322]:
def early_late_benchmarks(df, parameter, years):
    
    time_df = df[df.Year.isin(years)]
    
    benchmark_df = time_df.groupby(['Quarter',parameter]).agg({'Gross USD': 'mean'}).reset_index()
    
    benchmark_df['key'] = benchmark_df['Quarter'] + '_' + benchmark_df[parameter]
    
    del benchmark_df['Quarter']
    del benchmark_df[parameter]
    
    benchmark_df.rename(columns={'Gross USD':'Benchmark'}, inplace=True)
    
    return benchmark_df
    
    

In [335]:
def full_benchmark(df, parameter):
    
    early_df = df[df.Year.isin([2009, 2010, 2011, 2012, 2013, 2014])]
    late_df = df[df.Year.isin([2015, 2016, 2017, 2018, 2019])]
    
    early_benchmark_df = early_late_benchmarks(early_df, parameter, [2009, 2010, 2011, 2012, 2013])
    late_benchmark_df = early_late_benchmarks(late_df, parameter, [2016, 2017, 2018])
    
    early_df['key'] = early_df['Quarter'] + '_' + early_df[parameter]
    late_df['key'] = late_df['Quarter'] + '_' + late_df[parameter]
    
    early_df = early_df.merge(early_benchmark_df, how='left', on='key')
    late_df = late_df.merge(late_benchmark_df, how='left', on='key')
    
    complete_df = early_df.append(late_df)
    
    return complete_df
    
    

In [417]:
dow_df = ticket_df.groupby(['Year','Quarter','Day of Week']).agg({'Gross USD': 'sum'}).reset_index()

In [418]:
dow_df

Unnamed: 0,Year,Quarter,Day of Week,Gross USD
0,2009,Q1,Friday,247656.0
1,2009,Q1,Saturday,186563.0
2,2009,Q1,Sunday,99400.0
3,2009,Q2,Friday,390351.0
4,2009,Q2,Monday,89144.0
...,...,...,...,...
262,2019,Q3,Friday,760769.0
263,2019,Q3,Saturday,172015.0
264,2019,Q3,Thursday,243362.0
265,2019,Q3,Tuesday,208746.5


In [419]:
dow_benchmark_df = full_benchmark(dow_df, 'Day of Week')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


In [420]:
dow_benchmark_df

Unnamed: 0,Year,Quarter,Day of Week,Gross USD,key,Benchmark
0,2009,Q1,Friday,247656.0,Q1_Friday,305268.300000
1,2009,Q1,Saturday,186563.0,Q1_Saturday,333070.600000
2,2009,Q1,Sunday,99400.0,Q1_Sunday,57684.333333
3,2009,Q2,Friday,390351.0,Q2_Friday,342480.200000
4,2009,Q2,Monday,89144.0,Q2_Monday,158861.000000
...,...,...,...,...,...,...
117,2019,Q3,Friday,760769.0,Q3_Friday,737153.833333
118,2019,Q3,Saturday,172015.0,Q3_Saturday,430032.500000
119,2019,Q3,Thursday,243362.0,Q3_Thursday,487754.166667
120,2019,Q3,Tuesday,208746.5,Q3_Tuesday,225165.333333


In [421]:
dow_benchmark_df.rename(columns={'Quarter':'Q'}, inplace=True)
dow_year = dow_benchmark_df.Year.astype('str')
dow_benchmark_df.Year = dow_year
dow_benchmark_df.insert(2, 'Quarter', dow_benchmark_df['Year'] + ' ' + dow_benchmark_df['Q'])
dow_benchmark_df['% of Benchmark'] = round(100*dow_benchmark_df['Gross USD']/dow_benchmark_df['Benchmark'],2)


In [422]:
dow_benchmark_df

Unnamed: 0,Year,Q,Quarter,Day of Week,Gross USD,key,Benchmark,% of Benchmark
0,2009,Q1,2009 Q1,Friday,247656.0,Q1_Friday,305268.300000,81.13
1,2009,Q1,2009 Q1,Saturday,186563.0,Q1_Saturday,333070.600000,56.01
2,2009,Q1,2009 Q1,Sunday,99400.0,Q1_Sunday,57684.333333,172.32
3,2009,Q2,2009 Q2,Friday,390351.0,Q2_Friday,342480.200000,113.98
4,2009,Q2,2009 Q2,Monday,89144.0,Q2_Monday,158861.000000,56.11
...,...,...,...,...,...,...,...,...
117,2019,Q3,2019 Q3,Friday,760769.0,Q3_Friday,737153.833333,103.20
118,2019,Q3,2019 Q3,Saturday,172015.0,Q3_Saturday,430032.500000,40.00
119,2019,Q3,2019 Q3,Thursday,243362.0,Q3_Thursday,487754.166667,49.89
120,2019,Q3,2019 Q3,Tuesday,208746.5,Q3_Tuesday,225165.333333,92.71


In [423]:
dow_benchmark_df.to_csv(r'Data/Tableau/Quarterly_Revenue_DOW.csv')

## Quarterly Revenue Benchmark by Genre

In [424]:
genre_df = ticket_df.groupby(['Year','Quarter','Artist Genre']).agg({'Gross USD': 'mean'}).reset_index()



In [425]:
genre_df

Unnamed: 0,Year,Quarter,Artist Genre,Gross USD
0,2009,Q1,Hip Hop,85163.0
1,2009,Q1,Indie,99400.0
2,2009,Q1,Punk or Metal,91000.0
3,2009,Q1,Rock,98000.0
4,2009,Q1,Soul,74893.0
...,...,...,...,...
328,2019,Q3,Hip Hop,93072.5
329,2019,Q3,Other,86480.0
330,2019,Q3,Pop,245813.0
331,2019,Q3,Rock,180110.4


In [426]:
genre_benchmark_df = full_benchmark(genre_df, 'Artist Genre')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


In [427]:
genre_benchmark_df.rename(columns={'Quarter':'Q'}, inplace=True)

In [428]:
genre_year = genre_benchmark_df.Year.astype('str')

In [429]:
genre_benchmark_df.Year = genre_year

In [430]:
genre_benchmark_df.insert(2, 'Quarter', genre_benchmark_df['Year'] + ' ' + genre_benchmark_df['Q'])



In [431]:
genre_benchmark_df['% of Benchmark'] = round(100*genre_benchmark_df['Gross USD']/genre_benchmark_df['Benchmark'],2)


In [432]:
genre_benchmark_df

Unnamed: 0,Year,Q,Quarter,Artist Genre,Gross USD,key,Benchmark,% of Benchmark
0,2009,Q1,2009 Q1,Hip Hop,85163.0,Q1_Hip Hop,95225.500000,89.43
1,2009,Q1,2009 Q1,Indie,99400.0,Q1_Indie,76606.200000,129.75
2,2009,Q1,2009 Q1,Punk or Metal,91000.0,Q1_Punk or Metal,89718.750000,101.43
3,2009,Q1,2009 Q1,Rock,98000.0,Q1_Rock,99971.225000,98.03
4,2009,Q1,2009 Q1,Soul,74893.0,Q1_Soul,109386.500000,68.47
...,...,...,...,...,...,...,...,...
160,2019,Q3,2019 Q3,Hip Hop,93072.5,Q3_Hip Hop,97217.266667,95.74
161,2019,Q3,2019 Q3,Other,86480.0,Q3_Other,68140.166667,126.91
162,2019,Q3,2019 Q3,Pop,245813.0,Q3_Pop,134358.000000,182.95
163,2019,Q3,2019 Q3,Rock,180110.4,Q3_Rock,124573.777778,144.58


In [433]:
genre_benchmark_df.to_csv(r'Data/Tableau/Quarterly_Revenue_Genre.csv')

---

# Add all dates

In [474]:
ticket_dates = pd.to_datetime(ticket_df['Event Date'])

In [475]:
ticket_df['Event Date'] = ticket_dates

In [476]:
date1 = '2/6/09'
date2 = '9/6/19'
date_list = pd.Series(pd.date_range(date1, date2).tolist())

In [477]:
all_dates_df = pd.DataFrame(date_list)
all_dates_df.columns = ['Event Date']

In [478]:
all_dates_df = pd.to_datetime(all_dates_df['Event Date'], format = 'mm%/dd%/%yy')

In [479]:
all_dates_df = pd.DataFrame(all_dates_df)
all_dates_df.columns = ['Event Date']

In [480]:
all_dates_df = all_dates_df.merge(ticket_df, how='left', on='Event Date')

In [481]:
all_dates_df['Gross USD'].fillna(0, inplace=True)

In [482]:
all_dates_df['Gross USD'].value_counts()

0.0         3007
98000.0       32
82600.0       24
91000.0       13
70000.0       13
            ... 
173699.0       1
106700.0       1
61919.0        1
62371.0        1
64890.0        1
Name: Gross USD, Length: 619, dtype: int64

In [483]:
all_dates_df

Unnamed: 0,Event Date,Year,Month,Q,Quarter,Season,Day of Week,Time of Week,Number of Shows,Show Type,...,Artist Start Date,Years Active,Artist Hometown,US Region,Local or Not,Miles From Home,Artist Genre,Main Genre,Hometown Latitude,Hometown Longitude
0,2009-02-06,2009,February,Q1,2009 Q1,Winter,Friday,Weekend,1.0,single headliner,...,1983.0,26.0,"Fullerton, California, United States",West Coast,US,364.697459,Punk or Metal,punk,33.870821,-117.929417
1,2009-02-07,2009,February,Q1,2009 Q1,Winter,Saturday,Weekend,1.0,single headliner,...,1994.0,15.0,"San Francisco, California, United States",West Coast,Local,8.441116,Hip Hop,hip hop,37.779026,-122.419906
2,2009-02-08,,,,,,,,,,...,,,,,,,,,,
3,2009-02-09,,,,,,,,,,...,,,,,,,,,,
4,2009-02-10,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3860,2019-09-02,,,,,,,,,,...,,,,,,,,,,
3861,2019-09-03,,,,,,,,,,...,,,,,,,,,,
3862,2019-09-04,,,,,,,,,,...,,,,,,,,,,
3863,2019-09-05,2019,September,Q3,2019 Q3,Autumn,Thursday,Weekend,2.0,multiple shows,...,1969.0,50.0,"London, England, United Kingdom",International,International,5361.521946,Rock,progressive rock,51.507322,-0.127647


In [484]:
all_dates_df.to_csv(r'Data/ticket_final_complete_nulls.csv')
