# Generate StubHub ticket data for upcoming events along with ticket face value data

Get StubHub event data for events near given cities. Supplement with ticket face value data (scraped from SongKick) <br>Assume almost all ticket prices (from both StubHub and SongKick) include relevant fees. Prices should be interpreted as what the concertgoer actually pays. <br>

In [1]:
import datetime
import json
import numpy as np
import os
import pandas as pd
import re
import requests
import time
from bs4 import BeautifulSoup

## Define Functions

In [2]:
# GetEventsNearCity - returns info about events near a given city 
# Gives us everything except face value and ticket vendor info.
# Reference: https://developer.stubhub.com/store/site/pages/doc-viewer.jag?category=Search&api=EventSearchAPIv2&endpoint=searchforeventsv2&version=v2
def GetStubHubEventsNearCity(city, limit):
    url = 'https://api.stubhub.com/search/catalog/events/v2'
    query = {'city': city, 'minAvailableTickets':1, 'categoryName':'Concert tickets', 'limit': limit}
    headers = {'Authorization': 'Bearer y57VBex3X28F4XcMW2uq9sEfhisa'}
    r = requests.get(url, params=query, headers=headers, verify=True)
    df = CreateStubHubDataFrame(r)
    return df

def CreateStubHubDataFrame(stubhub_response):
    json_response = json.loads(stubhub_response.text)
    event_info = []
    for event in json_response['events']:
        event_info.append(
            {
                'artist':event['attributes'][0].get('value').encode('utf-8', errors='replace'),
                'date':str(event['dateLocal']),
                'event_id':str(event['id']),
                'min_price':str(event['ticketInfo'].get('minPrice')),
                'max_price':str(event['ticketInfo'].get('maxPrice')),
                'total_postings':str(event['ticketInfo'].get('totalPostings')),
                'total_tickets':str(event['ticketInfo'].get('totalTickets')),
                'venue':str(event['venue'].get('name')),
                'city':str(event['venue'].get('city')),
                'state':str(event['venue'].get('state')),
            }
        )
    sort_map = ['event_id', 'date', 'artist', 'venue', 'min_price', 'max_price', 'total_postings', 'total_tickets', 'city', 'state']
    TicketData = pd.DataFrame(event_info)
    TicketData = TicketData.ix[:, sort_map] # sort columns based on sort map
    TicketData = TicketData.sort_values(by='event_id', ascending=True)
    return TicketData

def DeleteNonConcerts(df):
    df = df[df['artist'].str.contains("PARKING PASSES") == False] # Get rid of parking pass tix
    df = df[df['venue'].str.contains("111 Minna Gallery") == False] # Hard-coded for SF (art gallery)
    df = df[df['artist'].str.contains("STRAWBERRY FIELDS") == False] # Get rid of STRAWBERRY FIELDS at BB King's Blues Club NYC
    df = df.reset_index() # reset numeric index
    del df['index']
    return df

def GenerateArtistVenueListFromDataframe(df):
    original_artist_venue_list = df[['event_id', 'artist', 'venue']].values.tolist()
    filtered_artist_venue_list = []
    # Send artist venue list items thru filter to correct bad values before SongKick query:
    for item in original_artist_venue_list:
        item = FilterArtistVenueList(item)
        filtered_artist_venue_list.append(item)
    #artist_venue_list = artist_venue_list[:210] # Limit list just for testing purposes.
    return filtered_artist_venue_list
    
# FilterArtistVenueList - for a list of form [event_id, artist, venue], run filters on values to prepare for SongKick
def FilterArtistVenueList(artist_venue_list):
    artist_venue_list[2] = artist_venue_list[2].replace('Nob Hill Masonic Auditorium', 'Masonic')
    artist_venue_list[2] = artist_venue_list[2].replace("Slims", "Slim's")
    artist_venue_list[2] = artist_venue_list[2].replace("Ruby Skye Nightclub", 'Ruby Skye')
    artist_venue_list[2] = artist_venue_list[2].replace("Bimbos 365 Club", "Bimbo's 365 Club")
    artist_venue_list[2] = artist_venue_list[2].replace("Stubbs BarBQ", "Stubb's BBQ")
    artist_venue_list[2] = artist_venue_list[2].replace("Emos", "Emo's")
    artist_venue_list[2] = artist_venue_list[2].replace("Frank Erwin Events Center", "Frank Erwin Center")
    artist_venue_list[2] = artist_venue_list[2].replace("ACL Live at The Moody Theater", "Austin City Limits Live at The Moody Theater")
    artist_venue_list[1] = artist_venue_list[1].replace("HARLEM GOSPEL CHOIR", "Jessica Carvo")
    return artist_venue_list

# SongKickArtistVenueSearch - for a given query (artist + venue), give us the Songkick search results page
def SongKickArtistVenueSearch(query):
    url = 'http://www.songkick.com/search'
    parameters = {'page':'1', 'per_page':'10', 'query':query, 'type':'upcoming'}
    r = requests.get(url, params=parameters)
    return r

# GetSongKickEventPage - for a given SongKick event URL tail, return the SongKick event HTML page
def GetSongKickEventPage(url_tail):
    base_url = 'http://www.songkick.com'
    full_url = base_url + url_tail
    r = requests.get(full_url)
    return r

# ConvertDateStringToDateTime - for a given date string (formatted by StubHub API), return a date object for date
def ConvertDateStringToDateTime(date_string):
    date_string = date_string.replace('T', '')[:-13].upper() # Get rid of time from date string.
    date_string = date_string.replace('-', ' ') # Get rid of dashes.
    date_time = datetime.datetime.strptime(date_string, "%Y %m %d")
    date = date_time.date()
    return date

# GetNumberOfDaysUntilDate - for a given date string (formatted by StubHub API), get the number of days from now until then
def GetNumberOfDaysUntilDate(date_string):
    date = ConvertDateStringToDateTime(date_string)
    now = datetime.datetime.now().date()
    delta = date - now
    return delta.days

# GetEchonestRateLimit - returns our current Echonest API rate limit (i.e., number of allowed calls per minute)
# http://developer.echonest.com/docs/v4#rate-limits
def GetEchonestRateLimit():
    url = 'http://developer.echonest.com/api/v4/artist/profile?api_key=SYMKX4PCI2YPULGTV&name=radiohead'
    r = requests.get(url, verify=True)
    rate_limit = int(r.headers['X-RateLimit-Limit'])
    return rate_limit

# GetEchonestInfo function - for a given SongKick Artist ID string, return a list with the relevant Echonest Info for the artist
# Return list is of the form: [SK_id, # blogs, # news, # reviews, discovery, famliliarity, hotttnesss, # years active]
def GetEchonestInfo(sk_id):
    url = ("http://developer.echonest.com/api/v4/artist/profile?api_key=SYMKX4PCI2YPULGTV&id=songkick:artist:"+sk_id+
            "&bucket=familiarity&bucket=hotttnesss&bucket=discovery&bucket=years_active&bucket=doc_counts")
    r = requests.get(url) #get data
    json_response = json.loads(r.text) #put data into json object
    if json_response['response']['status']['code'] == 0:
        # if we received valid results from echonest for the given ID:
        i = json_response['response']['artist']
    
        # convert the year in years_active to number of years active
        if i['years_active'] and (type(i['years_active'][0]['start']) is int): # if the years_active list is not empty    
            start_year = datetime.date(i['years_active'][0]['start'],1,1).year
            this_year = datetime.date(datetime.date.today().year,1,1).year
            num_years_active = this_year - start_year
        else:
            num_years_active = None

        info_list = [sk_id, i['doc_counts']['blogs'], i['doc_counts']['news'], i['doc_counts']['reviews'], 
                     i['discovery'], i['familiarity'], i['hotttnesss'], num_years_active]
    else:
        # if echonest returned a non-success response code (see here: http://developer.echonest.com/docs/v4#response-codes):
        error = "error_"+str(json_response['response']['status']['code'])
        info_list = [sk_id, error, error, error, error, error, error, error]
    return info_list

## Create StubHub Dataframe for a given metro area and get rid of non-concert events

In [3]:
city = 'New York' # Set Desired Metro Area Here
file_suffix = "NYC" # Hard code for output file suffixes

# Get Stubhub events for a given city and put resulting events data into TicketData dataframe
TicketData = GetStubHubEventsNearCity(city, 500)

print "Data count: "
print TicketData['event_id'].count()
TicketData.head()

Data count: 
422


Unnamed: 0,event_id,date,artist,venue,min_price,max_price,total_postings,total_tickets,city,state
21,9327664,2016-04-26T20:00:00-0400,Ana Moura,Carnegie Hall - Stern Auditorium,235.35,288.0,3,8.0,New York,NY
205,9343796,2016-06-03T12:58:00-0400,Governors Ball,Randalls Island,359.25,4999.01,124,321.0,New York,NY
412,9354230,2016-04-11T20:00:00-0400,David Gilmour,Madison Square Garden,137.02,4400.0,615,1523.0,New York,NY
3,9354233,2016-04-11T20:01:00-0400,PARKING PASSES ONLY David Gilmour,Madison Square Garden Parking Lots,26.1,40.0,5,38.0,New York,NY
411,9357410,2016-04-12T20:00:00-0400,David Gilmour,Madison Square Garden,125.22,6900.0,993,2429.0,New York,NY


In [4]:
# Get rid of rows that aren't actually concerts
TicketData = DeleteNonConcerts(TicketData)

print "New data count: "
print TicketData['event_id'].count()
TicketData.head()

New data count: 
383


Unnamed: 0,event_id,date,artist,venue,min_price,max_price,total_postings,total_tickets,city,state
0,9327664,2016-04-26T20:00:00-0400,Ana Moura,Carnegie Hall - Stern Auditorium,235.35,288.0,3,8.0,New York,NY
1,9343796,2016-06-03T12:58:00-0400,Governors Ball,Randalls Island,359.25,4999.01,124,321.0,New York,NY
2,9354230,2016-04-11T20:00:00-0400,David Gilmour,Madison Square Garden,137.02,4400.0,615,1523.0,New York,NY
3,9357410,2016-04-12T20:00:00-0400,David Gilmour,Madison Square Garden,125.22,6900.0,993,2429.0,New York,NY
4,9364603,2016-03-14T20:00:00-0400,Red Hot Chilli Pipers,B.B. King Blues Club & Grill,35.54,106.52,13,29.0,New York,NY


## Create List of Artist-Venue Pairs

In [5]:
# Generate list of artist-venue pairs from StubHub data to search on SongKick to get ticket info for each event.
artist_venue_list = GenerateArtistVenueListFromDataframe(TicketData)
print artist_venue_list

[['9327664', 'Ana Moura', 'Carnegie Hall - Stern Auditorium'], ['9343796', 'Governors Ball', 'Randalls Island'], ['9354230', 'David Gilmour', 'Madison Square Garden'], ['9357410', 'David Gilmour', 'Madison Square Garden'], ['9364603', 'Red Hot Chilli Pipers', 'B.B. King Blues Club & Grill'], ['9372090', 'Disturbed', 'Irving Plaza'], ['9386342', 'Underoath', 'Playstation Theater'], ['9388305', 'Afro Latin Jazz Orchestra', 'Apollo Theater'], ['9392063', 'Vance Joy', 'Beacon Theatre'], ['9395215', 'Skizzy Mars', 'SOBs'], ['9397104', 'Vance Joy', 'Beacon Theatre'], ['9408082', 'Galactic', 'Terminal 5'], ['9408640', 'Iron Maiden', 'Madison Square Garden'], ['9409378', 'The Cure', 'Madison Square Garden'], ['9413097', 'The Cure', 'Madison Square Garden'], ['9415426', 'Drew Holcomb and the Neighbors', 'Bowery Ballroom'], ['9416981', 'Luis Miguel', 'Madison Square Garden'], ['9420519', 'Jimmie Vaughan', 'B.B. King Blues Club & Grill'], ['9420557', 'B.B. King Blues Club & Grill', 'B.B. King Blu

## Get SongKick Event URLs

In [6]:
# Get Songkick event URLs for each event from the StubHub data (TIME CONSUMING FOR LARGE DATASETS)
sk_eventURL_list = []
for item in artist_venue_list:
    query = str(item[1]) + " " + str(item[2]) # "Artist Venue"
    response = SongKickArtistVenueSearch(query)
    soup = BeautifulSoup(response.text, 'html.parser')
    results = soup.find_all("p", class_="summary")
    if results:
        m = re.search("/concerts/[0-9]{8}[a-zA-Z-]*", str(results[0])) # Only look at first result.
        if m:
            to_append = [item[0], m.group(0)]
        else:
            to_append = [item[0], 'NoRegexp'] # Is this result even possible? Prob not?
    else:
        to_append = [item[0], 'NoResults']
    sk_eventURL_list.append(to_append)
    time.sleep(0.5) # Wait half a second (so that we don't send too many requests to SongKick to fast)

print sk_eventURL_list

[['9327664', '/concerts/23958309-ana-moura-at-isaac-stern-auditorium-carnegie-hall'], ['9343796', 'NoResults'], ['9354230', '/concerts/24385739-david-gilmour-at-madison-square-garden'], ['9357410', '/concerts/24385739-david-gilmour-at-madison-square-garden'], ['9364603', '/concerts/23704558-red-hot-chilli-pipers-at-bb-king-blues-club-and-grill'], ['9372090', '/concerts/24637034-disturbed-at-irving-plaza'], ['9386342', '/concerts/24721504-underoath-at-playstation-theater'], ['9388305', '/concerts/23909728-arturo-ofarrill-at-apollo-theater'], ['9392063', '/concerts/24877609-vance-joy-at-beacon-theatre'], ['9395215', '/concerts/25125294-skizzy-mars-at-sobs'], ['9397104', '/concerts/24877609-vance-joy-at-beacon-theatre'], ['9408082', '/concerts/25045704-galactic-at-terminal-'], ['9408640', '/concerts/25068179-iron-maiden-at-madison-square-garden'], ['9409378', '/concerts/25101574-cure-at-madison-square-garden'], ['9413097', '/concerts/25101574-cure-at-madison-square-garden'], ['9415426', '

## Check SongKick URL Results and write list to a txt file as a backup

In [7]:
# Check to see which events did not get SongKick results
sk_eventURL_list_df = pd.DataFrame(sk_eventURL_list, columns=('event_id', 'sk_result'))
no_sk_results_df = sk_eventURL_list_df[sk_eventURL_list_df['sk_result'] == "NoResults"]
no_sk_results_df = sk_eventURL_list_df.join(TicketData, how='inner', rsuffix='_noSK')

num_events = len(sk_eventURL_list)
missing_events = no_sk_results_df[no_sk_results_df['sk_result'] == "NoResults"]['venue'].count()

print("Total # of events: \n%d" % num_events)
print("Total number of missing events: \n%d" % missing_events) 
print "\nVenues missing data:"
print no_sk_results_df[no_sk_results_df['sk_result'] == "NoResults"]['venue'].value_counts() #check artists

Total # of events: 
383
Total number of missing events: 
67

Venues missing data:
B.B. King Blues Club & Grill            13
Irving Plaza                             8
Rose Theater at Lincoln Center           7
Randalls Island                          7
Playstation Theater                      3
Beacon Theatre                           3
The Appel Room at The Lincoln Center     2
Madison Square Garden                    2
Merkin Concert Hall                      2
Webster Hall                             2
The Studio at Webster Hall               2
Gramercy Theatre                         2
Stage 48                                 1
City Winery New York                     1
Town Hall New York                       1
Hill Country                             1
Alice Tully Hall                         1
Carnegie Hall - Stern Auditorium         1
Marlin Room at Webster Hall              1
Apollo Theater                           1
Metropolitan Museum of Art               1
ROCKS OFF Conce

In [8]:
# Write sk_eventURL_list to a text file in case we need to come back to it.
sk_url_list_file = open('Mid_Pipeline_Data/SongKickURLList_'+file_suffix+'.txt', 'w')
for item in sk_eventURL_list:
    print>>sk_url_list_file, item
print "\nBackup file written."


Backup file written.


## Get Ticket Information from SongKick URLs

In [9]:
# Generate a new DataFrame with Ticket face value info for each event. (TIME CONSUMING FOR LARGE DATASETS)
event_face_value_list = []
for event in sk_eventURL_list:
    if event[1] == "NoResults":
        # If we didn't get a URL tail, append "NoResults" and move to the next item in the for loop.
        event_face_value_list.append([event[0], event[1], event[1], event[1], event[1]])
        continue
    event_info = [event[0]] # Add Stubhub event ID to event_info list
    response = GetSongKickEventPage(event[1])
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Get ticket vendor data.
    vendors = soup.find_all("span", class_="vendor")
    if vendors:
        vendor_string = str(vendors[0])
        vendor_string = vendor_string.replace('<span class="vendor">', "")
        vendor_string = vendor_string.replace('\n      </span>', '')
    else:
        vendor_string = "NoVendorClass"
    event_info.append(vendor_string)

    #Get face value for tickets according to SongKick
    price = soup.find_all("span", class_="price")
    if price:
        m = re.search("\$[0-9.]*", str(price[0])) # NOTE - this will get the minimum ticket price if there is a range.
        if m:
            price = m.group(0)
        else:
            price = "NoPriceNumber"
    else:
        price = "NoPriceClass"
    event_info.append(price)
    
    #Get ticket availability status (0 for still available, 1 for Sold Out)
    status = soup.find_all("div", class_="ticket-cell buy-button-container")
    if status:
        m = re.search("sold-out", str(status[0]))
        if m:
            status = 1 # Sold out.
        else:
            status = 0 # Not sold out.
    else:
        status = "NoSoldOutClass"
    event_info.append(status)
    
    #Get SongKick artist ID for artist
    artist_id = soup.find_all("h1", class_="summary")
    if artist_id:
        m = re.search("/artists/[0-9]{3,9}", str(artist_id[0])) # assume that a SK artist ID is between 3 and 8 #'s long
        if m:
            artist_id = m.group(0)
            artist_id = artist_id.replace('/artists/', '')
        else:
            artist_id = "NoArtistID"
    else:
        artist_id = "NoArtistIDClass"
    event_info.append(artist_id)
    
    
    event_face_value_list.append(event_info)
    time.sleep(0.5) # Wait half a second (so that we don't send too many requests to SongKick to fast)

FaceValue_df = pd.DataFrame(event_face_value_list, columns=['event_id', 'ticket_vendor', 'face_value', 'sold_out', 'sk_artist_id'])
FaceValue_df.head()

Unnamed: 0,event_id,ticket_vendor,face_value,sold_out,sk_artist_id
0,9327664,NoVendorClass,NoPriceClass,NoSoldOutClass,75888
1,9343796,NoResults,NoResults,NoResults,NoResults
2,9354230,Ticketmaster,$75.00,0,20890
3,9357410,Ticketmaster,$75.00,0,20890
4,9364603,Ticketmaster,$30.00,0,191126


## Join FaceValue Dataframe with original Dataframe

In [10]:
# Join FaceValue dataframe with original TicketData data frame
backup = TicketData.copy() # Make a backup of original dataframe before join just in case something goes wrong.
TicketData = TicketData.join(FaceValue_df, how='left', rsuffix='_fv')
del TicketData['event_id_fv']
TicketData.head()

Unnamed: 0,event_id,date,artist,venue,min_price,max_price,total_postings,total_tickets,city,state,ticket_vendor,face_value,sold_out,sk_artist_id
0,9327664,2016-04-26T20:00:00-0400,Ana Moura,Carnegie Hall - Stern Auditorium,235.35,288.0,3,8.0,New York,NY,NoVendorClass,NoPriceClass,NoSoldOutClass,75888
1,9343796,2016-06-03T12:58:00-0400,Governors Ball,Randalls Island,359.25,4999.01,124,321.0,New York,NY,NoResults,NoResults,NoResults,NoResults
2,9354230,2016-04-11T20:00:00-0400,David Gilmour,Madison Square Garden,137.02,4400.0,615,1523.0,New York,NY,Ticketmaster,$75.00,0,20890
3,9357410,2016-04-12T20:00:00-0400,David Gilmour,Madison Square Garden,125.22,6900.0,993,2429.0,New York,NY,Ticketmaster,$75.00,0,20890
4,9364603,2016-03-14T20:00:00-0400,Red Hot Chilli Pipers,B.B. King Blues Club & Grill,35.54,106.52,13,29.0,New York,NY,Ticketmaster,$30.00,0,191126


## Calculate number of days from present until each show

In [11]:
# Calculate number of days from now until the show for each row.
TicketData['days_to_show'] = TicketData['date'].apply(GetNumberOfDaysUntilDate)
TicketData.head()

Unnamed: 0,event_id,date,artist,venue,min_price,max_price,total_postings,total_tickets,city,state,ticket_vendor,face_value,sold_out,sk_artist_id,days_to_show
0,9327664,2016-04-26T20:00:00-0400,Ana Moura,Carnegie Hall - Stern Auditorium,235.35,288.0,3,8.0,New York,NY,NoVendorClass,NoPriceClass,NoSoldOutClass,75888,44
1,9343796,2016-06-03T12:58:00-0400,Governors Ball,Randalls Island,359.25,4999.01,124,321.0,New York,NY,NoResults,NoResults,NoResults,NoResults,82
2,9354230,2016-04-11T20:00:00-0400,David Gilmour,Madison Square Garden,137.02,4400.0,615,1523.0,New York,NY,Ticketmaster,$75.00,0,20890,29
3,9357410,2016-04-12T20:00:00-0400,David Gilmour,Madison Square Garden,125.22,6900.0,993,2429.0,New York,NY,Ticketmaster,$75.00,0,20890,30
4,9364603,2016-03-14T20:00:00-0400,Red Hot Chilli Pipers,B.B. King Blues Club & Grill,35.54,106.52,13,29.0,New York,NY,Ticketmaster,$30.00,0,191126,1


## Check Results

In [12]:
no_sk_results = TicketData[TicketData['face_value'] == 'NoResults']['venue'].count()
no_fv_sk = TicketData[TicketData['face_value'] == 'NoPriceClass']['venue'].count()
no_fv = no_sk_results + no_fv_sk
total_events = TicketData['event_id'].count()
pct_fv = (1 - float(no_fv)/float(total_events))*100

print("Number of events with no SK search results: \n%d" % no_sk_results)
print("Number of events found in SK but without FV prices in SK: \n%d" % no_fv_sk)
print("Total events with no face value price found: \n%d" % no_fv)
print("Total number of events: \n%d" % total_events)
print("Percent of events with FV's: \n%d" % pct_fv)
print "\nVenue value counts with no search results in SK: "
print TicketData[TicketData['face_value'] == 'NoResults']['venue'].value_counts()

Number of events with no SK search results: 
67
Number of events found in SK but without FV prices in SK: 
95
Total events with no face value price found: 
162
Total number of events: 
383
Percent of events with FV's: 
57

Venue value counts with no search results in SK: 
B.B. King Blues Club & Grill            13
Irving Plaza                             8
Rose Theater at Lincoln Center           7
Randalls Island                          7
Playstation Theater                      3
Beacon Theatre                           3
The Appel Room at The Lincoln Center     2
Madison Square Garden                    2
Merkin Concert Hall                      2
Webster Hall                             2
The Studio at Webster Hall               2
Gramercy Theatre                         2
Stage 48                                 1
City Winery New York                     1
Town Hall New York                       1
Hill Country                             1
Alice Tully Hall                       

# Write Pre-EchoNest Dataframe to CSV

In [13]:
# Write resulting dataframe to CSV.
TicketData.to_csv(path_or_buf="Mid_Pipeline_Data/PreEchonest_TicketData_"+file_suffix+".csv", index=False)

## Get Echonest Info for Artists

### TODO 3/13/16: Probably should wait until we have all metro area data stored and then just run this once over everything

In [14]:
# Get current Echonest rate limit and set sleep time so that we can wait an appropriate time between EN API calls
rate_limit = GetEchonestRateLimit()
sleep_time = (60/rate_limit) - 0.4 # subtract 0.4 seconds since that's (roughly) how long each call takes
print("Echonest rate limit = %s and sleep_time = %s \n" % (rate_limit, sleep_time))

Echonest rate limit = 20 and sleep_time = 2.6 



In [15]:
# Only include SK IDs for events where we actually found a real face value for the tickets.
list_of_sk_artist_ids = TicketData[(TicketData['face_value'] != 'NoPriceClass') 
           & (TicketData['face_value'] != 'NoResults')]['sk_artist_id']
# Get rid of duplicate SK IDs to save Echonest API calls
list_of_sk_artist_ids = set(list_of_sk_artist_ids)
print("Number of unique artist IDs in list_of_sk_artist_ids: %d" % len(list_of_sk_artist_ids))

Number of unique artist IDs in list_of_sk_artist_ids: 152


In [16]:
# Get Echonest data for each SK artist ID (TIME CONSUMING FOR LARGE LISTS OF SK ARTIST IDS):
echonest_artist_info_list = []
for artist_id in list_of_sk_artist_ids:
    if artist_id == None:
        continue # move on if we couldn't get a songkick id for an artist
    echonest_info_list = GetEchonestInfo(artist_id)
    echonest_artist_info_list.append(echonest_info_list)
    time.sleep(sleep_time) # Wait so that we don't go over the Echonest rate limit

print echonest_artist_info_list

[['362622', 3215, 551, 38, 0.475769856233892, 0.751473, 0.890075, 36], ['1183857', 1959, 48, 0, 0.604552420651678, 0.559962, 0.793894, 2], ['20179', 551, 39, 20, 0.35046372062147807, 0.616095, 0.503572, 29], ['636848', 2519, 239, 12, 0.4962594425955814, 0.613667, 0.710173, 5], ['4154881', 1441, 472, 2, 0.3466214328199724, 0.503933, 0.418708, 40], ['42687', 3581, 672, 100, 0.4252166980474489, 0.638884, 0.635266, 17], ['408487', 2395, 519, 95, 0.36669282926193814, 0.656079, 0.564829, 46], ['667410', 97, 3, 0, 0.4952242919933629, 0.37941, 0.517351, None], ['6664009', 537, 86, 0, 0.6313915450781337, 0.546165, 0.811859, 7], ['529844', 2581, 943, 112, 0.3836531448331694, 0.672066, 0.608591, 15], ['166269', 119, 119, 22, 0.4638165934587235, 0.386502, 0.48454, 37], ['4172111', 2505, 1226, 23, 0.4645984568889668, 0.567421, 0.61719, 5], ['296988', 3748, 1184, 93, 0.32453332387424677, 0.78699, 0.663087, 51], ['5183683', 678, 208, 15, 0.4524539497371307, 0.523811, 0.562663, 5], ['170739', 1809, 41

In [None]:
# Put echonest_artist_info_list into a dataframe to be joined with TicketData
en_artist_info_list_columns = ['sk_artist_id', 'num_blogs', 'num_news', 'num_reviews', 'discovery', 'familiarity',
                              'hotttnesss', 'num_years_active']
Echonest_artist_info_df = pd.DataFrame(echonest_artist_info_list, columns=en_artist_info_list_columns)
Echonest_artist_info_df.head(15)

## Merge EchoNest data with main TicketData dataframe

In [None]:
Joined_DF = pd.merge(TicketData, Echonest_artist_info_df, how='left', left_on='sk_artist_id', right_on='sk_artist_id')
Joined_DF.head(10)

In [None]:
# Inspect particular rows in joined DF:
Joined_DF[Joined_DF['sk_artist_id'] == '8709723'].head(10)

In [None]:
# Count how many events in the Dataframe have actual face value data
fv_count = Joined_DF[(Joined_DF['face_value'] != 'NoPriceClass') 
           & (Joined_DF['face_value'] != 'NoResults')]['face_value'].count()
print("Number of events with real face vales: %s" % fv_count)
print("Total number of events: %s" % Joined_DF['event_id'].count())

In [None]:
# Optional Sanity Check: inspect SongKick and Echonest data together 
Joined_DF[['artist', 'venue', 'city', 'ticket_vendor', 'face_value', 'sold_out', 'sk_artist_id',
          'num_blogs', 'num_news', 'discovery', 'hotttnesss', 'num_years_active']].head(20)

## Write Joined Dataframe to CSV

In [None]:
# Write resulting dataframe to CSV.
Joined_DF.to_csv(path_or_buf="TicketData_"+file_suffix+".csv", index=False)