# Mark's EDA

This first section queries the phish.net API to get a long csv of phish setlist data. 

In [1]:
### This script access data from the Phish.net API
### https://docs.phish.net
### Warning! limit use of this api to download and cache data locally
### too many or too large API calls and the app will be shutdown by API admin

import json
import pandas as pd
import requests
from tqdm import tqdm

# songNetwork API Key - get one for free on: https://phish.net/api
apiKey = '1512F21F881B46EA6528'

print("Getting song data...")
songLink = 'https://api.phish.net/v5/songs.json?apikey='+apiKey
songFile = requests.get(songLink)
songData = json.loads(songFile.text)['data']
songDF = pd.DataFrame({
    'songid': [ int(s['songid']) for s in songData ],
    'artist': [ s['artist'] for s in songData ],
    'times_played': [ int(s['times_played']) for s in songData ],
    'last_played': [ s['last_played'] for s in songData ],
    'debut': [ s['debut'] for s in songData ]
})

print("Getting show data...")
showLink = 'https://api.phish.net/v5/shows.json?apikey='+apiKey
showFile = requests.get(showLink)
showDict = json.loads(showFile.text)['data']
allPhishShows = [ int(sh['showid']) for sh in showDict if sh['artistid']=='1' ]

print("Getting setlist data...")
setLink = 'https://api.phish.net/v5/setlists.json?apikey='+apiKey
setFile = requests.get(setLink)
setDict = json.loads(setFile.text)['data']

# subset of desired keys from the setlist data, and datatypes
setKeys = {
    'showdate':str,   # date of the concert
    'set':str,        # set of the show (1,2,3 or encore)
    'position':int,   # relative position in the show
    'songid':int,     # song id number
    'slug':str,       # song name
    'trans_mark':str, # song transition marker
    'gap':int,        # number of shows since the song last played
    'isjam':str,      # categorical - "jam" song
    'city':str,       # venue city
    'state':str,      # venue state
    'country':str,    # venue country
    'venueid':int,    # venue id number
    'tourid':int,     # which tour the show was part of
    'showlength':int  # number of songs in the show max(position)
}

print('Parsing setlist data...')

# this parses the setlists into a dataframe indexed by song
# setlist with missing keys/values are excluded
allPhishSets = { k:[] for k in setKeys.keys() }
for showid in tqdm(allPhishShows):
    fullSet = {}
    setlist = [
        d for d in setDict if 'showid' in d and int(d['showid'])==showid
    ]
    for k,v in setKeys.items():
        if k=='showlength':
            fullSet[k] = [len(setlist)]*len(setlist)
        else:
            fullSet[k] = [ v(d.get(k)) for d in setlist ]
    if any(None in v for v in fullSet.values()):
        continue  # skips sets with incomplete information
    else:
        allPhishSets = {
            k: allPhishSets.get(k, []) + fullSet.get(k, []) for k in setKeys
        }

allPhishDF = pd.DataFrame(data=allPhishSets)

# only include "full" shows with 2 sets and an encore
completeSets = allPhishDF.groupby(by=['showdate', 'set'])\
                         .size()\
                         .reset_index(name='Count')\
                         .pivot(index='showdate',columns='set',values='Count')\
                         .dropna(subset=['1', '2', 'e'])

allPhishDF = allPhishDF[allPhishDF['showdate'].isin(completeSets.index)]
allPhishDF = allPhishDF.merge(songDF,on='songid',how='left')

allPhishDF.to_csv('../data/allphishsets.csv', index=False)
# with open('../data/allphishsets.json', 'w') as file:
#     file.write(json.dumps(allPhishDF.to_dict(orient='list')))

print("Complete!")

Getting song data...
Getting show data...
Getting setlist data...
Parsing setlist data...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2124/2124 [01:48<00:00, 19.54it/s]


Complete!


This section explores the downloaded data and transforms the infrequently played songs into the "wildcard" song.

In [2]:
import pandas as pd

df = pd.read_csv('../data/allphishsets.csv')

df = df.sort_values(by=['showdate','set','position'], ascending=[True, True, True])

In [3]:
# example of a show IE sentence
df[df['showdate']=='2000-09-17']

Unnamed: 0,showdate,set,position,songid,slug,trans_mark,gap,isjam,city,state,country,venueid,tourid,showlength,artist,times_played,last_played,debut
0,2000-09-17,1,1,242,guyute,",",5,0,Columbia,MD,USA,9,50,16,Phish,133.0,2023-08-02,1994-10-07
1,2000-09-17,1,2,45,back-on-the-train,",",7,0,Columbia,MD,USA,9,50,16,Trey Anastasio,149.0,2023-10-06,1999-06-30
2,2000-09-17,1,3,48,bathtub-gin,",",6,0,Columbia,MD,USA,9,50,16,Phish,298.0,2023-10-10,1989-05-26
3,2000-09-17,1,4,341,limb-by-limb,",",6,0,Columbia,MD,USA,9,50,16,Phish,154.0,2023-07-26,1997-06-13
4,2000-09-17,1,5,591,the-moma-dance,",",4,0,Columbia,MD,USA,9,50,16,Phish,193.0,2023-10-06,1998-06-30
5,2000-09-17,1,6,329,lawn-boy,",",16,0,Columbia,MD,USA,9,50,16,Phish,219.0,2023-07-19,1989-11-30
6,2000-09-17,1,7,208,fluffhead,",",17,0,Columbia,MD,USA,9,50,16,Phish,277.0,2023-10-14,1984-12-01
7,2000-09-17,1,8,576,the-curtain-with,>,10,0,Columbia,MD,USA,9,50,16,Phish,40.0,2023-04-17,1987-08-09
8,2000-09-17,1,9,110,chalk-dust-torture,,4,0,Columbia,MD,USA,9,50,16,Phish,501.0,2023-10-14,1991-02-01
9,2000-09-17,2,10,466,rock-and-roll,>,13,0,Columbia,MD,USA,9,50,16,The Velvet Underground,92.0,2023-08-25,1998-10-31


In [4]:
print(f"unique songs played: {len(df['slug'].unique())}")
print(f"unique one-off songs: {len(df[df['times_played']==1]['slug'].unique())}")
print(f"unique two-off songs: {len(df[df['times_played']==2]['slug'].unique())}")
print(f"unique three-off songs: {len(df[df['times_played']==3]['slug'].unique())}")
print(f"unique four-off songs: {len(df[df['times_played']==4]['slug'].unique())}")
print(f"unique five-off songs: {len(df[df['times_played']==5]['slug'].unique())}")

unique songs played: 890
unique one-off songs: 308
unique two-off songs: 101
unique three-off songs: 41
unique four-off songs: 30
unique five-off songs: 18


In [5]:
# because there are so many 1 or 2 -off songs, these are impossible to predict with the historical knowledge
# so they are converted to songid=0 IE "wildcard"
df.loc[df['times_played'] <= 2, 'songid'] = 0
df.loc[df['times_played'] <= 2, 'slug'] = 'wildcard'
df.loc[df['times_played'] <= 2, 'times_played'] = 510
print(f"unique one-off songs: {len(df[df['times_played']==1]['slug'].unique())}")
print(f"unique two-off songs: {len(df[df['times_played']==2]['slug'].unique())}")

unique one-off songs: 0
unique two-off songs: 0


In [6]:
print(f"total words (songs played): {len(df)}")
print(f"total paragraphs (tours): {len(df['tourid'].unique())}")
print(f"unique sentences (shows): {len(df['showdate'].unique())}")
print(f"unique vocabulary (songs): {len(df['slug'].unique())}")

total words (songs played): 33533
total paragraphs (tours): 103
unique sentences (shows): 1550
unique vocabulary (songs): 482


Get a songstring for each show.

In [8]:
songstring = df[['showdate','set','slug']].groupby(['showdate','set'])['slug']\
                                          .apply(lambda x: '|'.join(x)).reset_index()
songstring['full'] = songstring.apply(lambda row: f"set-{row['set']}|{row['slug']}", axis=1)

songstring = songstring[['showdate','full']].groupby(['showdate'])['full']\
                                            .apply(lambda x: '|'.join(x)).reset_index()

songstring

Unnamed: 0,showdate,full
0,1985-05-03,set-1|slave-to-the-traffic-light|mikes-song|da...
1,1986-04-01,set-1|quinn-the-eskimo-the-mighty-quinn|have-m...
2,1986-10-15,set-1|alumni-blues|makisupa-policeman|skin-it-...
3,1987-03-06,set-1|funky-bitch|good-times-bad-times|corinna...
4,1987-04-29,set-1|she-caught-the-katy-and-left-me-a-mule-t...
...,...,...
1545,2023-10-10,set-1|sigma-oasis|wildcard|theme-from-the-bott...
1546,2023-10-11,set-1|set-your-soul-free|funky-bitch|roggae|ki...
1547,2023-10-13,set-1|carini|rift|halleys-comet|ghost|albuquer...
1548,2023-10-14,set-1|runaway-jim|martian-monster|sample-in-a-...


In [16]:
# tokens
[ song for show in songstring['full'] for song in show.split('|') ]

['set-1',
 'slave-to-the-traffic-light',
 'mikes-song',
 'daves-energy-guide',
 'big-leg-emma',
 'set-2',
 'alumni-blues',
 'wild-child',
 'cant-you-hear-me-knocking',
 'jam',
 'cities',
 'wildcard',
 'set-3',
 'scarlet-begonias',
 'eyes-of-the-world',
 'whipping-post',
 'mcgrupp-and-the-watchful-hosemasters',
 'makisupa-policeman',
 'run-like-an-antelope',
 'wildcard',
 'set-e',
 'anarchy',
 'set-1',
 'quinn-the-eskimo-the-mighty-quinn',
 'have-mercy',
 'harry-hood',
 'wildcard',
 'daves-energy-guide',
 'icculus',
 'you-enjoy-myself',
 'set-2',
 'wildcard',
 'wildcard',
 'acdc-bag',
 'mcgrupp-and-the-watchful-hosemasters',
 'alumni-blues',
 'letter-to-jimmy-page',
 'alumni-blues',
 'dear-mrs-reagan',
 'set-e',
 'wildcard',
 'set-1',
 'alumni-blues',
 'makisupa-policeman',
 'skin-it-back',
 'cities',
 'i-am-hydrogen',
 'mcgrupp-and-the-watchful-hosemasters',
 'acdc-bag',
 'you-enjoy-myself',
 'lushington',
 'set-2',
 'peaches-en-regalia',
 'golgi-apparatus',
 'swing-low-sweet-chariot',

In [10]:
# https://www.tensorflow.org/text/tutorials/word2vec
# https://www.tensorflow.org/api_docs/python/tf/data/TextLineDataset