# Mark's EDA

This first section queries the phish.net API to get a long csv of phish setlist data. 

In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.initializers import he_normal
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Masking
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np


In [7]:
### This script access data from the Phish.net API
### https://docs.phish.net
### Warning! limit use of this api to download and cache data locally
### too many or too large API calls and the app will be shutdown by API admin

import json
import pandas as pd
import requests
from tqdm import tqdm

# songNetwork API Key - get one for free on: https://phish.net/api
apiKey = '1512F21F881B46EA6528'

print("Getting song data...")
songLink = 'https://api.phish.net/v5/songs.json?apikey='+apiKey
songFile = requests.get(songLink)
songData = json.loads(songFile.text)['data']
songDF = pd.DataFrame({
    'songid': [ int(s['songid']) for s in songData ],
    'artist': [ s['artist'] for s in songData ],
    'times_played': [ int(s['times_played']) for s in songData ],
    'last_played': [ s['last_played'] for s in songData ],
    'debut': [ s['debut'] for s in songData ]
})

print("Getting show data...")
showLink = 'https://api.phish.net/v5/shows.json?apikey='+apiKey
showFile = requests.get(showLink)
showDict = json.loads(showFile.text)['data']
allPhishShows = [ int(sh['showid']) for sh in showDict if sh['artistid']=='1' ]

print("Getting setlist data...")
setLink = 'https://api.phish.net/v5/setlists.json?apikey='+apiKey
setFile = requests.get(setLink)
setDict = json.loads(setFile.text)['data']

# subset of desired keys from the setlist data, and datatypes
setKeys = {
    'showdate':str,   # date of the concert
    'set':str,        # set of the show (1,2,3 or encore)
    'position':int,   # relative position in the show
    'songid':int,     # song id number
    'slug':str,       # song name
    'trans_mark':str, # song transition marker
    'gap':int,        # number of shows since the song last played
    'isjam':str,      # categorical - "jam" song
    'city':str,       # venue city
    'state':str,      # venue state
    'country':str,    # venue country
    'venueid':int,    # venue id number
    'tourid':int,     # which tour the show was part of
    'showlength':int  # number of songs in the show max(position)
}

print('Parsing setlist data...')

# this parses the setlists into a dataframe indexed by song
# setlist with missing keys/values are excluded
allPhishSets = { k:[] for k in setKeys.keys() }
for showid in tqdm(allPhishShows):
    fullSet = {}
    setlist = [
        d for d in setDict if 'showid' in d and int(d['showid'])==showid
    ]
    for k,v in setKeys.items():
        if k=='showlength':
            fullSet[k] = [len(setlist)]*len(setlist)
        else:
            fullSet[k] = [ v(d.get(k)) for d in setlist ]
    if any(None in v for v in fullSet.values()):
        continue  # skips sets with incomplete information
    else:
        allPhishSets = {
            k: allPhishSets.get(k, []) + fullSet.get(k, []) for k in setKeys
        }

allPhishDF = pd.DataFrame(data=allPhishSets)

# only include "full" shows with 2 sets and an encore
completeSets = allPhishDF.groupby(by=['showdate', 'set'])\
                         .size()\
                         .reset_index(name='Count')\
                         .pivot(index='showdate',columns='set',values='Count')\
                         .dropna(subset=['1', '2', 'e'])

allPhishDF = allPhishDF[allPhishDF['showdate'].isin(completeSets.index)]
allPhishDF = allPhishDF.merge(songDF,on='songid',how='left')



Getting song data...
Getting show data...
Getting setlist data...
Parsing setlist data...


100%|███████████████████████████████████████| 2124/2124 [01:19<00:00, 26.60it/s]


In [8]:
allPhishDF.to_csv('data/allphishsets.csv', index=False)
# with open('../data/allphishsets.json', 'w') as file:
#     file.write(json.dumps(allPhishDF.to_dict(orient='list')))

print("Complete!")

Complete!


In [2]:
df = pd.read_csv('data/allphishsets.csv')

This section explores the downloaded data and transforms the infrequently played songs into the "wildcard" song.

In [3]:
# example of a show IE sentence
df[df['showdate']=='2000-09-17']

Unnamed: 0,showdate,set,position,songid,slug,trans_mark,gap,isjam,city,state,country,venueid,tourid,showlength,artist,times_played,last_played,debut
0,2000-09-17,1,1,242,guyute,",",5,0,Columbia,MD,USA,9,50,16,Phish,133.0,2023-08-02,1994-10-07
1,2000-09-17,1,2,45,back-on-the-train,",",7,0,Columbia,MD,USA,9,50,16,Trey Anastasio,149.0,2023-10-06,1999-06-30
2,2000-09-17,1,3,48,bathtub-gin,",",6,0,Columbia,MD,USA,9,50,16,Phish,298.0,2023-10-10,1989-05-26
3,2000-09-17,1,4,341,limb-by-limb,",",6,0,Columbia,MD,USA,9,50,16,Phish,154.0,2023-07-26,1997-06-13
4,2000-09-17,1,5,591,the-moma-dance,",",4,0,Columbia,MD,USA,9,50,16,Phish,193.0,2023-10-06,1998-06-30
5,2000-09-17,1,6,329,lawn-boy,",",16,0,Columbia,MD,USA,9,50,16,Phish,219.0,2023-07-19,1989-11-30
6,2000-09-17,1,7,208,fluffhead,",",17,0,Columbia,MD,USA,9,50,16,Phish,277.0,2023-10-14,1984-12-01
7,2000-09-17,1,8,576,the-curtain-with,>,10,0,Columbia,MD,USA,9,50,16,Phish,40.0,2023-04-17,1987-08-09
8,2000-09-17,1,9,110,chalk-dust-torture,,4,0,Columbia,MD,USA,9,50,16,Phish,501.0,2023-10-14,1991-02-01
9,2000-09-17,2,10,466,rock-and-roll,>,13,0,Columbia,MD,USA,9,50,16,The Velvet Underground,92.0,2023-08-25,1998-10-31


In [4]:
print(f"unique songs played: {len(df['slug'].unique())}")
print(f"unique one-off songs: {len(df[df['times_played']==1]['slug'].unique())}")
print(f"unique two-off songs: {len(df[df['times_played']==2]['slug'].unique())}")
print(f"unique three-off songs: {len(df[df['times_played']==3]['slug'].unique())}")
print(f"unique four-off songs: {len(df[df['times_played']==4]['slug'].unique())}")
print(f"unique five-off songs: {len(df[df['times_played']==5]['slug'].unique())}")

unique songs played: 890
unique one-off songs: 308
unique two-off songs: 101
unique three-off songs: 41
unique four-off songs: 30
unique five-off songs: 18


In [5]:
# because there are so many 1 or 2 -off songs, these are impossible to predict with the historical knowledge
# so they are converted to songid=0 IE "wildcard"
df.loc[df['times_played'] <= 2, 'songid'] = 0
df.loc[df['times_played'] <= 2, 'slug'] = 'wildcard'
df.loc[df['times_played'] <= 2, 'times_played'] = 510
print(f"unique one-off songs: {len(df[df['times_played']==1]['slug'].unique())}")
print(f"unique two-off songs: {len(df[df['times_played']==2]['slug'].unique())}")

unique one-off songs: 0
unique two-off songs: 0


In [6]:
print(f"total words (songs played): {len(df)}")
print(f"total paragraphs (tours): {len(df['tourid'].unique())}")
print(f"unique sentences (shows): {len(df['showdate'].unique())}")
print(f"unique vocabulary (songs): {len(df['slug'].unique())}")

total words (songs played): 33533
total paragraphs (tours): 103
unique sentences (shows): 1550
unique vocabulary (songs): 482


In [7]:
songstring = df[['showdate','set','slug']].groupby(['showdate','set'])['slug']\
                                          .apply(lambda x: '|'.join(x)).reset_index()
songstring['full'] = songstring.apply(lambda row: f"set-{row['set']}|{row['slug']}", axis=1)

songstring = songstring[['showdate','full']].groupby(['showdate'])['full']\
                                            .apply(lambda x: '|'.join(x)).reset_index()

songstring


Unnamed: 0,showdate,full
0,1985-05-03,set-1|slave-to-the-traffic-light|mikes-song|da...
1,1986-04-01,set-1|quinn-the-eskimo-the-mighty-quinn|have-m...
2,1986-10-15,set-1|alumni-blues|makisupa-policeman|skin-it-...
3,1987-03-06,set-1|funky-bitch|good-times-bad-times|corinna...
4,1987-04-29,set-1|she-caught-the-katy-and-left-me-a-mule-t...
...,...,...
1545,2023-10-10,set-1|sigma-oasis|wildcard|theme-from-the-bott...
1546,2023-10-11,set-1|set-your-soul-free|funky-bitch|roggae|ki...
1547,2023-10-13,set-1|carini|rift|halleys-comet|ghost|albuquer...
1548,2023-10-14,set-1|runaway-jim|martian-monster|sample-in-a-...


In [None]:
# Pre-processing: tokenizing songs

In [None]:

songs = songstring['full'].str.split('|').apply(lambda x: [song.replace('-', ' ') for song in x])
unique_songs = list(set(song for sublist in songs for song in sublist))
num_songs = len(unique_songs)

# Encode songs into numerical values
song_to_index = {song: i for i, song in enumerate(unique_songs)}
index_to_song = {i: song for i, song in enumerate(unique_songs)}

In [None]:
# 0 list in X, 1th list in y, 2nd list in X, 3rd list in y and so on. 

In [None]:


# Create input sequences and target sequences
input_sequences = []
target_sequences = []

for i in range(len(songs) - 1):
    input_seq = songs.iloc[i]
    target_seq = songs.iloc[i + 1]
    input_sequences.append([song_to_index[song] for song in input_seq])
    target_sequences.append([song_to_index[song] for song in target_seq])

# Pad sequences to make them of the same length
X = pad_sequences(input_sequences, padding='pre', truncating='pre')
y = pad_sequences(target_sequences, padding='pre', truncating='pre')

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:


# Define the model
embedding_dim = 50  # Adjust as needed
model = Sequential()
model.add(Embedding(input_dim=num_songs, output_dim=embedding_dim, input_length=None))
model.add(Masking(mask_value=0.0)) 
model.add(LSTM(units=100,activation='tanh', kernel_initializer=he_normal, return_sequences=True))
model.add(LSTM(units=100,activation='tanh', kernel_initializer=he_normal, return_sequences=True))
model.add(Dropout(0.3))
model.add(Dense(units=num_songs, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7f9390732f70>

In [77]:
# generate predictions until num predictions or Set 1
def generate_predictions(model, seed_sequence, stop_song_index, num_predictions=10):
    predicted_sequence = seed_sequence.copy()
    predicted_set = set(predicted_sequence)

    while len(predicted_sequence) < num_predictions:
        next_song_probs = model.predict(np.array([predicted_sequence]))[0][-1]
        # Exclude songs that have already been predicted
        valid_probs = [prob if i not in predicted_set else 0 for i, prob in enumerate(next_song_probs)]
        next_song_index = np.argmax(valid_probs)

        if next_song_index == stop_song_index:
            print("Got Set 1. ")
            break

        predicted_sequence.append(next_song_index)
        predicted_set.add(next_song_index)

    return predicted_sequence


In [42]:
# test example
seed_index = 5  # test example
#seed length
seed_length = 8 
seed_sequence = songstring['full'].iloc[seed_index].split('|')

# Tokenize the seed sequence
seed_sequence = [song.replace('-', ' ') for song in seed_sequence][:seed_length]
org_list =  [song.replace('-', ' ') for song in songstring['full'].iloc[seed_index].split('|')]
seed_sequence_indices = [song_to_index[song] for song in seed_sequence]

# pad/trunc seed to the required length
seed_sequence_padded = pad_sequences([seed_sequence_indices], padding='pre', truncating='pre')[0]


# stop if set 1 is encountered again
stop_song = 'set 1'
stop_song_index = song_to_index[stop_song]
# set number of predictions to the original playlist - seed length, i.e. remaining songs pending from that playlist.
num_predictions = len(org_list)- seed_length #10  

predicted_sequence = generate_predictions(model, seed_sequence_padded.tolist(), stop_song_index, num_predictions)

# numerical predictions back to song names
predicted_songs = [index_to_song[i] for i in predicted_sequence]

# Print the seed sequence and predicted sequence
print('\norg_list')
print(org_list)
print("\nSeed Sequence:")
print(seed_sequence)
print("\nPredicted Sequence:")
print(predicted_songs[seed_length:])




org_list
['set 1', 'you enjoy myself', 'lushington', 'possum', 'slave to the traffic light', 'sneakin sally through the alley', 'clod', 'peaches en regalia', 'the man who stepped into yesterday', 'avenu malkenu', 'the man who stepped into yesterday', 'makisupa policeman', 'ya mar', 'set 2', 'divided sky', 'funky bitch', 'harpua', 'bundle of joy', 'harpua', 'fluffhead', 'good times bad times', 'set e', 'golgi apparatus', 'corinna', 'letter to jimmy page']

Seed Sequence:
['set 1', 'you enjoy myself', 'lushington', 'possum', 'slave to the traffic light', 'sneakin sally through the alley', 'clod', 'peaches en regalia']

Predicted Sequence:
['wildcard', 'golgi apparatus', 'alumni blues', 'divided sky', 'suzy greenberg', 'buried alive', 'the landlady', 'foam', 'guelah papyrus']


In [None]:
# Assuming org_list, seed_sequence, stopword_list, and predicted_sequence are lists of songs

stopword_list = {'set 1', 'set 2', 'set 3', 'set 4', 'set 5', 'set 6', 'set e'}


# Convert the lists to sets
org_set = set(org_list)
seed_set = set(seed_sequence)

# Find the songs in org_list but not in seed_sequence
not_in_seed_sequence = org_set - seed_set

# Remove songs in stopword_list
filtered_songs = [song.strip() for song in not_in_seed_sequence if song not in stopword_list]

# Remove songs in stopword_list from predicted_sequence
predicted_sequence = [song.strip() for song in predicted_songs[seed_length:] if song not in stopword_list]

# Count the number of matching songs
matching_songs = [song for song in filtered_songs if song in predicted_sequence]
num_matching_songs = len(matching_songs)

# Calculate the percentage of matching songs
percentage_matching = (num_matching_songs / len(filtered_songs)) * 100

print("Matching Songs in Predicted Sequence:")
print(matching_songs)
print("Number of Matching Songs:", num_matching_songs)
print("Percentage of Matching Songs:", percentage_matching)


In [None]:
## Splitting by date

In [26]:
split_date = '2017-12-31' 

# Split the data
train_data = songstring[songstring['showdate'] < split_date]
test_data = songstring[songstring['showdate'] >= split_date]

# Tokenize songs in the training and testing data
train_songs = train_data['full'].str.split('|').apply(lambda x: [song.replace('-', ' ') for song in x])
test_songs = test_data['full'].str.split('|').apply(lambda x: [song.replace('-', ' ') for song in x])

# Create unique songs and encode into numerical values
all_songs = list(set(song for sublist in train_songs for song in sublist))
num_songs = len(all_songs)

song_to_index = {song: i for i, song in enumerate(all_songs)}
index_to_song = {i: song for i, song in enumerate(all_songs)}

# Encode sequences for training data
train_input_sequences = []
train_target_sequences = []

for i in range(len(train_songs) - 1):
    input_seq = train_songs.iloc[i]
    target_seq = train_songs.iloc[i + 1]
    train_input_sequences.append([song_to_index[song] for song in input_seq])
    train_target_sequences.append([song_to_index[song] for song in target_seq])
    
# Pad sequences for training and testing data
X_train = pad_sequences(train_input_sequences, padding='pre', truncating='pre')
y_train = pad_sequences(train_target_sequences, padding='pre', truncating='pre')
# Encode sequences for testing data
test_input_sequences = []
test_target_sequences = []

for i in range(len(test_songs) - 1):
    input_seq = test_songs.iloc[i]
    target_seq = test_songs.iloc[i + 1]

    # Handle missing songs in the dictionary
    input_sequence_indices = [song_to_index.get(song, -1) for song in input_seq]
    target_sequence_indices = [song_to_index.get(song, -1) for song in target_seq]

    # Filter out songs with index -1 (not found in the dictionary)
    input_sequence_indices = [index for index in input_sequence_indices if index != -1]
    target_sequence_indices = [index for index in target_sequence_indices if index != -1]

    test_input_sequences.append(input_sequence_indices)
    test_target_sequences.append(target_sequence_indices)

# Pad sequences for testing data
X_test = pad_sequences(test_input_sequences, padding='pre', truncating='pre')
y_test = pad_sequences(test_target_sequences, padding='pre', truncating='pre')

In [36]:


# model
embedding_dim = 50  
model = Sequential()
model.add(Embedding(input_dim=num_songs, output_dim=embedding_dim, input_length=None))
model.add(Masking(mask_value=0.0)) 
model.add(LSTM(units=100,activation='tanh', kernel_initializer=he_normal, return_sequences=True))
model.add(LSTM(units=100,activation='tanh', kernel_initializer=he_normal, return_sequences=True))
model.add(Dropout(0.3))
model.add(Dense(units=num_songs, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100


Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.src.callbacks.History at 0x7f93776b6130>

In [76]:
# test example
seed_index = 50  # test example
#seed length
seed_length =1 
seed_sequence = songstring['full'].iloc[seed_index].split('|')

# Tokenize the seed sequence
seed_sequence = [song.replace('-', ' ') for song in seed_sequence][:seed_length]
org_list =  [song.replace('-', ' ') for song in songstring['full'].iloc[seed_index].split('|')]
seed_sequence_indices = [song_to_index[song] for song in seed_sequence]

# pad/trunc seed to the required length
seed_sequence_padded = pad_sequences([seed_sequence_indices], padding='pre', truncating='pre')[0]


# stop if set 1 is encountered again
stop_song = 'set 1'
stop_song_index = song_to_index[stop_song]
# set number of predictions to the original playlist - seed length, i.e. remaining songs pending from that playlist.
num_predictions = len(org_list)- seed_length #10  

predicted_sequence = generate_predictions(model, seed_sequence_padded.tolist(), stop_song_index, num_predictions)

# numerical predictions back to song names
predicted_songs = [index_to_song[i] for i in predicted_sequence]

# Print the seed sequence and predicted sequence
print('\norg_list')
print(org_list)
print("\nSeed Sequence:")
print(seed_sequence)
print("\nPredicted Sequence:")
print(predicted_songs[seed_length:])



org_list
['set 1', 'golgi apparatus', 'the oh kee pa ceremony', 'suzy greenberg', 'you enjoy myself', 'walk away', 'bouncing around the room', 'acdc bag', 'the squirming coil', 'mikes song', 'i am hydrogen', 'weekapaug groove', 'carolina', 'set 2', 'dinner and a movie', 'ya mar', 'reba', 'wilson', 'take the a train', 'alumni blues', 'letter to jimmy page', 'alumni blues', 'foam', 'the ballad of curtis loew', 'david bowie', 'set e', 'i didnt know']

Seed Sequence:
['set 1']

Predicted Sequence:
['set 2', 'i wanna be like you', 'cant you hear me knocking', 'eyes of the world', 'wild child', 'scarlet begonias', 'possum', 'wildcard', 'golgi apparatus', 'alumni blues', 'divided sky', 'suzy greenberg', 'wilson', 'chalk dust torture', 'buried alive', 'the landlady', 'foam', 'the man who stepped into yesterday', 'llama', 'guelah papyrus', 'runaway jim', 'poor heart', 'sparkle', 'rift', 'reba']


In [None]:
#% of correct songs predicted

In [75]:
# Assuming org_list, seed_sequence, stopword_list, and predicted_sequence are lists of songs

stopword_list = {'set 1', 'set 2', 'set 3', 'set 4', 'set 5', 'set 6', 'set e'}


# Convert the lists to sets
org_set = set(org_list)
seed_set = set(seed_sequence)

# Find the songs in org_list but not in seed_sequence
not_in_seed_sequence = org_set - seed_set

# Remove songs in stopword_list
filtered_songs = [song.strip() for song in not_in_seed_sequence if song not in stopword_list]

# Remove songs in stopword_list from predicted_sequence
predicted_sequence = [song.strip() for song in predicted_songs[seed_length:] if song not in stopword_list]

# Count the number of matching songs
matching_songs = [song for song in filtered_songs if song in predicted_sequence]
num_matching_songs = len(matching_songs)

# Calculate the percentage of matching songs
percentage_matching = (num_matching_songs / len(filtered_songs)) * 100

print("Matching Songs in Predicted Sequence:")
print(matching_songs)
print("Number of Matching Songs:", num_matching_songs)
print("Percentage of Matching Songs:", percentage_matching)


Matching Songs in Predicted Sequence:
['possum', 'foam', 'wilson', 'golgi apparatus', 'divided sky', 'alumni blues']
Number of Matching Songs: 6
Percentage of Matching Songs: 24.0
