In [1]:
import pandas as pd
import os
from datetime import date, datetime, timedelta

from dateutil.relativedelta import relativedelta

In [2]:
today = datetime.today().strftime('%Y-%m-%d')
todayminusyear = (datetime.now() - relativedelta(years=1)).strftime('%Y-%m-%d')

In [3]:
try:
    script_dir = os.path.dirname(os.path.abspath(__file__))
except NameError:
    script_dir = os.getcwd()
base_dir = os.path.dirname(script_dir)
data_dir = os.path.join(base_dir, "Data", "Phish", "From Web")

In [4]:
songdata = pd.read_csv(os.path.join(data_dir, "songdata.csv"))
venuedata = pd.read_csv(os.path.join(data_dir, "venuedata.csv"))
showdata = pd.read_csv(os.path.join(data_dir, "showdata.csv"))
showdata = showdata[showdata['exclude_from_stats'] != 1].copy().reset_index(drop=True)
transitiondata = pd.read_csv(os.path.join(data_dir, "transitiondata.csv"))
setlistdata = pd.read_csv(os.path.join(data_dir, "setlistdata.csv"))

In [5]:
showdata.tail(10)

Unnamed: 0,show_number,showid,showdate,venueid,tourid,exclude_from_stats,setlist_notes
2034,2161,1725991676,2024-10-27,1626,61.0,0,<p>Access Me was played for the first time sin...
2035,2162,1727805806,2024-12-28,157,210.0,0,<p>Reba did not contain the whistling ending. ...
2036,2163,1727805824,2024-12-29,157,210.0,0,<p>Bathtub Gin was unfinished. Trey teased Fra...
2037,2164,1727805841,2024-12-30,157,210.0,0,<p>Lengthwise was performed for the first time...
2038,2165,1727805859,2024-12-31,157,210.0,0,<p>Trey teased In Memory of Elizabeth Reed dur...
2039,2152,1718730861,2025-01-29,1481,208.0,0,<p>Trey teased Mike&#39;s Song in Weekapaug Gr...
2040,2153,1718730894,2025-01-30,1481,208.0,0,<p>Page teased We&#39;re Off to See the Wizard...
2041,2154,1718730936,2025-01-31,1481,208.0,0,<p>Wolfman&#39;s Brother contained Mike&#39;s ...
2042,2155,1718730981,2025-02-01,1481,208.0,0,"<p>Before Bathtub Gin, Trey said they would pl..."
2043,2166,1737486654,2025-04-18,1570,209.0,0,


In [6]:
last_show = showdata['show_number'].max() - 1
showdata[showdata['show_number'] == showdata['show_number'].max()].head()

Unnamed: 0,show_number,showid,showdate,venueid,tourid,exclude_from_stats,setlist_notes
2043,2166,1737486654,2025-04-18,1570,209.0,0,


In [7]:
setlist_by_song = pd.merge(setlistdata, showdata, on='showid', how='left').sort_values(['songid','show_number']).reset_index(drop=True)
setlist_by_song['gap'] = setlist_by_song.groupby('songid')['show_number'].diff()
setlist_by_song.loc[setlist_by_song.groupby('songid').head(1).index, 'gap'] = None 
print(setlist_by_song.columns)
setlist_by_song.head()

Index(['showid', 'uniqueid', 'songid', 'set', 'position', 'transition',
       'isreprise', 'isjam', 'isjamchart', 'jamchart_description', 'tracktime',
       'gap', 'is_original', 'soundcheck', 'footnote', 'exclude',
       'show_number', 'showdate', 'venueid', 'tourid', 'exclude_from_stats',
       'setlist_notes'],
      dtype='object')


Unnamed: 0,showid,uniqueid,songid,set,position,transition,isreprise,isjam,isjamchart,jamchart_description,...,is_original,soundcheck,footnote,exclude,show_number,showdate,venueid,tourid,exclude_from_stats,setlist_notes
0,1252808086,51796,1,1,1,2,0,0,0,,...,0,,Phish debut.,0,1084.0,1998-12-31,157.0,43.0,0.0,Princeâs 1999 made its Phish debut (appropri...
1,1485905928,379198,1,2,13,3,0,0,1,"After busting a 524 show gap, the band breaks ...",...,0,,,0,1840.0,2017-07-26,157.0,176.0,0.0,This show was night fiveÂ of Phish's Baker's D...
2,1620929979,461812,1,1,2,2,0,2,1,"Coming after a massive, show-opening ""2001"", ""...",...,0,,,0,2009.0,2021-10-28,1316.0,188.0,0.0,This show featured a setlist with all songs fe...
3,1644953954,474224,1,1,1,2,0,2,0,,...,0,,,0,2055.0,2022-08-13,3.0,189.0,0.0,<p>Trey teased 1999 in Fluffhead and Taste and...
4,1251078810,4401,2,2,11,2,0,2,1,Debut. Jam gradually settles down from high po...,...,1,,Debut.,0,277.0,2003-01-02,6.0,51.0,0.0,Gin included San-Ho-Zay teases. Trey reference...


In [8]:
print(songdata.columns)
songdata.head()

Index(['song_id', 'song', 'original_artist', 'debut_date'], dtype='object')


Unnamed: 0,song_id,song,original_artist,debut_date
0,2301,(I Can't Get No) Satisfaction,The Rolling Stones,2022-02-25
1,2947,...And Flew Away,Trey Anastasio,2023-07-14
2,1,1999,Prince,1998-12-31
3,2908,20-20 Vision,Gene Autry,2019-12-28
4,2,46 Days,Phish,2003-01-02


In [9]:
my_song_data = (setlist_by_song[setlist_by_song['isreprise'] == 0]
                .merge(songdata[['song_id', 'song', 'original_artist']], 
                       left_on='songid', 
                       right_on='song_id', 
                       how='left').drop(columns=['song_id'])
                .groupby(['song', 'is_original'])
                .agg({
                    'show_number': ['count', 'min', 'max'],
                    'gap': ['min', 'max', 'mean', 'median', 'std']
                })
                .reset_index()
                .round(2)
               )

my_song_data.columns = ['_'.join(col).strip() for col in my_song_data.columns.values]

In [10]:
# Rename columns for easier access
my_song_data = my_song_data.rename(columns={
    'song_': 'song', 
    'is_original_': 'is_original',
    'show_number_count': 'times_played_total', 
    'show_number_min': 'debut', 
    'show_number_max': 'last_played', 
    'gap_min': 'min_gap', 
    'gap_max': 'max_gap', 
    'gap_mean': 'avg_gap',
    'gap_median': 'med_gap',  
    'gap_std': 'std_gap'
})

my_song_data['is_original'] = my_song_data['is_original'].astype(int)
my_song_data['current_gap'] = last_show - my_song_data['last_played']

my_song_data.head()

Unnamed: 0,song,is_original,times_played_total,debut,last_played,min_gap,max_gap,avg_gap,med_gap,std_gap,current_gap
0,(I Can't Get No) Satisfaction,0,1,2014.0,2014.0,,,,,,151.0
1,...And Flew Away,0,1,2081.0,2081.0,,,,,,84.0
2,1999,0,4,1084.0,2055.0,46.0,756.0,323.67,169.0,379.43,110.0
3,20-20 Vision,0,1,1944.0,1944.0,,,,,,221.0
4,46 Days,1,150,277.0,2162.0,0.0,571.0,12.65,5.0,59.7,3.0


In [11]:
my_song_data = (my_song_data
                .merge(showdata[['show_number', 'showdate']], left_on='debut', right_on='show_number', how='left')
                .rename(columns={'showdate': 'debut_date'}).drop(columns=['show_number', 'debut'])
                .merge(showdata[['show_number', 'showdate']], left_on='last_played', right_on='show_number', how='left')
                .rename(columns={'showdate': 'ltp_date'}).drop(columns=['show_number', 'last_played'])
)[['song', 'is_original', 'times_played_total','debut_date','ltp_date','current_gap','avg_gap', 'med_gap', 'std_gap']]
my_song_data['gap_zscore'] = (my_song_data['current_gap'] - my_song_data['avg_gap']) / my_song_data['std_gap']

my_song_data.head()

Unnamed: 0,song,is_original,times_played_total,debut_date,ltp_date,current_gap,avg_gap,med_gap,std_gap,gap_zscore
0,(I Can't Get No) Satisfaction,0,1,2022-02-25,2022-02-25,151.0,,,,
1,...And Flew Away,0,1,2023-07-14,2023-07-14,84.0,,,,
2,1999,0,4,1998-12-31,2022-08-13,110.0,323.67,169.0,379.43,-0.563134
3,20-20 Vision,0,1,2019-12-28,2019-12-28,221.0,,,,
4,46 Days,1,150,2003-01-02,2024-12-28,3.0,12.65,5.0,59.7,-0.161642


In [12]:
five_years_ago = date.today() - timedelta(days=5*365)
print(five_years_ago)

2020-02-12


In [13]:
type(my_song_data['ltp_date'][0])

str

In [14]:
my_song_data['ltp_date'] = pd.to_datetime(my_song_data['ltp_date'], format='%Y-%m-%d').dt.date
my_song_data['ltp_date'][0:2]

0    2022-02-25
1    2023-07-14
Name: ltp_date, dtype: object

In [15]:
ck_plus = (my_song_data[(my_song_data['is_original'] == 1) & 
                        (my_song_data['times_played_total'] > 10)
                        &(my_song_data['ltp_date'] > five_years_ago)].copy()           
        .sort_values(by='gap_zscore', ascending=False)
        .reset_index(drop=True)
        .drop(columns=['is_original','debut_date', 'std_gap','gap_zscore'])
)
        
ck_plus['current_minus_avg'] = ck_plus['current_gap'] - ck_plus['avg_gap']
ck_plus['current_minus_med'] = ck_plus['current_gap'] - ck_plus['med_gap']

ck_plus.head(25)

Unnamed: 0,song,times_played_total,ltp_date,current_gap,avg_gap,med_gap,current_minus_avg,current_minus_med
0,Waiting All Night,33,2021-10-24,164.0,9.59,4.0,154.41,160.0
1,Yarmouth Road,32,2022-07-24,123.0,12.32,6.0,110.68,117.0
2,Suzy Greenberg,449,2024-07-19,35.0,4.7,3.0,30.3,32.0
3,Wombat,22,2022-08-10,112.0,17.1,12.0,94.9,100.0
4,Glide,117,2021-08-31,167.0,17.16,5.0,149.84,162.0
5,Sparkle,333,2023-09-01,66.0,6.31,3.0,59.69,63.0
6,Breath and Burning,17,2022-07-24,123.0,16.06,6.0,106.94,117.0
7,Ass Handed,19,2023-07-22,78.0,16.22,6.0,61.78,72.0
8,Makisupa Policeman,115,2023-07-11,86.0,17.89,11.0,68.11,75.0
9,The Landlady,215,2022-08-14,109.0,9.58,2.0,99.42,107.0


In [30]:
treys_notebook_data = (setlist_by_song[(setlist_by_song['isreprise'] == 0)&(setlist_by_song['showdate'] > todayminusyear)]
                .merge(songdata[['song_id', 'song']], 
                       left_on='songid', 
                       right_on='song_id',   
                       how='left').drop(columns=['song_id'])
)[['song', 'is_original', 'show_number', 'showdate', 'gap']]

In [31]:
treys_notebook = (
            treys_notebook_data
            .groupby(['song', 'is_original'])
            .agg({
                'show_number': ['count', 'max'],
                'gap': ['min', 'max', 'mean', 'median', 'std']
            })
            .reset_index()
            .round(2)
        )
treys_notebook.columns = ['_'.join(col).strip() for col in treys_notebook.columns.values]

treys_notebook = treys_notebook.rename(columns={
            'song_': 'song', 
            'is_original_': 'is_original',
            'show_number_count': 'times_played_in_last_year', 
            'show_number_max': 'last_played', 
            'gap_min': 'min_gap', 
            'gap_max': 'max_gap', 
            'gap_mean': 'avg_gap',
            'gap_median': 'med_gap',  
            'gap_std': 'std_gap'
        })

treys_notebook['is_original'] = treys_notebook['is_original'].astype(int)
treys_notebook['current_gap'] = last_show - treys_notebook['last_played']

treys_notebook = (
            treys_notebook
            .merge(
                showdata[['show_number', 'showdate']], 
                left_on='last_played', 
                right_on='show_number', 
                how='left'
            )
            .rename(columns={'showdate': 'ltp_date'})
            .drop(columns=['show_number', 'last_played'])
        )[['song', 'is_original', 'times_played_in_last_year', 'ltp_date',
           'current_gap', 'avg_gap', 'med_gap']]

treys_notebook = (
    treys_notebook[
        (treys_notebook['is_original'] == 1) & 
        (treys_notebook['current_gap'] > 3)
        ]
    .sort_values(by='times_played_in_last_year', ascending=False)
    .reset_index(drop=True)
    .drop(columns=['is_original'])
        )

treys_notebook.head(10)

Unnamed: 0,song,times_played_in_last_year,ltp_date,current_gap,avg_gap,med_gap
0,Steam,6,2024-10-25,6.0,8.33,8.5
1,Gumbo,4,2025-01-30,12.0,12.25,10.0
2,Roggae,4,2025-01-30,12.0,11.5,10.5
3,I Am Hydrogen,4,2025-01-29,13.0,15.5,12.5
4,Wading in the Velvet Sea,4,2024-10-27,4.0,15.5,15.0
5,The Wedge,4,2024-10-25,6.0,11.25,8.0
6,The Howling,4,2024-10-25,6.0,11.0,11.5
7,A Song I Heard the Ocean Sing,3,2024-07-28,28.0,14.67,14.0
8,Limb By Limb,3,2024-10-27,4.0,23.67,14.0
9,Mull,3,2025-01-29,13.0,20.67,20.0
