In [1]:
import pandas as pd
import os
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta

In [2]:
today = datetime.today().strftime('%Y-%m-%d')
todayminusyear = (datetime.now() - relativedelta(years=1)).strftime('%Y-%m-%d')

In [3]:
try:
    script_dir = os.path.dirname(os.path.abspath(__file__))
except NameError:
    script_dir = os.getcwd()
base_dir = os.path.dirname(script_dir)
data_dir = os.path.join(base_dir, "Data", "Phish")

In [4]:
songdata = pd.read_csv(os.path.join(data_dir, "songdata.csv"))
venuedata = pd.read_csv(os.path.join(data_dir, "venuedata.csv"))
showdata = pd.read_csv(os.path.join(data_dir, "showdata.csv"))
transition_data = pd.read_csv(os.path.join(data_dir, "transition_data.csv"))
setlistdata = pd.read_csv(os.path.join(data_dir, "setlistdata.csv"))

In [5]:
last_show = showdata['show_number'].max() - 1

In [6]:
setlist_by_song = pd.merge(setlistdata, showdata, on='showid', how='left').sort_values(['songid','show_number']).reset_index(drop=True)
setlist_by_song['gap'] = setlist_by_song.groupby('songid')['show_number'].diff()
setlist_by_song.loc[setlist_by_song.groupby('songid').head(1).index, 'gap'] = None 
setlist_by_song.head()

Unnamed: 0,showid,uniqueid,songid,set,position,transition,isreprise,isjam,isjamchart,jamchart_description,...,is_original,soundcheck,footnote,exclude,show_number,showdate,venueid,tourid,exclude_from_stats,setlist_notes
0,1252808086,51796,1,1,1,2,0,0,0,,...,0,,Phish debut.,0,1292.0,1998-12-31,157.0,43.0,0.0,Princeâs 1999 made its Phish debut (appropri...
1,1485905928,379198,1,2,13,3,0,0,1,"After busting a 524 show gap, the band breaks ...",...,0,,,0,1849.0,2017-07-26,157.0,176.0,0.0,This show was night fiveÂ of Phish's Baker's D...
2,1620929979,461812,1,1,2,2,0,2,1,"Coming after a massive, show-opening ""2001"", ""...",...,0,,,0,2012.0,2021-10-28,1316.0,188.0,0.0,This show featured a setlist with all songs fe...
3,1644953954,474224,1,1,1,2,0,2,0,,...,0,,,0,2058.0,2022-08-13,3.0,189.0,0.0,<p>Trey teased 1999 in Fluffhead and Taste and...
4,1251077390,4364,2,1,1,1,0,0,0,,...,1,,,1,1421.0,2002-12-14,227.0,61.0,1.0,Phish performed on <em>Saturday Night Live</em...


In [8]:
my_song_data = (setlist_by_song[setlist_by_song['isreprise'] == 0]
                .merge(songdata[['song_id', 'song', 'original_artist']], left_on='songid', right_on='song_id', how='left').drop(columns=['song_id'])
                .groupby(['song', 'is_original'])
                .agg({
                    'show_number': ['count', 'min', 'max'],
                    'gap': ['min', 'max', 'mean', 'median', 'std']
                })
                .reset_index()
                .round(2)
               )

my_song_data.columns = ['_'.join(col).strip() for col in my_song_data.columns.values]

# Rename columns for easier access
my_song_data = my_song_data.rename(columns={
    'song_': 'song', 
    'is_original_': 'is_original',
    'show_number_count': 'times_played_total', 
    'show_number_min': 'debut', 
    'show_number_max': 'last_played', 
    'gap_min': 'min_gap', 
    'gap_max': 'max_gap', 
    'gap_mean': 'avg_gap',
    'gap_median': 'med_gap',  
    'gap_std': 'std_gap'
})

my_song_data['is_original'] = my_song_data['is_original'].astype(int)
my_song_data['current_gap'] = last_show - my_song_data['last_played']

my_song_data = (my_song_data
                .merge(showdata[['show_number', 'showdate']], left_on='debut', right_on='show_number', how='left')
                .rename(columns={'showdate': 'debut_date'}).drop(columns=['show_number', 'debut'])
                .merge(showdata[['show_number', 'showdate']], left_on='last_played', right_on='show_number', how='left')
                .rename(columns={'showdate': 'ltp_date'}).drop(columns=['show_number', 'last_played'])
)[['song', 'is_original', 'times_played_total','debut_date','ltp_date','current_gap','avg_gap', 'med_gap', 'std_gap']]
my_song_data['gap_zscore'] = (my_song_data['current_gap'] - my_song_data['avg_gap']) / my_song_data['std_gap']


my_song_data.head()

Unnamed: 0,song,is_original,times_played_total,debut_date,ltp_date,current_gap,avg_gap,med_gap,std_gap,gap_zscore
0,(I Can't Get No) Satisfaction,0,1,2022-02-25,2022-02-25,143.0,,,,
1,...And Flew Away,0,1,2023-07-14,2023-07-14,84.0,,,,
2,1999,0,4,1998-12-31,2022-08-13,108.0,255.33,163.0,267.72,-0.550314
3,20-20 Vision,0,1,2019-12-28,2019-12-28,221.0,,,,
4,46 Days,1,151,2002-12-14,2025-01-30,2.0,4.95,4.0,3.11,-0.948553


In [10]:
my_song_data['ltp_date'] = pd.to_datetime(my_song_data['ltp_date'], format='%m/%d/%y', errors='coerce')
# fix this


five_years_ago = datetime.today() - timedelta(days=5*365)
ck_plus = (my_song_data[(my_song_data['is_original'] == 1)&(my_song_data['times_played_total'] > 10)&(my_song_data['ltp_date'] > five_years_ago)].copy()           
           .sort_values(by='gap_zscore', ascending=False).reset_index(drop=True).drop(columns=['is_original','debut_date', 'std_gap','gap_zscore'])
)
ck_plus['current_minus_avg'] = ck_plus['current_gap'] - ck_plus['avg_gap']
ck_plus['current_minus_med'] = ck_plus['current_gap'] - ck_plus['med_gap']

my_song_data.head()

Unnamed: 0,song,is_original,times_played_total,debut_date,ltp_date,current_gap,avg_gap,med_gap,std_gap,gap_zscore
0,(I Can't Get No) Satisfaction,0,1,2022-02-25,NaT,143.0,,,,
1,...And Flew Away,0,1,2023-07-14,NaT,84.0,,,,
2,1999,0,4,1998-12-31,NaT,108.0,255.33,163.0,267.72,-0.550314
3,20-20 Vision,0,1,2019-12-28,NaT,221.0,,,,
4,46 Days,1,151,2002-12-14,NaT,2.0,4.95,4.0,3.11,-0.948553


In [24]:
treys_notebook_data = (setlist_by_song[(setlist_by_song['isreprise'] == 0)&(setlist_by_song['showdate'] > todayminusyear)]
                .merge(songdata[['song_id', 'song']], left_on='songid', right_on='song_id', how='left').drop(columns=['songid'])
)[['song', 'is_original', 'show_number', 'showdate','gap']]

treys_notebook = (treys_notebook_data.groupby(['song', 'is_original'])
                  .agg({
                      'show_number': ['count', 'max'],
                      'gap': ['min', 'max', 'mean', 'median', 'std']})
                  .reset_index().round(2)
)

treys_notebook.columns = ['_'.join(col).strip() for col in treys_notebook.columns.values]

# Rename columns for easier access
treys_notebook = treys_notebook.rename(columns={
    'song_': 'song', 
    'is_original_': 'is_original',
    'show_number_count': 'times_played_in_last_year', 
    'show_number_max': 'last_played', 
    'gap_min': 'min_gap', 
    'gap_max': 'max_gap', 
    'gap_mean': 'avg_gap',
    'gap_median': 'med_gap',  
    'gap_std': 'std_gap'
})

treys_notebook['is_original'] = treys_notebook['is_original'].astype(int)
treys_notebook['current_gap'] = last_show - treys_notebook['last_played']

treys_notebook = (treys_notebook
                .merge(showdata[['show_number', 'showdate']], left_on='last_played', right_on='show_number', how='left')
                .rename(columns={'showdate': 'ltp_date'}).drop(columns=['show_number', 'last_played'])
)[['song', 'is_original', 'times_played_in_last_year','ltp_date','current_gap','avg_gap', 'med_gap']]

treys_notebook = (treys_notebook[(treys_notebook['is_original'] == 1)&(treys_notebook['current_gap'] > 3)]
                  .sort_values(by='times_played_in_last_year', ascending=False)
                  .reset_index(drop=True)
                  .drop(columns=['is_original'])
)
treys_notebook.head()

Unnamed: 0,song,times_played_in_last_year,ltp_date,current_gap,avg_gap,med_gap
0,Down with Disease,13,2024-12-30,6.0,3.38,4.0
1,Carini,9,2024-12-31,5.0,5.22,4.0
2,Slave to the Traffic Light,7,2024-12-31,5.0,5.14,5.0
3,Life Saving Gun,7,2024-12-31,5.0,6.57,7.0
4,Steam,6,2024-10-25,11.0,7.67,8.5


In [25]:
save_path = os.path.join(base_dir, "Data", "Phish")
ck_plus.to_csv(os.path.join(save_path, "ck_plus.csv"), index=False)
treys_notebook.to_csv(os.path.join(save_path, "treys_notebook.csv"), index=False)