In [1]:
import os
import pandas as pd
import numpy as np
import requests
import re
from bs4 import BeautifulSoup
from io import StringIO
from datetime import datetime, date

In [2]:
current_directory = os.getcwd()
script_directory = os.path.dirname(current_directory)
script_directory

'/Users/connorkitchings/Desktop/Repositories/Concerts'

# Functions

## Venue and Songlist

In [3]:
def load_venue_data(save=False):
    venues_url = "https://allthings.umphreys.com/venues/"
    response = requests.get(venues_url)
    response.raise_for_status()  # Raise an exception for bad status codes
    html_content = response.text
    soup = BeautifulSoup(html_content, 'html.parser')
    tables = soup.find_all('table')
    if tables:
        tables_str = str(tables)  # Convert tables to string
        tables_io = StringIO(tables_str)  # Wrap in StringIO
        tables = pd.read_html(tables_io)
    venue_data = tables[0].copy().reset_index(names='id')
    venue_data['id'] = venue_data['id'].astype(str)
    venue_data['Last Played'] = pd.to_datetime(venue_data['Last Played']).dt.date
    if save:
        venue_data.to_csv(script_directory+'/Data/UM/From Web/venues.csv', index=False)
        print("Venues data saved.")
    return venue_data

In [4]:
def load_songlist_data(save=False):
    songlist_url = "https://allthings.umphreys.com/song/"
    response = requests.get(songlist_url)
    response.raise_for_status()  # Raise an exception for bad status codes
    html_content = response.text
    soup = BeautifulSoup(html_content, 'html.parser')
    tables = soup.find_all('table')
    if tables:
        tables_str = str(tables)  # Convert tables to string
        tables_io = StringIO(tables_str)  # Wrap in StringIO
        tables = pd.read_html(tables_io)
    songlist_data = tables[1].copy().sort_values(by='Song Name').reset_index(drop=True)
    songlist_data['Debut Date'] = pd.to_datetime(songlist_data['Debut Date']).dt.date
    songlist_data['Last Played'] = pd.to_datetime(songlist_data['Last Played']).dt.date
    if save:
        songlist_data.to_csv(script_directory+'/Data/UM/From Web/songlist.csv', index=False)
        print("Songlist data saved.")
    return songlist_data

## Setlists

In [5]:
def load_setlist_data(venue_data, update=False, save=False):
    if update:
        data = update_setlist_data(venue_data)
    else:
        data = load_all_setlist_data()
        
    if save:
        data.to_csv(script_directory+'/Data/UM/From Web/setlists.csv', index=False)
        print("Setlist data saved.")
    
    return data

In [61]:
def load_all_setlist_data():
    # Pulling Song Data from All Things Umphreys Website
    songlist_url = "https://allthings.umphreys.com/song/"
    response = requests.get(songlist_url)
    response.raise_for_status()  # Raise an exception for bad status codes
    html_content = response.text
    soup = BeautifulSoup(html_content, 'html.parser')
    tables = soup.find_all('table')
    if tables:
        tables_str = str(tables)  # Convert tables to string
    # Extract song names using regex
    pattern = r'href="/song/([^"]+)"' # Regex pattern to find href="/song/something" and capture the song name
    song_names = re.findall(pattern, tables_str) # Find all matches

    setlists = []
    for song in song_names:
        song_url = songlist_url + song
        response = requests.get(song_url)
        response.raise_for_status()  # Raise an exception for bad status codes
        html_content = response.text
        soup = BeautifulSoup(html_content, 'html.parser')
        title_tag = soup.find('title')
        title = re.search(r'"(.*?)"', title_tag.get_text()).group(1) if title_tag and '"' in title_tag.get_text() else 'Unknown Title'
        tables = soup.find_all('table')
        if tables:
            tables_str = str(tables)  # Convert tables to string
            tables_io = StringIO(tables_str)  # Wrap in StringIO
            tables = pd.read_html(tables_io)
        song_table = tables[0].copy().sort_values(by='Date Played').reset_index(drop=True)
        # Add the title as the first column
        song_table.insert(0, 'Song Name', title)
        song_table['Date Played'] = pd.to_datetime(song_table['Date Played']).dt.date
        song_table = song_table.drop(columns=['Show Gap'])
        setlists.append(song_table)
        
    setlists = pd.concat(setlists).reset_index(drop=True)
    setlists['Footnote'] = setlists['Footnote'].fillna('')
    setlists = setlists.sort_values(by=['Date Played', 'Song Name'], ascending=[False, True]).reset_index(drop=True)
    
    return setlists

In [60]:
def update_setlist_data(venue_data):
    # Load existing setlist data
    existing_setlist_data = pd.read_csv(script_directory+'/Data/UM/From Web/setlists.csv')
    # Use maximum date played to filter venue data for missing shows
    last_show = datetime.strptime(existing_setlist_data['Date Played'].max(), '%Y-%m-%d').date()
    print(f"Previous Last Show in Dataset: {last_show}")
    existing_setlist_data['Date Played'] = pd.to_datetime(existing_setlist_data['Date Played']).dt.date
    missing_setlists_venues = venue_data[(venue_data['Last Played'] > last_show) & 
                                     (venue_data['Last Played'] < datetime.today().date())
                                     ].copy().reset_index(drop=True)
    # Adjust the venue name for entries including special characters or ending with ", The"
    missing_setlists_venues['Venue Name'] = missing_setlists_venues['Venue Name'].apply(
        lambda x: ('The ' + x[:-5] if x.endswith(', The') else x).replace('&', 'amp').replace("'", '039').replace("!", '')
    )
    
    new_setlists = []
    for _, row in missing_setlists_venues.iterrows():
        new_setlist_urls = []
        base_venue_url = 'https://allthings.umphreys.com/venues/'
        components = []
        venue_name = row['Venue Name']
        city = row['City']
        state = row['State']
        country = row['Country']
        if pd.notna(venue_name) and venue_name != '':
            components.append(venue_name.replace(' ', '-').lower())
        if pd.notna(city) and city != '':
            components.append(city.replace(' ', '-').lower())
        if pd.notna(state) and state != '':
            components.append(state.replace(' ', '-').lower())
        if pd.notna(country) and country != '':
            components.append(country.replace(' ', '-').lower())
        venue_url = base_venue_url + '-'.join(components)
        
        # Check Venue Page for all dates needed
        response = requests.get(venue_url)
        response.raise_for_status()  # Raise an exception for bad status codes
        html_content = response.text
        soup = BeautifulSoup(html_content, 'html.parser')
        tables = soup.find_all('table')
        if tables:
            tables_str = str(tables)  # Convert tables to string
            tables_io = StringIO(tables_str)  # Wrap in StringIO
            tables = pd.read_html(tables_io)
        venue_table = tables[0].copy()
        relevant_dates = [date for date in pd.to_datetime(venue_table['Date'], errors='coerce').dt.date if pd.notna(date) and date > last_show]
        for date in relevant_dates:
            date_url = date.strftime('%B-%-d-%Y').lower()
            base_setlist_url = 'https://allthings.umphreys.com/setlists/umphreys-mcgee-'
            url_components = [date_url]
            if pd.notna(venue_name) and venue_name != '':
                url_components.append(venue_name.replace(' ', '-').lower())
            if pd.notna(city) and city != '':
                url_components.append(city.replace(' ', '-').lower())
            if pd.notna(state) and state != '':
                url_components.append(state.replace(' ', '-').lower())
            if pd.notna(country) and country != '':
                url_components.append(country.replace(' ', '-').lower())
            setlist_url = base_setlist_url + '-'.join(url_components)+ '.html'
            new_setlist = get_setlist_from_setlist_url(setlist_url, date)
            new_setlist['Venue'] = f"{venue_name}, {city}, {state}"
            new_setlists.append(new_setlist)
       
    new_setlists = pd.concat(new_setlists).reset_index(drop=True)
    new_setlists = new_setlists[['Song Name', 'Date Played', 'Venue', 'Set', 'Song Before', 'Song After','Footnote']]
    new_setlists = new_setlists.sort_values(by=['Date Played', 'Song Name'], ascending=[False, True]).reset_index(drop=True)
    new_setlists['Venue'] = new_setlists['Venue'].str.replace(r'\bamp\b', '&', regex=True)
    new_setlists['Venue'] = new_setlists['Venue'].str.replace('039', "'", regex=False)
    new_setlists['Venue'] = np.where(new_setlists['Venue']=="Express Live, Columbus, OH", "Express Live!, Columbus, OH", new_setlists['Venue'])
    new_setlists['Venue'] = np.where(new_setlists['Venue']=='Kemba Live, Columbus, OH', 'KEMBA Live!, Columbus, OH', new_setlists['Venue'])
    new_setlists['Venue'] = np.where(new_setlists['Venue']=="Ram's Head Live, Baltimore, MD", "Ram's Head Live!, Baltimore, MD", new_setlists['Venue'])
    new_setlists['Venue'] = np.where(new_setlists['Venue']=="Virginia Credit Union Live, Richmond, VA", "Virginia Credit Union Live!, Richmond, VA", new_setlists['Venue'])
    new_setlists['Venue'] = new_setlists['Venue'].apply(lambda x: x[:-4] if x.strip().casefold().endswith('the') else x)
    
    final_setlist = pd.concat([existing_setlist_data, new_setlists]).sort_values(by=['Date Played', 'Song Name'], ascending=[False, True]).reset_index(drop=True)
    final_setlist['Footnote'] = final_setlist['Footnote'].fillna('')
    return final_setlist

In [27]:
def get_setlist_from_setlist_url(url, the_date):
    response = requests.get(url)
    response.raise_for_status()  # Raise an exception for bad status codes
    html_content = response.text
    soup = BeautifulSoup(html_content, 'html.parser')
    setlist_body = soup.find('div', class_='setlist-body')
    # Extract footnotes to reference later
    footnotes_dict = {}
    footnotes_section = soup.find('p', class_='setlist-footnotes')
    if footnotes_section:
        footnote_text = footnotes_section.get_text()
        # Extract footnote numbers and their descriptions
        footnote_matches = re.findall(r'\[(\d+)\](.*?)(?=\[\d+\]|$)', footnote_text, re.DOTALL)
        for num, desc in footnote_matches:
            footnotes_dict[num] = desc.strip()

    # Process each set
    all_songs = []
    current_set = ""

    # Find all paragraphs in the setlist body (each paragraph typically represents a set)
    for paragraph in setlist_body.find_all('p'):
            # Extract set name
            set_label = paragraph.find('b', class_='setlabel')
            if set_label:
                current_set = set_label.get_text().strip()
                
                # Extract all song boxes in this set
                song_boxes = paragraph.find_all('span', class_='setlist-songbox')
                
                # Process songs within this set only
                set_songs = []
                
                for i, box in enumerate(song_boxes):
                    # Extract song name (removing transition symbols)
                    song_link = box.find('a')
                    if song_link:
                        song_name = song_link.get_text().strip()
                    else:
                        # If no link, try to get text directly
                        song_name = box.get_text().strip()
                    
                    # Clean up the song name (remove transition symbols)
                    song_name = re.sub(r'[,>]$', '', song_name).strip()
                    
                    # Check for footnotes
                    footnote_refs = []
                    sup_tags = box.find_all('sup')
                    for sup in sup_tags:
                        footnote_num = re.search(r'\[(\d+)\]', sup.get_text())
                        if footnote_num:
                            footnote_refs.append(footnote_num.group(1))
                    
                    # Get footnote text
                    footnote_text = ""
                    if footnote_refs:
                        for ref in footnote_refs:
                            if ref in footnotes_dict:
                                footnote_text += footnotes_dict[ref] + " "
                    
                    # Add to our set-specific list
                    set_songs.append({
                        'Song Name': song_name,
                        'Set': current_set,
                        'Footnote': footnote_text.strip(),
                    })
                
                # Now process previous and next songs within this set only
                for i, song_data in enumerate(set_songs):
                    prev_song = None
                    next_song = None
                    
                    # Previous song (only if within the same set)
                    if i > 0:
                        prev_song = set_songs[i-1]['Song Name']
                    
                    # Next song (only if within the same set)
                    if i < len(set_songs) - 1:
                        next_song = set_songs[i+1]['Song Name']
                    
                    # Add the prev/next info to the song data
                    song_data['Song Before'] = prev_song
                    song_data['Song After'] = next_song
                    
                    # Add to our main list
                    all_songs.append(song_data)
        
    # Convert to DataFrame
    df = pd.DataFrame(all_songs)

    # Fill NA values with '***' to indicate no previous/next song in the set
    df['Song After'] = df['Song After'].fillna('***')
    df['Song Before'] = df['Song Before'].fillna('***')
    df['Set'] = np.where(df['Set'].str.contains('Encore'), 'e', 
                        np.where(df['Set'].str.contains('Set 1'), '1', 
                                np.where(df['Set'].str.contains('Set 2'), '2', 
                                        np.where(df['Set'].str.contains('Set 3'), '3', df['Set']))))
    df['Date Played'] = the_date
    df['Date Played'] = pd.to_datetime(df['Date Played']).dt.date
    
    return df

# Loading Data

In [9]:
venue_data = load_venue_data()
venue_data.head()

Unnamed: 0,id,Venue Name,City,State,Country,Times Played,Last Played
0,0,1 Up - Colfax,Denver,CO,USA,1,2013-12-30
1,1,12th & Porter,Nashville,TN,USA,3,2003-10-02
2,2,20 Monroe Live,Grand Rapids,MI,USA,1,2017-02-02
3,3,203 East Marion Street,South Bend,IN,USA,1,1998-10-02
4,4,20th Century Theatre,Cincinnati,OH,USA,2,2002-11-15


In [10]:
songlist_data = load_songlist_data(save=True)
songlist_data.head()

Songlist data saved.


Unnamed: 0,Song Name,Original Artist,Debut Date,Last Played,Times Played Live,Avg Show Gap
0,"""Brendan & Jake switch rigs""",,2010-04-24,2021-08-22,2,442.33
1,"""Mirro Memories""",,2022-11-12,2022-11-12,1,193.0
2,"""Mrs Robinson's Strut""",,2011-04-02,2011-04-02,1,1219.0
3,#19,,2011-04-02,2011-04-02,1,1219.0
4,#5,,2003-05-15,2024-12-12,173,12.99


In [31]:
existing_setlist_data = pd.read_csv(script_directory+'/Data/UM/From Web/setlists.csv').sort_values(by=['Date Played', 'Song Name'], ascending=[False, True]).reset_index(drop=True)
existing_setlist_data['Date Played'] = pd.to_datetime(existing_setlist_data['Date Played']).dt.date
existing_setlist_data.head()

Unnamed: 0,Song Name,Date Played,Venue,Set,Song Before,Song After,Footnote
0,Cemetery Walk,2025-03-01,"Johnny Mercer Theatre, Savannah, GA",1,Domino Theory,Cemetery Walk II,
1,Cemetery Walk II,2025-03-01,"Johnny Mercer Theatre, Savannah, GA",1,Cemetery Walk,Wellwishers,
2,Domino Theory,2025-03-01,"Johnny Mercer Theatre, Savannah, GA",1,Staircase >,Cemetery Walk,
3,Front Porch,2025-03-01,"Johnny Mercer Theatre, Savannah, GA",1,Wellwishers,***,
4,Hangover,2025-03-01,"Johnny Mercer Theatre, Savannah, GA",2,That's All >,> Puppet String,


In [50]:
existing_setlist_data[existing_setlist_data['Venue'].str.contains('KEMBA')]

Unnamed: 0,Song Name,Date Played,Venue,Set,Song Before,Song After,Footnote
1323,#5,2024-02-02,"KEMBA Live!, Columbus, OH",1,Conduit,> Wellwishers,
1324,Catshot,2024-02-02,"KEMBA Live!, Columbus, OH",1,***,> Den,
1325,Conduit,2024-02-02,"KEMBA Live!, Columbus, OH",1,It's Not Your Fault >,#5,
1326,Cut the Cable,2024-02-02,"KEMBA Live!, Columbus, OH",e,***,***,
1327,Den,2024-02-02,"KEMBA Live!, Columbus, OH",1,Catshot >,It's Not Your Fault,
1328,Flying,2024-02-02,"KEMBA Live!, Columbus, OH",2,Intentions Clear >,> Slacker,
1329,Higgins,2024-02-02,"KEMBA Live!, Columbus, OH",2,Hurt Bird Bath,> Walletsworth,
1330,Hurt Bird Bath,2024-02-02,"KEMBA Live!, Columbus, OH",2,***,Higgins,
1331,Intentions Clear,2024-02-02,"KEMBA Live!, Columbus, OH",2,Professor Wormbog,> Flying,
1332,It's Not Your Fault,2024-02-02,"KEMBA Live!, Columbus, OH",1,Den,> Conduit,


In [32]:
existing_setlist_data['Date Played'][0]

datetime.date(2025, 3, 1)

In [33]:
setlist_data['Date Played'][0]

datetime.date(2025, 3, 15)

In [63]:
setlist_data = load_setlist_data(venue_data, update=True, save=True)
setlist_data

Previous Last Show in Dataset: 2025-03-01
Setlist data saved.


Unnamed: 0,Song Name,Date Played,Venue,Set,Song Before,Song After,Footnote
0,Attachments,2025-03-15,"Treefort Music Hall, Boise, ID",2,Mantis,Hindsight,
1,Come As Your Kids,2025-03-15,"Treefort Music Hall, Boise, ID",e,***,***,
2,Comma Later,2025-03-15,"Treefort Music Hall, Boise, ID",1,No Excuses,Nothing Too Fancy,
3,Domino Theory,2025-03-15,"Treefort Music Hall, Boise, ID",1,It's Not Your Fault,Speak Up,
4,Glory,2025-03-15,"Treefort Music Hall, Boise, ID",2,Hindsight,***,
...,...,...,...,...,...,...,...
46110,Philosophy,1998-01-21,"Bridget McGuire's Filling Station, South Bend, IN",1,Muffburger Sandwich,> Linus and Lucy,"debut, Ben Folds Five"
46111,Philosophy,1998-01-21,"Bridget McGuire's Filling Station, South Bend, IN",1,Linus and Lucy >,***,
46112,Pour Some Sugar On Me,1998-01-21,"Bridget McGuire's Filling Station, South Bend, IN",e,***,***,"debut, Def Leppard"
46113,Red Baron,1998-01-21,"Bridget McGuire's Filling Station, South Bend, IN",1,Bob,> Divisions,"debut, Billy Cobham; with Birthday wishes from..."
