In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime

# Dictionary to convert month abbreviation to number
MONTH_CONVERSION = {
    'Jan': '01', 'Feb': '02', 'Mar': '03',
    'Apr': '04', 'May': '05', 'Jun': '06',
    'Jul': '07', 'Aug': '08', 'Sep': '09',
    'Oct': '10', 'Nov': '11', 'Dec': '12'
}

def get_soup(url):
    """Get BeautifulSoup object for a given URL."""
    page = requests.get(url)
    return BeautifulSoup(page.content, 'html.parser')

def extract_date(soup):
    """Extract and format the date from the soup object."""
    month = MONTH_CONVERSION[soup.find('span', class_='month').text]
    day = soup.find('span', class_='day').text.zfill(2)
    year = soup.find('span', class_='year').text
    return f"{year}-{month}-{day}"

def extract_location(soup):
    """Extract the location from the soup object."""
    venue_meta_tag = soup.find('meta', property='qc:venue')
    return venue_meta_tag['content'] if venue_meta_tag else 'Unknown'

def extract_tour_name(soup):
    """Extract the tour name from the soup object."""
    tour_name_element = soup.find('a', title=lambda t: t and 'by Pearl Jam setlists' in t)
    return tour_name_element.find('span').text if tour_name_element else 'Unknown'

def extract_songs(soup):
    """Extract the songs from the soup object."""
    songs = {'Main': [], 'Encore': [], 'Encore2': []}
    current_part = 'Main'

    for li in soup.find_all('li'):
        if 'encore' in li.get('class', []):
            if 'Encore 2:' in li.find('span').text:
                current_part = 'Encore2'
            elif 'Encore:' in li.find('span').text:
                current_part = 'Encore'
        song_label = li.find('a', class_='songLabel')
        if song_label:
            songs[current_part].append(song_label.text)
    
    return songs

def scrape_setlists(base_url):
    """Scrape setlists from the base URL."""
    num = 0
    data = []

    while True:
        num += 1
        url = f"{base_url}?page={num}"
        soup = get_soup(url)
        
        if soup.title.text == "Page Not Found | setlist.fm":
            break

        for link in soup.find_all('a', class_="summary url"):
            setlist_url = "https://www.setlist.fm" + link['href'][2:]
            setlist_soup = get_soup(setlist_url)

            formatted_date = extract_date(setlist_soup)
            location = extract_location(setlist_soup)
            tour_name = extract_tour_name(setlist_soup)
            songs = extract_songs(setlist_soup)

            for part, song_list in songs.items():
                for song in song_list:
                    data.append({
                        'Date': formatted_date,
                        'Location': location,
                        'Tour': tour_name,
                        'Song Title': song,
                        'Setlist Part': part
                    })

    return data

# Base URL for the setlists
base_url = "https://www.setlist.fm/setlists/pearl-jam-23d6b80b.html"
# Scrape data
data = scrape_setlists(base_url)
# Convert to DataFrame and save to Excel
df = pd.DataFrame(data)
df.to_excel('setlist.xlsx', index=False)