# Project 2
For project 2, we built an ETL pipeline to create a database containing data on Phish live shows from 1993-2023. First, we scrapped the website phish.net for setlists and the Wikipedia page for Phish concert tours and festivals for attendance and box office data. We then transformed the extracted data, reformatting columns, value formatting, and dropping rows that lacked useful data. We stored all of the results for each year in a variable, which were in turn each stored in a list. To load our data, we converted each variable to a DataFrame and then wrote the DataFrames to CSV files. Finally, we loaded the data from the CSV files directly to SQLite and PostgreSQL. There are many interesting questions that can be explored when analyzing the database we have prepared. For example, we could look at how City and Year affect Attendance and Attendance/Capacity. We could count the recurrence of previous Cities and weigh Attendance and Gross to create a predictive model to determine the likeliest cities to be announced for future show dates. 

In [1]:
from datetime import datetime
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os
import shutil
from pathlib import Path
import sqlite3
from sqlalchemy import create_engine
from sqlalchemy.types import Integer, Text, String, DateTime, Float
import pycountry

## Part 1: Extract
#### Scrapping for Data

In [2]:
# Pull in setlist data
url = 'https://phish.net/setlists/phish/'
respons = requests.get(url)

phish_soup = BeautifulSoup(respons.text, 'html.parser')

Now we begin scraping the setlist data!

In [3]:
dates = phish_soup.find_all('span', class_='setlist-date')

date_strings = [date.text[-11:] for date in dates]

cleaned_date_strings = [date.strip() for date in date_strings]

cleaned_date_strings[0:5]

['04/21/2024', '04/20/2024', '04/19/2024', '04/18/2024', '02/24/2024']

In [4]:
venues = phish_soup.find_all('div', class_='setlist-venue')

try:
    venue_names = [venue.text.strip().title() for venue in venues]
except Exception as e:
    venue_names = [venue.find('span').text.strip() for venue in venues]

venue_names

['Sphere',
 'Sphere',
 'Sphere',
 'Sphere',
 'Moon Palace',
 'Moon Palace',
 'Moon Palace',
 'Moon Palace',
 'Moon Palace',
 'Madison Square Garden',
 'Madison Square Garden',
 'Madison Square Garden',
 'Madison Square Garden',
 'United Center',
 'United Center',
 'United Center',
 'Ervin J. Nutter Center, Wright State University',
 'Ervin J. Nutter Center, Wright State University',
 'Bridgestone Arena',
 'Bridgestone Arena',
 'Bridgestone Arena',
 "Dick'S Sporting Goods Park",
 "Dick'S Sporting Goods Park",
 "Dick'S Sporting Goods Park",
 "Dick'S Sporting Goods Park",
 'Broadview Stage At Spac',
 'Broadview Stage At Spac',
 'Madison Square Garden',
 'Madison Square Garden',
 'Madison Square Garden',
 'Madison Square Garden',
 'Madison Square Garden',
 'Madison Square Garden',
 'Madison Square Garden',
 'Td Pavilion At The Mann',
 'Td Pavilion At The Mann',
 "St. Joseph'S Health Amphitheater At Lakeview",
 'The Pavilion At Star Lake',
 'The Pavilion At Star Lake',
 'Live Oak Bank Pavil

In [None]:
venues

In [None]:
# Define a mapping for known exceptions
exception_mapping = {
    "Quintana Roo": "MX",
    "Cancun": "MX",  # Add other exceptions as needed
}

def get_country_code(location):
    # Check if the location is in the exception mapping
    if location in exception_mapping:
        return exception_mapping[location]
    try:
        # Check if the location is a valid country name
        return pycountry.countries.lookup(location).alpha_2
    except LookupError:
        # If the country is not found, return None
        return None

def get_state_or_country(location):
    # Split the location by comma to separate city and state/country
    parts = location.split(',')
    city = parts[0].strip().title()
    if len(parts) > 1:
        state_or_country = parts[1].strip().title()
        # Try to get country code
        country_code = get_country_code(state_or_country)
        if country_code:
            return city, country_code
        else:
            # If not a country, assume it's a state
            return city, state_or_country.upper()
    return city, None

locations = phish_soup.find_all('div', class_='setlist-location')
locations = [location.text.strip() for location in locations]

cities_states = [get_state_or_country(location) for location in locations]
cities = [city for city, _ in cities_states]
states_or_countries = [state_or_country for _, state_or_country in cities_states]

In [None]:
print(states_or_countries[:5])
print(cities_states[:5])
print(cities[:5])

In [None]:
# phish_p1_df = pd.DataFrame({
#     'Date': cleaned_date_strings,
#     'Venue': venue_names,
#     'City': cities,
#     'State': states
# })
# phish_p1_df

In [None]:
url2 = 'https://phish.net/setlists/?year='
all_years_dates = []
all_venues = []
all_cities = []
all_states = []

# Get the current year
current_year = datetime.now().year

# Loop from 1982 to the current year
for year in range(1982, current_year + 1):
    
    year_url = url2 + str(year)
    
    respons = requests.get(year_url)

    phish_soup = BeautifulSoup(respons.text, 'html.parser')
    
    # Dates
    dates = phish_soup.find_all('span', class_='setlist-date')

    date_strings = [date.text[-11:] for date in dates]

#     cleaned_date_strings = [date.strip().strftime('%Y%m%d') for date in date_strings]
    cleaned_date_strings = [datetime.strptime(date.strip(), '%m/%d/%Y').strftime('%Y-%m-%d') for date in date_strings]

    
    all_years_dates.extend(cleaned_date_strings)
    
    # Venues
    venues = phish_soup.find_all('div', class_='setlist-venue')

    venue_strings = [venue.find('span').text.strip().title().replace("'S", "'s") for venue in venues]
    
    all_venues.extend(venue_strings)
    
    # Locations/ City/ State
    locations = phish_soup.find_all('div', class_='setlist-location')

    locations = [location.text.strip() for location in locations]

    cities = [location.split(',')[0].title() for location in locations]
    all_cities.extend(cities)
    
    states = [location[-2:].upper() for location in locations]
    all_states.extend(states)


In [5]:
url2 = 'https://phish.net/setlists/?year='
all_years_dates = []
all_venues = []
all_cities = []
all_states = []

# Define a mapping for known exceptions
exception_mapping = {
    "Quintana Roo": "MX",
    "Cancun": "MX",  # Add other exceptions as needed
}

def get_country_code(location):
    # Check if the location is in the exception mapping
    if location in exception_mapping:
        return exception_mapping[location]
    try:
        # Check if the location is a valid country name
        return pycountry.countries.lookup(location).alpha_2
    except LookupError:
        # If the country is not found, return None
        return None

def get_state_or_country(location):
    # Split the location by comma to separate city and state/country
    parts = location.split(',')
    city = parts[0].strip().title()
    if len(parts) > 1:
        state_or_country = parts[1].strip().title()
        # Try to get country code
        country_code = get_country_code(state_or_country)
        if country_code:
            return city, country_code
        else:
            # If not a country, assume it's a state
            return city, state_or_country.upper()
    return city, None

# Get the current year
current_year = datetime.now().year

# Loop from 1982 to the current year
for year in range(1982, current_year + 1):
    year_url = url2 + str(year)
    response = requests.get(year_url)
    phish_soup = BeautifulSoup(response.text, 'html.parser')

    # Dates
    dates = phish_soup.find_all('span', class_='setlist-date')
    date_strings = [date.text[-11:] for date in dates]
    cleaned_date_strings = [datetime.strptime(date.strip(), '%m/%d/%Y').strftime('%Y-%m-%d') for date in date_strings]
    all_years_dates.extend(cleaned_date_strings)

    # Venues
    venues = phish_soup.find_all('div', class_='setlist-venue')
    venue_strings = [venue.find('span').text.strip().title().replace("'S", "'s") for venue in venues]
    all_venues.extend(venue_strings)

    # Locations/ City/ State
    locations = phish_soup.find_all('div', class_='setlist-location')
    locations = [location.text.strip() for location in locations]

    cities_states = [get_state_or_country(location) for location in locations]
    cities = [city for city, _ in cities_states]
    states_or_countries = [state_or_country for _, state_or_countries in cities_states]

    all_cities.extend(cities)
    all_states.extend(states_or_countries)

print(all_years_dates[:5])
print(all_venues[:5])
print(all_cities[:5])
print(all_states[:5])

NameError: name 'state_or_country' is not defined

In [None]:
phish_shows_df = pd.DataFrame({
    'Date': all_years_dates,
    'Venue': all_venues,
    'City': all_cities,
    'State': all_states
})
phish_shows_df