In [17]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np
import json

In [2]:
# URL of the Glastonbury 2025 lineup page
url = 'https://www.glastonburyfestivals.co.uk/line-up/line-up-2025/'

response = requests.get(url)
response.raise_for_status()  # Raise an exception for HTTP errors

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')
soup

<!DOCTYPE html>

<html lang="en-GB">
<head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<title>Glastonbury Festival - Line-Up 2025</title>
<link href="https://www.glastonburyfestivals.co.uk/wp-content/themes/glastonbury-2024/assets/favicon/apple-touch-icon.png" rel="apple-touch-icon" sizes="180x180"/>
<link href="https://www.glastonburyfestivals.co.uk/wp-content/themes/glastonbury-2024/assets/favicon/favicon-32x32.png" rel="icon" sizes="32x32" type="image/png"/>
<link href="https://www.glastonburyfestivals.co.uk/wp-content/themes/glastonbury-2024/assets/favicon/favicon-16x16.png" rel="icon" sizes="16x16" type="image/png"/>
<link href="https://www.glastonburyfestivals.co.uk/wp-content/themes/glastonbury-2024/assets/favicon/site.webmanifest?v=1749600525" rel="manifest"/>
<link color="#1e1d44" href="https://www.glastonburyfestivals.co.uk/wp-content/themes/glastonbury-2024/assets/favicon/safari-pinned-tab.svg" rel="mask-icon"/>
<meta conten

In [3]:
# Extract the information from the parsed HTML
# List to store all performances
performances = []

# Find all stage containers
for container in soup.find_all('div', class_='stage-container'):
    stage_name = container.find_previous('h3', class_='stage-name').get_text(strip=True)
    
    for day in container.find_all('h4', class_='stage-day'):
        day_name = day.get_text(strip=True)
        table = day.find_next('table')
        
        for row in table.find_all('tr'):
            cells = row.find_all('td')
            if len(cells) >= 2:
                artist = cells[0].get_text(strip=True)
                timings = cells[1].get_text(strip=True)
                
                performances.append({
                    'artist': artist.title(),
                    'stage': stage_name.title(),
                    'day': day_name.title(),
                    'start_time': timings[0:6],
                    'end_time': timings[8:13]
                })

# Create DataFrame
extracted_df = pd.DataFrame(performances)

extracted_df

Unnamed: 0,artist,stage,day,start_time,end_time
0,The 1975,Pyramid Stage,Friday,22:15,23:45
1,Biffy Clyro,Pyramid Stage,Friday,20:15,21:15
2,Alanis Morissette,Pyramid Stage,Friday,18:15,19:15
3,Tba,Pyramid Stage,Friday,16:55,17:30
4,Burning Spear,Pyramid Stage,Friday,15:00,16:00
...,...,...,...,...,...
3891,Michael Rosen,Kidzfield Big Top,Sunday,11:45,12:15
3892,Alex & Laura'S Big Monster Party,Kidzfield Big Top,Sunday,11:10,11:35
3893,Fladam Presents: Hallowbean! (A Spook-Tastic M...,Kidzfield Big Top,Sunday,10:15,10:55
3894,Red Riding Hood,Kidzfield Big Top,Sunday,09:30,10:00


In [4]:
# Choosing the main stages to extract artists from
print(extracted_df['stage'].unique())
initial_stages = ['Pyramid Stage', 'Other Stage', 'West Holts Stage', 'Woodsies', 'The Park Stage', 'Acoustic Stage', 'Avalon Stage']
# Should add more stages when the project is working
initial_stages

['Pyramid Stage' 'Other Stage' 'West Holts Stage' 'Woodsies'
 'The Park Stage' 'Acoustic Stage' 'Avalon Stage' 'Left Field' 'Arcadia'
 'Levels' 'Glade' 'Lonely Hearts Club' 'Assembly' 'Firmly Rooted'
 'The Information' 'Stonebridge Bar' 'Wishing Well' 'Scissors'
 'Free University Of Glastonbury' 'Hms Sweet Charity' 'Bimble Inn'
 'Humblewell Active Platform' 'Humblewell Retreat Yurt' 'Glade Dome'
 'Pier - 10 Aces Stage' 'Iicon' 'Genosys' 'Nyc Downlow' 'The Meatrack'
 'The Temple' 'The Rum Shack' 'Mez Yard' 'Kinetic' 'Temple Uprising'
 'Tree Stage' 'Babylon Uprising' 'Luna' 'Shangri-La Stage' 'Nomad' 'Lore'
 'Azaadi' 'Flying Bus' 'The Salon Carousel' 'Blind Tiger' 'Greenpeace'
 'The Apocalypse Museum\xa0@ Greenpeace' 'Small World Stage' 'Toad Hall'
 'Speakers Forum' 'Croissant Neuf' 'Croissant Neuf Bandstand'
 'Laboratory Stage' 'Ancient Futures' 'Lunched Out Lizards'
 'Mandala Stage' 'Bbc Introducing' 'The Hive' 'Pier Bandstand'
 'Deluxe Diner' 'The Rocket Lounge' 'Glasto Latino' 'Pilto

['Pyramid Stage',
 'Other Stage',
 'West Holts Stage',
 'Woodsies',
 'The Park Stage',
 'Acoustic Stage',
 'Avalon Stage']

In [5]:
# Filter the data to just these stages
filtered_data = extracted_df[extracted_df['stage'].isin(initial_stages)]
# Also filter out TBA and Patchwork
filtered_data = filtered_data[(filtered_data.artist != "Tba") & (filtered_data.artist != "Patchwork")]
filtered_data

Unnamed: 0,artist,stage,day,start_time,end_time
0,The 1975,Pyramid Stage,Friday,22:15,23:45
1,Biffy Clyro,Pyramid Stage,Friday,20:15,21:15
2,Alanis Morissette,Pyramid Stage,Friday,18:15,19:15
4,Burning Spear,Pyramid Stage,Friday,15:00,16:00
5,Cmat,Pyramid Stage,Friday,13:30,14:30
...,...,...,...,...,...
172,My Baby,Avalon Stage,Sunday,16:50,17:50
173,The Horne Section,Avalon Stage,Sunday,15:20,16:20
174,Brooke Combe,Avalon Stage,Sunday,13:55,14:50
175,Talisk,Avalon Stage,Sunday,12:30,13:25


In [6]:
filtered_data.shape[0]

173

In [15]:
filtered_data['artist'].tail(59)

118                                       Katy J Pearson
119                                        Geordie Greep
120                                          Melin Melyn
121                                         Ani Difranco
122                                        The Searchers
123                                       Dhani Harrison
124                                        Billie Marten
125                                           Skerryvore
126                                        Hugh Cornwell
127                                      Gabrielle Aplin
128                                         Tift Merritt
129                                           Nadia Reid
130                                 Our Man In The Field
131                                            Nick Lowe
132                                     Hothouse Flowers
133                                         Jeremy Loops
134                                          The Coronas
135                            

In [44]:
to_save = {}
for row in range(filtered_data.shape[0]):
    key = filtered_data.iloc[row, 0].replace("\n", ", ")
    to_save[key] = {
        'Glasto Info': {
            'Stage': str(filtered_data.iloc[row, 1]).strip(),
            'Day': str(filtered_data.iloc[row, 2]).strip(),
            'Start': str(filtered_data.iloc[row, 3]).strip(),
            'Finish': str(filtered_data.iloc[row, 4]).strip()
        }
    }
to_save

{'The 1975': {'Glasto Info': {'Stage': 'Pyramid Stage',
   'Day': 'Friday',
   'Start': '22:15',
   'Finish': '23:45'}},
 'Biffy Clyro': {'Glasto Info': {'Stage': 'Pyramid Stage',
   'Day': 'Friday',
   'Start': '20:15',
   'Finish': '21:15'}},
 'Alanis Morissette': {'Glasto Info': {'Stage': 'Pyramid Stage',
   'Day': 'Friday',
   'Start': '18:15',
   'Finish': '19:15'}},
 'Burning Spear': {'Glasto Info': {'Stage': 'Pyramid Stage',
   'Day': 'Friday',
   'Start': '15:00',
   'Finish': '16:00'}},
 'Cmat': {'Glasto Info': {'Stage': 'Pyramid Stage',
   'Day': 'Friday',
   'Start': '13:30',
   'Finish': '14:30'}},
 'Supergrass': {'Glasto Info': {'Stage': 'Pyramid Stage',
   'Day': 'Friday',
   'Start': '12:00',
   'Finish': '13:00'}},
 'Neil Young And The Chrome Hearts': {'Glasto Info': {'Stage': 'Pyramid Stage',
   'Day': 'Saturday',
   'Start': '22:00',
   'Finish': '23:45'}},
 'Raye': {'Glasto Info': {'Stage': 'Pyramid Stage',
   'Day': 'Saturday',
   'Start': '20:00',
   'Finish': '21:

In [45]:
names = list(to_save.keys())

In [46]:
# Use Chatgpt to clean up the Names to get people in the format that can be used elsewhere - Not used Currently
# Create a pandas df with the Glastonbury names as the first column
import pandas as pd
import openai
from openai import OpenAI
import ast
import time
import json
with open('../../secrets/keys.json', 'r') as file:
    api_data = json.load(file)
    openai_api_key = api_data['OpenAIKey']

client = openai.OpenAI(api_key = openai_api_key)

In [47]:
names_str = "\n".join(names)

In [54]:
# Create a prompt to clean up the list of names
prompt = f"""Here is a list of artist names from a lineup, some of the artists are not very clear who is the main artist 
Please return a cleaned dictionary with:
- Remove entries like "tba" or "patchwork"
- Where the act says something like Neil Young And The Chrome Hearts, please return just the main artist e.g Neil Young
- If there are DJ sets, please just get the name of the artist doing it
- The cleaned version is needed to be used in the Genius and Spotify APIs
- Please only Return as a Python dictionary where the key is the original text and the value is the returned string, where ast.literal_eval can be used on the response string to create a dictionary in python

Lineup:
{names_str}
"""
prompt

'Here is a list of artist names from a lineup, some of the artists are not very clear who is the main artist \nPlease return a cleaned dictionary with:\n- Remove entries like "tba" or "patchwork"\n- Where the act says something like Neil Young And The Chrome Hearts, please return just the main artist e.g Neil Young\n- If there are DJ sets, please just get the name of the artist doing it\n- The cleaned version is needed to be used in the Genius and Spotify APIs\n- Please only Return as a Python dictionary where the key is the original text and the value is the returned string, where ast.literal_eval can be used on the response string to create a dictionary in python\n\nLineup:\nThe 1975\nBiffy Clyro\nAlanis Morissette\nBurning Spear\nCmat\nSupergrass\nNeil Young And The Chrome Hearts\nRaye\nJohn Fogerty\nThe Script\nBrandi Carlile\nKaiser Chiefs\nOlivia Rodrigo\nNoah Kahan\nNile Rodgers & Chic\nRod Stewart\nThe Libertines\nCeleste\nThe Selecter\nLoyle Carner\nBusta Rhymes\nGracie Abrams

In [55]:
response = client.chat.completions.create(
    model="gpt-4.1-nano",
    messages=[
        {"role": "user", "content": prompt}
    ],
    temperature=0.2,
)

In [56]:
response.choices[0].message.content

'{\n"The 1975": "The 1975",\n"Biffy Clyro": "Biffy Clyro",\n"Alanis Morissette": "Alanis Morissette",\n"Burning Spear": "Burning Spear",\n"Cmat": "Cmat",\n"Supergrass": "Supergrass",\n"Neil Young And The Chrome Hearts": "Neil Young",\n"Raye": "Raye",\n"John Fogerty": "John Fogerty",\n"The Script": "The Script",\n"Brandi Carlile": "Brandi Carlile",\n"Kaiser Chiefs": "Kaiser Chiefs",\n"Olivia Rodrigo": "Olivia Rodrigo",\n"Noah Kahan": "Noah Kahan",\n"Nile Rodgers & Chic": "Nile Rodgers",\n"Rod Stewart": "Rod Stewart",\n"The Libertines": "The Libertines",\n"Celeste": "Celeste",\n"The Selecter": "The Selecter",\n"Loyle Carner": "Loyle Carner",\n"Busta Rhymes": "Busta Rhymes",\n"Gracie Abrams": "Gracie Abrams",\n"Franz Ferdinand": "Franz Ferdinand",\n"Wet Leg": "Wet Leg",\n"Inhaler": "Inhaler",\n"Rizzle Kicks": "Rizzle Kicks",\n"Fabio & Grooverider And The Outlook Orchestra": "Fabio & Grooverider",\n"Charli Xcx": "Charli Xcx",\n"Deftones": "Deftones",\n"Ezra Collective": "Ezra Collective",\

In [57]:
# Use ast to turn the string into a python dictionary
returned_dict = ast.literal_eval(response.choices[0].message.content)
returned_dict

{'The 1975': 'The 1975',
 'Biffy Clyro': 'Biffy Clyro',
 'Alanis Morissette': 'Alanis Morissette',
 'Burning Spear': 'Burning Spear',
 'Cmat': 'Cmat',
 'Supergrass': 'Supergrass',
 'Neil Young And The Chrome Hearts': 'Neil Young',
 'Raye': 'Raye',
 'John Fogerty': 'John Fogerty',
 'The Script': 'The Script',
 'Brandi Carlile': 'Brandi Carlile',
 'Kaiser Chiefs': 'Kaiser Chiefs',
 'Olivia Rodrigo': 'Olivia Rodrigo',
 'Noah Kahan': 'Noah Kahan',
 'Nile Rodgers & Chic': 'Nile Rodgers',
 'Rod Stewart': 'Rod Stewart',
 'The Libertines': 'The Libertines',
 'Celeste': 'Celeste',
 'The Selecter': 'The Selecter',
 'Loyle Carner': 'Loyle Carner',
 'Busta Rhymes': 'Busta Rhymes',
 'Gracie Abrams': 'Gracie Abrams',
 'Franz Ferdinand': 'Franz Ferdinand',
 'Wet Leg': 'Wet Leg',
 'Inhaler': 'Inhaler',
 'Rizzle Kicks': 'Rizzle Kicks',
 'Fabio & Grooverider And The Outlook Orchestra': 'Fabio & Grooverider',
 'Charli Xcx': 'Charli Xcx',
 'Deftones': 'Deftones',
 'Ezra Collective': 'Ezra Collective',
 'A

In [58]:
# Add these cleaned versions into the dictionary
for dict_key, dict_value in returned_dict.items():
    if dict_key in to_save:
        to_save[dict_key]['Cleaned Name'] = dict_value
to_save

{'The 1975': {'Glasto Info': {'Stage': 'Pyramid Stage',
   'Day': 'Friday',
   'Start': '22:15',
   'Finish': '23:45'},
  'Cleaned Name': 'The 1975'},
 'Biffy Clyro': {'Glasto Info': {'Stage': 'Pyramid Stage',
   'Day': 'Friday',
   'Start': '20:15',
   'Finish': '21:15'},
  'Cleaned Name': 'Biffy Clyro'},
 'Alanis Morissette': {'Glasto Info': {'Stage': 'Pyramid Stage',
   'Day': 'Friday',
   'Start': '18:15',
   'Finish': '19:15'},
  'Cleaned Name': 'Alanis Morissette'},
 'Burning Spear': {'Glasto Info': {'Stage': 'Pyramid Stage',
   'Day': 'Friday',
   'Start': '15:00',
   'Finish': '16:00'},
  'Cleaned Name': 'Burning Spear'},
 'Cmat': {'Glasto Info': {'Stage': 'Pyramid Stage',
   'Day': 'Friday',
   'Start': '13:30',
   'Finish': '14:30'},
  'Cleaned Name': 'Cmat'},
 'Supergrass': {'Glasto Info': {'Stage': 'Pyramid Stage',
   'Day': 'Friday',
   'Start': '12:00',
   'Finish': '13:00'},
  'Cleaned Name': 'Supergrass'},
 'Neil Young And The Chrome Hearts': {'Glasto Info': {'Stage': '

In [59]:
# Save the library to a json format
with open("../../data/mid/nested_dict.json", "w") as f:
    json.dump(to_save, f, indent=4)