In [1]:
# Import Libraries
import requests
import re
import time
import pandas as pd
import numpy as np
import json
import os
import sqlite3
import Levenshtein
from bs4 import BeautifulSoup
from tqdm import tqdm

# Get Genius Keys and Tokens
with open('../../secrets/keys.json', 'r') as file:
    api_data = json.load(file)
    Genius_ID = api_data['GeniusID']
    Genius_Secret = api_data['GeniusSecret']
    Genius_Token = api_data['GeniusToken']

In [2]:
# Read in the list of cleaned artist
with open("../../data/mid/nested_dict_v2.json", "r") as f:
    loaded_dict = json.load(f)
cleaned_names = [artist_info['Cleaned Name'] for artist_info in loaded_dict.values()]
cleaned_names

['The 1975',
 'Biffy Clyro',
 'Alanis Morissette',
 'Burning Spear',
 'Cmat',
 'Supergrass',
 'Neil Young',
 'Raye',
 'John Fogerty',
 'The Script',
 'Brandi Carlile',
 'Kaiser Chiefs',
 'Olivia Rodrigo',
 'Noah Kahan',
 'Nile Rodgers',
 'Rod Stewart',
 'The Libertines',
 'Celeste',
 'The Selecter',
 'Loyle Carner',
 'Busta Rhymes',
 'Gracie Abrams',
 'Franz Ferdinand',
 'Wet Leg',
 'Inhaler',
 'Rizzle Kicks',
 'Fabio & Grooverider',
 'Charli Xcx',
 'Deftones',
 'Ezra Collective',
 'Amyl',
 'Weezer',
 'Beabadoobee',
 'Good Neighbours',
 'Alessi Rose',
 'The Prodigy',
 'Wolf Alice',
 'Snow Patrol',
 'Turnstile',
 'Joy Crookes',
 'Shaboozey',
 'Nadine Shah',
 'Louis Dunford',
 'Maribou State',
 'Badbadnotgood',
 'Denzel Curry',
 'En Vogue',
 'Vieux Farka Toure',
 'Glass Beams',
 'Ca7Riel',
 'Corto.Alto',
 'Doechii',
 'Amaarae',
 'Greentea Peng',
 'Yussef Dayes',
 'Kneecap',
 'Bob Vylan',
 'Nilüfer Yanya',
 'Infinity Song',
 'Overmono',
 'Parcels',
 'The Brian Jonestown Massacre',
 'Goat'

In [4]:
# Attempting one artist pull to see the response
url = "https://api.genius.com/search"
headers = {"Authorization": f"Bearer {Genius_Token}"}
params = {'q': "Amyl"}
    
response = requests.get(url, headers = headers, params = params)
data = response.json()
data

{'meta': {'status': 200},
 'response': {'hits': [{'highlights': [],
    'index': 'song',
    'type': 'song',
    'result': {'annotation_count': 10,
     'api_path': '/songs/2875227',
     'artist_names': 'Amy Shark',
     'full_title': 'Adore by\xa0Amy\xa0Shark',
     'header_image_thumbnail_url': 'https://images.genius.com/cecc4b3dc7a42e780072905927ad4e72.300x300x1.jpg',
     'header_image_url': 'https://images.genius.com/cecc4b3dc7a42e780072905927ad4e72.1000x1000x1.jpg',
     'id': 2875227,
     'lyrics_owner_id': 2034060,
     'lyrics_state': 'complete',
     'path': '/Amy-shark-adore-lyrics',
     'primary_artist_names': 'Amy Shark',
     'pyongs_count': 6,
     'relationships_index_url': 'https://genius.com/Amy-shark-adore-sample',
     'release_date_components': {'year': 2016, 'month': 8, 'day': 1},
     'release_date_for_display': 'August 1, 2016',
     'release_date_with_abbreviated_month_for_display': 'Aug. 1, 2016',
     'song_art_image_thumbnail_url': 'https://images.genius.

In [5]:
Levenshtein.distance("Amyl and the Sniffers", "Amyl")

17

In [7]:
url = "https://api.genius.com/search"
headers = {"Authorization": f"Bearer {Genius_Token}"}
most_likely_name = []
most_likely_id = []
# Do the above but for all artists in the list
for clean_name_i in cleaned_names:
    params = {'q': clean_name_i}
    response = requests.get(url, headers = headers, params = params)
    data_i = response.json()
    # If statement added to save time
    if data_i['response']['hits'][0]['result']['primary_artist']['name'].lower().strip() == clean_name_i.lower().strip():
        output_name = data_i['response']['hits'][0]['result']['primary_artist']['name']
        output_id = data_i['response']['hits'][0]['result']['primary_artist']['id']
    else:
        hit_names = [data_i['response']['hits'][i]['result']['primary_artist']['name'] for i in range(len(data_i['response']['hits']))]
        hit_ids = [data_i['response']['hits'][i]['result']['primary_artist']['id'] for i in range(len(data_i['response']['hits']))]
        hit_df = pd.DataFrame({"hit_id": hit_ids, "hit_name": hit_names, "artist_name":clean_name_i})
        # Use Levenshtein distance to make more accurate
        hit_df['lev_dist'] = hit_df.apply(
            lambda row: Levenshtein.distance(row['hit_name'].lower(), row['artist_name'].lower()),
            axis=1
        )
        # If the whole of the artist anme is in the hit_name then give a 1
        hit_df['in_return'] = hit_df.apply(
            lambda row: 1 if pd.Series(row['hit_name']).str.contains(row['artist_name'], regex=False).any() else 0,
            axis=1
        )
        hit_df.sort_values(['lev_dist'], inplace = True)
        # print(hit_df)
        if hit_df['in_return'].sum() > 0:
            ouput_df = hit_df[hit_df['in_return'] == 1]
            output_id = ouput_df.iloc[0, 0]
            output_name = ouput_df.iloc[0, 1]
        elif hit_df['lev_dist'].min() >= 0.8*len(clean_name_i) or hit_df['in_return'].min() >= 8:
            output_id = np.nan
            output_name = np.nan
        else:
            output_id = hit_df.iloc[0, 0]
            output_name = hit_df.iloc[0, 1]
    most_likely_name.append(output_name)
    most_likely_id.append(output_id)

In [8]:
outputs = pd.DataFrame({"names": cleaned_names, "most_likely_name":most_likely_name, "most_likely_id":most_likely_id})
outputs

Unnamed: 0,names,most_likely_name,most_likely_id
0,The 1975,The 1975,45824.0
1,Biffy Clyro,Biffy Clyro,62717.0
2,Alanis Morissette,Alanis Morissette,34995.0
3,Burning Spear,Burning Spear,208175.0
4,Cmat,CMAT,2450044.0
...,...,...,...
168,My Baby,,
169,The Horne Section,Alex Horne & The Horne Section,2119663.0
170,Brooke Combe,Brooke Combe,2720203.0
171,Talisk,Talisk,1460995.0


In [None]:
# Define functions to get artist song urls and also to retrieve songs fomr the artists songs

In [None]:
def get_artist_top_songs(artist_id, access_token, limit=10):
    url = f"https://api.genius.com/artists/{artist_id}/songs"
    headers = {"Authorization": f"Bearer {access_token}"}
    params = {
        "sort": "popularity",  # Sort by popularity
        "per_page": limit,
        "page": 1
    }
    
    response = requests.get(url, headers=headers, params=params)
    
    if response.status_code == 200:
        songs = response.json()["response"]["songs"]
        return [(song["title"], song["url"]) for song in songs]
    else:
        print(f"Error: {response.status_code}")
        return []

In [66]:
def return_song_lyrics(song_url):
    headers = {
        "User-Agent": "Mozilla/5.0"
    }
    response = requests.get(song_url, headers=headers)
    if response.status_code != 200:
        raise Exception(f"Failed to fetch page: {response.status_code}")

    soup = BeautifulSoup(response.text, "html.parser")

    # Find all containers that have lyrics in
    lyrics_divs = soup.find_all("div", class_=lambda x: x and "Lyrics__Container" in x)

    if not lyrics_divs:
        lyrics_divs = soup.select(".song_body-lyrics p")

    raw_lines = []

    for div in lyrics_divs:
        lines = div.get_text(separator="\n", strip=True).split("\n")
        raw_lines.extend(lines)
        
    clean_lines = []

    for line in raw_lines:
        line = line.strip()
        # Skip lines with [Section headers]
        if re.match(r"^\[.*\]$", line):
            continue
        # Skip contributors and metadata
        if re.match(r"^\d+\s+Contributors$", line):
            continue
        if line.lower().endswith("lyrics") and len(line.split()) < 6:
            continue
        if re.match(r"^\s*(Sung by|Performed by|Featuring|Written by)", line, re.IGNORECASE):
            continue
        clean_lines.append(line)

    # Join into one clean text block
    full_lyrics = "; ".join(clean_lines)

    # Replace multiple spaces with one
    full_lyrics = re.sub(r"\s+", " ", full_lyrics).strip()

    # If there is a Read More element then we take ony what is after the Read More element
    full_lyrics = full_lyrics.split("Read More;", 1)[-1].strip()

    # Remove anything left in []s
    full_lyrics = re.sub(r'\[.*?\]', '', full_lyrics)

    return full_lyrics 

In [57]:
full_names = list(loaded_dict.keys())
print(len(cleaned_names))
print(len(full_names))
full_names

['The 1975',
 'Biffy Clyro',
 'Alanis Morissette',
 'Burning Spear',
 'Cmat',
 'Supergrass',
 'Neil Young And The Chrome Hearts',
 'Raye',
 'John Fogerty',
 'The Script',
 'Brandi Carlile',
 'Kaiser Chiefs',
 'Olivia Rodrigo',
 'Noah Kahan',
 'Nile Rodgers & Chic',
 'Rod Stewart',
 'The Libertines',
 'Celeste',
 'The Selecter',
 'Loyle Carner',
 'Busta Rhymes',
 'Gracie Abrams',
 'Franz Ferdinand',
 'Wet Leg',
 'Inhaler',
 'Rizzle Kicks',
 'Fabio & Grooverider And The Outlook Orchestra',
 'Charli Xcx',
 'Deftones',
 'Ezra Collective',
 'Amyl & The Sniffers',
 'Weezer',
 'Beabadoobee',
 'Good Neighbours',
 'Alessi Rose',
 'The Prodigy',
 'Wolf Alice',
 'Snow Patrol',
 'Turnstile',
 'Joy Crookes',
 'Shaboozey',
 'Nadine Shah',
 'Louis Dunford',
 'Maribou State',
 'Badbadnotgood',
 'Denzel Curry',
 'En Vogue',
 'Vieux Farka Toure',
 'Glass Beams',
 'Ca7Riel & Paco Amoroso',
 'Corto.Alto',
 'Doechii',
 'Amaarae',
 'Greentea Peng',
 'Yussef Dayes',
 'Kneecap',
 'Bob Vylan',
 'Nilüfer Yanya'

In [71]:
# Create a loop to go through the artists and retrieve their song lyrics
new_dict = loaded_dict.copy()
run_through = 0
for most_likely_id, long_name in tqdm(zip(outputs['most_likely_id'], full_names)):
    art_id = str(most_likely_id).replace('.0', '')
    art_top_songs = get_artist_top_songs(art_id, Genius_Token, 10)
    song_titles, lyrics = [], []
    for title, art_song_url in art_top_songs:
        song_lyrics = return_song_lyrics(art_song_url)
        song_titles.append(title)
        lyrics.append(song_lyrics)
    new_dict[long_name]['Genius Info'] = {"song_titles":song_titles, "lyrics":lyrics}
new_dict

55it [11:48,  9.68s/it]

Error: 404


83it [17:25,  8.87s/it]

Error: 404


135it [28:19,  9.34s/it]

Error: 404


142it [29:28,  8.51s/it]

Error: 404


152it [31:06,  8.39s/it]

Error: 404


164it [33:40,  9.61s/it]

Error: 404


169it [34:30,  8.53s/it]

Error: 404


173it [35:10, 12.20s/it]


{'The 1975': {'Glasto Info': {'Stage': 'Pyramid Stage',
   'Day': 'Friday',
   'Start': '22:15',
   'Finish': '23:45'},
  'Cleaned Name': 'The 1975',
  'Spotify Info': {'Spotify Followers': 7800048, 'Popularity': 78},
  'Genius Info': {'song_titles': ['Somebody Else',
    'Love It If We Made It',
    'Robbers',
    'Sex',
    'Chocolate',
    'About You',
    'Sincerity Is Scary',
    'Girls',
    'Somebody Else (Girl Version)',
    'It’s Not Living (If It’s Not With You)'],
   'lyrics': ["So I heard you found somebody else; And at first, I thought it was a lie; I took all my things that make sounds; The rest I can do without; I don't want your body; But I hate to think about you with somebody else; Our love has gone cold; You're intertwining your soul with somebody else; I'm looking through you; While you're looking through your phone; And then leaving with somebody else; No, I don't want your body; But I'm picturing your body with somebody else (Else, else...); I don't want your body

In [85]:
# Quick check of how many songs lyrics were saved for each artist
for i in new_dict.keys():
    print(i)
    print(len(new_dict[i]['Genius Info']['song_titles']))
    print(len(new_dict[i]['Genius Info']['lyrics']))

The 1975
10
10
Biffy Clyro
10
10
Alanis Morissette
10
10
Burning Spear
9
9
Cmat
10
10
Supergrass
10
10
Neil Young And The Chrome Hearts
10
10
Raye
10
10
John Fogerty
10
10
The Script
10
10
Brandi Carlile
10
10
Kaiser Chiefs
10
10
Olivia Rodrigo
10
10
Noah Kahan
10
10
Nile Rodgers & Chic
10
10
Rod Stewart
10
10
The Libertines
10
10
Celeste
10
10
The Selecter
10
10
Loyle Carner
10
10
Busta Rhymes
10
10
Gracie Abrams
10
10
Franz Ferdinand
10
10
Wet Leg
10
10
Inhaler
10
10
Rizzle Kicks
10
10
Fabio & Grooverider And The Outlook Orchestra
10
10
Charli Xcx
10
10
Deftones
10
10
Ezra Collective
10
10
Amyl & The Sniffers
10
10
Weezer
10
10
Beabadoobee
10
10
Good Neighbours
10
10
Alessi Rose
10
10
The Prodigy
10
10
Wolf Alice
10
10
Snow Patrol
10
10
Turnstile
10
10
Joy Crookes
10
10
Shaboozey
10
10
Nadine Shah
10
10
Louis Dunford
10
10
Maribou State
10
10
Badbadnotgood
10
10
Denzel Curry
10
10
En Vogue
10
10
Vieux Farka Toure
10
10
Glass Beams
9
9
Ca7Riel & Paco Amoroso
10
10
Corto.Alto
10
10
Doe

In [86]:
# Using chatgpt to determine if an artist should have their lyrics used for NLP
import openai
from openai import OpenAI
import ast
import json
with open('../../secrets/keys.json', 'r') as file:
    api_data = json.load(file)
    openai_api_key = api_data['OpenAIKey']

client = openai.OpenAI(api_key = openai_api_key)

In [87]:
names_str = "\n".join(cleaned_names)

In [117]:
# Create a prompt to clean up the list of names
prompt = """Here is a list of artist names from a lineup, I am looking at analysing the lyrics of artists using NLP. Please indicate worth an
artist is worth conducting NLP on with a yes or a no. An artist that does not have a vocalist when they perform would likely be a no or
if most of their songs have very few words in. If an artist is a singer songwriter then the answer would definitely ne a yes. Bands such as the
prodigy or maribou state where there songs have a lot of lyrics should be a yes, same with ezra collective and badbadnotgood as they have lyrics in the
majority of their songs
Please return a cleaned python dictionary with that I will be able to use ast on to turn into a python dictionaty:
- Each artist as they key of the string 
- A simple yes or no as the value as to whether that artist should have its
An example of the return looks as follows:
{"artist_1":answer_1, "artist_2":answer_2, "artist_3":answer_3, "artist_4":answer_4}

Lineup:""" + names_str
prompt

'Here is a list of artist names from a lineup, I am looking at analysing the lyrics of artists using NLP. Please indicate worth an\nartist is worth conducting NLP on with a yes or a no. An artist that does not have a vocalist when they perform would likely be a no or\nif most of their songs have very few words in. If an artist is a singer songwriter then the answer would definitely ne a yes. Bands such as the\nprodigy or maribou state where there songs have a lot of lyrics should be a yes, same with ezra collective and badbadnotgood as they have lyrics in the\nmajority of their songs\nPlease return a cleaned python dictionary with that I will be able to use ast on to turn into a python dictionaty:\n- Each artist as they key of the string \n- A simple yes or no as the value as to whether that artist should have its\nAn example of the return looks as follows:\n{"artist_1":answer_1, "artist_2":answer_2, "artist_3":answer_3, "artist_4":answer_4}\n\nLineup:The 1975\nBiffy Clyro\nAlanis Mori

In [118]:
response = client.chat.completions.create(
    model="gpt-4.1-mini",
    messages=[
        {"role": "user", "content": prompt}
    ],
    temperature=0.2,
)
api_response = response.choices[0].message.content

In [119]:
# Remove the ```python markers and strip whitespace
cleaned_response = api_response.strip("```python\n").strip()

# Convert to dictionary
response_dict = ast.literal_eval(cleaned_response)
response_dict

{'The 1975': 'yes',
 'Biffy Clyro': 'yes',
 'Alanis Morissette': 'yes',
 'Burning Spear': 'yes',
 'Cmat': 'yes',
 'Supergrass': 'yes',
 'Neil Young': 'yes',
 'Raye': 'yes',
 'John Fogerty': 'yes',
 'The Script': 'yes',
 'Brandi Carlile': 'yes',
 'Kaiser Chiefs': 'yes',
 'Olivia Rodrigo': 'yes',
 'Noah Kahan': 'yes',
 'Nile Rodgers': 'no',
 'Rod Stewart': 'yes',
 'The Libertines': 'yes',
 'Celeste': 'yes',
 'The Selecter': 'yes',
 'Loyle Carner': 'yes',
 'Busta Rhymes': 'yes',
 'Gracie Abrams': 'yes',
 'Franz Ferdinand': 'yes',
 'Wet Leg': 'yes',
 'Inhaler': 'yes',
 'Rizzle Kicks': 'yes',
 'Fabio & Grooverider': 'no',
 'Charli Xcx': 'yes',
 'Deftones': 'yes',
 'Ezra Collective': 'yes',
 'Amyl': 'yes',
 'Weezer': 'yes',
 'Beabadoobee': 'yes',
 'Good Neighbours': 'yes',
 'Alessi Rose': 'yes',
 'The Prodigy': 'yes',
 'Wolf Alice': 'yes',
 'Snow Patrol': 'yes',
 'Turnstile': 'yes',
 'Joy Crookes': 'yes',
 'Shaboozey': 'yes',
 'Nadine Shah': 'yes',
 'Louis Dunford': 'yes',
 'Maribou State': 

In [121]:
new_dict['The 1975']['Genius Info']

{'song_titles': ['Somebody Else',
  'Love It If We Made It',
  'Robbers',
  'Sex',
  'Chocolate',
  'About You',
  'Sincerity Is Scary',
  'Girls',
  'Somebody Else (Girl Version)',
  'It’s Not Living (If It’s Not With You)'],
 'lyrics': ["So I heard you found somebody else; And at first, I thought it was a lie; I took all my things that make sounds; The rest I can do without; I don't want your body; But I hate to think about you with somebody else; Our love has gone cold; You're intertwining your soul with somebody else; I'm looking through you; While you're looking through your phone; And then leaving with somebody else; No, I don't want your body; But I'm picturing your body with somebody else (Else, else...); I don't want your body, I don't want your body; I don't want your body, I don't want your body; I don't want your body, I don't want your body; I don't want your body, I don't want your body; And c'mon baby (I know); This ain't the last time that I'll see your face; And c'mon 

In [123]:
for f_name, yes_no in zip(full_names, response_dict.values()):
    new_dict[f_name]['Genius Info']['should_analyse'] = yes_no
new_dict

{'The 1975': {'Glasto Info': {'Stage': 'Pyramid Stage',
   'Day': 'Friday',
   'Start': '22:15',
   'Finish': '23:45'},
  'Cleaned Name': 'The 1975',
  'Spotify Info': {'Spotify Followers': 7800048, 'Popularity': 78},
  'Genius Info': {'song_titles': ['Somebody Else',
    'Love It If We Made It',
    'Robbers',
    'Sex',
    'Chocolate',
    'About You',
    'Sincerity Is Scary',
    'Girls',
    'Somebody Else (Girl Version)',
    'It’s Not Living (If It’s Not With You)'],
   'lyrics': ["So I heard you found somebody else; And at first, I thought it was a lie; I took all my things that make sounds; The rest I can do without; I don't want your body; But I hate to think about you with somebody else; Our love has gone cold; You're intertwining your soul with somebody else; I'm looking through you; While you're looking through your phone; And then leaving with somebody else; No, I don't want your body; But I'm picturing your body with somebody else (Else, else...); I don't want your body

In [124]:
with open("../../data/mid/nested_dict_v3.json", "w") as f:
    json.dump(new_dict, f, indent=4)