# Lyrics Scraping

The following code is used to scrape the song lyrics from the Genius website using the Python libraries BeautifulSoup and Selenium.

In [None]:
pip install beautifulsoup4 requests selenium webdriver-manager

### Beautiful scoup

In [None]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import time
import unicodedata
import re

def normalize_text(text):
    """Normalize characters in text, removing accents and handling dashes."""
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('ascii')
    text = re.sub(r'[^\w\s-]', '', text)
    return text.lower()

def format_slug(text, preserve_dashes=False):
    """Formats text to fit the URL structure of Genius.com, optionally preserving dashes."""
    text = normalize_text(text)
    text = re.sub(r'\s+', '-', text)
    return text

def scrape_song_lyrics(artist, title):
    """Use Selenium to navigate Google and scrape lyrics from Genius.com."""
    search_query = f"{artist} {title} lyrics genius"
    
    # Set up Selenium WebDriver
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
    driver.get("http://www.google.com")
    search = driver.find_element(By.NAME, "q")
    search.send_keys(search_query + Keys.RETURN)

    try:
        # Wait for search results to load and find the first Genius link
        wait = WebDriverWait(driver, 10)
        link = wait.until(EC.element_to_be_clickable((By.XPATH, "//a[starts-with(@href, 'https://genius.com/')]")))
        link.click()
        
        # Wait for the lyrics page to load and scrape the lyrics
        lyrics_div = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div.lyrics, [data-lyrics-container]")))
        lyrics = lyrics_div.text
        lyrics = re.sub(r'[\(\[].*?[\)\]]', '', lyrics)  # Remove annotations
        lyrics = '\n'.join([line.strip() for line in lyrics.split('\n') if line.strip()])  # Clean up lines
        
        # Return lyrics and URL
        return lyrics.strip(), driver.current_url
    except Exception as e:
        print(f"Error processing lyrics: {e}")
    finally:
        driver.quit()

    return "Lyrics not found.", None

# Example usage
artist = "Coldplay"
title = "Yellow"
lyrics, url = scrape_song_lyrics(artist, title)
print(lyrics, url)

In [8]:
df_song = pd.read_csv('dataset_songs.csv')


In [9]:
df_song = df_song.drop("Unnamed: 0", axis=1)
df_song['Song Title'] = df_song['Song Title'].str.replace('"', '', regex=False)
df_song['Label']= 1

In [5]:
df_song = df_song[:]
df_song.to_csv("prepared_for_scraping.csv")

In [6]:
df_song

Unnamed: 0,Year,Song Title,Artist,Label
0,1960,Theme from A Summer Place,Percy Faith,1
1,1960,He'll Have to Go,Jim Reeves,1
2,1960,Cathy's Clown,The Everly Brothers,1
3,1960,Running Bear,Johnny Preston,1
4,1960,Teen Angel,Mark Dinning,1
...,...,...,...,...
6396,2023,"Bzrp Music Sessions, Vol. 53",Bizarrap and Shakira,1
6397,2023,Meltdown,Travis Scott featuring Drake,1
6398,2023,Put It on da Floor Again,Latto featuring Cardi B,1
6399,2023,Bloody Mary,Lady Gaga,1


In [6]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import re
import unicodedata


def normalize_text(text):
    """Normalize characters in text, removing accents and handling dashes."""
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('ascii')  # Convert accented characters.
    text = re.sub(r'[^\w\s-]', '', text)  # Keep dashes and remove other non-alphanumeric characters.
    return text.lower()

def format_slug(text, preserve_dashes=False):
    """Formats text to fit the URL structure of Genius.com, optionally preserving dashes."""
    text = normalize_text(text)
    text = re.sub(r'\s+', '-', text)  # Replace spaces with dashes.
    return text

def construct_urls(artist, title):
    """Constructs possible Genius URLs from artist and song title including variants."""
    primary_artist = artist.split(' featuring ')[0].split(' ft ')[0].split(' feat ')[0].split(' & ')[0].split(' and ')[0]
    title = format_slug(title, preserve_dashes=True)
    artists = [format_slug(a.strip()) for a in re.split(r' and | & ', primary_artist)]

    # Generate URLs for each combination of 'and' placements if multiple artists
    urls = []
    if len(artists) > 1:
        for i in range(1, len(artists)):
            artist_combination = ' and '.join(artists[:i]) + '-' + '-and-'.join(artists[i:])
            urls.append(f'https://genius.com/{artist_combination}-{title}-lyrics')
    else:
        urls.append(f'https://genius.com/{format_slug(primary_artist)}-{title}-lyrics')

    return urls

def scrape_song_lyrics(urls):
    """Attempt to scrape lyrics from a list of URLs and return the URL as well."""
    for url in urls:
        try:
            response = requests.get(url)
            if response.status_code == 404:
                print(f"URL Not Found: {url}")  # Print the URL that resulted in a 404
                continue  # Try next URL if 404 not found
            html = BeautifulSoup(response.text, 'html.parser')
            lyrics_div = html.find('div', class_='lyrics') or html.find('div', attrs={'data-lyrics-container': True})
            if lyrics_div:
                lyrics = lyrics_div.get_text()
                lyrics = re.sub(r'[\(\[].*?[\)\]]', '', lyrics)  # Remove annotations
                lyrics = '\n'.join([line.strip() for line in lyrics.split('\n') if line.strip()])  # Clean up lines
                return lyrics.strip(), url
        except Exception as e:
            print(f"Error processing lyrics for URL {url}: {e}")
    return "Lyrics not found.", None

def apply_lyrics_and_url(row):
    lyrics, url = scrape_song_lyrics(construct_urls(row['Artist'], row['Song Title']))
    return pd.Series([lyrics, url], index=['Lyrics', 'URL'])


df_song[['Lyrics', 'URL']] = df_song.apply(apply_lyrics_and_url, axis=1)
df_song.to_csv("Billboard_1960-2023.csv")

URL Not Found: https://genius.com/brian-hyland-itsy-bitsy-teenie-weenie-yellow-polkadot-bikini-lyrics
URL Not Found: https://genius.com/hank-ballard-finger-poppin-time-lyrics
URL Not Found: https://genius.com/ferrante-theme-from-the-apartment-lyrics
URL Not Found: https://genius.com/the-browns-the-old-lamp-lighter-lyrics
URL Not Found: https://genius.com/the-four-preps-down-by-the-station-lyrics
URL Not Found: https://genius.com/skip-cherry-pie-lyrics
URL Not Found: https://genius.com/spencer-ross-tracys-theme-lyrics
URL Not Found: https://genius.com/johnny-beatnik-fly-lyrics
URL Not Found: https://genius.com/anita-bryant-my-little-corner-of-the-world-lyrics
URL Not Found: https://genius.com/the-highwaymen-michael-lyrics
URL Not Found: https://genius.com/the-shirelles-will-you-love-me-tomorrow-lyrics
URL Not Found: https://genius.com/ferrante-exodus-lyrics
URL Not Found: https://genius.com/dick-the-mountains-high-lyrics
URL Not Found: https://genius.com/shep-daddys-home-lyrics
URL Not 

In [7]:
def insert_newlines(lyrics):
    # Insert a newline before each capital letter that follows a non-space character
    modified_lyrics = re.sub(r'(?<=\S)([A-Z])', r'\n\1', lyrics)
    return modified_lyrics

# Apply the function to the Lyrics column
df_song['Lyrics'] = df_song['Lyrics'].apply(insert_newlines)
df_song.to_csv("Billboard1_1960-2023acapo.csv")

In [10]:
df_song = pd.read_csv("Billboard1_1960-2023acapo.csv")
df_song

Unnamed: 0.1,Unnamed: 0,Year,Song Title,Artist,Label,Lyrics,URL
0,0,1960,Theme from A Summer Place,Percy Faith,1,There's a summer place\nWhere it may rain or s...,https://genius.com/percy-faith-theme-from-a-su...
1,1,1960,He'll Have to Go,Jim Reeves,1,Put your sweet lips a little closer to the pho...,https://genius.com/jim-reeves-hell-have-to-go-...
2,2,1960,Cathy's Clown,The Everly Brothers,1,Don't want your love anymore\nDon't want your ...,https://genius.com/the-everly-brothers-cathys-...
3,3,1960,Running Bear,Johnny Preston,1,*vocalizations*\nOn the bank of the river\nSto...,https://genius.com/johnny-preston-running-bear...
4,4,1960,Teen Angel,Mark Dinning,1,Teen Angel\nTeen Angel\nTeen Angel\nThat fatef...,https://genius.com/mark-dinning-teen-angel-lyrics
...,...,...,...,...,...,...,...
6396,6396,2023,"Bzrp Music Sessions, Vol. 53",Bizarrap and Shakira,1,Lyrics not found.,
6397,6397,2023,Meltdown,Travis Scott featuring Drake,1,Yeah\nTensions is definitely rising\nT'd up ri...,https://genius.com/travis-scott-meltdown-lyrics
6398,6398,2023,Put It on da Floor Again,Latto featuring Cardi B,1,Ah What's happenin'? Big Latto Rip me out the ...,https://genius.com/latto-put-it-on-da-floor-ag...
6399,6399,2023,Bloody Mary,Lady Gaga,1,Money\nLove is just a history that they may pr...,https://genius.com/lady-gaga-bloody-mary-lyrics


In [14]:
df_song = pd.read_csv("Billboard1_1960-2023acapo.csv")
df_song
df_song = df_song[df_song["Lyrics"] == "Lyrics not found."]
df_song.to_csv("da_cercare.csv")

### Selenium

In [None]:
# .\nlp-env\Scripts\Activate.ps1
# cd nlp-env\

In [15]:
df_song = pd.read_csv("Billboard_1960-2023.csv")
df_song
df_song = df_song[df_song["Lyrics"] == "Lyrics not found."]
df_song.to_csv("da_cercare.csv")

In [16]:
df_song

Unnamed: 0.1,Unnamed: 0,Year,Song Title,Artist,Label,Lyrics,URL
18,18,1960,Itsy Bitsy Teenie Weenie Yellow Polkadot Bikini,Brian Hyland,1,Lyrics not found.,
24,24,1960,"Walk, Don't Run",The Ventures,1,Lyrics not found.,
36,36,1960,Because They're Young,Duane Eddy,1,Lyrics not found.,
48,48,1960,Finger Poppin' Time,Hank Ballard & The Midnighters,1,Lyrics not found.,
52,52,1960,Theme from The Apartment,Ferrante & Teicher,1,Lyrics not found.,
...,...,...,...,...,...,...,...
6346,6346,2023,Barbie World,Nicki Minaj and Ice Spice with Aqua,1,Lyrics not found.,
6382,6382,2023,Seven,Jungkook featuring Latto,1,Lyrics not found.,
6390,6390,2023,PRC,Peso Pluma and Natanael Cano,1,Lyrics not found.,
6394,6394,2023,Peaches & Eggplants,Young Nudy featuring 21 Savage,1,Lyrics not found.,


In [27]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import time
import pandas as pd

def scrape_song_lyrics(artist, title):
    search_query = f"{artist} {title} lyrics genius"

    # Set up Chrome options for better performance
    chrome_options = Options()
    chrome_options.add_argument("--disable-extensions")  # Disable extensions to speed up
    chrome_options.add_argument("--disable-gpu")  # GPU hardware acceleration isn't necessary for scraping
    chrome_options.add_argument("--headless")  # Run in headless mode if you don't need a GUI

    # Set up Selenium WebDriver
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    driver.set_page_load_timeout(30)  # Set the time to wait for a page to load

    try:
        driver.get("http://www.google.com")
        search = driver.find_element(By.NAME, "q")
        search.send_keys(search_query + Keys.RETURN)

        # Wait for search results to load and find the first Genius link
        wait = WebDriverWait(driver, 20)  # Increase timeout for waiting
        link = wait.until(EC.element_to_be_clickable((By.XPATH, "//a[starts-with(@href, 'https://genius.com/')]")))
        link.click()

        # Wait for the lyrics page to load and scrape the lyrics
        lyrics_div = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div.lyrics, [data-lyrics-container]")))
        lyrics = lyrics_div.text
        lyrics = '\n'.join([line.strip() for line in lyrics.split('\n') if line.strip()])  # Clean up lines

        return lyrics.strip(), driver.current_url
    except Exception as e:
        print(f"Error processing lyrics for {artist} - {title}: {e}")
        return "Lyrics not found.", None
    finally:
        driver.quit()


for index, row in df_song.iterrows():
    lyrics, url = scrape_song_lyrics(row['Artist'], row['Song Title'])
    df_song.at[index, 'Lyrics'] = lyrics
    df_song.at[index, 'URL'] = url
    # Save to CSV after each row is processed
    df_song.to_csv("updated_songs.csv", index=False)

Error processing lyrics for The Ventures - Walk, Don't Run: Message: 
Stacktrace:
	GetHandleVerifier [0x00F8C183+48259]
	(No symbol) [0x00F1CAB1]
	(No symbol) [0x00E10A17]
	(No symbol) [0x00E50BED]
	(No symbol) [0x00E50C9B]
	(No symbol) [0x00E8BC12]
	(No symbol) [0x00E70DE4]
	(No symbol) [0x00E89B9C]
	(No symbol) [0x00E70B36]
	(No symbol) [0x00E4570D]
	(No symbol) [0x00E462CD]
	GetHandleVerifier [0x01246613+2908435]
	GetHandleVerifier [0x01283C2B+3159851]
	GetHandleVerifier [0x0102513B+674875]
	GetHandleVerifier [0x0102B2FC+699900]
	(No symbol) [0x00F262B4]
	(No symbol) [0x00F22308]
	(No symbol) [0x00F2249C]
	(No symbol) [0x00F14C20]
	BaseThreadInitThunk [0x754C7BA9+25]
	RtlInitializeExceptionChain [0x76F5BE3B+107]
	RtlClearBits [0x76F5BDBF+191]

Error processing lyrics for Bill Black's Combo - White Silver Sands: Message: 
Stacktrace:
	GetHandleVerifier [0x00F8C183+48259]
	(No symbol) [0x00F1CAB1]
	(No symbol) [0x00E10A17]
	(No symbol) [0x00E50BED]
	(No symbol) [0x00E50C9B]
	(No symbo

In [33]:
df_song_trovate = df_song[df_song["Lyrics"] == "Lyrics not found."]

In [34]:
df_song_trovate

Unnamed: 0.1,Unnamed: 0,Year,Song Title,Artist,Label,Lyrics,URL
24,24,1960,"Walk, Don't Run",The Ventures,1,Lyrics not found.,
56,56,1960,White Silver Sands,Bill Black's Combo,1,Lyrics not found.,
89,89,1960,Beatnik Fly,Johnny and the Hurricanes,1,Lyrics not found.,
107,107,1961,Wheels,The String-A-Longs,1,Lyrics not found.,
110,110,1961,Calcutta,Lawrence Welk,1,Lyrics not found.,
...,...,...,...,...,...,...,...
3408,3408,1994,All for Love,"Bryan Adams, Rod Stewart and Sting",1,Lyrics not found.,
3665,3665,1996,Children,Robert Miles,1,Lyrics not found.,
4640,4640,2006,Stickwitu,Pussycat Dolls,1,Lyrics not found.,
5917,5917,2019,Eastside,"Benny Blanco, Halsey and Khalid",1,Lyrics not found.,


In [32]:
def insert_newlines(lyrics):
    # Insert a newline before each capital letter that follows a non-space character
    modified_lyrics = re.sub(r'(?<=\S)([A-Z])', r'\n\1', lyrics)
    return modified_lyrics


df_song_trovate.to_csv("Billboard_canzone_da_aggiungere_2.csv")