In [1]:
# importing packages

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

TodaysDate = time.strftime("%Y-%m-%d")


In [5]:
def retrieve_hyperlinks(main_url):
    """ 
    Extract all hyperlinks in 'main_url' and return a list with these hyperlinks 
    """
    
    # Send request and catch response: r

    r = requests.get(main_url)

    # Extracts response as html: html_doc
    html_doc = r.text

    # Create a BeautifulSoup object from the HTML: soup
    soup = BeautifulSoup(html_doc,"lxml")
    
    # Find all 'a' tags (which define hyperlinks): a_tags

    a_tags = soup.find_all('a')
    
    # Create a list with hyperlinks found

    list_links = [link.get('href') for link in a_tags]
    
    # Remove none values if there is some
    
    list_links = list(filter(None, list_links)) 
    
    return list_links

In [6]:
# retrieving all hyperlinks
urls = ['https://songteksten.net/artist/lyrics/1938/evanescence.html',
       'https://songteksten.net/artist/lyrics/1938/evanescence/page/2.html',
       'https://songteksten.net/artist/lyrics/1938/evanescence/page/3.html']

list_links_lyrics_songteksten_net = []

for url in urls:
    list_links_lyrics_songteksten_net.extend(retrieve_hyperlinks(url))
    
# remove probable repetitions

list_links_lyrics_songteksten_net = list(set(list_links_lyrics_songteksten_net))

    
print('Number of links before filtering:', len(list_links_lyrics_songteksten_net))

Number of links before filtering: 233


In [7]:
# using url address to filter lyrics

spliting = urls[0].split('/')
filter_lyrics = spliting[2]+'/lyric/'+spliting[-2]

list_links_lyrics_songteksten_net = [link for link in list_links_lyrics_songteksten_net if (filter_lyrics 
                                                                              in link) ]

print('Number of links after filtering:', len(list_links_lyrics_songteksten_net))


Number of links after filtering: 86


In [9]:
def extract_lyric_from_url(url_lyric):
    """ 
    Extract lyrics after prettify beautiful soup from www.songteksten.nl 
    """
    
    
    # send a http request
    r_lyric = requests.get(url_lyric)
    
    # obtain text with html containt of the url
    html_doc_lyric = r_lyric.text
    
    # making html easier to read
    soup_lyric = BeautifulSoup(html_doc_lyric,"lxml")

    
    # prettifying it
    soup_lyric_pretty = soup_lyric.prettify()
    
    # Isolating deal that contains the lyric
    
    text = soup_lyric_pretty.split('</h1>\n')[1].split('<div class="buma-consent" role="alert">')[0]

    # Cleaning text and building a list with it
    list_lyrics = text.split('<br/>\n')
    list_lyrics = [item.replace('\n','') for item in list_lyrics]
    list_lyrics = [item.lstrip().rstrip() for item in list_lyrics]
    
    # removing empty elements from the list
    
    for item in list_lyrics:
        if str(item) == '':
            list_lyrics.remove(item)
            
    # this part was added after noticing that at least one lyric was not following the normal pattern
    
    if '<div' in list_lyrics[0]:
        list_lyrics = list_lyrics[1:]
        
        
    # Having the lyrics in string format
    
    lyrics = '. '.join(list_lyrics)
            
    
    # returning both list and string
    
    return list_lyrics, lyrics

In [10]:
list_lyrics_evanescence = []
list_title_lyrics_evanescence = []

# building lists with titles of lyrics and lyrics

for url_lyric in list_links_lyrics_songteksten_net:
    
    list_title_lyrics_evanescence.append(url_lyric.split('/')[-1].split('.')[-2])
    list_lyrics_evanescence.append(extract_lyric_from_url(url_lyric)[1])


In [11]:
# Creating a dataframe with song titles and lyrics

df = pd.DataFrame({'song_title': list_title_lyrics_evanescence,
                  'lyrics': list_lyrics_evanescence})

# lower case titles and remove '-'

df['song_title'] = df['song_title'].apply(lambda x: x.replace('-',' ').lower())

# Organize dataframe in alphabetical order
df.sort_values('song_title', inplace = True)
df.reset_index(drop = True, inplace = True)
df.head()

# saving dataframe to .csv

df.to_csv("./data/lyrics_evanescence_"+TodaysDate+".csv", index = False)

# within temptation

In [12]:
# retrieving all hyperlinks
urls = ['https://songteksten.net/artist/lyrics/320/within-temptation.html',
       'https://songteksten.net/artist/lyrics/320/within-temptation/page/2.html',
       'https://songteksten.net/artist/lyrics/320/within-temptation/page/3.html']

list_links_lyrics_songteksten_net = []

for url in urls:
    list_links_lyrics_songteksten_net.extend(retrieve_hyperlinks(url))
    
# removing possible duplicates
list_links_lyrics_songteksten_net = list(set(list_links_lyrics_songteksten_net))

    
print('Number of links before filtering:', len(list_links_lyrics_songteksten_net))

Number of links before filtering: 231


In [13]:
# filtering hyperlinks which contain lyrics - specific for songteksten.net

# using url address to filter lyrics

spliting = urls[0].split('/')
filter_lyrics = spliting[2]+'/lyric/'+spliting[-2]

list_links_lyrics_songteksten_net = [link for link in list_links_lyrics_songteksten_net if (filter_lyrics 
                                                                              in link) ]

print('Number of links after filtering:', len(list_links_lyrics_songteksten_net))

Number of links after filtering: 74


In [14]:
# building lists with titles of lyrics and lyrics
list_title_lyrics_within_temptation = []
list_lyrics_within_temptation = []

for url_lyric in list_links_lyrics_songteksten_net:
    
    list_title_lyrics_within_temptation.append(url_lyric.split('/')[-1].split('.')[-2])
    list_lyrics_within_temptation.append(extract_lyric_from_url(url_lyric)[1])