# azlyrics.com - Web Scraping Project
### M.H.

### <span style="color: red">The Objective:   <br> Creating a dataset of songs (including lyrics) recorded by artists whose name starts with letter H. </span>

In [1]:
# needed libraries

import requests                      #<-- for requesting web content
from bs4 import BeautifulSoup as bs #<-- for parsing through requested content
import pandas as pd                #<-- for manipulating data
from time import time, sleep      #<-- for incorporating time lapse in automatic web scraping
from datetime import datetime    #<-- for recording current time
from random import randint      #<-- for random assignment of time lapses
from warnings import warn      #<-- for generating warnings when connection with server during auto collection is lost
import re                     #<-- for conducting Regular Expression operations on Strings

In [2]:
# loading the url of a page on azlyrics.com for artists whose names begin with H
url = "https://www.azlyrics.com/h.html"

In [3]:
# accessing webpage for letter H artists and checking the status of connection (200 would be OK)
page = requests.get(url)
page.status_code

200

In [4]:
# parsing through the page
soup=bs(page.content,"lxml")
print(soup.prettify())

<!DOCTYPE html>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <!-- The above 3 meta tags *must* come first in the head; any other head content must come *after* these tags -->
  <meta content="noarchive" name="robots"/>
  <title>
   Artists H at AZLyrics
  </title>
  <link href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.4/css/bootstrap.min.css" rel="stylesheet"/>
  <link href="/local/az.css" rel="stylesheet"/>
  <!-- HTML5 shim and Respond.js for IE8 support of HTML5 elements and media queries -->
  <!--[if lt IE 9]>
      <script src="https://oss.maxcdn.com/html5shiv/3.7.2/html5shiv.min.js"></script>
      <script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
    <![endif]-->
  <script charset="UTF-8" data-domain-script="464b7175-7273-4e0f-8753-e9a483d4a156" data-language="en" src="https://cookie-cdn.cookiepro.com/scriptte

In [5]:
# isolating the content in the main container on the page for letter H
main = soup.find_all(class_="col-sm-6 text-center artist-col")

main

[<div class="col-sm-6 text-center artist-col">
 <a href="h/h1ghrmusic.html">H1GHR MUSIC</a><br/>
 <a href="h/haash.html">Ha-Ash</a><br/>
 <a href="e/emilyhackett.html">Hackett, Emily</a><br/>
 <a href="s/stevehackett.html">Hackett, Steve</a><br/>
 <a href="m/marikahackman.html">Hackman, Marika</a><br/>
 <a href="h/hacktivist.html">Hacktivist</a><br/>
 <a href="h/haddaway.html">Haddaway</a><br/>
 <a href="d/deitrickhaddon.html">Haddon, Deitrick</a><br/>
 <a href="h/hadestowncast.html">Hadestown Cast</a><br/>
 <a href="h/hadise.html">Hadise</a><br/>
 <a href="h/hadouken.html">Hadouken!</a><br/>
 <a href="h/haechan.html">Haechan</a><br/>
 <a href="b/benhaenow.html">Haenow, Ben</a><br/>
 <a href="h/haerts.html">Haerts</a><br/>
 <a href="h/haevn.html">HAEVN</a><br/>
 <a href="h/haftbefehl.html">Haftbefehl</a><br/>
 <a href="s/sammyhagar.html">Hagar, Sammy</a><br/>
 <a href="r/rasmushagen.html">Hagen, Rasmus</a><br/>
 <a href="h/haggard.html">Haggard</a><br/>
 <a href="m/merlehaggard.html">H

In [6]:
# main column is devided into 2 columns
# isolating the left column of the list
left = main[0]

# this is the right column of the list
right = main[1]

# view of the right column
right

<div class="col-sm-6 text-center artist-col">
<a href="m/marcoshernandez.html">Hernandez, Marcos</a><br/>
<a href="p/patrickhernandez.html">Hernandez, Patrick</a><br/>
<a href="t/tyherndon.html">Herndon, Ty</a><br/>
<a href="h/heroesdelsilencio.html">Héroes Del Silencio</a><br/>
<a href="m/maxherre.html">Herre, Max</a><br/>
<a href="z/zachherron.html">Herron, Zach</a><br/>
<a href="h/hers.html">Her's</a><br/>
<a href="j/jameshersey.html">Hersey, James</a><br/>
<a href="j/joehertz.html">Hertz, Joe</a><br/>
<a href="k/katieherzig.html">Herzig, Katie</a><br/>
<a href="s/surielhess.html">Hess, Suriel</a><br/>
<a href="a/arihest.html">Hest, Ari</a><br/>
<a href="h/heusslenfoire.html">Heuss L'Enfoiré</a><br/>
<a href="e/emmahewitt.html">Hewitt, Emma</a><br/>
<a href="h/hewitt.html">Hewitt, Jennifer Love</a><br/>
<a href="h/heyhihello.html">HeyHiHello</a><br/>
<a href="h/heymonday.html">Hey Monday</a><br/>
<a href="h/heyviolet.html">Hey Violet</a><br/>
<a href="h/hezekiahwalker.html">Hezekiah

In [7]:
# selecting only names (<a> links) in both, right and left columns
right_a_tags = right.find_all("a")
left_a_tags = left.find_all("a")
# concatenating both, left and right lists of links and checking how many (number of artists under letter h)
page_artists = left_a_tags + right_a_tags
len(page_artists)

669

In [8]:
# creating 2 lists, one for artists' names and one for their URLs' unique tale portions 
artists_names = []
artists_links = []
for i in page_artists:
    artists_names.append(i.text)
    artists_links.append(i.get("href"))
# printing first 10 elements of artists_names for review
print(artists_names[0:10]) 

['H1GHR MUSIC', 'Ha-Ash', 'Hackett, Emily', 'Hackett, Steve', 'Hackman, Marika', 'Hacktivist', 'Haddaway', 'Haddon, Deitrick', 'Hadestown Cast', 'Hadise']


In [9]:
# printing first 10 elements of artists_links for review
print(artists_links[0:10])

['h/h1ghrmusic.html', 'h/haash.html', 'e/emilyhackett.html', 's/stevehackett.html', 'm/marikahackman.html', 'h/hacktivist.html', 'h/haddaway.html', 'd/deitrickhaddon.html', 'h/hadestowncast.html', 'h/hadise.html']


In [10]:
# double-checking if the number of names is the same as the original number of artists
len(artists_names)


669

In [11]:
# double-checking if the number of url-tails is the same as the original number of artists
len(artists_links)

669

In [12]:
# creating new list for artists' URLs by concatenating the missing head portion to urls in the artists_links
artists_urls = []
for link in artists_links:
    artists_urls.append("https://www.azlyrics.com/" + link)
artists_urls[0:3]

['https://www.azlyrics.com/h/h1ghrmusic.html',
 'https://www.azlyrics.com/h/haash.html',
 'https://www.azlyrics.com/e/emilyhackett.html']

In [13]:
# creating the list of tuples containing both, artist's name and the URL to the artist's page on azlyrics.com
name_url_list = list(zip(artists_names, artists_urls))
# reviewing fist 3 rows
name_url_list[0:3]

[('H1GHR MUSIC', 'https://www.azlyrics.com/h/h1ghrmusic.html'),
 ('Ha-Ash', 'https://www.azlyrics.com/h/haash.html'),
 ('Hackett, Emily', 'https://www.azlyrics.com/e/emilyhackett.html')]

### <span style="color: red">Scraping songs' titles with their respective URLs for all artists whose name starts with H </span>

In [14]:
artist_name = []   # <-- collecting each song's artist name
song_title = []    # <-- collecting songs' titles
song_url = []      # <-- collecting the url to each song's lyrics
year = []          # <-- collecting each song's album year (if exists, if not it will have "Missing" instead)
lapses = []     # <-- the list of actual time lapses in the scraping process between requests for each artist's page

In [15]:
request_count = 0   # <-- reflect the number of artists whose page with songs was requested
title = ''    # <-- to hold the title of song
now = datetime.now()
begin_time = now.strftime("%H:%M:%S") # <-- time when scraping started

# scraping titles of songs, their lyrics-URLs, and a year of recording for artists in the collected list (669)
for artist in name_url_list[0:669]:  
    name = artist[0]
    artist_url = artist[1]
    start_time = time()   # <-- time when the artist's page is requested
    request_count += 1
    artist_page = requests.get(artist_url)
    if artist_page.status_code != 200:
        warn('Request: {}; Status code: {}'.format(request_count, artist_page.status_code))
    soup = bs(artist_page.content, 'lxml')
    sleep(randint(6,15))    # <-- randomly generated time-lapse added between requests
    
    a = soup.find('div', {'id':{'listAlbum'}})
    b = soup.find('div', {'class':{'col-xs-12 col-lg-8 text-center'}})
    if (a is not None):
        container = a
    elif(b is not None):
        container = b 
    else:  
        container = None
   
    if (container != None):
        divs = container.find_all('div') 
        album_year = 'Missing'
        for div in divs: 
            if div.has_attr('class'):
                if div.attrs['class'][0] == 'album':
                    if div.text == 'other songs:':
                        album_year = 'Missing'
                    else:
                        str = div.text[-5:-1]
                        x = re.search('^19', str)    # <-- checking if the last text section of div starts with 19
                        y = re.search('^20', str)    # <-- checking if the last text section of div starts with 20
                        if (x == None and y == None):
                            album_year = 'Missing'   # <-- if not, the year is not listed
                        else:
                            album_year = str
                elif div.attrs['class'][0] == 'listalbum-item':
                    lyrics_link = div.find('a')
                    if(lyrics_link is None):
                        title = div.text
                        fixed_url = "Missing"
                    else:
                        title = div.a.text      
                        song_link = div.a.get("href")
                        z = re.search('^http', song_link)   # <-- some href already contain full webpage link to lyrics
                        if (z == None):
                            fixed_url = "https://www.azlyrics.com/" + song_link.lstrip("../")
                        else:
                            fixed_url = song_link        
                    artist_name.append(name) 
                    song_title.append(title)
                    year.append(album_year)
                    song_url.append(fixed_url) 
                    
    elapsed_time = time() - start_time 
    lapses.append(elapsed_time)   
    now = datetime.now()
    end_time = now.strftime("%H:%M:%S")    # <-------------------- time when scraping ended
    

In [16]:
# calculating the total time this part of scraping took:
import math

scrape_time = 0
for i in lapses:
    scrape_time += i
scrape_time_hours = math.floor(scrape_time/3600)  # <-- number of hours (each hour has 3600 seconds)
scrape_time_min = math.floor((scrape_time%3600)/60)  # <-- number of minutes (remainder of seconds divided by 60)
print(f'Total time of scraping was {scrape_time_hours} hours and {scrape_time_min} minutes')
print(f'Beginning time: {begin_time}')
print(f'End time: {end_time}')


Total time of scraping was 2 hours and 1 minutes
Beginning time: 09:33:46
End time: 11:35:26


In [17]:
len(lapses)

669

In [19]:
request_count

669

In [20]:
len(set(artist_name))

668

### <span style="color: red;"> Now, checking lenghts of all 4 lists filled during the web-scraping process.</span>

In [21]:
len(artist_name)

42529

In [22]:
len(song_title)

42529

In [23]:
len(year)

42529

In [24]:
len(song_url)

42529

In [25]:
# joining all 4 lists into one dataframe
all_data = pd.DataFrame(list(zip(artist_name,song_title,year,song_url)),
               columns =['Artist_Name', 'Song_Title', 'Year', 'Lyrics_URL'])
all_data.head(3)

Unnamed: 0,Artist_Name,Song_Title,Year,Lyrics_URL
0,H1GHR MUSIC,H1GHR,2020,https://www.azlyrics.com/lyrics/h1ghrmusic/h1g...
1,H1GHR MUSIC,Melanin Handsome,2020,https://www.azlyrics.com/lyrics/h1ghrmusic/mel...
2,H1GHR MUSIC,How We Rock,2020,https://www.azlyrics.com/lyrics/h1ghrmusic/how...


In [26]:
# saving current data (no lyrics column yet) into the csv file (as backup record of raw data that was collected)
all_data.to_csv(r'all_data.csv', index=True)    # <-- insert preferable directory link where you want to save the file

In [27]:
# current size of dataframe
all_data.shape

(42529, 4)

In [29]:
# removing rows with "Missing" in the Year or Lyrics_URL column and any other rows with missing values
all_data = all_data[(all_data.Year != 'Missing') & (all_data.Lyrics_URL != 'Missing')]

# removing rows with no value in any of the columns
all_data = all_data.dropna(how='any',axis=0)

# size of data after the removal of rows 
all_data.shape

(34838, 4)

In [32]:
# removing dublicates 
all_data = all_data.drop_duplicates(keep="first")
all_data = all_data.reset_index(drop=True)
all_data.shape

(34760, 4)

In [33]:
# note:the multiple occurance (frequency) of the same Lyrics_URL link to the song reflects recordings
# of the same song in different years or by different artist
all_data.describe()

Unnamed: 0,Artist_Name,Song_Title,Year,Lyrics_URL
count,34760,34760,34760,34760
unique,599,29530,71,33921
top,"Haggard, Merle",Intro,2020,https://www.azlyrics.com/lyrics/ferlinhusky/go...
freq,611,27,1854,6


In [34]:
# checking which years are represented 
print(sorted(all_data['Year'].unique()))

['1952', '1953', '1954', '1955', '1956', '1957', '1958', '1959', '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022']


In [35]:
# converting "Year" column's data type to integer 
all_data['Year'] = all_data.Year.astype(int)

all_data.dtypes

Artist_Name    object
Song_Title     object
Year            int32
Lyrics_URL     object
dtype: object

### <span style="color: red">Now we scrape the lyrics for every song listed in the current dataframe and add them to the new column "Lyrics". </span>

In [None]:
songs_lyrics = []       # <-- list to hold collected lyrics 
urls = list(all_data['Lyrics_URL'])  # <-- list of URLs to lyrics that we want to collect 
div_error = 'ERROR'     # <-- if lyrics are not available, this String will be inserted as lyrics

In [None]:
# The code below can be run in increments (e.g., for every 5K songs each time) or at once for all
# items in the all_data table (i.e.,34,760 songs);
# It is useful to know that many website servers have bot detection systems, mainly to protect servers from
# being overwhelmed with requests, thus your scraping session, if detected, may be terminated. In such case,
# you can continue scraping after checking where the download was left off (in this case, what was the index of 
# the last song lyrics appended to the songs_lyrics list which corelates with the index of the last song url 
# being processes).

request_count = 0
lapses = []     # <-- the list of actual time lapses in the scraping process between requests
now = datetime.now()
begin_time = now.strftime("%H:%M:%S") # <-- time when scraping started

for song_url in urls[0:34760]:   # <---- ***here is where you put the range of indexes you want to scrape lyrics for***
    start_time = time()   # <-- time when the artist's page is requested
    request_count += 1
    page = requests.get(song_url)
    if page.status_code != 200:
        warn('Request: {}; Status code: {}'.format(request, page.status_code))
    soup = bs(page.content, 'lxml')
    sleep(randint(8,17))    # <-- randomly generated time-lapses added between requests
    try:
        x = soup.find('div', {'class':{"col-xs-12 col-lg-8 text-center"}})
        divs = x.find_all('div') 
        y = divs[5].text
        y = re.sub(",", ";", y)     # <-- replacing all commas with semicolons
        songs_lyrics.append(y)  
    
    except:
        songs_lyrics.append(div_error)
        
    elapsed_time = time() - start_time 
    lapses.append(elapsed_time)   
    now = datetime.now()
    end_time = now.strftime("%H:%M:%S")    # <-------------------- time when scraping ended
    

In [None]:
# calculating the total time this part of scraping took:
scrape_time = 0
for i in lapses:
    scrape_time += i
scrape_time_hours = math.floor(scrape_time/3600)  # <-- number of hours (each hour has 3600 seconds)
scrape_time_min = math.floor((scrape_time%3600)/60)  # <-- number of minutes (remainder of seconds divided by 60)
print(f'Total time of scraping was {scrape_time_hours} hours and {scrape_time_min} minutes')
print(f'Beginning time: {begin_time}')
print(f'End time: {end_time}')

In [None]:
# shows how many song lyrics were appended
len(songs_lyrics)

In [None]:
# shows how many laps times were recorded
len(lapses)

In [None]:
# number of processed page requests
request_count

In [None]:
all_data.shape

In [None]:
# adding collected lyrics to the dataframe
collected_data = pd.DataFrame(all_data) 
collected_data['Lyrics'] = songs_lyrics 
collected_data.shape

In [None]:
collected_data.dtypes

In [None]:
# saving all collected raw data into a .csv file:
collected_data.to_csv(r'collected_data.csv', index=False)  #<--insert preferable directory link where you want to save the file

### <span style="color: red;"> Post-Web-Scraping Tasks and Saving of the Final Dataset Ready for Future Analysis.</span>

In [36]:
# loading data from saved .csv file 
data = pd.read_csv(r'collected_data.csv') 

In [37]:
data.columns

Index(['Artist_Name', 'Song_Title', 'Year', 'Lyrics_URL', 'Lyrics'], dtype='object')

In [38]:
# checking if there are any rows with missing values:
data.isna().sum()

Artist_Name    0
Song_Title     0
Year           0
Lyrics_URL     0
Lyrics         0
dtype: int64

In [39]:
# chacking if there are any missing or incorrect values in the lyrics column
data[data['Lyrics'].isin(["\n\n", "ERROR", ''])]

Unnamed: 0,Artist_Name,Song_Title,Year,Lyrics_URL,Lyrics
2364,"Haley, Gavin",Intro,2022,https://www.azlyrics.com/lyrics/gavinhaley/int...,\n\n
2365,"Haley, Gavin",Blue Hour,2022,https://www.azlyrics.com/lyrics/gavinhaley/blu...,\n\n
2366,"Haley, Gavin",Lottery,2022,https://www.azlyrics.com/lyrics/gavinhaley/lot...,\n\n
2368,"Haley, Gavin",Heroes,2022,https://www.azlyrics.com/lyrics/gavinhaley/her...,\n\n
2369,"Haley, Gavin",Drifting Away,2022,https://www.azlyrics.com/lyrics/gavinhaley/dri...,\n\n
...,...,...,...,...,...
32966,HUNNY,"New Recording 122 August 19, 2020",2022,https://www.azlyrics.com/lyrics/hunny/newrecor...,\n\n
34101,"Hutchinson, Eric",Right Side Of History,2022,https://www.azlyrics.com/lyrics/erichutchinson...,\n\n
34102,"Hutchinson, Eric",The Littlest Candle,2022,https://www.azlyrics.com/lyrics/erichutchinson...,\n\n
34103,"Hutchinson, Eric",Pick Up The Pace,2022,https://www.azlyrics.com/lyrics/erichutchinson...,\n\n


In [40]:
# checking if there are any missing values:
data.isna().sum()

Artist_Name    0
Song_Title     0
Year           0
Lyrics_URL     0
Lyrics         0
dtype: int64

In [41]:
# current size of data
data.shape

(34688, 5)

In [42]:
# removing rows with error in the Lyrics column
data.drop(data[data['Lyrics'].isin(["\n\n", "ERROR", ''])].index, inplace = True)
data.reset_index(drop=True, inplace=True)

# new size of dataset
data.shape

(34572, 5)

In [43]:
# viewing the very last row in the dataframe
data.tail(1)

Unnamed: 0,Artist_Name,Song_Title,Year,Lyrics_URL,Lyrics
34571,HyunA,FLOWER SHOWER,2021,https://www.azlyrics.com/lyrics/hyuna/flowersh...,\n\n[Romanized:]\n\nI just wanna be your flowe...


In [44]:
# viewing the sample data for one song in the created dataset; 
# this is the printout of data contained in the last row in the final dataset:

print(f'Artist_Name: {data["Artist_Name"][34571]}')
print(f'Song_Title: {data["Song_Title"][34571]}')
print(f'Year: {data["Year"][34571]}')
print(f'Lyrics_URL: {data["Lyrics_URL"][34571]}')
print('-----------------------------------------------------')

# the lyrics are listed in 3 linquistic forms: Romanized, Korean(original), and English translation.
print(f'Lyrics: {data["Lyrics"][34571]}')  # <-- 

Artist_Name: HyunA
Song_Title: FLOWER SHOWER
Year: 2021
Lyrics_URL: https://www.azlyrics.com/lyrics/hyuna/flowershower.html
-----------------------------------------------------
Lyrics: 

[Romanized:]

I just wanna be your flower saeppalgaeyo nan
Hyanggiroun hyanggiro beolttedeuri kkoyeo
Hok shideureodo Don't you worry dashi pijana
Pieonaneun nae Color shiseondeuri moyeo

Ppanhi nareul barabwa geureon gwanshimi nan
Shilchi ana shilchi ana Eh eh
Dashi pieonaneun kkot Just gotta let it go OK
Can you hear me now

Ije kkotgillo na georeogal kkeoya
Geurae kkotteullo shaweohal kkeoya

Take a flower shower (woo hoo woo hoo)
Ssodajineun Flower
Sing it; sing it babe
Wing it; wing it babe
Bring it; bring it babe
Kkotyanggiga nane Eh eh
Like doomda doomda
Like doomda doomda

I'mma be forever young saeparaeyo nan
Bomi omyeon Get set go saero pieonayo
Seodureuji ma gyejeolcheoreom doraojana
Meoributeo balkkeutkkaji hyeonaneun ppalgaeyo

Nal jom naebeoryeo dweobwa maeume eomneun mal
Give me your lik

In [45]:
# beginning and ending years of song recording contained in the created dataset:
print(data['Year'].min(), data['Year'].max())

1952 2022


In [46]:
# saving final dataset of 34,572 songs recorded between years 1952-2022 by artists whose name starts with letter H
data.to_csv(r'h_artists_songs.csv', index=False)  #<-- you can specify here the intended directory link to save this file

In [47]:
data.shape

(34572, 5)