# Import libraries

This code was developed by Christoph Frost: https://www.linkedin.com/in/christophfrost/

In [1]:
import requests
import bs4
import os
import time
import re

# Define necessary functions

## Does Website exists?

In [2]:
def existing_url(res):

    if res.status_code != 200: #if not 200, there is any error (server, host,...)
        return 'No lyrics'
    else:
        # if response is 200, do the soup
        soup = bs4.BeautifulSoup(res.text, 'lxml')
        soup_text = soup.select_one('div>hgroup>h2').text
        # already examined error text
        error_text = "We couldn't find any artists matching your query."
        print(soup_text)
        if soup_text.lower() == error_text.lower():
            return 'Page not exists'       
    
    return 'exists'

## Duplicate handling

In [3]:
def duplicate_handling(res,title_links):
    soup = bs4.BeautifulSoup(res.text, 'lxml')
    song_select = soup.select('td.tal.qx>strong>a')
    print(len(song_select))
    
    for song in song_select:
        #1. Check ( and [ and split it:
        title = re.split('\s\(|\s\[|(\s|)[\?!]',song.text)[0].lower()
        #2. avoid duplicates by 
        if any(title in inner_list for inner_list in title_links) == False:
            link = song['href'].replace('/','',1)
            add_pair = [title,link]
            title_links.append(add_pair)
    
    return title_links

## Download lyrics and save in folder

In [4]:
def download_save(newpath,url,title_links):
    mislead = 0
    download = 0
    for song in title_links[0:150]:
        time.sleep(1)
        lyric_url = url + song[1] # fragment of url
        response = requests.get(lyric_url)
        # do the soup
        soup = bs4.BeautifulSoup(response.text, 'lxml')
        try: # some songs that are not linked with lyrics
            lyric_text=soup.find('pre', {'id': 'lyric-body-text'}).text
        except AttributeError: # if the link actually doesn't contain lyrics
            print(f'Link of song {song[0]} does not lead to any lyrics!')
            mislead += 1
        else: 
            print(song[0])
            title = re.sub('[\W]','_',song[0]) # song[0] is title of the song
            print(title)

            if not os.path.exists(newpath):  # creates folder if necessary
                os.makedirs(newpath)
            with open(newpath +'/'+title + '.txt', 'w') as response_file:  # saves the lyric-files
                response_file.write(lyric_text)
            print(title_links.index(song))
            download+=1
    
    print(f'The number of misleading links: {mislead}')
    print(f'The number of downloaded files: {download}')
    print(f'The number of all tried songs: {len(title_links)}')

# Run the Scraping Program

In [5]:
url = 'https://www.lyrics.com/'
path = os.getcwd()
artist_sum = 0
while artist_sum < 2:
    title_links = []
    
    #0. Input-Assignemt for which Interpret
    artist = input('The lyrics of which artist you would like to scrape?')
    artist = artist.lower()
    artist_url = url + 'artist/' + artist
    print(artist_url)
    
    #request the URL
    res = requests.get(artist_url)
    print(res.status_code)
    #1.existing URL?
    exist = existing_url(res)
    print(exist)
    #Repeat-Condition
    if (exist == 'No lyrics') or exist == 'Page not exists':
        print(f'{artist} is not available. Please enter another artist.')
        continue    
    
    #2.duplicate handling: avoid '()' and '[]' and ignore doubles
    title_links = duplicate_handling(res,title_links) # creates a nested list (list of list [[a1,a2],[b1,b2],...])
    
    #3. check if more than 100 different songs
    if len(title_links) < 100:
        print(f'{artist} has just {len(title_links)} different songs instead 100. Please enter another artist.')
        continue
    else:
        print(f'{artist} has {len(title_links)} different songs. Congrats!')
    
    #4. download lyrics and save to an individual folder
    print(f'The lyrics of the {len(title_links)} songs of {artist} are going to be saved now.')
    newpath = path +'/'+ artist #for new folder
    download_save(newpath,url,title_links)
    
    artist_sum += 1

KeyboardInterrupt: Interrupted by user

In [None]:
output = set()
for x in title_links:
    output.add(x[0])
print(output)
len(output)