In [6]:
import requests
import pandas as pd
import re
from bs4 import BeautifulSoup
import pickle5 as pickle

## Writing a function that downloads directly lyrics and stores them in a dict

In [2]:
def main():
    #Ask user for the artist suffix
    artist_suffix = input("What is the page of your artist - https://www.lyrics.com/artist/...?:")
    artist_page = "https://www.lyrics.com/artist/" + artist_suffix

    #Get the relative link of all songs from the artist page, find songs that are listed more than once
    links, multiple = get_unique_links(artist_page)

    #If lyrics available, add them to a dic
    new_links, artist_name, dic_lyrics = download_save_lyrics(links)

    #print_stat
    print_stats(artist_name, links, new_links, multiple)

    return dic_lyrics


def get_unique_links(link):
    """
    Generates a list with relative links to songs on lyrics.com and a list with titles that have been listed multiple times
    :param link: Link to artist page as string
    :return: two lists: 1. unique links to lyrics, 2. Titles of songs listed more than once
    """
    r = requests.get(link)
    pattern_1 = '"(/lyric\/.*?)"'
    matches_1 = re.findall(pattern=pattern_1, string=r.text)

    title = []
    unique_rel_links = []
    multiple = []

    for match in matches_1:
        pattern_2 = "/lyric\/[\d]*\/.*?\/(.*?)$"
        match_2 = re.findall(pattern=pattern_2, string=match)[0]
        if not match_2 in title:
            unique_rel_links.append(match)
            title.append(match_2)
        else:
            multiple.append(match_2)

    return unique_rel_links, multiple


def download_save_lyrics(list_of_links):

    artist_page = ""
    artist = ""
    dic = {}
    new_list = list_of_links.copy()
    no_text = list()

    for i in range(len(list_of_links)):
        link = "http://www.lyrics.com/" + list_of_links[i]
        r = requests.get(link)
        lyrics_page = BeautifulSoup(r.text, "html.parser")

        #Only proceed when the html id can be found
        if len(lyrics_page.findAll(id="lyric-body-text")) > 0:
            #Add the name of the artist during the first iteration
            if len(artist) == 0:
                artist = lyrics_page.find("div", class_="artist-thumb")["data-artist"]

            #Find the lyrics and add them to dic
            song_title = lyrics_page.find(id="lyric-title-text").text
            lyrics = lyrics_page.find(id="lyric-body-text").text.replace("\n", " ").replace("\r", " ")
            dic[song_title] = lyrics
        else:
            no_text.append(list_of_links[i])

    #Remove the songs without lyrics from the link list
    for song in no_text:
        new_list.remove(song)

    return new_list, artist, dic


def print_stats(artist, old_list, new_list, multiple_entries):
    print(f"Your chosen artist is: {artist}\n"
          f"Number of unique song entries found on lyrics.com: {len(old_list)}\n"
          f"Number of songs that were listed multiple times: {len(multiple_entries)}\n"
          f"Number of songs with lyrics: {len(new_list)}")

In [3]:
lyrics_lc = main()

Your chosen artist is: Loyle Carner
Number of unique song entries found on lyrics.com: 35
Number of songs that were listed multiple times: 21
Number of songs with lyrics: 27


## Save file through pickle

In [7]:
with open('./data/lyrics_lc.pkl', 'wb') as f:
    pickle.dump(lyrics_lc, f)