In [1]:
import requests
import re
from bs4 import BeautifulSoup
import pickle5 as pickle

## 1. Write a function that gets the relative links of songs via regex

In [2]:
def get_unique_links(link):
    """
    Generates a list with relative links to songs on lyrics.com and a list with titles that have been listed multiple times
    :param link: Link to artist page as string
    :return: two lists: 1. unique links to lyrics, 2. Titles of songs listed more than once
    """
    r = requests.get(link)
    pattern_1 = '"(/lyric\/.*?)"'
    matches_1 = re.findall(pattern=pattern_1, string=r.text)

    title = []
    unique_rel_links = []
    multiple = []

    for match in matches_1:
        pattern_2 = "/lyric\/[\d]*\/.*?\/(.*?)$"
        match_2 = re.findall(pattern=pattern_2, string=match)[0]
        if not match_2 in title:
            unique_rel_links.append(match)
            title.append(match_2)
        else:
            multiple.append(match_2)

    return unique_rel_links, multiple

In [3]:
links_fr, double_fr = get_unique_links("https://www.lyrics.com/artist/Frightened-Rabbit/807408")

In [4]:
print(f"There are {len(links_fr)} unique songs of 'Frightened Rabbit' listed on lyrics.com.\n"
      f"{len(double_fr)} songs have been listed more than one time.")

There are 63 unique songs of 'Frightened Rabbit' listed on lyrics.com.
60 songs have been listed more than one time.


In [5]:
links_ga, double_ga = get_unique_links("https://www.lyrics.com/artist/The-Gaslight-Anthem/909493")

In [6]:
print(f"There are {len(links_ga)} unique songs of 'Gaslight Anthem' listed on lyrics.com.\n"
      f"{len(double_ga)} songs have been listed more than one time.")

There are 94 unique songs of 'Gaslight Anthem' listed on lyrics.com.
90 songs have been listed more than one time.


## 2. Download the page with the lyrics and safe it as an HTML-file

In [7]:
def download_lyrics_save_html(list_of_links, path):
    """
    Downloads the pages with the lyrics and returns a list with the relative link of songs that
    actually have lyrics on lyrics.com
    :param list_of_links: List of relative links of songs on lyrics.com
    :param path: Path and filename of the generated HTML-files
    :return: A list with the relative link of songs that actually have lyrics on lyrics.com
    """

    new_list = list_of_links.copy()
    no_text = list()
    index = 1

    for i in range(len(list_of_links)):
        user_path = path + str(index) +".html"
        link = "http://www.lyrics.com/" + list_of_links[i]
        r = requests.get(link)
        lyrics_page = BeautifulSoup(r.text, "html.parser")

        # Only safe the file if the page contains the id
        if len(lyrics_page.findAll(id="lyric-body-text")) > 0:
            with open(user_path, 'w') as the_file:
                the_file.write(r.text)
            index += 1
        else:
            no_text.append(list_of_links[i])

    # Remove songs without lyrics from the initial list
    for song in no_text:
        new_list.remove(song)

    return new_list

In [8]:
new_links_fr = download_lyrics_save_html(links_fr, "./data/Frightened Rabbit/frightened_rabbit_")

In [9]:
new_links_ga = download_lyrics_save_html(links_ga, "./data/Gaslight Anthem/gaslight_anthem_")

## 3. Loading the HTML-file, extract the lyrics with BeautifulSoup() and store them in a dict

In [10]:
def load_html_get_lyrics(nr_songs, path):
    """
    Extracts the lyrics and saves them in a dict with the following format '[song title] : [lyrics]'
    :param nr_songs: Amount of HTML-files as int
    :param path: Path where the HTML-files are stored
    :return: A dict with the following format '[song title] : [lyrics]'
    """

    dic = {}


    for i in range(nr_songs):
        user_path = path + str(i+1) +".html"
        with open(user_path, 'r') as f:
            lyrics_page = BeautifulSoup(f, "html.parser")
            song_title = lyrics_page.find(id="lyric-title-text").text
            lyrics = lyrics_page.find(id="lyric-body-text").text.replace("\n", " ").replace("\r", " ")
            dic[song_title] = lyrics

    return dic

In [11]:
lyrics_fr = load_html_get_lyrics(len(new_links_fr), "./data/Frightened Rabbit/frightened_rabbit_")

In [12]:
lyrics_ga = load_html_get_lyrics(len(new_links_ga), "./data/Gaslight Anthem/gaslight_anthem_")

## 4. Safe the dicts via pickle

In [13]:
with open('./data/lyrics_fa.pkl', 'wb') as f:
    pickle.dump(lyrics_fr, f)

In [14]:
with open('./data/lyrics_ga.pkl', 'wb') as f:
    pickle.dump(lyrics_ga, f)