# Lyrics Project

In [1]:
import requests
from bs4 import BeautifulSoup
import re

### Access the Artist URL

In [2]:
url = 'https://www.lyrics.com/artist/R%C3%B3is%C3%ADn-Murphy/350933'

In [3]:
response = requests.get(url)

In [4]:
# Check the status code
response.status_code

# 200: means that the request was valid and we got what we asked for
# 404: Page not found
# Everything with 400 means that we made a mistake or are not authorized
# Everything with 500 means that there is an error on the server side

200

In [5]:
html_artist = response.text

In [6]:
html_artist[:200]

'\n<!doctype html>\n<!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7" lang="en"> <![endif]-->\n<!--[if IE 7]>    <html class="no-js lt-ie9 lt-ie8" lang="en"> <![endif]-->\n<!--[if IE 8]>    <html '

In [7]:
# retrieves artist name. Includes accentuation and spaces
# re.findall returns a list of strings, hence the indexation.

artist_name = re.findall('Albums by <strong>([^<]+)', html_artist)[0]
artist_name.replace(' ', '_')

'Róisín_Murphy'

In [8]:
parsed_html_artist = BeautifulSoup(html_artist)

In [9]:
type(parsed_html_artist)

bs4.BeautifulSoup

In [10]:
html_links = parsed_html_artist.find_all(attrs={'class': 'tal qx'})
len(html_links) # this returns the number of songs

121

In [11]:
html_links[0]

<td class="tal qx"><strong><a href="/lyric/29625905/R%C3%B3is%C3%ADn+Murphy/Look+Around+You+%5BRaik+Remix%5D">Look Around You [Raik Remix]</a></strong></td>

In [12]:
# This loop uses regular expressions to strip the superfluos information before and after the link.
# re.sub(pattern, replacement, target string): It sub(stitutes) the pattern with an empty substring.
# It also adds the https:... prefix to each link before appending it to a list.

lyrics_links = []
for entry in html_links:
    a = str(entry)
    b = re.sub('.+=+?"', '', a)
    c = re.sub('".+>', '', b)
    c = 'https://www.lyrics.com' + c
    lyrics_links.append(c)

In [15]:
lyrics_links[:3]

['https://www.lyrics.com/lyric/29625905/R%C3%B3is%C3%ADn+Murphy/Look+Around+You+%5BRaik+Remix%5D',
 'https://www.lyrics.com/lyric/29574756/R%C3%B3is%C3%ADn+Murphy/Look+Around+You+%5BOriginal+Mix%5D',
 'https://www.lyrics.com/lyric/29464167/R%C3%B3is%C3%ADn+Murphy/Look+Around+You']

### Access the URL of one song

In [16]:
response_song = requests.get(lyrics_links[0])

In [17]:
response_song.status_code

200

In [18]:
html_song = response_song.text

In [19]:
soup_song = BeautifulSoup(html_song)

In [22]:
html_lyrics = str(soup_song.find(attrs={'class': "lyric-body"}))

In [23]:
html_lyrics[:200]

'<pre class="lyric-body" data-lang="en" dir="ltr" id="lyric-body-text">I can see a sky in the <a href="https://www.definitions.net/definition/place" style="color:#222;">place</a> where we\'re happy\nI ca'

In [24]:
# Remove all the noisy links in the lyrics
lyrics = re.sub('<.+?>+?', '', html_lyrics)

In [25]:
# Remove the square brackets before and after the lyrics body
lyrics = re.sub('\[', '', lyrics)
lyrics = re.sub('\]', '', lyrics)
lyrics

"I can see a sky in the place where we're happy\nI can see it all now, our friends are all there\nI close my eyes and I know that you're actually with me\nI'm going in, i'm going into a dream state\nAs it real as it feels\nNo, no, don't turn back\nCan't just turn around\n???\nAnd my heart beating faster\n\nI go anywhere to find you\nAny place beside you\nI gonna stay right where I found you\nI look around me\nI go anywhere to find you\nAny place beside you\nI stay right where I found you\nI look around me\nIt's just astounding\n\nWe're not apart\nNo, we're just a part of the landscape\nI can see it all now\n\nI go anywhere to find you\nAny place beside you\nI stay right where I found you\nI look around me\n\nI go anywhere to find you\nAny place beside you\nI stay right where I found you\nI look around me\n\nImaging me a landscape\nI can see it all night\nYour dancing in the devils line feels right\nBab ba badab aba a\n\nI go anywhere to find you\nAny place beside you\nI'm gonna stay ri

In [None]:
# save lyrics as a txt file to the songs folder
with open('./songs/test.txt', 'w') as f:
    f.writelines(lyrics)

## Loop the links and save lyrics as separate files

In [None]:
'''
Still need to upgrade:
    - a way to extract the name of the artist and encode it in the file and folder. Consider accentuation and spaces!
'''

i = 1
for song in lyrics_links:
    response_song = requests.get(song)
    html_song = response_song.text
    soup_song = BeautifulSoup(html_song)
    html_lyrics = str(soup_song.find(attrs={'class': "lyric-body"}))
    lyrics = re.sub('<.+?>+?', '', html_lyrics)
    lyrics = re.sub('\[', '', lyrics)
    lyrics = re.sub('\]', '', lyrics)
    num_len = len(str(len(lyrics_links))) # lol... read next comment first.
    with open('./songs/Róisín_Murphy_' + str(i).zfill(num_len) + '.txt', 'w') as f:
        #zfill(num_len) makes sure there are enough leading 0's before the number to sort files easily.
        f.writelines(lyrics)
    i += 1

## Define function

In [42]:
def scrape_lyrics(url):
    '''
    Scrapes the lyrics of all songs of an artist privided their page on lyrics.com.
    The lyrics are then saved as separate files in a new diretory in the current working.
    
    Parameters
    ----------
    url : str
        the url of artist page on lyrics.com
    '''
    response = requests.get(url)
    response.text
    html_artist = response.text
    artist_name = re.findall('Albums by <strong>([^<]+)', html_artist)[0]
    soup_artist = BeautifulSoup(html_artist)
    html_links = soup_artist.find_all(attrs={'class': 'tal qx'})
    lyrics_links = []
    #TODO: song_titles = []
    for entry in html_links:
        a = str(entry)
        b = re.sub('.+=+?"', '', a)
        #TODO: song_title = re.findall('">(.+?)<.+>', a)[0]
        c = re.sub('".+>', '', b)
        d = 'https://www.lyrics.com' + c
        lyrics_links.append(d)
        #TODO: song_titles.append(song_title)
        #TODO: if song_titles: unique...
    i = 1
    for song in lyrics_links:
        response_song = requests.get(song)
        html_song = response_song.text
        # TODO: maybe here I can add the song_name. This may avoid duplicates...
        soup_song = BeautifulSoup(html_song)
        html_lyrics = str(soup_song.find(attrs={'class': "lyric-body"}))
        if html_lyrics == 'None':
            continue # skips empty lyrics
        else:
            lyrics = re.sub('<.+?>+?', '', html_lyrics)
            lyrics = re.sub('\[', '', lyrics)
            lyrics = re.sub('\]', '', lyrics)
            num_len = len(str(len(lyrics_links))) # lol... read next comment first.
            with open('./songs/' + artist_name.replace(' ', '_') + '_' + str(i).zfill(num_len) + '.txt', 'w') as f:
                #zfill(num_len) makes sure there are enough leading 0's before the number to sort files easily.
                f.writelines(lyrics)
                f.read
            i += 1


## Lines for testing code

In [43]:
scrape_lyrics('https://www.lyrics.com/artist/Diamanda-Gal%C3%A1s/4330')

In [60]:
str(html_links[0])[40:]

'ic/29625905/R%C3%B3is%C3%ADn+Murphy/Look+Around+You+%5BRaik+Remix%5D">Look Around You [Raik Remix]</a></strong></td>'

In [66]:
re.findall('">(.+?)<.+>', str(html_links[0])[40:])[0]

'Look Around You [Raik Remix]'

## Python file

In [23]:
import requests
from bs4 import BeautifulSoup
import re
import sys
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split as tts
##
from collections import Counter


In [1]:
url = 'https://www.lyrics.com/artist/Arca'

In [94]:
def scrape_links(url):
    '''
    Scrapes the links to all songs of an artist provided their page on lyrics.com.
    
    Parameters
    ----------
    url : str
        the url of artist page on lyrics.com

    Returns
    -------
    list
        a list of strings with the links to the all song lyrics on the artist page on lyrics.com
    '''

    response = requests.get(url)
    html_artist = response.text
    artist_name = re.findall('Albums by <strong>([^<]+)', html_artist)[0]
    soup_artist = BeautifulSoup(html_artist, features="lxml")
    html_links = soup_artist.find_all(attrs={'class': 'tal qx'})
    lyrics_links = []
    song_titles = []
    for entry in html_links:
        str_entry = str(entry.find_all('a')[0])
        path_end, song_title = re.findall('\/l[^<]+', str_entry)[0].split(sep='">')
        if song_title not in song_titles: #this removes duplicates
            lyrics_links.append('https://www.lyrics.com' + path_end)
            song_titles.append(song_title)
    return lyrics_links, artist_name, song_titles

In [13]:
url = 'https://www.lyrics.com/artist/Laurie-Anderson/3545'

In [96]:
len(scrape_links(url)[2])

92

In [65]:
lyrics_links = scrape_links(url)

In [66]:
lyrics_links

(['https://www.lyrics.com/lyric/35252944/Laurie+Anderson/Shining+Star+%28Makin%27+My+Love%29'],
 'Laurie Anderson')

In [25]:
response = requests.get(url)
html_artist = response.text
soup_artist = BeautifulSoup(html_artist, features="lxml")
html_links = soup_artist.find_all(attrs={'class': 'tal qx'})

In [42]:
test = str(html_links[1].find_all('a')[0])

In [56]:
a, b = re.findall('\/l[^<]+', test)[0].split(sep='">')

'/lyric/35252944/Laurie+Anderson/Shining+Star+%28Makin%27+My+Love%29'

In [7]:
response = requests.get(url)
html_artist = response.text
re.findall('Albums by <strong>([^<]+)', html_artist)

[]

In [None]:
def scrape_lyrics(url):
    '''
    Scrapes the lyrics of all songs of an artist provided their page on lyrics.com.
    The lyrics are then saved as separate files in a new diretcory in the current working directory.
    
    Parameters
    ----------
    url : str
        the url of artist page on lyrics.com
    '''
    lyrics_links, artist_name = scrape_links(url)
    for i, song in enumerate(lyrics_links, 1):
        response_song = requests.get(song)
        html_song = response_song.text
        # TODO: maybe here I can add the song_name. This may avoid duplicates...
        soup_song = BeautifulSoup(html_song, features="lxml")
        html_lyrics = str(soup_song.find(attrs={'class': "lyric-body"}))
        if html_lyrics == 'None':
            continue # skips empty lyrics
        else:
            lyrics = re.sub('<.+?>+?', '', html_lyrics)
            lyrics = re.sub('\[', '', lyrics)
            lyrics = re.sub('\]', '', lyrics)
            num_len = len(str(len(lyrics_links))) # length of number of songs
            with open('./songs/' + artist_name.replace(' ', '_') + '_' + str(i).zfill(num_len) + '.txt', 'w') as f:
                #zfill(num_len) makes sure there are enough leading 0's before the number to sort files easily.
                f.writelines(lyrics)
            list_lyrics_artist(artist_name, i)

In [None]:
def list_lyrics_artist(artist_name, num_songs):
    '''
    Creates two lists of strings: one with the i lyrics and another with the name of the artist i times.
    '''
    artist_list = [artist_name for i in range(num_songs)]
    num_len = len(str(num_songs))
    lyrics_list = []
    for i in range(1, num_songs + 1):
        with open('./songs/' + artist_name.replace(' ', '_') + '_' + str(i).zfill(num_len) + '.txt', 'r') as f:
            lyrics_list.append(f.read())
    return lyrics_list, artist_list

In [None]:
def build_model(X, y):
    '''
    Trains Logistic Regression model
    '''
    Xtrain, Xtest, ytrain, ytest = tts(X, y)
    cv = CountVectorizer()

In [None]:
## Main Program:

X = []
y = []

lyrics_list, artist_list = list_lyrics_artist()
X.append(lyrics_list)
y.append(artist_list)