# We scrapping tool for getting the lyrics from all Bob Dylan songs

In [1]:
import numpy as np
import urllib
import pandas as pd
import re
import lxml.html
import unicodedata
import os

We get from Wikipedia a table with all Bob Dylan's songs:

In [2]:
urlt = r'https://en.wikipedia.org/wiki/List_of_songs_written_by_Bob_Dylan'
tabele = pd.read_html(urlt, header=0)
df = tabele[0]


The dataframe looks like this:

In [3]:
df.head()

Unnamed: 0,Recorded,Song title,Writer(s),(First) Album release,Released,Notes,Unnamed: 6
0,1990,"10,000 Men",Dylan,Under the Red Sky,1990,,
1,1990,2 X 2,Dylan,Under the Red Sky,1990,,
2,1966,4th Time Around,Dylan,Blonde on Blonde,1966,,
3,1990,7 Deadly Sins,"Dylan, Jeff Lynne, Tom Petty, George Harrison",Traveling Wilburys Vol. 3,1990,[1],
4,1974,Abandoned Love,Dylan,Biograph,1985,,


523 songs seem to be pretty good, we can work with them

In [4]:
len(df)

523

In [5]:
cantece = df['Song title'].tolist()

The class Song is taken from [Sebastian Raschka's github](https://github.com/rasbt/datacollect/tree/master/collect_lyrics)

In [6]:
class Song(object):
    def __init__(self, artist, title):
        self.artist = self.__format_str(artist)
        self.title = self.__format_str(title)
        self.url = None
        self.lyric = None

    def __format_str(self, s):
        # remove paranthesis and contents
        s = s.strip()
        try:
            # strip accent
            s = ''.join(c for c in unicodedata.normalize('NFD', s)
                         if unicodedata.category(c) != 'Mn')
        except:
            pass
        s = s.title()
        return s

    def __quote(self, s):
         return urllib.parse.quote(s.replace(' ', '_'))

    def __make_url(self):
        artist = self.__quote(self.artist)
        title = self.__quote(self.title)
        artist_title = '%s:%s' %(artist, title)
        url = 'http://lyrics.wikia.com/' + artist_title
        self.url = url

    def update(self, artist=None, title=None):
        if artist:
            self.artist = self.__format_str(artist)
        if title:
            self.title = self.__format_str(title)

    def lyricwikia(self):
        self.__make_url()
        try:
            doc = lxml.html.parse(self.url)
            lyricbox = doc.getroot().cssselect('.lyricbox')[0]
        except IOError:
            self.lyric = ''
            return
        lyrics = []

        for node in lyricbox:
            if node.tag == 'br':
                lyrics.append('\n')
            if node.tail is not None:
                lyrics.append(node.tail)
        self.lyric =  "".join(lyrics).strip()    
        return self.lyric

We have all his song titles into a list called __cantece__. We are now able to get each of these songs lyrics and write them into a text file called _dylansongs.txt_

In [14]:
f = open('dylansongs.txt','w')
for cantec in cantece:
    try:
        song = Song(artist='Bob Dylan', title=cantec)
        lyr = song.lyricwikia()
    except:
        lyr = 'lupa'
        
  
    f.write(str(lyr))
    f.write('/n')
f.close()

In [10]:
print(cantece[:5])

['10,000 Men', '2 X 2', '4th Time Around', '7 Deadly Sins', 'Abandoned Love']
