In [1]:
from scrapy import Selector, Request
import scrapy
import requests

## 1. Scraping for links to lyrics

In [2]:
url = 'https://genius.com/artists/Dua-lipa' 
html = requests.get( url ).content
sel = Selector(text=html)

In [3]:
print( "There is this number of elements in the html code: ", len( sel.xpath('//*') ) )

There is this number of elements in the html code:  424


In [4]:
# getting all links to song lyrics

links_dualipa = sel.css('a.mini_card::attr(href)').extract()

In [5]:
links_dualipa2 = sel.xpath('/html//*[contains(@class, "mini_card")]/@href').extract()

In [6]:
links_dualipa2

['https://genius.com/Dua-lipa-new-rules-lyrics',
 'https://genius.com/Dua-lipa-dont-start-now-lyrics',
 'https://genius.com/Dua-lipa-idgaf-lyrics',
 'https://genius.com/Dua-lipa-blow-your-mind-mwah-lyrics',
 'https://genius.com/Dua-lipa-levitating-lyrics',
 'https://genius.com/Dua-lipa-break-my-heart-lyrics',
 'https://genius.com/Dua-lipa-be-the-one-lyrics',
 'https://genius.com/Dua-lipa-homesick-lyrics',
 'https://genius.com/Dua-lipa-physical-lyrics',
 'https://genius.com/Dua-lipa-were-good-lyrics']

In [7]:
links_dualipa

['https://genius.com/Dua-lipa-new-rules-lyrics',
 'https://genius.com/Dua-lipa-dont-start-now-lyrics',
 'https://genius.com/Dua-lipa-idgaf-lyrics',
 'https://genius.com/Dua-lipa-blow-your-mind-mwah-lyrics',
 'https://genius.com/Dua-lipa-levitating-lyrics',
 'https://genius.com/Dua-lipa-break-my-heart-lyrics',
 'https://genius.com/Dua-lipa-be-the-one-lyrics',
 'https://genius.com/Dua-lipa-homesick-lyrics',
 'https://genius.com/Dua-lipa-physical-lyrics',
 'https://genius.com/Dua-lipa-were-good-lyrics']

## 2. Scraping for lyrics

In [8]:
url = 'https://genius.com/Dua-lipa-dont-start-now-lyrics' 
html = requests.get( url ).content
sel = Selector(text=html)

In [9]:
song_title = sel.css('title::text').extract_first().strip()

In [10]:
song_title

"Dua\xa0Lipa – Don't Start Now Lyrics | Genius Lyrics"

In [11]:
song_lyrics = sel.xpath(
            ('/html//*[contains(@class,"lyric")]//a/text()|/html//*[contains(@class,"lyric")]//p/text()')).extract()

In [12]:
song_lyrics = [line.strip() for line in song_lyrics]

In [13]:
song_lyrics

[]

## 3. Write the Spider 

In [14]:
# writing a scrapy spider to scrape the lyrics in every link

# Import the CrawlerProcess: for running the spider
from scrapy.crawler import CrawlerProcess

# Create the Spider class
class Lyrics_Spider(scrapy.Spider):
  name = "lyrics_spider"
  # start_requests method
  def start_requests(self):
    yield scrapy.Request(url = "https://genius.com/artists/Dua-lipa",
                         callback = self.parse_links)
  # First parsing method
  def parse_links(self, response):
    lyrics_links = response.css('a.mini_card::attr(href)').extract()
    for url in lyrics_links:
      yield response.follow(url = url,
                            callback = self.parse_lyrics)
  # Second parsing method
  def parse_lyrics(self, response):
    song_title = response.css('title::text').extract_first().strip()
    song_lyrics = response.xpath(
            ('/html//*[contains(@class,"lyric")]//a/text()|/html//*[contains(@class,"lyric")]//p/text()')).extract()
    
    song_lyrics = [line.strip() for line in song_lyrics]
    lyrics_dict[song_title] = song_lyrics


In [15]:
# Initialize the dictionary **outside** of the Spider class
lyrics_dict = dict()

# Run the Spider
process = CrawlerProcess()
process.crawl(Lyrics_Spider)
process.start()


2021-05-21 17:01:43 [scrapy.utils.log] INFO: Scrapy 2.5.0 started (bot: scrapybot)
2021-05-21 17:01:43 [scrapy.utils.log] INFO: Versions: lxml 4.6.1.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 21.2.0, Python 3.8.5 (default, Sep  4 2020, 02:22:02) - [Clang 10.0.0 ], pyOpenSSL 19.1.0 (OpenSSL 1.1.1h  22 Sep 2020), cryptography 3.1.1, Platform macOS-10.14.6-x86_64-i386-64bit
2021-05-21 17:01:43 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2021-05-21 17:01:43 [scrapy.crawler] INFO: Overridden settings:
{}
2021-05-21 17:01:43 [scrapy.extensions.telnet] INFO: Telnet Password: ae308fd7e1975c65
2021-05-21 17:01:43 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.logstats.LogStats']
2021-05-21 17:01:43 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddle

In [16]:
lyrics_dict.keys()

dict_keys(['Dua\xa0Lipa – New Rules Lyrics | Genius Lyrics', "Dua\xa0Lipa – We're Good Lyrics | Genius Lyrics", 'Dua\xa0Lipa – Break My Heart Lyrics | Genius Lyrics', 'Dua\xa0Lipa – Levitating Lyrics | Genius Lyrics', 'Dua\xa0Lipa – Homesick Lyrics | Genius Lyrics', 'Be the One Lyrics - Dua Lipa | Genius Lyrics', 'Dua\xa0Lipa – Blow Your Mind (Mwah) Lyrics | Genius Lyrics', 'Dua\xa0Lipa – Physical Lyrics | Genius Lyrics', 'Dua\xa0Lipa – IDGAF Lyrics | Genius Lyrics', "Dua\xa0Lipa – Don't Start Now Lyrics | Genius Lyrics"])

## 4. Scraping more songs (link to more links)

In [17]:
url = 'https://genius.com/artists/Dua-lipa' 
html = requests.get( url ).content
sel = Selector(text=html)

links_dualipa_albums = sel.css('a.vertical_album_card::attr(href)').extract()

print(links_dualipa_albums)

2021-05-21 17:01:47 [urllib3.connectionpool] DEBUG: Starting new HTTPS connection (1): genius.com:443
2021-05-21 17:01:47 [urllib3.connectionpool] DEBUG: https://genius.com:443 "GET /artists/Dua-lipa HTTP/1.1" 200 None


['https://genius.com/albums/Dua-lipa/Future-nostalgia-the-moonlight-edition', 'https://genius.com/albums/Dua-lipa/Future-nostalgia-bonus-edition', 'https://genius.com/albums/Dua-lipa/Future-nostalgia', 'https://genius.com/albums/Dua-lipa/Dont-start-now-remixes', 'https://genius.com/albums/Dua-lipa/Dua-lipa-complete-edition', 'https://genius.com/albums/Dua-lipa/New-rules-remixes']


In [18]:
links_dualipa = []
url = 'https://genius.com/artists/Dua-lipa' 

for link in links_dualipa_albums:
    html = requests.get( link ).content
    sel = Selector(text=html)
    links_in_album = sel.css('a.u-display_block::attr(href)').extract()
    links_dualipa.extend(links_in_album)


2021-05-21 17:01:47 [urllib3.connectionpool] DEBUG: Starting new HTTPS connection (1): genius.com:443
2021-05-21 17:01:47 [urllib3.connectionpool] DEBUG: https://genius.com:443 "GET /albums/Dua-lipa/Future-nostalgia-the-moonlight-edition HTTP/1.1" 200 None
2021-05-21 17:01:47 [urllib3.connectionpool] DEBUG: Starting new HTTPS connection (1): genius.com:443
2021-05-21 17:01:48 [urllib3.connectionpool] DEBUG: https://genius.com:443 "GET /albums/Dua-lipa/Future-nostalgia-bonus-edition HTTP/1.1" 200 None
2021-05-21 17:01:48 [urllib3.connectionpool] DEBUG: Starting new HTTPS connection (1): genius.com:443
2021-05-21 17:01:49 [urllib3.connectionpool] DEBUG: https://genius.com:443 "GET /albums/Dua-lipa/Future-nostalgia HTTP/1.1" 200 None
2021-05-21 17:01:49 [urllib3.connectionpool] DEBUG: Starting new HTTPS connection (1): genius.com:443
2021-05-21 17:01:50 [urllib3.connectionpool] DEBUG: https://genius.com:443 "GET /albums/Dua-lipa/Dont-start-now-remixes HTTP/1.1" 200 None
2021-05-21 17:01:5

In [19]:
len(links_dualipa)

96