# Module 1 Assignment

In [166]:
# Import relevant libaries

import os
import datetime
import re

# Libraries for lyrics scrape section

import requests
import time
from bs4 import BeautifulSoup
from collections import defaultdict, Counter
import random

# Additional libraries

import shutil

---

## Part 1: Pulling and Storing Lyric Links

#### *Choose two musical artists for your project. Both should have at least twenty songs on their artist page on AZLyrics.*

### Adele & Eminem

#### *Note on Rate Limiting*

Whenever you call `requests.get` to retrieve a page, put a `time.sleep(5 + 10*random.random())` on the next line. This will help you not to get blocked. If you do get blocked, which you can identify if the returned pages are not correct, just request a lyrics page through your browser. You'll be asked to perform a CAPTCHA and then your requests should start working again.


In [167]:
# Store artist name and link in dictionary

artists = {
    "adele":"https://www.azlyrics.com/a/adele.html",
    "eminem":"https://www.azlyrics.com/e/eminem.html"
}

Q: Take a look at the `robots.txt` page on www.azlyrics.com. Is the scraping we are about to do allowed or disallowed by this page? How do you know?

A: Glancing at the robots.txt page provides the following information:

```
        User-agent: *
        Disallow: /lyricsdb/
        Disallow: /song/
        Allow: /

        User-agent: 008
        Disallow: /
```

Based on the `Disallow` keys, the only endpoints that are not allowed to be scraped are those that include either `/lyricsdb` and `/song`. Since the endpoints being hit in this project are artist names and lyrics of specific songs (not the lyricsdb directly nor the audio file of the song itself), every other endpoint should be fair game unless the `User-agent` is `008`.

### *Scrape Link List*

In [168]:
# Create dictionary to hold lyric links

lyrics_pages = defaultdict(list)

for artist, artist_page in artists.items() :

    # Request the page and sleep

    r = requests.get(artist_page)
    time.sleep(5 + 10*random.random())

    # Convert HTML response to text using bs4

    soup = BeautifulSoup(r.text, 'html.parser')

    # Extract all song divs containing links to lyrics

    link_divs = soup.find_all('div',attrs={'class':'listalbum-item'})

    # Extract links from divs and store in list

    link_list = []
    for div in link_divs:
        href = div.find('a', href=True)
        link = f"https://www.azlyrics.com{href['href']}"
        link_list.append(link)

    # Store in `lyrics_pages` with key = artist & value = link_list

    lyrics_pages[artist] = link_list


In [169]:
# Check song count

for artist, lp in lyrics_pages.items():
    assert(len(set(lp)) > 20)

In [170]:
# Calculate time to pull all lyrics with delay of `5 + 10*random.random()` seconds 

for artist, links in lyrics_pages.items() :
    print(f"For {artist.capitalize()} we have {len(links)} song lyric links.")
    print(f"The full pull for this artist will take {round(len(links)*10/3600,2)} hours.")

There are 393 listed song lyrics for Adele
For adele we have 71.
The full pull will take for this artist will take 0.2 hours.
There are 393 listed song lyrics for Eminem
For eminem we have 412.
The full pull will take for this artist will take 1.14 hours.


---

## Part 2: Pulling and Storing Lyric Pages

In [171]:
# Function for creating filename based on song title and writing to text file using that name

def generate_filename_from_link(link) :
    
    if not link :
        return None
    
    # Drop the http or https and the html

    name = link.replace("https","").replace("http","")
    name = link.replace(".html","")
    name = name.replace("/lyrics/","")
    
    # Replace useless characters with UNDERSCORE

    name = name.replace("://","").replace(".","_").replace("/","_")
    
    # Tack on .txt

    name = name + ".txt"
    
    return(name)

In [172]:
# Code for checking if lyrics folder exists
# Using shutil.rmtree to remove and create a new one if it does

if os.path.isdir("lyrics") : 
    shutil.rmtree("lyrics/")

os.mkdir("lyrics")

### *Scrape Page Lyrics*

In [173]:
url_stub = "https://www.azlyrics.com" 
start = time.time()

total_pages = 0

for artist, links in lyrics_pages.items() :
    
    # 1. Build a subfolder for the artist

    if os.path.isdir(f"lyrics/{artist}"): 
        shutil.rmtree(f"lyrics/{artist}")
    os.mkdir(f"lyrics/{artist}")

    # 2. Iterate over the lyrics pages (30 pages per artist)

    for link in links[0:30] :
    
        # 3. Request the page and sleep

        r = requests.get(link)
        time.sleep(5 + 10*random.random())

        # Convert HTML response to text using bs4

        soup = BeautifulSoup(r.text, 'html.parser')

        # 4. Extract title and lyrics from page

        ringtone = soup.find('div', attrs = {'class' : 'ringtone'})
        title = soup.select_one('b:nth-child(5)').text
        lyrics = ringtone.find_next_sibling("div").text

        # 5. Write out title and lyrics split by two returns

        song_lyrics = f"{title}\n\n{lyrics}"

        # Using generate_filename_from_url results in ugly file names for me

            # fname = generate_filename_from_link(link)
            # Example Output: httpswww_azlyrics_comadele_daydreamer.txt

        # Below seems like a cleaner approach
        
        file_title = title.replace('"','').replace(' ', '_')
        fname = f"{file_title}.txt"

        # Write lyrics file to appropriate directory

        f = open(f"lyrics/{artist}/{fname}", "w")
        f.write(song_lyrics)
        f.close()
    

---

## Evaluation

### Checking Lyrics 

The output from your lyrics scrape should be stored in files located in this path from the directory:
`/lyrics/[Artist Name]/[filename from URL]`. This code summarizes the information at a high level to help the instructor evaluate your work. 

In [183]:
# Simple word extractor from Peter Norvig: https://norvig.com/spell-correct.html
def words(text): 
    return re.findall(r'\w+', text.lower())

In [186]:
# Minor alterations made here so the evaluation works

artist_folders = os.listdir("lyrics/")
artist_folders = [f for f in artist_folders if os.path.isdir("lyrics/" + f)]

for artist in artist_folders : 
    artist_files = os.listdir("lyrics/" + artist)
    artist_files = [f for f in artist_files if 'txt' in f or 'csv' in f or 'tsv' in f]

    print(f"For {artist.capitalize()} we have {len(artist_files)} files.")

    artist_words = []

    for f_name in artist_files : 
        # with open("lyrics/" + artist + "/" + f_name) as infile : 
        with open(f"lyrics/{artist}/{f_name}", "r") as infile:
            artist_words.extend(words(infile.read()))

    print(f"For {artist.capitalize()} we have roughly {len(artist_words)} words, {len(set(artist_words))} are unique.")

For Eminem we have 29 files.
For Eminem we have roughly 18516 words, 3021 are unique.
For Adele we have 30 files.
For Adele we have roughly 9299 words, 986 are unique.
