In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup, Comment
import requests
import re
import random
from unidecode import unidecode
import time
from torpy.http.requests import TorRequests
import urllib3
from contextlib import contextmanager
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
import functools
import logging
logging.basicConfig(level=logging.INFO)

In [2]:
def logger(func):
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        logging.info(
            f"Running '{func.__name__}'."
        )
        result = func(*args, **kwargs)
        logging.info(f"The '{func.__name__}' function returned.")
        return result
    return wrapper

In [3]:
@logger
def create_letter_url(letter):    

    query = f'https://www.azlyrics.com/{letter}.html'
    
    return query

In [4]:
@logger
def get_artist_urls(url, AGENT_LIST, sess):    

    urls = []
    
    response = request_handler(url, AGENT_LIST, sess)
    
    print(response.text)
    
    if response:
    
        soup = BeautifulSoup(response.text, 'html.parser')

        divs = soup.find_all('div', class_='col-sm-6 text-center artist-col')

        for div in divs:
            for a_tag in div.find_all('a'):
                link = a_tag.get('href')
                if link:
                    absolute_link = urljoin(url, link)
                    urls.append(absolute_link)
                    
        print("artist_urls", urls)
                    
    else:
        print("No Response Found", response)
    
    return urls

In [5]:
@logger
def get_song_urls(url, AGENT_LIST, sess):    
    
    urls = []
    
    response = request_handler(url, AGENT_LIST, sess)
    
    print(response.text)
    
    if response:
    
        soup = BeautifulSoup(response.text, 'html.parser')

        divs = soup.find_all('div', class_='listalbum-item')

        for div in divs:
            for a_tag in div.find_all('a'):
                link = a_tag.get('href')
                if link:
                    absolute_link = urljoin(url, link)
                    urls.append(absolute_link)
                    
                    
        print("song_urls", urls)
        
    else:
        print("No Response Found", response)               
                    
    return urls

In [6]:
@logger
def get_lyrics_from_url(url, AGENT_LIST, sess):
    
    print(url)
    
    response = request_handler(url, AGENT_LIST, sess)
    
    if response:
    
        soup = BeautifulSoup(response.text, 'html.parser')

        def find_lyrics_div(tag):
            for elem in tag(text=lambda text: isinstance(text, Comment)):    
                if "Usage of azlyrics.com content by any third-party lyrics provider is prohibited" in elem:
                    return True
            return False

        lyrics_div = soup.find(find_lyrics_div)

        if lyrics_div:

            for elem in lyrics_div(text=lambda text: isinstance(text, Comment)):
                elem.extract()

            lyrics = '\n'.join(line.strip() for line in lyrics_div.stripped_strings)

            return lyrics

        else:
            return "Lyrics div not found"
        
    else:
        print("No Lyrics Found")
        return None

In [7]:
@logger
def clean_lyrics(raw_text):

    start_index = raw_text.find("\nSearch")
    if start_index == -1:
        return "Start marker not found"


    end_index = raw_text.find("\nSubmit Corrections")
    if end_index == -1:
        return "End marker not found"

    lyrics = raw_text[start_index + len("\nSearch"):end_index].strip()
    
    cleaned_lyrics = re.sub(r'\[.*?(Chorus|Intro|Verse|Bridge|Outro|Hook|Pre-Chorus|Interlude|Refrain).*?\]', '', lyrics, flags=re.IGNORECASE)
    
    lines = cleaned_lyrics.split('\n')[3:]
    cleaned_lyrics = '\n'.join(lines)

    return cleaned_lyrics.strip()

In [8]:
@logger
def request_handler(url, AGENT_LIST, sess): 
    tries = 0
    
    while tries < 3:
        print("Try:", tries + 1)
        HEADERS = {"User-Agent": random.choice(AGENT_LIST)}

        try:

            response = send_request(url, HEADERS, sess)
            return response
        
        except Exception as e:
            print(e)
            pass 
    
        tries += 1
    
    return None

In [9]:
@logger
def send_request(url, HEADERS):
    with TorRequests() as tor_requests:
        with tor_requests.get_session() as sess:
            
            print(sess.get("http://httpbin.org/ip").json())
            html_content = sess.get(url, headers=HEADERS, timeout=5)

            print(HEADERS["User-Agent"])
                        
    return html_content

In [10]:
@logger
def send_request(url, HEADERS, sess):
    global request_counter
    print(sess.get("http://httpbin.org/ip").json())
    html_content = sess.get(url, headers=HEADERS, timeout=5)
    print(HEADERS["User-Agent"])
    
    request_counter += 1

    return html_content

In [11]:
def timer_decorator(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        print(f"{func.__name__} took {end_time - start_time} seconds to run.")
        return result
    return wrapper

In [12]:
@timer_decorator
def scrape_az_lyrics(alphabet, AGENT_LIST):
    counter = 0
    global request_counter
    lyric_df = pd.DataFrame(columns=["artist_url", "song_url", "lyrics", "letter"])

    with TorRequests() as tor_requests:
        with tor_requests.get_session() as sess:

            for letter in alphabet:
                letter_url = create_letter_url(letter)
                artist_urls = get_artist_urls(letter_url, AGENT_LIST, sess)

                if request_counter >= 3:
                            sess.close()  
                            request_counter = 0
                            print("Session Refreshed")
                
                for artist_url in artist_urls:
                    song_urls = get_song_urls(artist_url, AGENT_LIST, sess)
                    
                    if request_counter >= 3:
                            sess.close()  
                            request_counter = 0
                            print("Session Refreshed")
                    
                    for song_url in song_urls:
                        raw_html = get_lyrics_from_url(song_url, AGENT_LIST, sess)
                        clean_html = clean_lyrics(raw_html)
                        new_row_df = pd.DataFrame([{"artist_url": artist_url,
                                                    "song_url": song_url,
                                                    "lyrics": clean_html,
                                                    "letter": letter}])

                        lyric_df = pd.concat([lyric_df, new_row_df], ignore_index=True)

                        print(request_counter)

                        if request_counter >= :
                            sess.close()  
                            request_counter = 0
                            print("Session Refreshed")

                        counter += 1
                        
                        if counter >= 10:
                            return lyric_df
    return lyric_df

In [13]:
# https://developers.whatismybrowser.com/useragents/explore/operating_system_name/mac-os-x/
AGENT_LIST = [
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:43.0) Gecko/20100101 Firefox/43.0 SeaMonkey/8650",
"Mozilla/5.0 (Macintosh; PPC Mac OS X 10.5; rv:31.0) Gecko/20100101 Firefox/30.0 TenFourFox/7477",
"Mozilla/5.0 (Macintosh; PPC Mac OS X 10.5; FPR12; rv:45.0) Gecko/20100101 Firefox/45.0 TenFourFox/7450",
"Mozilla/5.0 (Macintosh; PPC Mac OS X 10.5; FPR14; rv:45.0) Gecko/20100101 Firefox/45.0 TenFourFox/7450",
"Mozilla/5.0 (Macintosh; PPC Mac OS X 10.5; rv:17.0) Gecko/17.0 Firefox/17.0 TenFourFox/7450",
"Mozilla/5.0 (Macintosh; PPC Mac OS X 10.4; rv:38.0) Gecko/20100101 Firefox/38.0 TenFourFox/7450",
"Mozilla/5.0 (Macintosh; PPC Mac OS X 10.4; rv:45.0) Gecko/20100101 Firefox/45.0 TenFourFox/7450",
"Mozilla/5.0 (Macintosh; PPC Mac OS X 10.4; FPR21; rv:45.0) Gecko/20100101 Firefox/45.0 TenFourFox/7450",
"Mozilla/5.0 (Macintosh; PPC Mac OS X 10.4; FPR30; rv:45.0) Gecko/20100101 Firefox/45.0 TenFourFox/7450",
"Mozilla/5.0 (Macintosh; PPC Mac OS X 10.4; FPR27; rv:45.0) Gecko/20100101 Firefox/45.0 TenFourFox/7450",
"Mozilla/5.0 (Macintosh; PPC Mac OS X 10.5; rv:17.0) Gecko/20130328 Firefox/17.0 TenFourFox/7450",
"Mozilla/5.0 (Macintosh; PPC Mac OS X 10.5; rv:31.0) Gecko/20100101 Firefox/31.0 TenFourFox/7450",
"Mozilla/5.0 (Macintosh; PPC Mac OS X 10.5; rv:17.0) Gecko/20130105 Firefox/17.0 TenFourFox/7450",
"Mozilla/5.0 (Macintosh; PPC Mac OS X 10.5; FPR2; rv:45.0) Gecko/20100101 Firefox/45.0 TenFourFox/7450",
"Mozilla/5.0 (Macintosh; PPC Mac OS X 10.4; rv:17.0) Gecko/20130328 Firefox/17.0 TenFourFox/7450",
"Mozilla/5.0 (Macintosh; PPC Mac OS X 10.4; rv:10.0.9) Gecko/20121011 Firefox/10.0.9 TenFourFox/7450",
"Mozilla/5.0 (Macintosh; PPC Mac OS X 10.5; FPR8; rv:45.0) Gecko/20100101 Firefox/45.0 TenFourFox/7450",
"Mozilla/5.0 (Macintosh; PPC Mac OS X 10.5; rv:10.0.6) Gecko/20120714 Firefox/10.0.6 TenFourFox/7450",
"Mozilla/5.0 (Macintosh; PPC Mac OS X 10.5; FPR6; rv:45.0) Gecko/20100101 Firefox/45.0 TenFourFox/7450",
"Mozilla/5.0 (Macintosh; PPC Mac OS X 10.5; FPR10; rv:45.0) Gecko/20100101 Firefox/45.0 TenFourFox/7450",
"Mozilla/5.0 (Macintosh; PPC Mac OS X 10.4; FPR7; rv:45.0) Gecko/20100101 Firefox/45.0 TenFourFox/7450",
"Mozilla/5.0 (Macintosh; PPC Mac OS X 10.4; FPR22; rv:45.0) Gecko/20100101 Firefox/45.0 TenFourFox/7450",
"Mozilla/5.0 (Macintosh; PPC Mac OS X 10.4; rv:17.0) Gecko/20130805 Firefox/17.0 TenFourFox/7450",
"Mozilla/5.0 (Macintosh; PPC Mac OS X 10.4; FPR12; rv:45.0) Gecko/20100101 Firefox/45.0 TenFourFox/7450",
"Mozilla/5.0 (Macintosh; PPC Mac OS X 10.4; FPR8; rv:45.0) Gecko/20100101 Firefox/45.0 TenFourFox/7450",
"Mozilla/5.0 (Macintosh; PPC Mac OS X 10.5; FPR23; rv:45.0) Gecko/20100101 Firefox/45.0 TenFourFox/7450",
"Mozilla/5.0 (Macintosh; PPC Mac OS X 10.4; rv:17.0) Gecko/20131114 Firefox/17.0 TenFourFox/7450",
"Mozilla/5.0 (Macintosh; PPC Mac OS X 10.6; FPR31; rv:45.0) Gecko/20100101 Firefox/45.0 TenFourFox/7450",
"Mozilla/5.0 (Macintosh; PPC Mac OS X 10.5; FPR18; rv:45.0) Gecko/20100101 Firefox/45.0 TenFourFox/7450",
"Mozilla/5.0 (Macintosh; PPC Mac OS X 10.4; FPR23; rv:45.0) Gecko/20100101 SVT/1.0.1 TenFourFox/7450",
"Mozilla/5.0 (Macintosh; PPC Mac OS X 10.5; rv:38.0) Gecko/20100101 Firefox/38.0 TenFourFox/7450",
"Mozilla/5.0 (Macintosh; PPC Mac OS X 10.4; FPR32; rv:45.0) Gecko/20100101 Firefox/45.0 TenFourFox/7450",
"Mozilla/5.0 (Macintosh; PPC Mac OS X 10.4; FPR17; rv:45.0) Gecko/20100101 Firefox/45.0 TenFourFox/7450",
"Mozilla/5.0 (Macintosh; PPC Mac OS X 10.5; FPR4; rv:45.0) Gecko/20100101 Firefox/45.0 TenFourFox/7450",
"Mozilla/5.0 (Macintosh; PPC Mac OS X 10.4; FPR3; rv:45.0) Gecko/20100101 Firefox/45.0 TenFourFox/7450",
"Mozilla/5.0 (Macintosh; PPC Mac OS X 10.5; FPR7; rv:45.0) Gecko/20100101 Firefox/45.0 TenFourFox/7450",
"Mozilla/5.0 (Macintosh; PPC Mac OS X 10.5; FPR21; rv:45.0) Gecko/20100101 Firefox/45.0 TenFourFox/7450",
"Mozilla/5.0 (Macintosh; PPC Mac OS X 10.5; FPR32; rv:45.0) Gecko/20100101 Firefox/45.0 TenFourFox/7450",
"Mozilla/5.0 (Macintosh; PPC Mac OS X 10.4; rv:17.0) Gecko/20130308 Firefox/17.0 TenFourFox/7450",
"Mozilla/5.0 (Macintosh; PPC Mac OS X 10.4; FPR6; rv:45.0) Gecko/20100101 Firefox/45.0 TenFourFox/7450",
"Mozilla/5.0 (Macintosh; PPC Mac OS X 10.5; rv:17.0) Gecko/20121128 Firefox/17.0 TenFourFox/7450",
"Mozilla/5.0 (Macintosh; PPC Mac OS X 10.5; FPR27; rv:45.0) Gecko/20100101 Firefox/45.0 TenFourFox/7450",
"Mozilla/5.0 (Macintosh; PPC Mac OS X 10.5; FPR13; rv:45.0) Gecko/20100101 Firefox/45.0 TenFourFox/7450",
"Mozilla/5.0 (Macintosh; PPC Mac OS X 10.4; FPR13; rv:45.0) Gecko/20100101 Firefox/45.0 TenFourFox/7450",
"Mozilla/5.0 (Macintosh; PPC Mac OS X 10.5; FPR3; rv:45.0) Gecko/20100101 Firefox/45.0 TenFourFox/7450",
"Mozilla/5.0 (Macintosh; PPC Mac OS X 10.4; FPR26; rv:45.0) Gecko/20100101 Firefox/45.0 TenFourFox/7450",
"Mozilla/5.0 (Macintosh; PPC Mac OS X 10.5; FPR29; rv:45.0) Gecko/20100101 Firefox/45.0 TenFourFox/7450",
"Mozilla/5.0 (Macintosh; PPC Mac OS X 10.5; FPR5; rv:45.0) Gecko/20100101 Firefox/45.0 TenFourFox/7450"
]

In [14]:
alphabet = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '19']

In [15]:
request_counter = 0
scrape_az_lyrics(alphabet, AGENT_LIST)

INFO:torpy.cache_storage:Loading cached NetworkStatusDocument from TorCacheDirStorage: /Users/cep4u/Library/Application Support/torpy/network_status
INFO:torpy.cache_storage:Loading cached DirKeyCertificateList from TorCacheDirStorage: /Users/cep4u/Library/Application Support/torpy/dir_key_certificates
INFO:torpy.guard:Connecting to guard node 162.19.137.206:443 (alexej; Tor 0.4.7.13)... (TorClient)
INFO:root:Running 'create_letter_url'.
INFO:root:The 'create_letter_url' function returned.
INFO:root:Running 'get_artist_urls'.
INFO:root:Running 'request_handler'.
INFO:root:Running 'send_request'.
INFO:torpy.circuit:Creating new circuit #80000001 with 162.19.137.206:443 (alexej; Tor 0.4.7.13) router...
INFO:torpy.documents.network_status:Getting descriptor for 162.19.137.206:443 (alexej; Tor 0.4.7.13)...
INFO:torpy.guard:Connecting to guard node 185.243.218.204:10443 (bauruine; Tor 0.4.7.13)... (Internal dir client)


Try: 1


INFO:torpy.circuit:Creating new circuit #80000002 with 185.243.218.204:10443 (bauruine; Tor 0.4.7.13) router...
INFO:torpy.circuit:Building 0 hops circuit...
INFO:torpy.stream:Stream #1: creating attached to #80000002 circuit...
INFO:torpy.stream:Stream #1: connecting to hsdir
INFO:torpy.stream:Stream #1: connected (remote ip '')
INFO:torpy.stream:Stream #1: remote disconnected (reason = DONE)
INFO:torpy.stream:Stream #1: closing (state = Disconnected)...
INFO:torpy.consesus:Got descriptor
INFO:torpy.circuit:Building 3 hops circuit...
INFO:torpy.circuit:Extending the circuit #80000001 with 188.214.30.66:443 (pleaseDonateWhonix; Tor 0.4.7.13)...
INFO:torpy.documents.network_status:Getting descriptor for 188.214.30.66:443 (pleaseDonateWhonix; Tor 0.4.7.13)...
INFO:torpy.stream:Stream #2: creating attached to #80000002 circuit...
INFO:torpy.stream:Stream #2: connecting to hsdir
INFO:torpy.stream:Stream #2: connected (remote ip '')
INFO:torpy.stream:Stream #2: remote disconnected (reason =

{'origin': '192.42.116.216'}


INFO:torpy.circuit:Building 3 hops circuit...
INFO:torpy.circuit:Extending the circuit #80000004 with 192.42.116.198:9000 (NTH8R1; Tor 0.4.7.13)...
INFO:torpy.documents.network_status:Getting descriptor for 192.42.116.198:9000 (NTH8R1; Tor 0.4.7.13)...
INFO:torpy.stream:Stream #7: creating attached to #80000002 circuit...
INFO:torpy.stream:Stream #7: connecting to hsdir
INFO:torpy.stream:Stream #7: connected (remote ip '')
INFO:torpy.stream:Stream #7: remote disconnected (reason = DONE)
INFO:torpy.stream:Stream #7: closing (state = Disconnected)...
INFO:torpy.consesus:Got descriptor
INFO:torpy.circuit:Extending the circuit #80000004 with 152.32.131.3:58045 (utopianswim; Tor 0.4.7.13)...
INFO:torpy.documents.network_status:Getting descriptor for 152.32.131.3:58045 (utopianswim; Tor 0.4.7.13)...
INFO:torpy.stream:Stream #8: creating attached to #80000002 circuit...
INFO:torpy.stream:Stream #8: connecting to hsdir
INFO:torpy.stream:Stream #8: connected (remote ip '')
INFO:torpy.stream:Str

Mozilla/5.0 (Macintosh; PPC Mac OS X 10.5; FPR13; rv:45.0) Gecko/20100101 Firefox/45.0 TenFourFox/7450
<!DOCTYPE html>
<html lang="en">
  <head>
    <meta charset="utf-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <!-- The above 3 meta tags *must* come first in the head; any other head content must come *after* these tags -->
    <meta name="robots" content="noarchive">
    <title>Artists A at AZLyrics</title>

    <link rel="canonical" href="https://www.azlyrics.com/a.html" />
    <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.4/css/bootstrap.min.css">
    <link rel="stylesheet" href="/local/az.css">

    <!-- HTML5 shim and Respond.js for IE8 support of HTML5 elements and media queries -->
    <!--[if lt IE 9]>
      <script src="https://oss.maxcdn.com/html5shiv/3.7.2/html5shiv.min.js"></script>
      <script src="https://oss.maxcdn.com/respond/1.4

{'origin': '192.42.116.216'}


INFO:root:The 'send_request' function returned.
INFO:root:The 'request_handler' function returned.
INFO:root:The 'get_song_urls' function returned.
INFO:root:Running 'get_lyrics_from_url'.
INFO:root:Running 'request_handler'.
INFO:root:Running 'send_request'.


Mozilla/5.0 (Macintosh; PPC Mac OS X 10.4; rv:17.0) Gecko/20130328 Firefox/17.0 TenFourFox/7450
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1">
<!-- The above 3 meta tags *must* come first in the head; any other head content must come *after* these tags -->
<meta name="description" content="a1 lyrics - 86 song lyrics sorted by album, including &quot;Forever In Love&quot;, &quot;Everytime&quot;, &quot;Heaven By Your Side&quot;."> 
<meta name="keywords" content="a1, a1 lyrics, discography, albums, songs">
<meta name="robots" content="noarchive">
<title>a1 Lyrics</title>

<link rel="canonical" href="https://www.azlyrics.com/a/a1.html" />
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.4/css/bootstrap.min.css">
<link rel="stylesheet" href="/local/az.css">

<!-- HTML5 shim and Respond.js for IE8 support of HTML5 

{'origin': '192.42.116.216'}


INFO:root:The 'send_request' function returned.
INFO:root:The 'request_handler' function returned.
INFO:root:The 'get_lyrics_from_url' function returned.
INFO:root:Running 'clean_lyrics'.
INFO:root:The 'clean_lyrics' function returned.
INFO:torpy.stream:Stream #9: closing (state = Connected)...
INFO:torpy.stream:Stream #6: closing (state = Connected)...
INFO:root:Running 'get_lyrics_from_url'.
INFO:root:Running 'request_handler'.
INFO:root:Running 'send_request'.
INFO:torpy.stream:Stream #10: creating attached to #80000003 circuit...
INFO:torpy.stream:Stream #10: connecting to ('httpbin.org', 80)


Mozilla/5.0 (Macintosh; PPC Mac OS X 10.5; FPR3; rv:45.0) Gecko/20100101 Firefox/45.0 TenFourFox/7450
3
Session Refreshed
https://www.azlyrics.com/lyrics/a1/bethefirsttobelieve.html
Try: 1


INFO:torpy.stream:Stream #10: connected (remote ip '54.210.149.139')
INFO:torpy.stream:Stream #11: creating attached to #80000004 circuit...
INFO:torpy.stream:Stream #11: connecting to ('www.azlyrics.com', 443)


{'origin': '192.42.116.216'}


INFO:torpy.stream:Stream #11: connected (remote ip '5.79.76.225')
INFO:root:The 'send_request' function returned.
INFO:root:The 'request_handler' function returned.
INFO:root:The 'get_lyrics_from_url' function returned.
INFO:root:Running 'clean_lyrics'.
INFO:root:The 'clean_lyrics' function returned.
INFO:root:Running 'get_lyrics_from_url'.
INFO:root:Running 'request_handler'.
INFO:root:Running 'send_request'.


Mozilla/5.0 (Macintosh; PPC Mac OS X 10.4; FPR3; rv:45.0) Gecko/20100101 Firefox/45.0 TenFourFox/7450
1
https://www.azlyrics.com/lyrics/a1/summertimeofourlives.html
Try: 1
{'origin': '192.42.116.216'}


INFO:root:The 'send_request' function returned.
INFO:root:The 'request_handler' function returned.
INFO:root:The 'get_lyrics_from_url' function returned.
INFO:root:Running 'clean_lyrics'.
INFO:root:The 'clean_lyrics' function returned.
INFO:root:Running 'get_lyrics_from_url'.
INFO:root:Running 'request_handler'.
INFO:root:Running 'send_request'.


Mozilla/5.0 (Macintosh; PPC Mac OS X 10.5; FPR7; rv:45.0) Gecko/20100101 Firefox/45.0 TenFourFox/7450
2
https://www.azlyrics.com/lyrics/a1/readyornot.html
Try: 1
{'origin': '192.42.116.216'}


INFO:root:The 'send_request' function returned.
INFO:root:The 'request_handler' function returned.
INFO:root:The 'get_lyrics_from_url' function returned.
INFO:root:Running 'clean_lyrics'.
INFO:root:The 'clean_lyrics' function returned.
INFO:torpy.stream:Stream #11: closing (state = Connected)...
INFO:torpy.stream:Stream #10: closing (state = Connected)...
INFO:root:Running 'get_lyrics_from_url'.
INFO:root:Running 'request_handler'.
INFO:root:Running 'send_request'.
INFO:torpy.stream:Stream #12: creating attached to #80000003 circuit...
INFO:torpy.stream:Stream #12: connecting to ('httpbin.org', 80)


Mozilla/5.0 (Macintosh; PPC Mac OS X 10.5; FPR27; rv:45.0) Gecko/20100101 Firefox/45.0 TenFourFox/7450
3
Session Refreshed
https://www.azlyrics.com/lyrics/a1/everytime.html
Try: 1


INFO:torpy.stream:Stream #12: connected (remote ip '54.210.149.139')
INFO:torpy.stream:Stream #13: creating attached to #80000004 circuit...
INFO:torpy.stream:Stream #13: connecting to ('www.azlyrics.com', 443)


{'origin': '192.42.116.216'}


INFO:torpy.stream:Stream #13: connected (remote ip '5.79.76.225')
INFO:root:The 'send_request' function returned.
INFO:root:The 'request_handler' function returned.
INFO:root:The 'get_lyrics_from_url' function returned.
INFO:root:Running 'clean_lyrics'.
INFO:root:The 'clean_lyrics' function returned.
INFO:root:Running 'get_lyrics_from_url'.
INFO:root:Running 'request_handler'.
INFO:root:Running 'send_request'.


Mozilla/5.0 (Macintosh; PPC Mac OS X 10.4; FPR17; rv:45.0) Gecko/20100101 Firefox/45.0 TenFourFox/7450
1
https://www.azlyrics.com/lyrics/a1/ifonly.html
Try: 1
{'origin': '192.42.116.216'}


INFO:root:The 'send_request' function returned.
INFO:root:The 'request_handler' function returned.
INFO:root:The 'get_lyrics_from_url' function returned.
INFO:root:Running 'clean_lyrics'.
INFO:root:The 'clean_lyrics' function returned.
INFO:root:Running 'get_lyrics_from_url'.
INFO:root:Running 'request_handler'.
INFO:root:Running 'send_request'.


Mozilla/5.0 (Macintosh; PPC Mac OS X 10.5; rv:17.0) Gecko/20130105 Firefox/17.0 TenFourFox/7450
2
https://www.azlyrics.com/lyrics/a1/heyyou.html
Try: 1
{'origin': '192.42.116.216'}


INFO:root:The 'send_request' function returned.
INFO:root:The 'request_handler' function returned.
INFO:root:The 'get_lyrics_from_url' function returned.
INFO:root:Running 'clean_lyrics'.
INFO:root:The 'clean_lyrics' function returned.
INFO:torpy.stream:Stream #13: closing (state = Connected)...
INFO:torpy.stream:Stream #12: closing (state = Connected)...
INFO:root:Running 'get_lyrics_from_url'.
INFO:root:Running 'request_handler'.
INFO:root:Running 'send_request'.
INFO:torpy.stream:Stream #14: creating attached to #80000003 circuit...
INFO:torpy.stream:Stream #14: connecting to ('httpbin.org', 80)


Mozilla/5.0 (Macintosh; PPC Mac OS X 10.4; rv:17.0) Gecko/20131114 Firefox/17.0 TenFourFox/7450
3
Session Refreshed
https://www.azlyrics.com/lyrics/a1/likearose.html
Try: 1


INFO:torpy.stream:Stream #14: connected (remote ip '54.210.149.139')
INFO:torpy.stream:Stream #15: creating attached to #80000004 circuit...
INFO:torpy.stream:Stream #15: connecting to ('www.azlyrics.com', 443)


{'origin': '192.42.116.216'}


INFO:torpy.stream:Stream #15: connected (remote ip '5.79.76.225')
INFO:root:The 'send_request' function returned.
INFO:root:The 'request_handler' function returned.
INFO:root:The 'get_lyrics_from_url' function returned.
INFO:root:Running 'clean_lyrics'.
INFO:root:The 'clean_lyrics' function returned.
INFO:root:Running 'get_lyrics_from_url'.
INFO:root:Running 'request_handler'.
INFO:root:Running 'send_request'.


Mozilla/5.0 (Macintosh; PPC Mac OS X 10.4; FPR6; rv:45.0) Gecko/20100101 Firefox/45.0 TenFourFox/7450
1
https://www.azlyrics.com/lyrics/a1/walkingintherain.html
Try: 1
{'origin': '192.42.116.216'}


INFO:root:The 'send_request' function returned.
INFO:root:The 'request_handler' function returned.
INFO:root:The 'get_lyrics_from_url' function returned.
INFO:root:Running 'clean_lyrics'.
INFO:root:The 'clean_lyrics' function returned.
INFO:root:Running 'get_lyrics_from_url'.
INFO:root:Running 'request_handler'.
INFO:root:Running 'send_request'.


Mozilla/5.0 (Macintosh; PPC Mac OS X 10.5; FPR3; rv:45.0) Gecko/20100101 Firefox/45.0 TenFourFox/7450
2
https://www.azlyrics.com/lyrics/a1/stillaround.html
Try: 1
{'origin': '192.42.116.216'}


INFO:root:The 'send_request' function returned.
INFO:root:The 'request_handler' function returned.
INFO:root:The 'get_lyrics_from_url' function returned.
INFO:root:Running 'clean_lyrics'.
INFO:root:The 'clean_lyrics' function returned.
INFO:torpy.stream:Stream #15: closing (state = Connected)...
INFO:torpy.stream:Stream #14: closing (state = Connected)...
INFO:torpy.guard:Closing guard connections (TorClient)...
INFO:torpy.guard:Destroy circuit #80000003
INFO:torpy.guard:Destroy circuit #80000004
INFO:torpy.guard:Closing guard connections (Internal dir client)...
INFO:torpy.guard:Destroy circuit #80000002


Mozilla/5.0 (Macintosh; PPC Mac OS X 10.5; rv:10.0.6) Gecko/20120714 Firefox/10.0.6 TenFourFox/7450
3
Session Refreshed
scrape_az_lyrics took 65.25445199012756 seconds to run.


Unnamed: 0,artist_url,song_url,lyrics,letter
0,https://www.azlyrics.com/a/a1.html,https://www.azlyrics.com/lyrics/a1/foreverinlo...,Love leads to laughter\nLove leads to pain\nWi...,a
1,https://www.azlyrics.com/a/a1.html,https://www.azlyrics.com/lyrics/a1/bethefirstt...,Just one on one\nThat's the way we do it baby\...,a
2,https://www.azlyrics.com/a/a1.html,https://www.azlyrics.com/lyrics/a1/summertimeo...,Summertime of our lives\nSummertime of our liv...,a
3,https://www.azlyrics.com/a/a1.html,https://www.azlyrics.com/lyrics/a1/readyornot....,Could you love a boy like me tonight?\nCome on...,a
4,https://www.azlyrics.com/a/a1.html,https://www.azlyrics.com/lyrics/a1/everytime.html,"Lately, I'm not who I used to be\nSomeone's co...",a
5,https://www.azlyrics.com/a/a1.html,https://www.azlyrics.com/lyrics/a1/ifonly.html,I said I'd never leave you\nI said that I woul...,a
6,https://www.azlyrics.com/a/a1.html,https://www.azlyrics.com/lyrics/a1/heyyou.html,"Hey you, looking around at your friends\nYou s...",a
7,https://www.azlyrics.com/a/a1.html,https://www.azlyrics.com/lyrics/a1/likearose.html,And as I look into your eyes\nI see an angel i...,a
8,https://www.azlyrics.com/a/a1.html,https://www.azlyrics.com/lyrics/a1/walkinginth...,Sitting by the window\nSinging songs of love\n...,a
9,https://www.azlyrics.com/a/a1.html,https://www.azlyrics.com/lyrics/a1/stillaround...,In this life of uncertainty\nThere are people ...,a


In [16]:
lyric_df

NameError: name 'lyric_df' is not defined

In [None]:
url = 'https://www.azlyrics.com/y.html'

In [None]:
get_song_urls('https://www.azlyrics.com/19/100gecs.html')