# 1. Identify the data source, requirements

    Import libraries

In [None]:
import pandas as pd

# library helper
# run: importnb-install from Conda before using
from importnb import Notebook
with Notebook(): 
    import Utility

# custom helper class (from jupyter notebook)
helper = Utility.Helper()

from bs4 import BeautifulSoup
import requests

import re

from string import ascii_lowercase

from datetime import datetime

# progress bars for long running functions
from tqdm import tqdm

import numpy as np

In [None]:
# reload changes in Jupyter notebooks
from importlib import reload
with Notebook(): __name__ == '__main__' and reload(Utility)

    Define constants and control variables

In [None]:
DATA_PATH = '../../data/'

LYRICS_URL = 'https://www.lyrics.com'

LYRICS_FILENAME = 'lyrics.csv'

In [None]:
artist_df = pd.read_csv(DATA_PATH + LYRICS_FILENAME)

artist_df.head()

In [None]:
print("Length of lyrics file: ", len(artist_df))

print("Song Lyrics N/A (error):", artist_df['song_lyrics'].isnull().sum())

sl_not_found = (artist_df['song_lyrics'] == 'Not found').sum()
print("Song Lyrics not found", sl_not_found)

print("Song Lyrics not scraped:", artist_df['song_lyrics'].isnull().sum() - sl_not_found)

print("Song Runtime not scraped:", (artist_df['song_runtime'] == 0).sum())

print("Song Year not scraped:", artist_df['song_year'].isnull().sum())

In [None]:
# Cast Song Lyrics column to str using a dictionary:
artist_df = artist_df.astype({'song_lyrics': 'str'})

artist_df.dtypes

# 2. Establish connection, collect sample data, identify data format

In [None]:
i = 0

song_url = LYRICS_URL + artist_df['song_URL'][i]
song_html = requests.get(song_url).text
song_soup = BeautifulSoup(song_html, 'html.parser')

song_runtime = song_soup.find('div', class_= re.compile('lyric-details')).find('dd', class_= re.compile('dd-margin')).findNext('dd').text

print('Song runtime:', song_runtime)

# The <pre> tag is inside <iframe>, so try to load it from iframe source URL:
song_lyrics = song_soup.find('pre', id='lyric-body-text').text

re.sub('\r', '', song_lyrics).split('\n')

In [None]:
artist_df.shape

# 3. Create function for bulk collection, transformation

In [None]:
for i in tqdm(range(curr, stop)):
    song_url = LYRICS_URL + artist_df['song_URL'][i]
    song_html = requests.get(song_url).text
    song_soup = BeautifulSoup(song_html, 'html.parser')
    
    # unfortunately lyric details are not always present, nor is time
    # we'll therefore save the runtime as 0:00 and then update it if present
    artist_df.at[i, 'song_runtime'] = datetime.strptime('0:00', '%M:%S').time()
    
    song_details = song_soup.find('div', class_= re.compile('lyric-details'))
    
    if (song_details != None): 
        # the song runtime is most consistently next to the clock icon <i class=far fa-clock">
        song_clock = song_details.find('i', class_= re.compile('fa-clock'))

        if (song_clock != None): 
            song_runtime = song_clock.parent.find_next_sibling().text
            
            song_runtime_mins = int(song_runtime.split(':')[-2])
            
            if (song_runtime_mins >= 30):
                artist_df.at[i, 'song_runtime'] = -1
                continue

            #save runtime, converted to datetime format, to dataframe
            artist_df.at[i, 'song_runtime'] = datetime.strptime(song_runtime, '%M:%S').time()
        
            # check for song year, beside clock
            if (song_clock.parent.find_previous_sibling() != None):
                song_year = int(song_clock.parent.find_previous_sibling().text)
                
                #save song year
                artist_df.at[i, 'song_year'] = song_year
                
    song_lyrics = song_soup.find('pre', id='lyric-body-text')

    if (song_lyrics == None): 
        artist_df.at[i, 'song_lyrics'] = 'Not found'
    else:
        # Set value at specified row/column pair, replacing '\n' with ' '
        artist_df.at[i, 'song_lyrics'] = re.sub('\r\n', ' ', song_lyrics.text)

# 4. Save data for later analysis

In [None]:
artist_df.to_csv(DATA_PATH + LYRICS_FILENAME, index=False)

In [None]:
# solve for UnicodeEncodeError: 'utf-8' codec can't encode characters in position 2046-2047: surrogates not allowed
artist_save = artist_df

artist_df['song_lyrics'].replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)