# Steps in Creating a Data Science Project

1. Defining the problem
**Business problem description:**
**Technical problem description:**

2. Getting the data
Setup environment
    Import libraries
    Define constants and control variables    

3. Exploratory data analysis

4. Preparing the data for Machine Learning Algorithms
Data cleaning
Feature engineering
preprocessing

5. Creating and evaluating multiple machine learning models

6. Tuning and selecting final models

7. Presenting findings and/or solutions

8. Launch, monitoring, and maintenance of system

# 1. Defining the problem

    **Business problem description:**

    **Technical problem description:**

# 2. Getting the data

# Setup environment

    # Import libraries

In [45]:
# library helper
# run: importnb-install from Conda before using
from importnb import Notebook
with Notebook(): 
    import Utility

# custom helper class (from jupyter notebook)
helper = Utility.Helper()

from bs4 import BeautifulSoup
import requests

import re

from string import ascii_lowercase

# progress bars for long running functions
import tqdm

Class 'Helper' v1.3 has been loaded


In [None]:
# reload changes in Jupyter notebooks
from importlib import reload
with Notebook(): __name__ == '__main__' and reload(Utility)

    # Define constants and control variables

In [60]:
DATA_PATH = '../../data/'

LYRICS_URL = 'https://www.lyrics.com/'

    Data for this project will be collected via Web Scraper using BeautifulSoup, with content taken from lyrics.com

    Grab the list of artists, by first letter or number, container looks like:
    
        <div id ="content-body" class="row">
        ...
            <div class="tdata-ext">
                <table class="tdata">
                ...
                    <tbody>
                        <tr><td class="tal qx"><strong><a href="artist/A-Band-Called-David/788850">A Band Called David</a></strong></td><td class="tal qx">7</td><td class="tal qx"><a href="artist-fans/788850">2</a></td></tr>
                    
    therefore we can use the anchor tag <a> to grab both artist URL and name

In [50]:
pattern = re.compile("artist-fans")

artists_dict = {}

# wrap function in progress bar, len(alphabet) + 1
for char in tqdm.tqdm(ascii_lowercase):

    # providing an integer to the URL will force the page to display up to that number of artists
    artists_url = LYRICS_URL + "artists/" + char + "/99999"
    artists_html = requests.get(artists_url).text
    artists_soup = BeautifulSoup(artists_html, 'html5lib')

    artists_a = artists_soup.find('div', id='content-body').find('table', class_=re.compile("tdata")).find('tbody').find_all('a')

    # there are links to number of fans by artists, we'll filter these out
    temp_dict = {artists.text: artists.get("href") for artists in artists_a if not re.match(pattern, artists.get("href"))}

    # join dictionaries
    artists_dict = {**artists_dict, **temp_dict} 

# loop once more for artists starting with a number (all entries are stored at url 0)  
artists_url = LYRICS_URL + "artists/0/99999"
artists_html = requests.get(artist_url).text
artists_soup = BeautifulSoup(artist_html, 'html5lib')

artists_links = artists_soup.find('div', id='content-body').find('table', class_=re.compile("tdata")).find('tbody').find_all('a')

# there are links to number of fans by artists, we'll filter these out
temp_dict = {artists.text: artists.get("href") for artists in artists_a if not re.match(pattern, artists.get("href"))}

# join dictionaries
artists_dict = {**artists_dict, **temp_dict} 

SyntaxError: EOL while scanning string literal (Temp/ipykernel_26372/4247979173.py, line 9)

In [49]:
artists_dict

{'A B': 'artist/A-B/472398',
 'A Bad Think': 'artist/A-Bad-Think/2137849593',
 'A Baffled Republic': 'artist/A-Baffled-Republic/2137849643',
 'A Banca 021': 'artist/A-Banca-021/2137850524',
 'A Band Called "O"': 'artist/A-Band-Called-%22O%22/19641',
 'A Band Called David': 'artist/A-Band-Called-David/788850',
 'A Band of Bees': 'artist/A-Band-of-Bees/526494',
 'A Band of Bitches': 'artist/A-Band-of-Bitches/2715023',
 'A Band Of Boys': 'artist/A-Band-Of-Boys/2137852109',
 'A Banjo Frolic': 'artist/A-Banjo-Frolic/2216388',
 'A Beautiful End': 'artist/A-Beautiful-End/2137855403',
 'A Black and White Movie': 'artist/A-Black-and-White-Movie/2391959',
 'A Blue Ocean Dream': 'artist/A-Blue-Ocean-Dream/706753',
 'A Boogie': 'artist/A-Boogie/3411424',
 'A Boogie wit da Hoodie': 'artist/A-Boogie-wit-da-Hoodie/3229023',
 'A Boy': 'artist/A-Boy/1659681',
 'A Boy and His Kite': 'artist/A-Boy-and-His-Kite/2714702',
 'A Brazuca': 'artist/A-Brazuca/2137846875',
 'A Breach of Silence': 'artist/A-Breach

    For each artist, grab the songs, saving results to a dataframe

In [89]:
lyrics_columns = ['Artist Name', 'Artist URL', 'Song Title', 'Song URL', 'Song Lyrics']

artist_df = pd.DataFrame(columns = lyrics_columns, dtype = str)
artist_df

Unnamed: 0,Artist Name,Artist URL,Song Title,Song URL,Song Lyrics


In [90]:
for key in tqdm.tqdm(artists_dict):

    artist_name = key
    artist_subdir = artists_dict[key]
    artist_url = LYRICS_URL + artist_subdir
    artist_html = requests.get(artist_url).text
    artist_soup = BeautifulSoup(artist_html, 'html5lib')
    
    for i in range(len(artist_soup.find_all('table', class_=re.compile("tdata")))):

        artist_link = artist_soup.find_all('table', class_=re.compile("tdata"))[i].find('tbody').a

        lst = []

        # 'Artist Name', 'Artist URL', 'Song Title', 'Song URL', 'Song Lyrics'
        lst.append(artist_name)
        lst.append(artist_subdir)
        lst.append(artist_link.text)
        lst.append(artist_link.get('href'))
        lst.append('')

        artist_df = artist_df.append(pd.Series(lst, index = lyrics_columns), ignore_index=True)

100%|███████████████████████████████████████████████████████████████████████| 128917/128917 [63:22:45<00:00,  1.77s/it]


In [91]:
artist_df.to_csv(DATA_PATH + "artist_scrape_no_lyrics.csv", index=False)

In [92]:
artist_df

Unnamed: 0,Artist Name,Artist URL,Song Title,Song URL,Song Lyrics
0,A B,artist/A-B/472398,Con el Tic Tac del Reloj,/lyric/3455846/A+B/Con+el+Tic+Tac+del+Reloj,
1,A Bad Think,artist/A-Bad-Think/2137849593,Now You Know,/lyric/36417131/A+Bad+Think/Now+You+Know,
2,A Baffled Republic,artist/A-Baffled-Republic/2137849643,Bad Boys (Move in Silence),/lyric/2262594/A+Baffled+Republic/Bad+Boys+%28...,
3,A Banca 021,artist/A-Banca-021/2137850524,Cor de Mel,/lyric/37798632/A+Banca+021/Cor+de+Mel,
4,"A Band Called ""O""",artist/A-Band-Called-%22O%22/19641,Sleeping,/lyric/637199/A+Band+Called+%22O%22/Sleeping,
...,...,...,...,...,...
1352583,ZZ Ward,artist/ZZ-Ward/2633167,Hold My Heart,/lyric/35830531/ZZ+Ward/Hold+My+Heart,
1352584,ZZ Ward,artist/ZZ-Ward/2633167,Move Like U Stole It [Paul Oakenfold Remix - I...,/lyric/29960072/ZZ+Ward/Move+Like+U+Stole+It+%...,
1352585,ZZ Ward,artist/ZZ-Ward/2633167,Cannonball,/lyric/35448627/ZZ+Ward/Cannonball,
1352586,ZZ Ward,artist/ZZ-Ward/2633167,Cryin Wolf,/lyric/36049908/ZZ+Ward/Cryin+Wolf,
