In [9]:

import requests
from bs4 import BeautifulSoup
import pandas as pd
import yaml
import time
import re
import urllib.parse
from lyricsgenius import Genius
import os
import unittest


## get artists from playlists

In [601]:

def get_spotify_keys():
    with open('keys/spotify_keys.yml', 'r') as file:
        spotify_keys = yaml.safe_load(file)

    return spotify_keys['id'], spotify_keys['secret']

def get_spotify_access(CLIENT_ID, CLIENT_SECRET):
    AUTH_URL = 'https://accounts.spotify.com/api/token'

    # POST
    auth_response = requests.post(AUTH_URL, {
        'grant_type': 'client_credentials',
        'client_id': CLIENT_ID,
        'client_secret': CLIENT_SECRET,
    })

    # convert the response to JSON
    auth_response_data = auth_response.json()
    print('Spotify ACCESS_TOKEN obtained. Token expires in', auth_response_data['expires_in'], 'seconds')

    # save the access token
    access_token = auth_response_data['access_token']

    return access_token

def call_spotify_api(url, headers):
    response = requests.get(url, headers=headers)

    if (response.status_code == 200):
        time.sleep(1)
        return response.json()
    elif (response.status_code == 401):
        print('[ERROR] Response status code 401 - waiting 30 sec')
        time.sleep(30)
        return None


def get_spotify_playlist(headers, content_uri):

    response_size = -1
    index = 0

    data = {'artist':[], 'id':[]}

    while (response_size!=0):

        url = 'https://api.spotify.com/v1/playlists/' + content_uri + '/tracks?limit=50&offset='+str(index)
        json_content = call_spotify_api(url, headers)

        if (json_content != None):
            items = json_content['items']
            response_size = len(items)

            for item in items:
                info = item['track']['artists'][0]

                data['artist'] += [info['name']]
                data['id'] += [info['id']]

            index += 50
            time.sleep(1)

    return data


def get_spotify_content(get_content_function, access_token, content_uri):

    headers = {
        'Authorization': 'Bearer {token}'.format(token=access_token)
    }
    content = get_content_function(headers, content_uri)

    return content


In [602]:

CLIENT_ID, CLIENT_SECRET = get_spotify_keys()
access_token = get_spotify_access(CLIENT_ID, CLIENT_SECRET)
# access_token = 'BQBzRnezTrbBvw-fgcIAjXQffqcMgTUBpcvbtVAV-z_Fp8FHeDiZFcrJ9PDCm9l2qB-hBU_t4CcGN7dlzOCP12DrrDnKq68K3ZwS34ECxEdQE9rve88'
# access_token = 'BQCKebW_cdw_YOjpH41ELgbraH3xo9zO3i-FKCvGn5Uq3NJV7D7T0KJvEJM5y91mRYAb1uhN34P2-w5gYbVtv5doY7z35khQq-FyeLQAZpG90ys380U'


# REMEMBER: these playlists are live and some change every couple of days or hours
spotify_playlists = {
    'Industrial Metal': '37i9dQZF1DX29LQDcJ6Xy7',
    'INDUSTRIAL METAL': '0I4fAd7K0zIHBJYNw9fSuv',
    'Heavy Metal': '37i9dQZF1DX9qNs32fujYe',
    'Metal Mix': '37i9dQZF1EQpgT26jgbgRI',
    'Aggressive Heavy Metal Mix': '37i9dQZF1EIetewBshGEPK',
    'Death Metal Mix': '37i9dQZF1EIf78r65WuXwA',
    'Death Metal Melodico': '5LXjHUJXPJIW71ySYppK5J',
    'Death Metal': '2vivknVOeJD7BUYnnuztrE',
    'Hard Rock': '37i9dQZF1DX1X7WV84927n',
    'Hard Rock / Metal': '1GXRoQWlxTNQiMNkOe7RqA',
    'Hard Rock Mix': '37i9dQZF1EIehdyB47Vd7I',
    'Metalcore Mix': '37i9dQZF1EIgtj4OvJCT7Q',
    '2023 Metalcore Playlist': '7IUlbEWRYOKeTZKmjBcRgX',
    'Modern Metalcore': '4ge2kKhU0ryYD1BWN1CX2T',
    'Deathcore': '37i9dQZF1DX1cJWWyylDuw',
    'Melodic Metal Mix': '37i9dQZF1EId4LkhIN52c3',
    'Epic and Melodic': '37i9dQZF1DX37bXS7EGI3f',
    'Melodic Death Metal Mix': '37i9dQZF1EIfs512qHK0fg',
    'Grindcore Mix': '37i9dQZF1EIgFHBMi7n4aZ',
    'Power Metal Mix': '37i9dQZF1EIfUrKSfi4vkq',
    'power metal': '6uD6LqbKgMn036cfvniRO6',
    'BLACK METAL': '37i9dQZF1EIdrDO1pClEMb',
    'black metal classics': '688iTCqxHbpNbBuWplfa17',
    'Nu Metal Era': '37i9dQZF1DXcfZ6moR6J0G',
    'Nu Metal Mix': '37i9dQZF1EIdT6waU1nlDF',
    'Nu Metal Hits': '37i9dQZF1EIdT6waU1nlDF',
    'Ultimate Goth Metal': '1DR4lUIiCmTYWrxmMNSoyd',
    'Gothic Metal': '76PSrknbBdEiQxvoinpYAm',
    'Groove Metal Mix': '37i9dQZF1EIcCL8b99YRCA',
    'Ultimate Groove Metal': '24y2slE56YDOTR2t4Zr1lR',
    'Rock Mix': '37i9dQZF1EQpj7X7UK8OOF',
    'Best of Rock 2000': '37i9dQZF1DX6rsDrBNGuWW',
    'Pop Hits 2000s - 2023': '6mtYuOxzl58vSGnEDtZ9uB',
    'Pop Hits 2023': '5TDtuKDbOhrfW7C58XnriZ',
    'Pop Mix': '37i9dQZF1EQncLwOalG3K7'
}


Spotify ACCESS_TOKEN obtained. Token expires in 3600 seconds


In [603]:

results_dict = {}

for playlist in spotify_playlists:
    print('processing:', playlist)
    content_uri = spotify_playlists[playlist]
    content = get_spotify_content(get_spotify_playlist, access_token, content_uri)

    results_dict[playlist] = content
print('DONE...')


processing: Industrial Metal
processing: INDUSTRIAL METAL
processing: Heavy Metal
processing: Metal Mix
processing: Aggressive Heavy Metal Mix
processing: Death Metal Mix
processing: Death Metal Melodico
processing: Death Metal
processing: Hard Rock
processing: Hard Rock / Metal
processing: Hard Rock Mix
processing: Metalcore Mix
processing: 2023 Metalcore Playlist
processing: Modern Metalcore
processing: Deathcore
processing: Melodic Metal Mix
processing: Epic and Melodic
processing: Melodic Death Metal Mix
processing: Grindcore Mix
processing: Power Metal Mix
processing: power metal
processing: BLACK METAL
processing: black metal classics
processing: Nu Metal Era
processing: Nu Metal Mix
processing: Nu Metal Hits
processing: Ultimate Goth Metal
processing: Gothic Metal
processing: Groove Metal Mix
processing: Ultimate Groove Metal
processing: Rock Mix
processing: Best of Rock 2000
processing: Pop Hits 2000s - 2023
processing: Pop Hits 2023
processing: Pop Mix
DONE...


In [604]:

artists_df = pd.DataFrame.from_dict({})

for results in results_dict:
    tmp_df = pd.DataFrame.from_dict(results_dict[results])
    artists_df = pd.concat([artists_df, tmp_df])

print(len(artists_df))
artists_df = artists_df.drop_duplicates()
artists_df = artists_df.dropna()
print(len(artists_df))

# artists_df.to_csv('artists.csv', sep=';', index=False)
artists_df.head(10)


4244
1139


Unnamed: 0,artist,id
0,Rob Zombie,3HVdAiMNjYrQIKlOGxoGh5
1,Marilyn Manson,2VYQTNDsvvKN9wmU5W7xpj
2,Fear Factory,74Hj7BmnUXyx2udrIEIKwX
3,Rammstein,6wWVKhxIU2cEi0K81v7HvP
4,Dope,7fWgqc4HJi3pcHhK8hKg2p
5,Filter,01WjpKiWVNurV5hjIadB8C
6,Static-X,7JDSHlDdVTo7aZKdQZ53Vf
7,Orgy,4uYwLU7k03RCQSRXGtQGg0
9,Skillet,49bzE5vRBRIota4qeHtQM8
10,Nine Inch Nails,0X380XXQSNBYuleKzav5UO


## get kaggle data

In [605]:

kaggle_spotify_file = "input_data/dataset.csv"
kaggle_youtube_file = "input_data/Spotify_Youtube.csv"

kaggle_spot_df = pd.read_csv(kaggle_spotify_file)
kaggle_spot_df = kaggle_spot_df[['artists', 'track_genre']]

kaggle_spot_df = kaggle_spot_df.groupby(['artists']).apply(lambda x: x['track_genre'].unique())
kaggle_spot_df = kaggle_spot_df.reset_index(name='track_genre')

kaggle_spot_df = kaggle_spot_df.set_index(['track_genre']).apply(lambda x: x.str.split(',').explode())
kaggle_spot_df = kaggle_spot_df.reset_index()

kaggle_yout_df = pd.read_csv(kaggle_youtube_file)
kaggle_yout_df = kaggle_yout_df

kaggle_tracks_df = kaggle_yout_df.merge(kaggle_spot_df, left_on='Artist', right_on='artists', how='inner')
kaggle_tracks_df = kaggle_tracks_df[['Artist', 'Url_spotify', 'Track', 'track_genre', 'Uri', 'Stream', 'Views', 'Likes', 'Comments', 'Description']]
kaggle_tracks_df.head()


Unnamed: 0,Artist,Url_spotify,Track,track_genre,Uri,Stream,Views,Likes,Comments,Description
0,Gorillaz,https://open.spotify.com/artist/3AA28KZvwAUcZu...,Feel Good Inc.,"[alternative, hip-hop]",spotify:track:0d28khcov6AiegSCpG5TuT,1040235000.0,693555221.0,6220896.0,169907.0,Official HD Video for Gorillaz' fantastic trac...
1,Gorillaz,https://open.spotify.com/artist/3AA28KZvwAUcZu...,Rhinestone Eyes,"[alternative, hip-hop]",spotify:track:1foMv2HQwfQ2vntFf9HFeG,310083700.0,72011645.0,1079128.0,31003.0,The official video for Gorillaz - Rhinestone E...
2,Gorillaz,https://open.spotify.com/artist/3AA28KZvwAUcZu...,New Gold (feat. Tame Impala and Bootie Brown),"[alternative, hip-hop]",spotify:track:64dLd6rVqDLtkXFYrEUHIU,63063470.0,8435055.0,282142.0,7399.0,Gorillaz - New Gold ft. Tame Impala & Bootie B...
3,Gorillaz,https://open.spotify.com/artist/3AA28KZvwAUcZu...,On Melancholy Hill,"[alternative, hip-hop]",spotify:track:0q6LuUqGLUiCPP1cbdwFs3,434663600.0,211754952.0,1788577.0,55229.0,Follow Gorillaz online:\nhttp://gorillaz.com \...
4,Gorillaz,https://open.spotify.com/artist/3AA28KZvwAUcZu...,Clint Eastwood,"[alternative, hip-hop]",spotify:track:7yMiX7n9SBvadzox8T5jzT,617259700.0,618480958.0,6197318.0,155930.0,The official music video for Gorillaz - Clint ...


In [606]:

def contains_similar_genre(values, genres):
    for value in values:
        for genre in genres:
            if (value in genre):
                return True
    return False

genre_filter = kaggle_tracks_df['track_genre'].apply(lambda x: contains_similar_genre(x, ['metal', 'rock', 'pop']))
kaggle_tracks_df = kaggle_tracks_df[genre_filter]

kaggle_tracks_df.head()


Unnamed: 0,Artist,Url_spotify,Track,track_genre,Uri,Stream,Views,Likes,Comments,Description
10,Red Hot Chili Peppers,https://open.spotify.com/artist/0L8ExT028jH3dd...,Californication,"[alt-rock, alternative, funk, metal, rock]",spotify:track:48UPSzbZjgc449aqz8bxox,1055738000.0,1018811000.0,4394471.0,121452.0,Watch the official music video for Californica...
11,Red Hot Chili Peppers,https://open.spotify.com/artist/0L8ExT028jH3dd...,Under the Bridge,"[alt-rock, alternative, funk, metal, rock]",spotify:track:3d9DChrdc6BOeFsbrZ3Is0,1061751000.0,246687700.0,1213572.0,32761.0,Watch the official music video for Under The B...
12,Red Hot Chili Peppers,https://open.spotify.com/artist/0L8ExT028jH3dd...,Can't Stop,"[alt-rock, alternative, funk, metal, rock]",spotify:track:3ZOEytgrvLwQaqXreDs2Jx,866465000.0,336635800.0,1740224.0,32573.0,Watch the official music video for Can't Stop ...
13,Red Hot Chili Peppers,https://open.spotify.com/artist/0L8ExT028jH3dd...,Scar Tissue,"[alt-rock, alternative, funk, metal, rock]",spotify:track:1G391cbiT3v3Cywg8T7DM1,613838700.0,435121500.0,1890900.0,37069.0,Watch the official music video for Scar Tissue...
14,Red Hot Chili Peppers,https://open.spotify.com/artist/0L8ExT028jH3dd...,Otherside,"[alt-rock, alternative, funk, metal, rock]",spotify:track:64BbK9SFKH2jk86U3dGj2P,732774500.0,673528700.0,3140356.0,60091.0,Watch the official music video for Otherside b...


In [607]:

kaggle_tracks_df['Url_spotify'] = kaggle_tracks_df['Url_spotify'].str.split('/').str[-1]
kaggle_tracks_df['Uri'] = kaggle_tracks_df['Uri'].str.split(':').str[-1]

cols = list(kaggle_tracks_df)
for col in cols:
    kaggle_tracks_df.rename(columns={col:col.lower()}, inplace=True)

kaggle_tracks_df.rename(columns={'url_spotify':'id'}, inplace=True)
kaggle_tracks_df.rename(columns={'uri':'track_id'}, inplace=True)


In [608]:

tmp_df = artists_df.copy()
tmp_df = pd.concat([tmp_df, kaggle_tracks_df[['artist', 'id']]])

print('spotify dataframe:\t\t\t\t', len(artists_df), 'artists')
print('kaggle dataframe:\t\t\t\t', len(kaggle_tracks_df), 'artists')
print('concatenated dataframe:\t\t\t\t', len(tmp_df), 'artists')

artists_df = tmp_df.drop_duplicates()
print('concatenated dataframe (drop duplicates):\t', len(artists_df), 'artists')


spotify dataframe:				 1139 artists
kaggle dataframe:				 2859 artists
concatenated dataframe:				 3998 artists
concatenated dataframe (drop duplicates):	 1299 artists


## get artist tracks

In [614]:

def get_text_between(text, start_str, end_str):

    i_start_list = [m.start() for m in re.finditer(start_str, text)]
    i_end_list = [m.start() for m in re.finditer(end_str, text)]

    results = []

    try:
        for i in range(len(i_start_list)):
            if (i >= len(i_end_list)):
                break
            i_start = i_start_list[i] + len(start_str)
            i_end = i_end_list[i]
            results += [text[i_start:i_end]]
        return results

    except Exception as e:
        print('Something went wrong:', e)
        return []


def get_artist_contents(soup):

    # Get monthly listeners
    monthly_listeners_div = soup.find('div', {'data-encore-id': 'type'})
    if (monthly_listeners_div != None):
        monthly_listeners = monthly_listeners_div.text
    else:
        monthly_listeners = None

    # Get top tracks
    start_str = '<span class="ListRowTitle__LineClamp-sc-1xe2if1-0 jjpOuK">'
    end_str = '</span></p></span>'
    top_tracks = get_text_between(str(soup), start_str, end_str)

    # Get top track listens
    start_str = 'data-encore-id="type" id="listrow-subtitle-track-spotify:track:'
    end_str = '</p></div><div class="Areas__InteractiveArea-sc-8gfrea-0 Areas__TrailingSlot-sc-8gfrea-7 bJSfgC jpzxju">'
    top_listens = get_text_between(str(soup), start_str, end_str)

    # Get top track links
    start_str = '"RowMouseLink-sc-hvecl1-0 fKlqHK" data-testid="mouse-only-link"></div><div id="onClickHinttrack-spotify:track:'
    end_str = '" style="display:none"></div><div class="Areas__HeaderSideArea-sc-8gfrea-1'
    top_ids = get_text_between(str(soup), start_str, end_str)
    top_ids = [tl[0:-2] for tl in top_ids]

    # Get description
    start_str = '<span class="Type__TypeElement-sc-goli3j-0 bGROfl G_f5DJd2sgHWeto5cwbi" data-encore-id="type">'
    end_str = '</span></p></div></div></div>'
    desc = get_text_between(str(soup), start_str, end_str)

    end_str = '</span><button aria-expanded="false" class="UhoFLV9F6uYQvi0m6ptf">'
    desc_exand = get_text_between(str(soup), start_str, end_str)

    if (len(desc_exand) > len(desc)):
        desc = desc_exand

    # Create output dict
    content_dict = {
        'monthly_listeners': monthly_listeners,
        'top_tracks': top_tracks,
        'top_listens': top_listens,
        'top_ids': top_ids,
        'desc': desc
    }

    if (
        (
            len(content_dict['top_tracks'])==
            len(content_dict['top_listens'])==
            len(content_dict['top_ids'])
        ) and (
            monthly_listeners != None
        )
    ):
        return content_dict
    else:
        return None


def process_page(url, function):

    headers = {
        'User-Agent': 'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36',
        'Accept-Language': 'en-US, en;q=0.5'
    }

    headers = ()
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')

    return function(soup)
    


In [615]:

artist_scrape_dict = {}

print('Fetching artists\' page information. This could take a few minutes...')
for i in range(len(artists_df)):

    if (i%100==0):
        print(i, 'of', len(artists_df), 'artists\' pages processed...' )

    artist = artists_df.iloc[i]
    artist_name = artist['artist']
    artist_id = artist['id']

    artist_url = 'https://open.spotify.com/artist/'+artist_id
    content_dict = process_page(artist_url, get_artist_contents)

    if (content_dict != None):
        artist_scrape_dict[artist_name] = content_dict

    time.sleep(1)
print('DONE...')


Fetching artists' page information. This could take a few minutes...
0 of 1299 artists' pages processed...
100 of 1299 artists' pages processed...
200 of 1299 artists' pages processed...
300 of 1299 artists' pages processed...
400 of 1299 artists' pages processed...
500 of 1299 artists' pages processed...
600 of 1299 artists' pages processed...
700 of 1299 artists' pages processed...
800 of 1299 artists' pages processed...
900 of 1299 artists' pages processed...
1000 of 1299 artists' pages processed...
1100 of 1299 artists' pages processed...
1200 of 1299 artists' pages processed...
DONE...


In [616]:

track_info_dict = {
    'artist':[],
    'track':[],
    'track_id':[],
    'track_listens':[]
}

artist_info_dict = {
    'artist':[],
    'monthly_listeners':[],
    'description':[]
}

for artist in artist_scrape_dict:
    data = artist_scrape_dict[artist]

    # Append to artist dictionary
    artist_info_dict['artist'] += [artist]
    artist_info_dict['monthly_listeners'] += [data['monthly_listeners']]

    desc = data['desc']
    if (len(desc) == 0):
        desc = None
    else:
        desc = desc[0]
    artist_info_dict['description'] += [desc]

    # Append to track dictionary
    for track in data['top_tracks']:
        track_info_dict['artist'] += [artist]
        track_info_dict['track'] += [track]

    for track_id in data['top_ids']:
        track_info_dict['track_id'] += [track_id]

    for track_listens in data['top_listens']:
        track_info_dict['track_listens'] += [track_listens]


In [620]:

tracks_df = pd.DataFrame.from_dict(track_info_dict)
# tracks_df.to_csv('tracks_df.csv', sep=';', index=False)
tracks_df.head()


Unnamed: 0,artist,track,track_id,track_listens
0,Rob Zombie,Dragula,6Nm8h73ycDG2saCnZV8poF,"6Nm8h73ycDG2saCnZV8poF-0"">331,646,321"
1,Rob Zombie,Living Dead Girl,10V8XpuyMoEcSMfM79WDET,"10V8XpuyMoEcSMfM79WDET-1"">130,584,774"
2,Rob Zombie,Superbeast,4eNnMvUrSu2TRpySjVC0Pv,"4eNnMvUrSu2TRpySjVC0Pv-2"">91,715,788"
3,Rob Zombie,Feel So Numb,6KnNDix4Owr4vNmsPhLmxD,"6KnNDix4Owr4vNmsPhLmxD-3"">54,587,702"
4,Rob Zombie,The Satanic Rites of Blacula,1JeGX4cixemJfBJl6XGQES,"1JeGX4cixemJfBJl6XGQES-4"">10,284,300"


In [623]:

artists_info_df = pd.DataFrame.from_dict(artist_info_dict)
# artists_info_df.to_csv('artist_info.csv', sep=';', index=False)
artists_info_df.head()


Unnamed: 0,artist,monthly_listeners,description
0,Rob Zombie,"5,758,128 monthly listeners",The longtime frontman for '90s industrial supe...
1,Marilyn Manson,"6,735,459 monthly listeners",Controversial rock frontman Marilyn Manson bec...
2,Fear Factory,"974,109 monthly listeners",One can’t overstate the size of the Fear Facto...
3,Rammstein,"10,661,005 monthly listeners",Rammstein are one of rock’s most individual an...
4,Dope,"1,734,951 monthly listeners",New album 'Blood Money Part Zer0' out 2.24.23!


In [624]:

artists_df = artists_df.merge(artists_info_df, on='artist')
artists_df.head()


Unnamed: 0,artist,id,monthly_listeners,description
0,Rob Zombie,3HVdAiMNjYrQIKlOGxoGh5,"5,758,128 monthly listeners",The longtime frontman for '90s industrial supe...
1,Marilyn Manson,2VYQTNDsvvKN9wmU5W7xpj,"6,735,459 monthly listeners",Controversial rock frontman Marilyn Manson bec...
2,Fear Factory,74Hj7BmnUXyx2udrIEIKwX,"974,109 monthly listeners",One can’t overstate the size of the Fear Facto...
3,Rammstein,6wWVKhxIU2cEi0K81v7HvP,"10,661,005 monthly listeners",Rammstein are one of rock’s most individual an...
4,Dope,7fWgqc4HJi3pcHhK8hKg2p,"1,734,951 monthly listeners",New album 'Blood Money Part Zer0' out 2.24.23!


## Get track features

In [628]:

def get_spotify_tracks(headers, content_uri):

    batch_size = 50
    batch_index = 0

    info_dict = {
        'track_id':[],
        'popularity':[],
        'artist_id':[]
    }
    feat_dict = {
        'track_id':[],
        'danceability':[],
        'energy':[],
        'key':[],
        'loudness':[],
        'mode':[],
        'speechiness':[],
        'acousticness':[],
        'instrumentalness':[],
        'liveness':[],
        'valence':[],
        'tempo':[],
        'time_signature':[]
    }

    print('Processing tracks. This could take a few minutes...')
    while True:

        if (batch_index%1000 == 0):
            print('\ttracks processed:', batch_index, 'of', len(content_uri))

        tracks_ids = content_uri[batch_index:batch_index+batch_size]

        info_url = 'https://api.spotify.com/v1/tracks?ids=' + '%2C'.join(tracks_ids)
        json_track_info = call_spotify_api(info_url, headers)

        features_url = 'https://api.spotify.com/v1/audio-features?ids='+ '%2C'.join(tracks_ids)
        json_track_feat = call_spotify_api(features_url, headers)

        if (json_track_info != None and json_track_feat != None):
            for track in json_track_info['tracks']:
                info_dict['track_id'] += [track['id']]
                info_dict['popularity'] += [track['popularity']]
                info_dict['artist_id'] += [track['artists'][0]['id']]

            for track in json_track_feat['audio_features']:
                feat_dict['track_id'] += [track['id']]
                feat_dict['danceability'] += [track['danceability']]
                feat_dict['energy'] += [track['energy']]
                feat_dict['key'] += [track['key']]
                feat_dict['loudness'] += [track['loudness']]
                feat_dict['mode'] += [track['mode']]
                feat_dict['speechiness'] += [track['speechiness']]
                feat_dict['acousticness'] += [track['acousticness']]
                feat_dict['instrumentalness'] += [track['instrumentalness']]
                feat_dict['liveness'] += [track['liveness']]
                feat_dict['valence'] += [track['valence']]
                feat_dict['tempo'] += [track['tempo']]
                feat_dict['time_signature'] += [track['time_signature']]

            batch_index += batch_size
        else:
            break

        time.sleep(1)

    print('DONE...')
    return info_dict, feat_dict


In [629]:

tracks_ids = pd.concat([tracks_df['track_id'], kaggle_tracks_df['track_id']])
tracks_ids = tracks_ids.drop_duplicates()
tracks_ids_list = tracks_ids.tolist()

access_token = get_spotify_access(CLIENT_ID, CLIENT_SECRET)
info_dict, feat_dict = get_spotify_content(get_spotify_tracks, access_token, tracks_ids_list)


Spotify ACCESS_TOKEN obtained. Token expires in 3600 seconds
Processing tracks. This could take a few minutes...
	tracks processed: 0 of 8079
	tracks processed: 1000 of 8079
	tracks processed: 2000 of 8079
	tracks processed: 3000 of 8079
	tracks processed: 4000 of 8079
	tracks processed: 5000 of 8079
	tracks processed: 6000 of 8079
	tracks processed: 7000 of 8079
	tracks processed: 8000 of 8079
DONE...


In [630]:

track_info_df = pd.DataFrame.from_dict(info_dict)
track_feat_df = pd.DataFrame.from_dict(feat_dict)
track_details_df = track_info_df.merge(track_feat_df, on='track_id')
track_details_df.head()


Unnamed: 0,track_id,popularity,artist_id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,6Nm8h73ycDG2saCnZV8poF,80,3HVdAiMNjYrQIKlOGxoGh5,0.591,0.963,0,-6.489,0,0.0531,8.7e-05,0.000107,0.0781,0.609,125.03,4
1,10V8XpuyMoEcSMfM79WDET,72,3HVdAiMNjYrQIKlOGxoGh5,0.518,0.948,6,-5.58,0,0.0683,0.00504,0.104,0.228,0.55,103.025,4
2,4eNnMvUrSu2TRpySjVC0Pv,68,3HVdAiMNjYrQIKlOGxoGh5,0.518,0.975,8,-4.425,1,0.0575,0.000136,0.827,0.42,0.32,154.059,4
3,6KnNDix4Owr4vNmsPhLmxD,66,3HVdAiMNjYrQIKlOGxoGh5,0.566,0.889,8,-5.22,1,0.0426,4.3e-05,0.00907,0.292,0.141,127.045,4
4,1JeGX4cixemJfBJl6XGQES,64,3HVdAiMNjYrQIKlOGxoGh5,0.48,0.993,0,-6.402,0,0.0862,0.000339,5e-05,0.3,0.589,154.991,4


In [641]:

spotify_detail_df = tracks_df.merge(track_details_df, on='track_id', how='inner')
kaggle_detail_df = kaggle_tracks_df.merge(track_details_df, on='track_id', how='inner')
kaggle_detail_df.drop(columns=['track_genre', 'id', 'description'], inplace=True)
kaggle_detail_df.rename(columns={'stream': 'track_listens'}, inplace=True)


In [642]:

print('Spotify scraped tracks dataframe:', len(spotify_detail_df))
spotify_detail_df.head(3)


Spotify scraped tracks dataframe: 6374


Unnamed: 0,artist,track,track_id,track_listens,popularity,artist_id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,Rob Zombie,Dragula,6Nm8h73ycDG2saCnZV8poF,"6Nm8h73ycDG2saCnZV8poF-0"">331,646,321",80,3HVdAiMNjYrQIKlOGxoGh5,0.591,0.963,0,-6.489,0,0.0531,8.7e-05,0.000107,0.0781,0.609,125.03,4
1,Rob Zombie,Living Dead Girl,10V8XpuyMoEcSMfM79WDET,"10V8XpuyMoEcSMfM79WDET-1"">130,584,774",72,3HVdAiMNjYrQIKlOGxoGh5,0.518,0.948,6,-5.58,0,0.0683,0.00504,0.104,0.228,0.55,103.025,4
2,Rob Zombie,Superbeast,4eNnMvUrSu2TRpySjVC0Pv,"4eNnMvUrSu2TRpySjVC0Pv-2"">91,715,788",68,3HVdAiMNjYrQIKlOGxoGh5,0.518,0.975,8,-4.425,1,0.0575,0.000136,0.827,0.42,0.32,154.059,4


In [643]:

print('Kaggle tracks dataframe:', len(kaggle_detail_df))
kaggle_detail_df.head(3)


Kaggle tracks dataframe: 2859


Unnamed: 0,artist,track,track_id,track_listens,views,likes,comments,popularity,artist_id,danceability,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,Red Hot Chili Peppers,Californication,48UPSzbZjgc449aqz8bxox,1055738000.0,1018811000.0,4394471.0,121452.0,85,0L8ExT028jH3ddEcZwqJJ5,0.592,...,9,-2.788,0,0.027,0.0021,0.00165,0.127,0.328,96.483,4
1,Red Hot Chili Peppers,Under the Bridge,3d9DChrdc6BOeFsbrZ3Is0,1061751000.0,246687700.0,1213572.0,32761.0,84,0L8ExT028jH3ddEcZwqJJ5,0.559,...,4,-13.496,1,0.0459,0.0576,0.000105,0.141,0.458,84.581,4
2,Red Hot Chili Peppers,Can't Stop,3ZOEytgrvLwQaqXreDs2Jx,866465000.0,336635800.0,1740224.0,32573.0,84,0L8ExT028jH3ddEcZwqJJ5,0.618,...,9,-3.442,1,0.0456,0.0179,0.0,0.167,0.875,91.455,4


## Get tracks' lyrics

In [722]:

tracks_names_df = pd.concat([spotify_detail_df[['track', 'artist']], kaggle_detail_df[['track', 'artist']]])
tracks_names_df = tracks_names.drop_duplicates()


In [724]:

# https://docs.genius.com/#/getting-started-h1
# https://medium.com/analytics-vidhya/how-to-scrape-song-lyrics-a-gentle-python-tutorial-5b1d4ab351d2
with open('keys/genius_key.yml', 'r') as file:
    genius_key = yaml.safe_load(file)

GENIUS_KEY = genius_key['key']
genius = Genius(GENIUS_KEY, verbose=False)

tracks_lyrics_dict = {
    'artist':[],
    'track':[],
    'lyrics':[]
}

print('Getting tracks\' lyrics. This could take a few hours...')
for i in range(len(tracks_names_df)):

    row = tracks_names_df.iloc[i]

    artist_name = row['artist']
    track_name = row['track']

    if (i%500==0):
        print(i, 'of', len(tracks_names_df), 'tracks processed...')

    # https://github.com/johnwmillr/LyricsGenius/issues/121
    while True:
        retries = 0
        try:
            song = genius.search_song(track_name, artist_name, get_full_info=False)
            break
        except:
            retries += 1
            pass

        if (retries >= 30):
            song=None
            break

    if (song != None):
        track_lyrics = song.lyrics
    else:
        track_lyrics = ''

    tracks_lyrics_dict['artist'] += [artist_name]
    tracks_lyrics_dict['track'] += [track_name]
    tracks_lyrics_dict['lyrics'] += [track_lyrics]
    time.sleep(0.5)

print('DONE...')


Getting tracks' lyrics. This could take a few hours...
0 of 8065 tracks processed...
500 of 8065 tracks processed...
1000 of 8065 tracks processed...
1500 of 8065 tracks processed...
2000 of 8065 tracks processed...
2500 of 8065 tracks processed...
3000 of 8065 tracks processed...
3500 of 8065 tracks processed...
4000 of 8065 tracks processed...
4500 of 8065 tracks processed...
5000 of 8065 tracks processed...
5500 of 8065 tracks processed...
6000 of 8065 tracks processed...
6500 of 8065 tracks processed...
7000 of 8065 tracks processed...
7500 of 8065 tracks processed...
8000 of 8065 tracks processed...
DONE...


In [725]:

tracks_lyrics_df = pd.DataFrame.from_dict(tracks_lyrics_dict)
tracks_lyrics_df.head()
# tracks_lyrics_df.to_csv('tracks_lyrics_df.csv', sep=';', index=False)


Unnamed: 0,artist,track,lyrics
0,Rob Zombie,Dragula,44 ContributorsDragula Lyrics[Sample]\nSuperst...
1,Rob Zombie,Living Dead Girl,29 ContributorsLiving Dead Girl Lyrics[Intro]\...
2,Rob Zombie,Superbeast,16 ContributorsSuperbeast Lyrics(Verse 1)\nShr...
3,Rob Zombie,Feel So Numb,14 ContributorsFeel So Numb Lyrics[Intro]\nPro...
4,Rob Zombie,The Satanic Rites of Blacula,4 ContributorsThe Satanic Rites of Blacula Lyr...


In [727]:

spotify_df = spotify_detail_df.merge(tracks_lyrics_df, on=['artist', 'track'], how='inner')
kaggle_df = kaggle_detail_df.merge(tracks_lyrics_df, on=['artist', 'track'], how='inner')


## Get band genres

In [648]:

def get_spotify_genres(headers, content_uri):

    batch_size = 50
    batch_index = 0

    artist_dict = {
        'id':[],
        'followers':[],
        'genres':[]
    }

    print('Fetching artists\' genres. This could take a few minutes...')
    while True:
        artists_ids = content_uri[batch_index:batch_index+batch_size]

        info_url = 'https://api.spotify.com/v1/artists?ids=' + '%2C'.join(artists_ids)
        json_artist_info = call_spotify_api(info_url, headers)

        if (json_artist_info != None):
            for artist in json_artist_info['artists']:
                artist_dict['id'] += [artist['id']]
                artist_dict['followers'] += [artist['followers']['total']]
                artist_dict['genres'] += [artist['genres']]
            batch_index += batch_size
        else:
            break

        time.sleep(1)

    print('DONE...')
    return artist_dict


In [649]:

artists_ids_df = artists_df['id']
artists_ids_df.drop_duplicates(inplace=True)
artists_ids_list = artists_ids_df.to_list()


In [650]:

access_token = get_spotify_access(CLIENT_ID, CLIENT_SECRET)
artists_info_dict = get_spotify_content(get_spotify_genres, access_token, artists_ids_list)


Spotify ACCESS_TOKEN obtained. Token expires in 3600 seconds
Fetching artists' genres. This could take a few minutes...
DONE...


In [653]:

artists_ids_df = pd.DataFrame.from_dict(artists_info_dict)
artists_ids_df.head()


Unnamed: 0,id,followers,genres
0,3HVdAiMNjYrQIKlOGxoGh5,2645445,"[alternative metal, hard rock, industrial meta..."
1,2VYQTNDsvvKN9wmU5W7xpj,4076257,"[alternative metal, hard rock, industrial, ind..."
2,74Hj7BmnUXyx2udrIEIKwX,632576,"[alternative metal, groove metal, industrial m..."
3,6wWVKhxIU2cEi0K81v7HvP,8618111,"[german metal, industrial, industrial metal, i..."
4,7fWgqc4HJi3pcHhK8hKg2p,751681,"[alternative metal, industrial metal, nu metal]"


In [654]:

artists_df = artists_df.merge(artists_ids_df, on='id')
artists_df.head()


Unnamed: 0,artist,id,monthly_listeners,description,followers,genres
0,Rob Zombie,3HVdAiMNjYrQIKlOGxoGh5,"5,758,128 monthly listeners",The longtime frontman for '90s industrial supe...,2645445,"[alternative metal, hard rock, industrial meta..."
1,Marilyn Manson,2VYQTNDsvvKN9wmU5W7xpj,"6,735,459 monthly listeners",Controversial rock frontman Marilyn Manson bec...,4076257,"[alternative metal, hard rock, industrial, ind..."
2,Fear Factory,74Hj7BmnUXyx2udrIEIKwX,"974,109 monthly listeners",One can’t overstate the size of the Fear Facto...,632576,"[alternative metal, groove metal, industrial m..."
3,Rammstein,6wWVKhxIU2cEi0K81v7HvP,"10,661,005 monthly listeners",Rammstein are one of rock’s most individual an...,8618111,"[german metal, industrial, industrial metal, i..."
4,Dope,7fWgqc4HJi3pcHhK8hKg2p,"1,734,951 monthly listeners",New album 'Blood Money Part Zer0' out 2.24.23!,751681,"[alternative metal, industrial metal, nu metal]"


## Get artists' wikipedia information

In [662]:

# https://bobbyhadz.com/blog/python-print-string-with-special-characters#:~:text=Use%20the%20repr()%20function,representation%20of%20the%20provided%20object.
# https://medium.com/geekculture/web-scraping-tables-in-python-using-beautiful-soup-8bbc31c5803e

def get_wiki_info(info_table, is_band):

    info_dict={
        'years_active': 'not_on_wikipedia',
        'origin': 'not_on_wikipedia'
    }

    info_found=False

    for row in info_table.tbody.find_all('tr'):
        if (row.th != None):
            row_name = row.th.text.replace('\xa0', ' ')

            # Get years active
            if (row_name in ['Years active', 'Years']):
                info_found = True
                row_val = row.td.text
                info_dict['years_active'] = row_val
                    

            # Get origin
            if (row_name in ['Origin', 'Born']):
                info_found = True
                row_val = row.td.text
                info_dict['origin'] = row_val

    if (info_found==False):
        return None
    else:
        return info_dict


def get_artist_wiki(soup):

    band_infobox = 'infobox vcard plainlist'
    artist_infobox = 'infobox biography vcard'

    # Get band info
    info_table = soup.find('table', {'class': band_infobox})
    if (info_table != None):
        info_dict = get_wiki_info(info_table, is_band=True)
        return info_dict

    # If not a band get artist info
    info_table = soup.find('table', {'class': artist_infobox})
    if (info_table != None):
        info_dict = get_wiki_info(info_table, is_band=False)
        return info_dict

    # Otherwise return nothing
    return None


def extract_wikipedia_data(artists):

    wiki_pages_found = 0
    artists_wiki_dict = {
        'artist':[],
        'years_active':[],
        'origin':[]
    }

    print('Fetching artists\' wikipedia information. This could take a few minutes...')
    for i in range(len(artists)):
        artist = artists[i]

        if (i%200==0):
            print('Finished processing', i, 'of', len(artists), 'artists\' wiki pages...')
            print('\t-', wiki_pages_found, 'of', i, 'wikipedia pages found')

        artist_search = artist.replace(' ', '_')

        url = 'https://en.wikipedia.org/wiki/'+artist_search
        wiki_info_dict = process_page(url, get_artist_wiki)

        if (wiki_info_dict==None):
            url = 'https://en.wikipedia.org/wiki/'+artist_search+'_(band)'
            wiki_info_dict = process_page(url, get_artist_wiki)

            if (wiki_info_dict==None):

                for rep_str in ['Of', 'The']:
                    artist_search = artist_search.replace(rep_str, rep_str.lower())
                url = 'https://en.wikipedia.org/wiki/'+artist_search
                wiki_info_dict = process_page(url, get_artist_wiki)

                if (wiki_info_dict==None):
                    url = 'https://en.wikipedia.org/wiki/'+urllib.parse.quote(artist_search)
                    wiki_info_dict = process_page(url, get_artist_wiki)

                    if (wiki_info_dict==None):
                        wiki_info_dict=None

        artists_wiki_dict['artist'] += [artist]
        if (wiki_info_dict!=None):
            artists_wiki_dict['years_active'] += [wiki_info_dict['years_active']]
            artists_wiki_dict['origin'] += [wiki_info_dict['origin']]
            wiki_pages_found += 1
        else:
            artists_wiki_dict['years_active'] += ['']
            artists_wiki_dict['origin'] += ['']

        time.sleep(0.5)

    print('DONE...')
    return artists_wiki_dict


In [664]:

artists_list = artists_df['artist'].tolist()
artists_wiki_dict = extract_wikipedia_data(artists_list)


Fetching artists' wikipedia information. This could take a few minutes...
Finished processing 0 of 1284 artists' wiki pages...
	- 0 of 0 wikipedia pages found
Finished processing 200 of 1284 artists' wiki pages...
	- 166 of 200 wikipedia pages found
Finished processing 400 of 1284 artists' wiki pages...
	- 304 of 400 wikipedia pages found
Finished processing 600 of 1284 artists' wiki pages...
	- 426 of 600 wikipedia pages found
Finished processing 800 of 1284 artists' wiki pages...
	- 571 of 800 wikipedia pages found
Finished processing 1000 of 1284 artists' wiki pages...
	- 743 of 1000 wikipedia pages found
Finished processing 1200 of 1284 artists' wiki pages...
	- 902 of 1200 wikipedia pages found
DONE...


In [671]:

artists_wiki_df = pd.DataFrame.from_dict(artists_wiki_dict)
artists_df = artists_df.merge(artists_wiki_df, on='artist')
artists_df.head()


Unnamed: 0,artist,id,monthly_listeners,description,followers,genres,years_active,origin
0,Rob Zombie,3HVdAiMNjYrQIKlOGxoGh5,"5,758,128 monthly listeners",The longtime frontman for '90s industrial supe...,2645445,"[alternative metal, hard rock, industrial meta...",1985–present,Robert Bartleh Cummings (1965-01-12) January 1...
1,Marilyn Manson,2VYQTNDsvvKN9wmU5W7xpj,"6,735,459 monthly listeners",Controversial rock frontman Marilyn Manson bec...,4076257,"[alternative metal, hard rock, industrial, ind...",1989–present,"(1969-01-05) January 5, 1969 (age 54)Canton, ..."
2,Fear Factory,74Hj7BmnUXyx2udrIEIKwX,"974,109 monthly listeners",One can’t overstate the size of the Fear Facto...,632576,"[alternative metal, groove metal, industrial m...",\n1989–2006\n2009–present\n,"Los Angeles, California, U.S."
3,Rammstein,6wWVKhxIU2cEi0K81v7HvP,"10,661,005 monthly listeners",Rammstein are one of rock’s most individual an...,8618111,"[german metal, industrial, industrial metal, i...",1994–present,"Berlin, Germany"
4,Dope,7fWgqc4HJi3pcHhK8hKg2p,"1,734,951 monthly listeners",New album 'Blood Money Part Zer0' out 2.24.23!,751681,"[alternative metal, industrial metal, nu metal]",1997–present,"New York City, U.S."


## Write dataframes

In [736]:

try:
    os.mkdir(os.getcwd()+'/intermediate_data')
except:
    pass

artists_df.to_csv('intermediate_data/artists.csv', sep=';', index=False)
spotify_df.to_csv('intermediate_data/spotify.csv', sep=';', index=False)
kaggle_df.to_csv('intermediate_data/kaggle.csv', sep=';', index=False)


## Read dataframes

In [8]:

artists_df = pd.read_csv('intermediate_data/artists.csv', sep=';')
spotify_df = pd.read_csv('intermediate_data/spotify.csv', sep=';')
kaggle_df = pd.read_csv('intermediate_data/kaggle.csv', sep=';')



## Cleaning Data

In [195]:

def clean_col_monthly_listeners(input_df):
    data_df = input_df.copy()

    data_df = data_df[data_df['monthly_listeners'] != '']
    data_df = data_df[data_df['monthly_listeners'].isnull()==False]

    listeners_series = data_df['monthly_listeners']
    numbers_str_series = listeners_series.str.split(' ').str[0]
    numbers_int_series = numbers_str_series.str.replace(',', '').astype(int)
    data_df['monthly_listeners'] = numbers_int_series

    return data_df

def clean_artists_data(input):

    if (type(input)==pd.DataFrame):

        data_df = input.copy()

        # Get columns
        cols_input = list(data_df)
        cols_required = [
            'artist', 'id', 'monthly_listeners', 'description',
            'followers', 'genres', 'years_active', 'origin'
        ]

        # Sort columns
        cols_input.sort()
        cols_required.sort()

        # Check input length and columns
        if (len(data_df)!=0 and cols_input==cols_required):

            data_df = clean_col_monthly_listeners(data_df)
            return data_df

    return None



In [200]:

mock_df = pd.DataFrame.from_dict(
    {
        'artist':           ['a', 'b', 'c', 'd', 'e', 'f', 'g'],
        'id':               ['1', '2', '3', '4', '5', '6', '7'],
        'monthly_listeners':['5,758,128 monthly listeners', '758,128 monthly listeners', '8,128 monthly listeners', '128 monthly listeners', '123', '', None],
        'description':      ['Some description...', '', None, '\nThis description has some \tescape characters\n\n.', '', '', ''],
        'followers':        ['', '', '', '', '', '', ''],
        'genres':           ['', '', '', '', '', '', ''],
        'years_active':     ['', '', '', '', '', '', ''],
        'origin':           ['', '', '', '', '', '', ''],
    }
)


print('> artists_df:\t\t', len(artists_df))
print('> cleaned artists_df:\t', len(clean_artists_data(artists_df)))

clean_artists_data(mock_df).head()



> artists_df:		 1284
> cleaned artists_df:	 1284


Unnamed: 0,artist,id,monthly_listeners,description,followers,genres,years_active,origin
0,a,1,5758128,Some description...,,,,
1,b,2,758128,,,,,
2,c,3,8128,,,,,
3,d,4,128,\nThis description has some \tescape character...,,,,
4,e,5,123,,,,,


In [147]:

class cleanArtistsDataTests(unittest.TestCase):

    def test_1_call(self):
        self.assertIsNotNone(clean_artists_data)

    def test_2_none(self):
        self.assertIsNone(clean_artists_data(None))
    
    def test_3_empty(self):
        empty_df = pd.DataFrame.from_dict({})
        self.assertIsNone(clean_artists_data(empty_df))

    def test_4_wrong_cols(self):
        missing_cols_df = pd.DataFrame.from_dict(
            mock_df.drop(columns=['artist'])
        )
        self.assertIsNone(clean_artists_data(missing_cols_df))

    def test_5_column_order(self):
        cols = list(mock_df)
        cols += [cols[0]]
        del cols[0]
        wrong_order_df = pd.DataFrame.from_dict(mock_df[cols])
        self.assertIsNotNone(clean_artists_data(wrong_order_df))

    def test_5_run(self):
        results_df = clean_artists_data(mock_df)
        self.assertIsNotNone(results_df)

unittest.main(argv=[''], exit=False,verbosity=2)


test_1_call (__main__.cleanArtistsDataTests) ... ok
test_2_none (__main__.cleanArtistsDataTests) ... ok
test_3_empty (__main__.cleanArtistsDataTests) ... ok
test_4_wrong_cols (__main__.cleanArtistsDataTests) ... ok
test_5_column_order (__main__.cleanArtistsDataTests) ... ok
test_5_run (__main__.cleanArtistsDataTests) ... 

<< you made it to the code >>
<< you made it to the code >>


ok

----------------------------------------------------------------------
Ran 6 tests in 0.005s

OK


<unittest.main.TestProgram at 0x1223ccac0>

In [12]:


artists_df.head()




Unnamed: 0,artist,id,monthly_listeners,description,followers,genres,years_active,origin
0,Rob Zombie,3HVdAiMNjYrQIKlOGxoGh5,"5,758,128 monthly listeners",The longtime frontman for '90s industrial supe...,2645445,"['alternative metal', 'hard rock', 'industrial...",1985–present,Robert Bartleh Cummings (1965-01-12) January 1...
1,Marilyn Manson,2VYQTNDsvvKN9wmU5W7xpj,"6,735,459 monthly listeners",Controversial rock frontman Marilyn Manson bec...,4076257,"['alternative metal', 'hard rock', 'industrial...",1989–present,"(1969-01-05) January 5, 1969 (age 54)Canton, ..."
2,Fear Factory,74Hj7BmnUXyx2udrIEIKwX,"974,109 monthly listeners",One can’t overstate the size of the Fear Facto...,632576,"['alternative metal', 'groove metal', 'industr...",\n1989–2006\n2009–present\n,"Los Angeles, California, U.S."
3,Rammstein,6wWVKhxIU2cEi0K81v7HvP,"10,661,005 monthly listeners",Rammstein are one of rock’s most individual an...,8618111,"['german metal', 'industrial', 'industrial met...",1994–present,"Berlin, Germany"
4,Dope,7fWgqc4HJi3pcHhK8hKg2p,"1,734,951 monthly listeners",New album 'Blood Money Part Zer0' out 2.24.23!,751681,"['alternative metal', 'industrial metal', 'nu ...",1997–present,"New York City, U.S."


In [None]:
# https://stackoverflow.com/questions/61064454/beautifulsoup-is-unable-to-extract-all-html