In [487]:

import requests
from bs4 import BeautifulSoup
import pandas as pd
import yaml
import time
import re
import urllib.parse


## get artists from playlists

In [142]:

def get_spotify_keys():
    with open('spotify_keys.yml', 'r') as file:
        spotify_keys = yaml.safe_load(file)

    return spotify_keys['id'], spotify_keys['secret']

def get_spotify_access(CLIENT_ID, CLIENT_SECRET):
    AUTH_URL = 'https://accounts.spotify.com/api/token'

    # POST
    auth_response = requests.post(AUTH_URL, {
        'grant_type': 'client_credentials',
        'client_id': CLIENT_ID,
        'client_secret': CLIENT_SECRET,
    })

    # convert the response to JSON
    auth_response_data = auth_response.json()
    print('Spotify ACCESS_TOKEN obtained. Token expires in', auth_response_data['expires_in'], 'seconds')

    # save the access token
    access_token = auth_response_data['access_token']

    return access_token

def call_spotify_api(url, headers):
    response = requests.get(url, headers=headers)

    if (response.status_code == 200):
        time.sleep(1)
        return response.json()
    elif (response.status_code == 401):
        print('[ERROR] Response status code 401 - waiting 30 sec')
        time.sleep(30)
        return None


def get_spotify_playlist(headers, content_uri):

    response_size = -1
    index = 0

    data = {'artist':[], 'id':[]}

    while (response_size!=0):

        url = 'https://api.spotify.com/v1/playlists/' + content_uri + '/tracks?limit=50&offset='+str(index)
        json_content = call_spotify_api(url, headers)

        if (json_content != None):
            items = json_content['items']
            response_size = len(items)

            for item in items:
                info = item['track']['artists'][0]

                data['artist'] += [info['name']]
                data['id'] += [info['id']]

            index += 50
            time.sleep(1)

    return data


def get_spotify_content(get_content_function, access_token, content_uri):

    headers = {
        'Authorization': 'Bearer {token}'.format(token=access_token)
    }
    content = get_content_function(headers, content_uri)

    return content


In [5]:

CLIENT_ID, CLIENT_SECRET = get_spotify_keys()
access_token = get_spotify_access(CLIENT_ID, CLIENT_SECRET)
# access_token = 'BQBzRnezTrbBvw-fgcIAjXQffqcMgTUBpcvbtVAV-z_Fp8FHeDiZFcrJ9PDCm9l2qB-hBU_t4CcGN7dlzOCP12DrrDnKq68K3ZwS34ECxEdQE9rve88'
# access_token = 'BQCKebW_cdw_YOjpH41ELgbraH3xo9zO3i-FKCvGn5Uq3NJV7D7T0KJvEJM5y91mRYAb1uhN34P2-w5gYbVtv5doY7z35khQq-FyeLQAZpG90ys380U'


# REMEMBER: these playlists are live and some change every couple of days or hours
spotify_playlists = {
    'Industrial Metal': '37i9dQZF1DX29LQDcJ6Xy7',
    'INDUSTRIAL METAL': '0I4fAd7K0zIHBJYNw9fSuv',
    'Heavy Metal': '37i9dQZF1DX9qNs32fujYe',
    'Metal Mix': '37i9dQZF1EQpgT26jgbgRI',
    'Aggressive Heavy Metal Mix': '37i9dQZF1EIetewBshGEPK',
    'Death Metal Mix': '37i9dQZF1EIf78r65WuXwA',
    'Death Metal Melodico': '5LXjHUJXPJIW71ySYppK5J',
    'Death Metal': '2vivknVOeJD7BUYnnuztrE',
    'Hard Rock': '37i9dQZF1DX1X7WV84927n',
    'Hard Rock / Metal': '1GXRoQWlxTNQiMNkOe7RqA',
    'Hard Rock Mix': '37i9dQZF1EIehdyB47Vd7I',
    'Metalcore Mix': '37i9dQZF1EIgtj4OvJCT7Q',
    '2023 Metalcore Playlist': '7IUlbEWRYOKeTZKmjBcRgX',
    'Modern Metalcore': '4ge2kKhU0ryYD1BWN1CX2T',
    'Deathcore': '37i9dQZF1DX1cJWWyylDuw',
    'Melodic Metal Mix': '37i9dQZF1EId4LkhIN52c3',
    'Epic and Melodic': '37i9dQZF1DX37bXS7EGI3f',
    'Melodic Death Metal Mix': '37i9dQZF1EIfs512qHK0fg',
    'Grindcore Mix': '37i9dQZF1EIgFHBMi7n4aZ',
    'Power Metal Mix': '37i9dQZF1EIfUrKSfi4vkq',
    'power metal': '6uD6LqbKgMn036cfvniRO6',
    'BLACK METAL': '37i9dQZF1EIdrDO1pClEMb',
    'black metal classics': '688iTCqxHbpNbBuWplfa17',
    'Nu Metal Era': '37i9dQZF1DXcfZ6moR6J0G',
    'Nu Metal Mix': '37i9dQZF1EIdT6waU1nlDF',
    'Nu Metal Hits': '37i9dQZF1EIdT6waU1nlDF',
    'Ultimate Goth Metal': '1DR4lUIiCmTYWrxmMNSoyd',
    'Gothic Metal': '76PSrknbBdEiQxvoinpYAm',
    'Groove Metal Mix': '37i9dQZF1EIcCL8b99YRCA',
    'Ultimate Groove Metal': '24y2slE56YDOTR2t4Zr1lR',
    'Rock Mix': '37i9dQZF1EQpj7X7UK8OOF',
    'Best of Rock 2000': '37i9dQZF1DX6rsDrBNGuWW',
    'Pop Hits 2000s - 2023': '6mtYuOxzl58vSGnEDtZ9uB',
    'Pop Hits 2023': '5TDtuKDbOhrfW7C58XnriZ',
    'Pop Mix': '37i9dQZF1EQncLwOalG3K7'
}


Spotify ACCESS_TOKEN obtained. Token expires in 3600 seconds


In [6]:

results_dict = {}

for playlist in spotify_playlists:
    print('processing:', playlist)
    content_uri = spotify_playlists[playlist]
    content = get_spotify_content(get_spotify_playlist, access_token, content_uri)

    results_dict[playlist] = content
print('DONE...')


processing: Industrial Metal
processing: INDUSTRIAL METAL
processing: Heavy Metal
processing: Metal Mix
processing: Aggressive Heavy Metal Mix
processing: Death Metal Mix
processing: Death Metal Melodico
processing: Death Metal
processing: Hard Rock
processing: Hard Rock / Metal
processing: Hard Rock Mix
processing: Metalcore Mix
processing: 2023 Metalcore Playlist
processing: Modern Metalcore
processing: Deathcore
processing: Melodic Metal Mix
processing: Epic and Melodic
processing: Melodic Death Metal Mix
processing: Grindcore Mix
processing: Power Metal Mix
processing: power metal
processing: BLACK METAL
processing: black metal classics
processing: Nu Metal Era
processing: Nu Metal Mix
processing: Nu Metal Hits
processing: Ultimate Goth Metal
processing: Gothic Metal
processing: Groove Metal Mix
processing: Ultimate Groove Metal
processing: Rock Mix
processing: Best of Rock 2000
processing: Pop Hits 2000s - 2023
processing: Pop Hits 2023
processing: Pop Mix
DONE...


In [7]:

artists_df = pd.DataFrame.from_dict({})

for results in results_dict:
    tmp_df = pd.DataFrame.from_dict(results_dict[results])
    artists_df = pd.concat([artists_df, tmp_df])

print(len(artists_df))
artists_df = artists_df.drop_duplicates()
artists_df = artists_df.dropna()
print(len(artists_df))

# artists_df.to_csv('artists.csv', sep=';', index=False)
artists_df.head(10)


4235
1143


Unnamed: 0,artist,id
0,Rob Zombie,3HVdAiMNjYrQIKlOGxoGh5
1,Marilyn Manson,2VYQTNDsvvKN9wmU5W7xpj
2,Fear Factory,74Hj7BmnUXyx2udrIEIKwX
3,Rammstein,6wWVKhxIU2cEi0K81v7HvP
4,Dope,7fWgqc4HJi3pcHhK8hKg2p
5,Filter,01WjpKiWVNurV5hjIadB8C
6,Static-X,7JDSHlDdVTo7aZKdQZ53Vf
7,Orgy,4uYwLU7k03RCQSRXGtQGg0
8,Skillet,49bzE5vRBRIota4qeHtQM8
9,Nine Inch Nails,0X380XXQSNBYuleKzav5UO


## get kaggle data

In [8]:
kaggle_spot_df = pd.read_csv("dataset.csv")
kaggle_spot_df = kaggle_spot_df[['artists', 'track_genre']]

kaggle_spot_df = kaggle_spot_df.groupby(['artists']).apply(lambda x: x['track_genre'].unique())
kaggle_spot_df = kaggle_spot_df.reset_index(name='track_genre')

kaggle_spot_df = kaggle_spot_df.set_index(['track_genre']).apply(lambda x: x.str.split(',').explode())
kaggle_spot_df = kaggle_spot_df.reset_index()

kaggle_yout_df = pd.read_csv("Spotify_Youtube.csv")
kaggle_yout_df = kaggle_yout_df

kaggle_tracks_df = kaggle_yout_df.merge(kaggle_spot_df, left_on='Artist', right_on='artists', how='inner')
kaggle_tracks_df = kaggle_tracks_df[['Artist', 'Url_spotify', 'Track', 'track_genre', 'Uri', 'Stream', 'Views', 'Likes', 'Comments', 'Description']]
kaggle_tracks_df.head()


Unnamed: 0,Artist,Url_spotify,Track,track_genre,Uri,Stream,Views,Likes,Comments,Description
0,Gorillaz,https://open.spotify.com/artist/3AA28KZvwAUcZu...,Feel Good Inc.,"[alternative, hip-hop]",spotify:track:0d28khcov6AiegSCpG5TuT,1040235000.0,693555221.0,6220896.0,169907.0,Official HD Video for Gorillaz' fantastic trac...
1,Gorillaz,https://open.spotify.com/artist/3AA28KZvwAUcZu...,Rhinestone Eyes,"[alternative, hip-hop]",spotify:track:1foMv2HQwfQ2vntFf9HFeG,310083700.0,72011645.0,1079128.0,31003.0,The official video for Gorillaz - Rhinestone E...
2,Gorillaz,https://open.spotify.com/artist/3AA28KZvwAUcZu...,New Gold (feat. Tame Impala and Bootie Brown),"[alternative, hip-hop]",spotify:track:64dLd6rVqDLtkXFYrEUHIU,63063470.0,8435055.0,282142.0,7399.0,Gorillaz - New Gold ft. Tame Impala & Bootie B...
3,Gorillaz,https://open.spotify.com/artist/3AA28KZvwAUcZu...,On Melancholy Hill,"[alternative, hip-hop]",spotify:track:0q6LuUqGLUiCPP1cbdwFs3,434663600.0,211754952.0,1788577.0,55229.0,Follow Gorillaz online:\nhttp://gorillaz.com \...
4,Gorillaz,https://open.spotify.com/artist/3AA28KZvwAUcZu...,Clint Eastwood,"[alternative, hip-hop]",spotify:track:7yMiX7n9SBvadzox8T5jzT,617259700.0,618480958.0,6197318.0,155930.0,The official music video for Gorillaz - Clint ...


In [9]:

def contains_similar_genre(values, genres):
    for value in values:
        for genre in genres:
            if (value in genre):
                return True
    return False

genre_filter = kaggle_tracks_df['track_genre'].apply(lambda x: contains_similar_genre(x, ['metal', 'rock', 'pop']))
kaggle_tracks_df = kaggle_tracks_df[genre_filter]

kaggle_tracks_df.head()


Unnamed: 0,Artist,Url_spotify,Track,track_genre,Uri,Stream,Views,Likes,Comments,Description
10,Red Hot Chili Peppers,https://open.spotify.com/artist/0L8ExT028jH3dd...,Californication,"[alt-rock, alternative, funk, metal, rock]",spotify:track:48UPSzbZjgc449aqz8bxox,1055738000.0,1018811000.0,4394471.0,121452.0,Watch the official music video for Californica...
11,Red Hot Chili Peppers,https://open.spotify.com/artist/0L8ExT028jH3dd...,Under the Bridge,"[alt-rock, alternative, funk, metal, rock]",spotify:track:3d9DChrdc6BOeFsbrZ3Is0,1061751000.0,246687700.0,1213572.0,32761.0,Watch the official music video for Under The B...
12,Red Hot Chili Peppers,https://open.spotify.com/artist/0L8ExT028jH3dd...,Can't Stop,"[alt-rock, alternative, funk, metal, rock]",spotify:track:3ZOEytgrvLwQaqXreDs2Jx,866465000.0,336635800.0,1740224.0,32573.0,Watch the official music video for Can't Stop ...
13,Red Hot Chili Peppers,https://open.spotify.com/artist/0L8ExT028jH3dd...,Scar Tissue,"[alt-rock, alternative, funk, metal, rock]",spotify:track:1G391cbiT3v3Cywg8T7DM1,613838700.0,435121500.0,1890900.0,37069.0,Watch the official music video for Scar Tissue...
14,Red Hot Chili Peppers,https://open.spotify.com/artist/0L8ExT028jH3dd...,Otherside,"[alt-rock, alternative, funk, metal, rock]",spotify:track:64BbK9SFKH2jk86U3dGj2P,732774500.0,673528700.0,3140356.0,60091.0,Watch the official music video for Otherside b...


In [10]:

kaggle_tracks_df['Url_spotify'] = kaggle_tracks_df['Url_spotify'].str.split('/').str[-1]
kaggle_tracks_df['Uri'] = kaggle_tracks_df['Uri'].str.split(':').str[-1]

cols = list(kaggle_tracks_df)
for col in cols:
    kaggle_tracks_df.rename(columns={col:col.lower()}, inplace=True)

kaggle_tracks_df.rename(columns={'url_spotify':'id'}, inplace=True)
kaggle_tracks_df.rename(columns={'uri':'track_id'}, inplace=True)


In [11]:

tmp_df = artists_df.copy()
tmp_df = pd.concat([tmp_df, kaggle_tracks_df[['artist', 'id']]])

print('spotify dataframe:\t\t\t\t', len(artists_df), 'artists')
print('kaggle dataframe:\t\t\t\t', len(kaggle_tracks_df), 'artists')
print('concatenated dataframe:\t\t\t\t', len(tmp_df), 'artists')

artists_df = tmp_df.drop_duplicates()
print('concatenated dataframe (drop duplicates):\t', len(artists_df), 'artists')


spotify dataframe:				 1143 artists
kaggle dataframe:				 2859 artists
concatenated dataframe:				 4002 artists
concatenated dataframe (drop duplicates):	 1302 artists


## get artist tracks

In [245]:

def get_text_between(text, start_str, end_str):

    i_start_list = [m.start() for m in re.finditer(start_str, text)]
    i_end_list = [m.start() for m in re.finditer(end_str, text)]

    results = []

    try:
        for i in range(len(i_start_list)):
            if (i >= len(i_end_list)):
                break
            i_start = i_start_list[i] + len(start_str)
            i_end = i_end_list[i]
            results += [text[i_start:i_end]]
        return results

    except Exception as e:
        print('Something went wrong:', e)
        return []


def get_artist_contents(soup):

    # Get monthly listeners
    monthly_listeners = soup.find('div', {'data-encore-id': 'type'}).text

    # Get top tracks
    start_str = '<span class="ListRowTitle__LineClamp-sc-1xe2if1-0 jjpOuK">'
    end_str = '</span></p></span>'
    top_tracks = get_text_between(str(soup), start_str, end_str)

    # Get top track listens
    start_str = '<span class="ListRowDetails__LineClamp-sc-sozu4l-0 hoTVKD">'
    end_str = '</span></p></div>'
    top_listens = get_text_between(str(soup), start_str, end_str)

    # Get top track links
    start_str = 'ListRowTitle__ListRowType-sc-1xe2if1-1 fkzPZI" data-encore-id="type" id="listrow-title-track-spotify:track:'
    end_str = '"><span class="ListRowTitle__LineClamp-sc-1xe2if1-0 jjpOuK">'
    top_ids = get_text_between(str(soup), start_str, end_str)
    top_ids = [tl[0:-2] for tl in top_ids]

    # Get description
    start_str = '<span class="Type__TypeElement-sc-goli3j-0 bGROfl G_f5DJd2sgHWeto5cwbi" data-encore-id="type">'
    end_str = '</span></p></div></div></div>'
    desc = get_text_between(str(soup), start_str, end_str)

    end_str = '</span><button aria-expanded="false" class="UhoFLV9F6uYQvi0m6ptf">'
    desc_exand = get_text_between(str(soup), start_str, end_str)

    if (len(desc_exand) > len(desc)):
        desc = desc_exand

    # Create output dict
    content_dict = {
        'monthly_listeners': monthly_listeners,
        'top_tracks': top_tracks,
        'top_listens': top_listens,
        'top_ids': top_ids,
        'desc': desc
    }

    return content_dict


def process_page(url, function):

    headers = {
        'User-Agent': 'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36',
        'Accept-Language': 'en-US, en;q=0.5'
    }

    headers = ()
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    results = function(soup)

    return results

# def process_page(artist_id):
#     url = 'https://open.spotify.com/artist/'+artist_id

#     headers = {
#         'User-Agent': 'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36',
#         'Accept-Language': 'en-US, en;q=0.5'
#     }

#     headers = ()
#     response = requests.get(url, headers=headers)
#     soup = BeautifulSoup(response.text, 'html.parser')

#     content_dict = get_artist_contents(soup)

#     return content_dict


In [13]:

artist_scrape_dict = {}

print('Fetching artists\' page information. This could take a few minutes...')
for i in range(len(artists_df)):

    if (i%100==0):
        print(i, 'of', len(artists_df), 'artists\' pages processed...' )

    artist = artists_df.iloc[i]
    artist_name = artist['artist']
    artist_id = artist['id']

    artist_url = 'https://open.spotify.com/artist/'+artist_id
    content_dict = process_page(artist_url, get_artist_contents)
    artist_scrape_dict[artist_name] = content_dict

    time.sleep(1)
print('DONE...')


Processing 1 of 1302  - artist id: 3HVdAiMNjYrQIKlOGxoGh5
Processing 2 of 1302  - artist id: 2VYQTNDsvvKN9wmU5W7xpj
Processing 3 of 1302  - artist id: 74Hj7BmnUXyx2udrIEIKwX
Processing 4 of 1302  - artist id: 6wWVKhxIU2cEi0K81v7HvP
Processing 5 of 1302  - artist id: 7fWgqc4HJi3pcHhK8hKg2p
Processing 6 of 1302  - artist id: 01WjpKiWVNurV5hjIadB8C
Processing 7 of 1302  - artist id: 7JDSHlDdVTo7aZKdQZ53Vf
Processing 8 of 1302  - artist id: 4uYwLU7k03RCQSRXGtQGg0
Processing 9 of 1302  - artist id: 49bzE5vRBRIota4qeHtQM8
Processing 10 of 1302  - artist id: 0X380XXQSNBYuleKzav5UO
Processing 11 of 1302  - artist id: 6MwPCCR936cYfM1dLsGVnl
Processing 12 of 1302  - artist id: 69k6uTSZMPLpSnhmLCiKxQ
Processing 13 of 1302  - artist id: 65A714FqhSPjoFZeffQbTv
Processing 14 of 1302  - artist id: 5imUS9dQyCbAjUEJJ9QyWC
Processing 15 of 1302  - artist id: 1DXylZlWbVvlckNqwvjTEt
Processing 16 of 1302  - artist id: 2mZITUvfEwrKlksoGpHTsM
Processing 17 of 1302  - artist id: 4BKyei61gtyDFxlKhcvBJJ
Proces

Processing 140 of 1302  - artist id: 28hJdGN1Awf7u3ifk2lVkg
Processing 141 of 1302  - artist id: 3pulcT2wt7FEG10lQlqDJL
Processing 142 of 1302  - artist id: 3BVkDHWRvLJEyKdvhLbjsq
Processing 143 of 1302  - artist id: 3JysSUOyfVs1UQ0UaESheP
Processing 144 of 1302  - artist id: 6HZr7Fs2VfV1PYHIwo8Ylc
Processing 145 of 1302  - artist id: 278ZYwGhdK6QTzE3MFePnP
Processing 146 of 1302  - artist id: 4CzUzn54Cp9TQr6a7JIlMZ
Processing 147 of 1302  - artist id: 0lVlNsuGaOr9vMHCZIAKMt
Processing 148 of 1302  - artist id: 6BD4lgmnh4vy6kkCaZRDWt
Processing 149 of 1302  - artist id: 568ZhdwyaiCyOGJRtNYhWf
Processing 150 of 1302  - artist id: 6bu7CtcOMWcS0BMq7snHW6
Processing 151 of 1302  - artist id: 0GDGKpJFhVpcjIGF8N6Ewt
Processing 152 of 1302  - artist id: 1Yox196W7bzVNZI7RBaPnf
Processing 153 of 1302  - artist id: 1Dvfqq39HxvCJ3GvfeIFuT
Processing 154 of 1302  - artist id: 6qO6LhD6FuXK5e2PtfAIMz
Processing 155 of 1302  - artist id: 159qqlGwzE04xyqpfAwRLo
Processing 156 of 1302  - artist id: 17M

Processing 277 of 1302  - artist id: 6ldp4tSRbPDs5VA2w9KXmD
Processing 278 of 1302  - artist id: 3Oc3JnQKIW7ZNPXS0yf6QV
Processing 279 of 1302  - artist id: 3BM0EaYmkKWuPmmHFUTQHv
Processing 280 of 1302  - artist id: 0cBiR10Zh0R2zJJmcxhNFS
Processing 281 of 1302  - artist id: 52xuvlUvnxqH0xzxGPKXSu
Processing 282 of 1302  - artist id: 3uQtxIjBgrF7vGOt54useD
Processing 283 of 1302  - artist id: 1MK0sGeyTNkbefYGj673e9
Processing 284 of 1302  - artist id: 0Ops4tGcYRQyhnic18QUcu
Processing 285 of 1302  - artist id: 1kTUJy3zL57iEANLB9FpIA
Processing 286 of 1302  - artist id: 4LT73i2bqTazQQkmODMaYf
Processing 287 of 1302  - artist id: 5tqZo146ewkPULPWNGPdq3
Processing 288 of 1302  - artist id: 2qvK5iLLoDV5mJeAvh5uka
Processing 289 of 1302  - artist id: 6L77qXFnXb7Tac7xzPa0Y7
Processing 290 of 1302  - artist id: 4uPWvwgl5qwMLqyUSZPhnv
Processing 291 of 1302  - artist id: 4xTDPgk4jHCF0qui3dH6BS
Processing 292 of 1302  - artist id: 7u12AuhJ5AaJIgZAZe0US8
Processing 293 of 1302  - artist id: 7ko

Processing 414 of 1302  - artist id: 0OCgRH2JnfJo0nNDJzEG0Q
Processing 415 of 1302  - artist id: 6SLAMfhOi7UJI0fMztaK0m
Processing 416 of 1302  - artist id: 1YEEbHKA7t5pYoNnjtHSzP
Processing 417 of 1302  - artist id: 4lk8d90mj4w8a7rwkGPznt
Processing 418 of 1302  - artist id: 1dfeR4HaWDbWqFHLkxsg1d
Processing 419 of 1302  - artist id: 3cfPaG0svAOypkJc1t3rdr
Processing 420 of 1302  - artist id: 5xUf6j4upBrXZPg6AI4MRK
Processing 421 of 1302  - artist id: 0zfT626RwO6zN3RDYeRit5
Processing 422 of 1302  - artist id: 1iTlOqIrZy8DlvCPJY2sjS
Processing 423 of 1302  - artist id: 2pH3wEn4eYlMMIIQyKPbVR
Processing 424 of 1302  - artist id: 0spHbv2fw49lDMkbOAdaqX
Processing 425 of 1302  - artist id: 2cnMpRsOVqtPMfq7YiFE6K
Processing 426 of 1302  - artist id: 711MCceyCBcFnzjGY4Q7Un
Processing 427 of 1302  - artist id: 36QJpDe2go2KgaRleHCDTp
Processing 428 of 1302  - artist id: 21ysNsPzHdqYN2fQ75ZswG
Processing 429 of 1302  - artist id: 7Ey4PD4MYsKc5I2dolUwbH
Processing 430 of 1302  - artist id: 1WR

Processing 551 of 1302  - artist id: 4jBjMnUHg8VCv9HM7KKbFd
Processing 552 of 1302  - artist id: 0DlST2L7efoM5Lb0uxG3Tx
Processing 553 of 1302  - artist id: 0Iv7bsODzqbmMkC6pylYYN
Processing 554 of 1302  - artist id: 0PNbGkwlV3farYnmoojYAY
Processing 555 of 1302  - artist id: 1GkXxneFQE4d5YTsrbN6Ya
Processing 556 of 1302  - artist id: 2DynE7m1BMVl4hQMvCXXq0
Processing 557 of 1302  - artist id: 0sxW2BJTemkPy8tDeO1s0t
Processing 558 of 1302  - artist id: 4tDkeVxH0CSkNiLVrsYmQs
Processing 559 of 1302  - artist id: 2X2KBI2OrNMci6TDQAXCA6
Processing 560 of 1302  - artist id: 7F9ZL4TJNr8AoU0UUQX8ih
Processing 561 of 1302  - artist id: 6NPqqqYcR7tAEHL4ORm6pQ
Processing 562 of 1302  - artist id: 1IeBTpaPY7xQtxqbxXa9qC
Processing 563 of 1302  - artist id: 4ZwENj9UHL3ujCD3k7DfNH
Processing 564 of 1302  - artist id: 6EFV3PmaXblKwNbvpkGv9l
Processing 565 of 1302  - artist id: 48zUWAXpgEXfpttz23pCNQ
Processing 566 of 1302  - artist id: 2By8ec9DQOnN2aiiFyrQ82
Processing 567 of 1302  - artist id: 1GH

Processing 688 of 1302  - artist id: 018gIUaP08hROTOiVdiEQ3
Processing 689 of 1302  - artist id: 7Hrplpu94RU1vSclRROPGS
Processing 690 of 1302  - artist id: 12lmG2LvSHtm6yxioeq7g7
Processing 691 of 1302  - artist id: 1pA13AkImcaEJTUpmAYoWP
Processing 692 of 1302  - artist id: 7MQo05KpBayRxPm9hkTmXZ
Processing 693 of 1302  - artist id: 49skVfbc2WfqgSgsGJBaz1
Processing 694 of 1302  - artist id: 4BPucEo3UudByoX0w5jxE3
Processing 695 of 1302  - artist id: 5G1MlYyOsiG56mjlsZHsLC
Processing 696 of 1302  - artist id: 1uPIYn2IU0IuPWSZ58kzyM
Processing 697 of 1302  - artist id: 77VZhI79f6lzshSx5YNrcw
Processing 698 of 1302  - artist id: 1kLo8SDhyjCNEgsPMtRIuc
Processing 699 of 1302  - artist id: 7L6u6TyhjuwubrcojPeNgf
Processing 700 of 1302  - artist id: 7kWnE981vITXDnAD2cZmCV
Processing 701 of 1302  - artist id: 4ZISAmHmQUDCpv8xydqeKG
Processing 702 of 1302  - artist id: 72unU2j2vnNWFiITlvx2nv
Processing 703 of 1302  - artist id: 6FfZaHz07OsknWNdtdan5R
Processing 704 of 1302  - artist id: 2hb

Processing 825 of 1302  - artist id: 13ltbymjg9upz4wOoF5TTs
Processing 826 of 1302  - artist id: 1TX9g1uSl0B0DQIE9lBqmU
Processing 827 of 1302  - artist id: 4UAUIsAgagejBNiobLmeSR
Processing 828 of 1302  - artist id: 5jQPKWBobu6cJwGSbbqgCk
Processing 829 of 1302  - artist id: 7iMWWdRNiCJwGOGEIYr02z
Processing 830 of 1302  - artist id: 0Z1UczcSjwKNuv4HgdjH3b
Processing 831 of 1302  - artist id: 3hLbhY1LYGckGGlhUTteTQ
Processing 832 of 1302  - artist id: 1mBXJUab9jbcHhYxMuWbxO
Processing 833 of 1302  - artist id: 2XboXFS9ENxbne9aajZlAc
Processing 834 of 1302  - artist id: 6s5biOReFi1Oe9GQOG0nsL
Processing 835 of 1302  - artist id: 2FBDDo06NZR5B7qOzmMxq9
Processing 836 of 1302  - artist id: 0ZXKT0FCsLWkSLCjoBJgBX
Processing 837 of 1302  - artist id: 7t2RUEJpH75zDwWj37hfnC
Processing 838 of 1302  - artist id: 2EOmvmVtYMTgn45w9mWUTV
Processing 839 of 1302  - artist id: 2P6YT8t4urGUSESUOiSnDl
Processing 840 of 1302  - artist id: 1ZaJhNBAhJ3HjPsWiB9sDc
Processing 841 of 1302  - artist id: 6WX

Processing 962 of 1302  - artist id: 6dJeKm76NjfXBNTpHmOhfO
Processing 963 of 1302  - artist id: 6KImCVD70vtIoJWnq6nGn3
Processing 964 of 1302  - artist id: 1HY2Jd0NmPuamShAr6KMms
Processing 965 of 1302  - artist id: 7tYKF4w9nC0nq9CsPZTHyP
Processing 966 of 1302  - artist id: 6LqNN22kT3074XbTVUrhzX
Processing 967 of 1302  - artist id: 6VuMaDnrHyPL1p4EHjYLi7
Processing 968 of 1302  - artist id: 3YQKmKGau1PzlVlkL1iodx
Processing 969 of 1302  - artist id: 69GGBxA162lTqCwzJG5jLp
Processing 970 of 1302  - artist id: 1Xyo4u8uXC1ZmMpatF05PJ
Processing 971 of 1302  - artist id: 74XFHRwlV6OrjEM0A2NCMF
Processing 972 of 1302  - artist id: 7qmpXeNz2ojlMl2EEfkeLs
Processing 973 of 1302  - artist id: 6S2OmqARrzebs0tKUEyXyp
Processing 974 of 1302  - artist id: 4GJ6xDCF5jaUqD6avOuQT6
Processing 975 of 1302  - artist id: 26dSoYclwsYLMAKD3tpOr4
Processing 976 of 1302  - artist id: 6AMd49uBDJfhf30Ak2QR5s
Processing 977 of 1302  - artist id: 1l8Fu6IkuTP0U5QetQJ5Xt
Processing 978 of 1302  - artist id: 5KK

Processing 1097 of 1302  - artist id: 17PKqjvXIlUPIx2TrwnVnb
Processing 1098 of 1302  - artist id: 4sTQVOfp9vEMCemLw50sbu
Processing 1099 of 1302  - artist id: 6aJRwrwfOffz7RMh2Cfm8e
Processing 1100 of 1302  - artist id: 1l7ZsJRRS8wlW3WfJfPfNS
Processing 1101 of 1302  - artist id: 4kI8Ie27vjvonwaB2ePh8T
Processing 1102 of 1302  - artist id: 4NHQUGzhtTLFvgF5SZesLK
Processing 1103 of 1302  - artist id: 738wLrAtLtCtFOLvQBXOXp
Processing 1104 of 1302  - artist id: 0X2BH1fck6amBIoJhDVmmJ
Processing 1105 of 1302  - artist id: 0C8ZW7ezQVs4URX5aX7Kqx
Processing 1106 of 1302  - artist id: 5p7f24Rk5HkUZsaS3BLG5F
Processing 1107 of 1302  - artist id: 1PbBg2aYjWLKRk84zJK15x
Processing 1108 of 1302  - artist id: 23fqKkggKUBHNkbKtXEls4
Processing 1109 of 1302  - artist id: 0RpddSzUHfncUWNJXKOsjy
Processing 1110 of 1302  - artist id: 4GNC7GD6oZMSxPGyXy4MNB
Processing 1111 of 1302  - artist id: 0RAWgVjsAEhbXPhqPGqd8n
Processing 1112 of 1302  - artist id: 37tjt3cGt6FQPLu6IsorqP
Processing 1113 of 1302 

Processing 1232 of 1302  - artist id: 0zOcE3mg9nS6l3yxt1Y0bK
Processing 1233 of 1302  - artist id: 3yMmYEklQ7gLOZXEFNd3xr
Processing 1234 of 1302  - artist id: 26bcq2nyj5GB7uRr558iQg
Processing 1235 of 1302  - artist id: 7Ln80lUS6He07XvHI8qqHH
Processing 1236 of 1302  - artist id: 2oSONSC9zQ4UonDKnLqksx
Processing 1237 of 1302  - artist id: 1EowJ1WwkMzkCkRomFhui7
Processing 1238 of 1302  - artist id: 6IRouO5mvvfcyxtPDKMYFN
Processing 1239 of 1302  - artist id: 1GLtl8uqKmnyCWxHmw9tL4
Processing 1240 of 1302  - artist id: 3yY2gUcIsjMr8hjo51PoJ8
Processing 1241 of 1302  - artist id: 26T3LtbuGT1Fu9m0eRq5X3
Processing 1242 of 1302  - artist id: 46gyXjRIvN1NL1eCB8GBxo
Processing 1243 of 1302  - artist id: 1wmoF73GhlylxwAbn9YGYr
Processing 1244 of 1302  - artist id: 4IKVDbCSBTxBeAsMKjAuTs
Processing 1245 of 1302  - artist id: 7k73EtZwoPs516ZxE72KsO
Processing 1246 of 1302  - artist id: 07QEuhtrNmmZ0zEcqE9SF6
Processing 1247 of 1302  - artist id: 3AQRLZ9PuTAozP28Skbq8V
Processing 1248 of 1302 

In [14]:

track_info_dict = {
    'artist':[],
    'track':[],
    'track_id':[],
    'track_listens':[]
}

artist_info_dict = {
    'artist':[],
    'monthly_listeners':[],
    'description':[]
}

for artist in artist_scrape_dict:
    data = artist_scrape_dict[artist]

    # Append to artist dictionary
    artist_info_dict['artist'] += [artist]
    artist_info_dict['monthly_listeners'] += [data['monthly_listeners']]

    desc = data['desc']
    if (len(desc) == 0):
        desc = None
    else:
        desc = desc[0]
    artist_info_dict['description'] += [desc]

    # Append to track dictionary
    for track in data['top_tracks']:
        track_info_dict['artist'] += [artist]
        track_info_dict['track'] += [track]

    for track_id in data['top_ids']:
        track_info_dict['track_id'] += [track_id]

    for track_listens in data['top_listens']:
        track_info_dict['track_listens'] += [track_listens]


In [15]:

tracks_df = pd.DataFrame.from_dict(track_info_dict)
# tracks_df.to_csv('tracks_df.csv', sep=';', index=False)
tracks_df.head()


Unnamed: 0,artist,track,track_id,track_listens
0,Rob Zombie,Dragula,6Nm8h73ycDG2saCnZV8poF,328982835
1,Rob Zombie,Living Dead Girl,10V8XpuyMoEcSMfM79WDET,129699990
2,Rob Zombie,Superbeast,4eNnMvUrSu2TRpySjVC0Pv,91179723
3,Rob Zombie,Feel So Numb,6KnNDix4Owr4vNmsPhLmxD,54192259
4,Rob Zombie,The Satanic Rites of Blacula,1JeGX4cixemJfBJl6XGQES,10018196


In [16]:

artists_info_df = pd.DataFrame.from_dict(artist_info_dict)
# artists_info_df.to_csv('artist_info.csv', sep=';', index=False)
artists_info_df.head()


Unnamed: 0,artist,monthly_listeners,description
0,Rob Zombie,"5,775,464 monthly listeners",The longtime frontman for '90s industrial supe...
1,Marilyn Manson,"6,689,377 monthly listeners",Controversial rock frontman Marilyn Manson bec...
2,Fear Factory,"990,866 monthly listeners",One can’t overstate the size of the Fear Facto...
3,Rammstein,"10,788,915 monthly listeners",Rammstein are one of rock’s most individual an...
4,Dope,"1,728,116 monthly listeners",New album 'Blood Money Part Zer0' out 2.24.23!


In [17]:

artists_df = artists_df.merge(artists_info_df, on='artist')
artists_df.head()


Unnamed: 0,artist,id,monthly_listeners,description
0,Rob Zombie,3HVdAiMNjYrQIKlOGxoGh5,"5,775,464 monthly listeners",The longtime frontman for '90s industrial supe...
1,Marilyn Manson,2VYQTNDsvvKN9wmU5W7xpj,"6,689,377 monthly listeners",Controversial rock frontman Marilyn Manson bec...
2,Fear Factory,74Hj7BmnUXyx2udrIEIKwX,"990,866 monthly listeners",One can’t overstate the size of the Fear Facto...
3,Rammstein,6wWVKhxIU2cEi0K81v7HvP,"10,788,915 monthly listeners",Rammstein are one of rock’s most individual an...
4,Dope,7fWgqc4HJi3pcHhK8hKg2p,"1,728,116 monthly listeners",New album 'Blood Money Part Zer0' out 2.24.23!


## Get track features

In [46]:

def get_spotify_tracks(headers, content_uri):

    batch_size = 50
    batch_index = 0

    info_dict = {
        'track_id':[],
        'popularity':[],
        'artist_id':[]
    }
    feat_dict = {
        'track_id':[],
        'danceability':[],
        'energy':[],
        'key':[],
        'loudness':[],
        'mode':[],
        'speechiness':[],
        'acousticness':[],
        'instrumentalness':[],
        'liveness':[],
        'valence':[],
        'tempo':[],
        'time_signature':[]
    }

    print('Processing tracks. This could take a few minutes...')
    while True:

        if (batch_index%1000 == 0):
            print('\ttracks processed:', batch_index, 'of', len(content_uri))

        tracks_ids = content_uri[batch_index:batch_index+batch_size]

        info_url = 'https://api.spotify.com/v1/tracks?ids=' + '%2C'.join(tracks_ids)
        json_track_info = call_spotify_api(info_url, headers)

        features_url = 'https://api.spotify.com/v1/audio-features?ids='+ '%2C'.join(tracks_ids)
        json_track_feat = call_spotify_api(features_url, headers)

        if (json_track_info != None and json_track_feat != None):
            for track in json_track_info['tracks']:
                info_dict['track_id'] += [track['id']]
                info_dict['popularity'] += [track['popularity']]
                info_dict['artist_id'] += [track['artists'][0]['id']]

            for track in json_track_feat['audio_features']:
                feat_dict['track_id'] += [track['id']]
                feat_dict['danceability'] += [track['danceability']]
                feat_dict['energy'] += [track['energy']]
                feat_dict['key'] += [track['key']]
                feat_dict['loudness'] += [track['loudness']]
                feat_dict['mode'] += [track['mode']]
                feat_dict['speechiness'] += [track['speechiness']]
                feat_dict['acousticness'] += [track['acousticness']]
                feat_dict['instrumentalness'] += [track['instrumentalness']]
                feat_dict['liveness'] += [track['liveness']]
                feat_dict['valence'] += [track['valence']]
                feat_dict['tempo'] += [track['tempo']]
                feat_dict['time_signature'] += [track['time_signature']]

            batch_index += batch_size
        else:
            break

        time.sleep(1)

    print('DONE...')
    return info_dict, feat_dict


In [47]:

tracks_ids = pd.concat([tracks_df['track_id'], kaggle_tracks_df['track_id']])
tracks_ids = tracks_ids.drop_duplicates()
tracks_ids_list = tracks_ids.tolist()

access_token = get_spotify_access(CLIENT_ID, CLIENT_SECRET)
info_dict, feat_dict = get_spotify_content(get_spotify_tracks, access_token, tracks_ids_list)


Spotify ACCESS_TOKEN obtained. Token expires in 3600 seconds
Processing tracks. This could take a few minutes...
	tracks processed: 0 of 8167
	tracks processed: 1000 of 8167
	tracks processed: 2000 of 8167
	tracks processed: 3000 of 8167
	tracks processed: 4000 of 8167
	tracks processed: 5000 of 8167
	tracks processed: 6000 of 8167
	tracks processed: 7000 of 8167
	tracks processed: 8000 of 8167
DONE...


In [64]:

track_info_df = pd.DataFrame.from_dict(info_dict)
track_feat_df = pd.DataFrame.from_dict(feat_dict)
track_details_df = track_info_df.merge(track_feat_df, on='track_id')
track_details_df.head()


Unnamed: 0,track_id,popularity,artist_id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,6Nm8h73ycDG2saCnZV8poF,80,3HVdAiMNjYrQIKlOGxoGh5,0.591,0.963,0,-6.489,0,0.0531,8.7e-05,0.000107,0.0781,0.609,125.03,4
1,10V8XpuyMoEcSMfM79WDET,72,3HVdAiMNjYrQIKlOGxoGh5,0.518,0.948,6,-5.58,0,0.0683,0.00504,0.104,0.228,0.55,103.025,4
2,4eNnMvUrSu2TRpySjVC0Pv,68,3HVdAiMNjYrQIKlOGxoGh5,0.518,0.975,8,-4.425,1,0.0575,0.000136,0.827,0.42,0.32,154.059,4
3,6KnNDix4Owr4vNmsPhLmxD,66,3HVdAiMNjYrQIKlOGxoGh5,0.566,0.889,8,-5.22,1,0.0426,4.3e-05,0.00907,0.292,0.141,127.045,4
4,1JeGX4cixemJfBJl6XGQES,64,3HVdAiMNjYrQIKlOGxoGh5,0.48,0.993,0,-6.402,0,0.0862,0.000339,5e-05,0.3,0.589,154.991,4


In [74]:

spotify_detail_df = tracks_df.merge(track_details_df, on='track_id', how='inner')
kaggle_detail_df = kaggle_tracks_df.merge(track_details_df, on='track_id', how='inner')
kaggle_detail_df.drop(columns=['track_genre', 'id'], inplace=True)
kaggle_detail_df.rename(columns={'stream': 'track_listens'}, inplace=True)

print(len(track_info_df))
print(len(track_details_df))
print('--')
print(len(tracks_df))
print(len(spotify_detail_df))
print('--')
print(len(kaggle_tracks_df))
print(len(kaggle_detail_df))


8167
8167
--
6466
6466
--
2859
2859


## Get lyrics

In [None]:

from lyricsgenius import Genius


In [547]:

tracks_names_df = pd.concat([spotify_detail_df[['track', 'artist']], kaggle_detail_df[['track', 'artist']]])
tracks_names_df = tracks_names.drop_duplicates()


In [548]:

# https://docs.genius.com/#/getting-started-h1
# https://medium.com/analytics-vidhya/how-to-scrape-song-lyrics-a-gentle-python-tutorial-5b1d4ab351d2

with open('genius_key.yml', 'r') as file:
    genius_key = yaml.safe_load(file)

GENIUS_KEY = genius_key['key']
genius = Genius(GENIUS_KEY, verbose=False)


In [552]:

tracks_lyrics_dict = {
    'artist':[],
    'track':[],
    'lyrics':[]
}

print('Getting tracks\' lyrics. This could take a few hours...')
for i in range(len(tracks_names_df)):

    row = tracks_names_df.iloc[i]

    artist_name = row['artist']
    track_name = row['track']

    if (i%500==0):
        print(i, 'of', len(tracks_names_df), 'tracks processed...')

    # https://github.com/johnwmillr/LyricsGenius/issues/121
    while True:
        retries = 0
        try:
            song = genius.search_song(track_name, artist_name, get_full_info=False)
            break
        except:
            retries += 1
            pass

        if (retries >= 30):
            song=None
            break

    if (song != None):
        track_lyrics = song.lyrics
    else:
        track_lyrics = ''

    tracks_lyrics_dict['artist'] += [artist_name]
    tracks_lyrics_dict['track'] += [track_name]
    tracks_lyrics_dict['lyrics'] += [track_lyrics]
    time.sleep(1)

print('DONE...')


Getting tracks' lyrics. This could take a few hours...
0 of 7623 tracks processed...
100 of 7623 tracks processed...
200 of 7623 tracks processed...
300 of 7623 tracks processed...
400 of 7623 tracks processed...
500 of 7623 tracks processed...
600 of 7623 tracks processed...
700 of 7623 tracks processed...
800 of 7623 tracks processed...
900 of 7623 tracks processed...
1000 of 7623 tracks processed...
1100 of 7623 tracks processed...
1200 of 7623 tracks processed...
1300 of 7623 tracks processed...
1400 of 7623 tracks processed...
1500 of 7623 tracks processed...
1600 of 7623 tracks processed...
1700 of 7623 tracks processed...
1800 of 7623 tracks processed...
1900 of 7623 tracks processed...
2000 of 7623 tracks processed...
2100 of 7623 tracks processed...
2200 of 7623 tracks processed...
2300 of 7623 tracks processed...
2400 of 7623 tracks processed...
2500 of 7623 tracks processed...
2600 of 7623 tracks processed...
2700 of 7623 tracks processed...
2800 of 7623 tracks processed...


In [576]:

tracks_lyrics_df = pd.DataFrame.from_dict(tracks_lyrics_dict)
tracks_lyrics_df.head()
# tracks_lyrics_df.to_csv('tracks_lyrics_df.csv', sep=';', index=False)


Unnamed: 0,artist,track,lyrics
0,Rob Zombie,Dragula,44 ContributorsDragula Lyrics[Sample]\nSuperst...
1,Rob Zombie,Living Dead Girl,29 ContributorsLiving Dead Girl Lyrics[Intro]\...
2,Rob Zombie,Superbeast,16 ContributorsSuperbeast Lyrics(Verse 1)\nShr...
3,Rob Zombie,Feel So Numb,14 ContributorsFeel So Numb Lyrics[Intro]\nPro...
4,Rob Zombie,The Satanic Rites of Blacula,4 ContributorsThe Satanic Rites of Blacula Lyr...


In [592]:

spotify_df = spotify_detail_df.merge(tracks_lyrics_df, on=['artist', 'track'], how='inner')
kaggle_df = kaggle_detail_df.merge(tracks_lyrics_df, on=['artist', 'track'], how='inner')


## Temp. write dataframes

In [599]:

import os

try:
    os.mkdir(os.getcwd()+'/tmp_data')
except:
    pass

artists_df.to_csv('tmp_data/artists.csv', sep=';', index=False)
spotify_df.to_csv('tmp_data/spotify.csv', sep=';', index=False)
kaggle_df.to_csv('tmp_data/kaggle.csv', sep=';', index=False)
tracks_lyrics_df.to_csv('tmp_data/lyrics.csv', sep=';', index=False)


## Get band genres

In [146]:

def get_spotify_genres(headers, content_uri):

    batch_size = 50
    batch_index = 0

    artist_dict = {
        'id':[],
        'followers':[],
        'genres':[]
    }

    print('Fetching artists\' genres. This could take a few minutes...')
    while True:
        artists_ids = content_uri[batch_index:batch_index+batch_size]

        info_url = 'https://api.spotify.com/v1/artists?ids=' + '%2C'.join(artists_ids)
        json_artist_info = call_spotify_api(info_url, headers)

        if (json_artist_info != None):
            for artist in json_artist_info['artists']:
                artist_dict['id'] += [artist['id']]
                artist_dict['followers'] += [artist['followers']['total']]
                artist_dict['genres'] += [artist['genres']]
            batch_index += batch_size
        else:
            break

        time.sleep(1)

    print('DONE...')
    return artist_dict


In [139]:

artists_ids_df = artists_df['id']
artists_ids_df.drop_duplicates(inplace=True)
artists_ids_list = artists_ids_df.to_list()


In [147]:

# access_token = get_spotify_access(CLIENT_ID, CLIENT_SECRET)
artists_info_dict = get_spotify_content(get_spotify_genres, access_token, artists_ids_list)


Fetching artists' genres. This could take a few minutes...
DONE...


In [155]:

artists_ids_df = pd.DataFrame.from_dict(artists_info_dict)
artists_ids_df.head()


Unnamed: 0,id,followers,genres
0,3HVdAiMNjYrQIKlOGxoGh5,2640333,"[alternative metal, hard rock, industrial meta..."
1,2VYQTNDsvvKN9wmU5W7xpj,4071591,"[alternative metal, hard rock, industrial, ind..."
2,74Hj7BmnUXyx2udrIEIKwX,631750,"[alternative metal, groove metal, industrial m..."
3,6wWVKhxIU2cEi0K81v7HvP,8600807,"[german metal, industrial, industrial metal, i..."
4,7fWgqc4HJi3pcHhK8hKg2p,750590,"[alternative metal, industrial metal, nu metal]"


In [156]:

artists_df = artists_df.merge(artists_ids_df, on='id')
artists_df.head()


Unnamed: 0,artist,id,monthly_listeners,description,followers,genres
0,Rob Zombie,3HVdAiMNjYrQIKlOGxoGh5,"5,775,464 monthly listeners",The longtime frontman for '90s industrial supe...,2640333,"[alternative metal, hard rock, industrial meta..."
1,Marilyn Manson,2VYQTNDsvvKN9wmU5W7xpj,"6,689,377 monthly listeners",Controversial rock frontman Marilyn Manson bec...,4071591,"[alternative metal, hard rock, industrial, ind..."
2,Fear Factory,74Hj7BmnUXyx2udrIEIKwX,"990,866 monthly listeners",One can’t overstate the size of the Fear Facto...,631750,"[alternative metal, groove metal, industrial m..."
3,Rammstein,6wWVKhxIU2cEi0K81v7HvP,"10,788,915 monthly listeners",Rammstein are one of rock’s most individual an...,8600807,"[german metal, industrial, industrial metal, i..."
4,Dope,7fWgqc4HJi3pcHhK8hKg2p,"1,728,116 monthly listeners",New album 'Blood Money Part Zer0' out 2.24.23!,750590,"[alternative metal, industrial metal, nu metal]"


## Get artists' wikipedia information

In [515]:

# https://bobbyhadz.com/blog/python-print-string-with-special-characters#:~:text=Use%20the%20repr()%20function,representation%20of%20the%20provided%20object.
# https://medium.com/geekculture/web-scraping-tables-in-python-using-beautiful-soup-8bbc31c5803e

def get_wiki_info(info_table, is_band):

    info_dict={
        'years_active': 'not_on_wikipedia',
        'origin': 'not_on_wikipedia'
    }

    info_found=False

    for row in info_table.tbody.find_all('tr'):
        if (row.th != None):
            row_name = row.th.text.replace('\xa0', ' ')

            # Get years active
            if (row_name in ['Years active', 'Years']):
                info_found = True
                row_val = row.td.text
                info_dict['years_active'] = row_val
                    

            # Get origin
            if (row_name in ['Origin', 'Born']):
                info_found = True
                row_val = row.td.text
                info_dict['origin'] = row_val

    if (info_found==False):
        return None
    else:
        return info_dict


def get_artist_wiki(soup):

    band_infobox = 'infobox vcard plainlist'
    artist_infobox = 'infobox biography vcard'

    # Get band info
    info_table = soup.find('table', {'class': band_infobox})
    if (info_table != None):
        info_dict = get_wiki_info(info_table, is_band=True)
        return info_dict

    # If not a band get artist info
    info_table = soup.find('table', {'class': artist_infobox})
    if (info_table != None):
        info_dict = get_wiki_info(info_table, is_band=False)
        return info_dict

    # Otherwise return nothing
    return None


def extract_wikipedia_data(artists):

    wiki_pages_found = 0
    artists_wiki_dict = {
        'artist':[],
        'years_active':[],
        'origin':[]
    }

    print('Fetching artists\' wikipedia information. This could take a few minutes...')
    for i in range(len(artists)):
        artist = artists[i]

        if (i%10==0):
            print('Finished processing', i, 'of', len(artists), 'artists\' wiki pages...')
            print('\t-', wiki_pages_found, 'of', i, 'wikipedia pages found')

        artist_search = artist.replace(' ', '_')
        print('\t-', artist_search)

        url = 'https://en.wikipedia.org/wiki/'+artist_search
        wiki_info_dict = process_page(url, get_artist_wiki)

        if (wiki_info_dict==None):
            url = 'https://en.wikipedia.org/wiki/'+artist_search+'_(band)'
            wiki_info_dict = process_page(url, get_artist_wiki)

            if (wiki_info_dict==None):

                for rep_str in ['Of', 'The']:
                    artist_search = artist_search.replace(rep_str, rep_str.lower())
                url = 'https://en.wikipedia.org/wiki/'+artist_search
                wiki_info_dict = process_page(url, get_artist_wiki)

                if (wiki_info_dict==None):
                    url = 'https://en.wikipedia.org/wiki/'+urllib.parse.quote(artist_search)
                    wiki_info_dict = process_page(url, get_artist_wiki)

                    if (wiki_info_dict==None):
                        print('\t\t[ERROR] still found no results...')
                        wiki_info_dict=None

        # -------------------------
#         replace_strings = {'':'', ' ':'_', 'Of':'of', 'The': 'the'}
#         addition_strings = ['', '_(band)']
#         locals_strings = ['en', 'de', 'ru']
#         results_found = False

#         while (results_found==False):
            
#             print(artist)
#             artist_replace = artist

#             # Replace parts of artist string
#             for rep_str in replace_strings:
#                 artist_replace = artist_replace.replace(rep_str, replace_strings[rep_str])

#                 # Add substring to artist string
#                 for add_str in addition_strings:
#                     artist_add = artist_replace+add_str

#                     # Try different locals
#                     for loc_str in locals_strings:

#                         # Try without urllib string parsing
#                         url = 'https://'+loc_str+'.wikipedia.org/wiki/'+artist_add
# #                         print('\t-', url)
#                         wiki_info_dict = process_page(url, get_artist_wiki)
#                         if (wiki_info_dict!=None):
#                             results_found = True
#                             wiki_pages_found += 1
#                             break

#                         # Try with urllib string parsing
#                         url = 'https://'+loc_str+'.wikipedia.org/wiki/'+urllib.parse.quote(artist_add)
# #                         print('\t-', url)
#                         wiki_info_dict = process_page(url, get_artist_wiki)
#                         if (wiki_info_dict!=None):
#                             results_found = True
#                             wiki_pages_found += 1
#                             break

#                     if (wiki_info_dict!=None):
#                         break
#                 if (wiki_info_dict!=None):
#                     break

#         if (results_found==None):
#             print('\t\t[ERROR] could not find results...')
#             wiki_info_dict=None
        # -------------------------

        artists_wiki_dict['artist'] += [artist]
        if (wiki_info_dict!=None):
            artists_wiki_dict['years_active'] += [wiki_info_dict['years_active']]
            artists_wiki_dict['origin'] += [wiki_info_dict['origin']]
            wiki_pages_found += 1
        else:
            artists_wiki_dict['years_active'] += ['']
            artists_wiki_dict['origin'] += ['']

        time.sleep(0.5)

    print('DONE...')
    return artists_wiki_dict


In [516]:

artists_list = artists_df['artist'].tolist()
artists_wiki_dict = extract_wikipedia_data(artists_list)


Fetching artists' wikipedia information. This could take a few minutes...
Finished processing 0 of 1302 artists' wiki pages...
0 of 0 wikipedia pages found
	- Rob_Zombie
	- Marilyn_Manson
	- Fear_Factory
	- Rammstein
	- Dope
	- Filter
	- Static-X
	- Orgy
	- Skillet
	- Nine_Inch_Nails
Finished processing 10 of 1302 artists' wiki pages...
10 of 10 wikipedia pages found
	- Motionless_In_White
	- Coal_Chamber
	- Lindemann
	- Powerman_5000
	- Ministry
	- OOMPH!
	- Celldweller
	- 3TEETH
	- Gothminister
	- Stabbing_Westward
Finished processing 20 of 1302 artists' wiki pages...
20 of 20 wikipedia pages found
	- Erdling
	- SKYND
	- KMFDM
	- Megaherz
	- Skinny_Puppy
	- Killing_Joke
	- Emigrate
	- Turmion_Kätilöt
	- Combichrist
	- Gravity_Kills
Finished processing 30 of 1302 artists' wiki pages...
30 of 30 wikipedia pages found
	- HEALTH
	- Ost+Front
		[ERROR] still found no results...
	- Machines_Of_Loving_Grace
	- Blue_Stahli
	- My_Life_With_The_Thrill_Kill_Kult
	- NA_CHUI
		[ERROR] still found

	- Heathen_Foray
		[ERROR] still found no results...
	- Uada
		[ERROR] still found no results...
	- Fleshgod_Apocalypse
	- Finntroll
	- In_Mourning
	- HAVAMAL
		[ERROR] still found no results...
	- Dark_Oath
		[ERROR] still found no results...
	- Kreator
	- Whispered
		[ERROR] still found no results...
Finished processing 280 of 1302 artists' wiki pages...
228 of 280 wikipedia pages found
	- Omnium_Gatherum
	- Vorna
		[ERROR] still found no results...
	- Behemoth
	- Folkheim
		[ERROR] still found no results...
	- Misery_Index
	- Old_Man's_Child
	- Verikalpa
		[ERROR] still found no results...
	- The_Halo_Effect
	- Before_The_Dawn
		[ERROR] still found no results...
	- Deals_Death
		[ERROR] still found no results...
Finished processing 290 of 1302 artists' wiki pages...
233 of 290 wikipedia pages found
	- The_Black_Dahlia_Murder
	- Metalocalypse:_Dethklok
	- Kataklysm
	- Ingested
	- Cattle_Decapitation
	- Cerebral_Bore
	- The_Agony_Scene
	- Acrania
	- Trepalium
		[ERROR] still found no 

KeyboardInterrupt: 

In [None]:

artists_wiki_df = pd.DataFrame.from_dict(artists_wiki_dict)
artists_wiki_df.head()


## Get fans-liked data

In [246]:

def get_artist_name(soup_text, index_start):

    str_index = 0
    closed = True

    while True:

        c = soup_text[index_start+str_index]

        if (c=='<' and closed==True):
            closed=False
        elif (c=='>' and closed==False):
            closed=True
        elif (c!='<' and closed==True):

            artist_name = ''
            while (c != '<'):
                artist_name += c
                str_index += 1
                c = soup_text[index_start+str_index]
            break

        str_index += 1

    index_end = index_start + str_index

    return artist_name, index_end


def get_artist_id(soup_text):

    href_index = soup_text.find('href="/artist/')+len('href="/artist/')
    index = 0

    while True:
        c = soup_text[href_index+index]
        if (c=='"'):
            break
        index += 1

    artist_id = soup_text[href_index:href_index+index]
    return artist_id


def get_fans_liked_artists(soup):

    monthly_listeners = soup.find('div', {'data-encore-id': 'type'}).text

    soup_text = str(soup)
    index_start = soup_text.find('Fans also like')+len('Fans also like')

    fans_liked_dict = {}
    artists_found = 0

    while (artists_found<5):
        artist_name, index_end = get_artist_name(soup_text, index_start)
        artist_id = get_artist_id(soup_text[index_start:index_end])

        fans_liked_dict[artist_name] = artist_id

        artists_found += 1
        index_start = index_end

    return monthly_listeners, fans_liked_dict


In [298]:

def get_fans_recursive(artists_dict, depth):

    if (depth<=max_depth):

        counter = 0

        for artist_name in artists_dict:

            if (depth==0):
                counter+=1
                num_artists = len(artists_dict)
                if (num_artists%counter==50):
                    print(counter, 'of', num_artists, 'processed...')

            if (artist_name not in artists_found_dict):
                artist_id = artists_dict[artist_name]

#                 print(depth*'\t'+'scraping', artist_name, 'fans')

                artist_url = 'https://open.spotify.com/artist/'+artist_id
                monthly_listeners, fans_liked_dict = process_page(artist_url, get_fans_liked_artists)

                artists_found_dict[artist_name] = {
                    'monthly_listeners': monthly_listeners,
                    'fans_liked': fans_liked_dict
                }

                get_fans_recursive(fans_liked_dict, depth+1)
#             else:
#                 print(depth*'\t'+artist_name, 'already processed')

            time.sleep(0.5)

def scrape_fans_liked(artist_id_df):

    # https://www.tutorialspoint.com/how-to-check-the-execution-time-of-python-script#:~:text=In%20python%2C%20we%20have%20the,of%20the%20given%20code%20block.

    artists_dict={}
    for index, row in artist_id_df.iterrows():
        artist_name = row['artist']
        artist_id = row['id']
        artists_dict[artist_name]=artist_id

    print('Fetching artists\' "fans also liked" artists. This could take a few hours...')
    start_time = time.time()

    get_fans_recursive(artists_dict, 0)

    end_time = time.time()
    t = end_time - start_time
    h = int(t/3600)
    m = int((t-h*3600)/60)
    s = int((t-h*3600-m*60))
    print('DONE... execution time:', h, 'hours', m, 'minutes and', s, 'seconds')


artists_found_dict = {}
max_depth = 5

scrape_fans_liked(artists_df[['artist', 'id']])


Fetching artists' "fans also liked" artists. This could take a few hours...
DONE... execution time: 0 hours 0 minutes and 55 seconds


In [289]:
artists_found_dict

{'Rob Zombie': {'monthly_listeners': '5,754,233 monthly listeners',
  'fans_liked': {'Static-X': '7JDSHlDdVTo7aZKdQZ53Vf',
   'Powerman 5000': '5imUS9dQyCbAjUEJJ9QyWC',
   'White Zombie': '0CF71zaDOJWCynIkW9bSK8',
   'Dope': '7fWgqc4HJi3pcHhK8hKg2p',
   'Coal Chamber': '69k6uTSZMPLpSnhmLCiKxQ'}},
 'Static-X': {'monthly_listeners': '1,719,643 monthly listeners',
  'fans_liked': {'Mudvayne': '2Pfv2w8a20xzC7Dr7QXRqM',
   'Mushroomhead': '18absyD7lQaXUDBXnyzU8M',
   'Dope': '7fWgqc4HJi3pcHhK8hKg2p',
   'Powerman 5000': '5imUS9dQyCbAjUEJJ9QyWC',
   'Coal Chamber': '69k6uTSZMPLpSnhmLCiKxQ'}},
 'Mudvayne': {'monthly_listeners': '2,458,713 monthly listeners',
  'fans_liked': {'Static-X': '7JDSHlDdVTo7aZKdQZ53Vf',
   'American Head Charge': '6Ig4qybKXgMN2FLSM7GKau',
   'Mushroomhead': '18absyD7lQaXUDBXnyzU8M',
   'Spineshank': '6fmbbxNvgHkglIakp1Wrv4',
   'Ill Niño': '1xJ6l1VXgGuyZ0uhu27caF'}},
 'American Head Charge': {'monthly_listeners': '97,970 monthly listeners',
  'fans_liked': {'Spinesha

In [None]:

# Todo:
# Scrape Lyrics
# Do full run until just before fans-liked part
# Save artist and track dataframes (kaggle and spotify)


In [None]:
# https://stackoverflow.com/questions/61064454/beautifulsoup-is-unable-to-extract-all-html