In [1]:
import pandas as pd
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import time
from difflib import SequenceMatcher
from tqdm import tqdm
from bs4 import BeautifulSoup
import requests
import re

In [2]:
APP_CLIENT_ID = "668e0c3c0e374ae9baeb91ddcd72d537"
APP_CLIENT_SECRET = "b14c0ec7a011407aa78cb688151a69a7"
CRAWL_FOLDER = 'crawl/spotify/'

sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=APP_CLIENT_ID, client_secret=APP_CLIENT_SECRET))

In [3]:
with open('data/data_artists.csv', encoding='utf-8') as artists_file:
    artists_df = pd.read_csv(artists_file, index_col=0)

artists_df

Unnamed: 0,artist_name
/artists/3684861-chk-chk-chk,!!! (Chk Chk Chk)
/artists/694091-kadebostan,!!!Kadebostan!!!
/artists/5929284-brothers-in-bamako,"""Brothers In Bamako"""
/artists/8704378-floyd-division-the-austrian-pink-floyd-tribute-band,"""Floyd Division"" The Austrian Pink Floyd Tribu..."
/artists/7172094-jazzetcetera-stewy-von-wattenwyl-group-feat-lisette-spinnler,"""Jazz...etcetera"" Stewy von Wattenwyl Group, ..."
...,...
/artists/4301-and-you-will-know-us-by-the-trail-of-dead,…And You Will Know Us by the Trail of Dead
/artists/1099500-euuroshima,€urOshima
/artists/9573874-belize-it,≈ Belize ≈
/artists/8889739-luo-chai-cao-yuan-wwww-prairie-wwww,落差草原 WWWW / Prairie WWWW


In [4]:
# get the similarity between two strings

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [5]:
def save_temp_data(data_df):
    csv_filename = CRAWL_FOLDER + 'temp_data_artists_matched.csv'
    data_df.to_csv(csv_filename, encoding='utf-8')
    print(csv_filename)

In [None]:
counter = 0
artists_matched_dict = dict()
artists_matched_df = artists_df.copy()

for idx, row in tqdm(artists_df.iloc[24400:].iterrows()):
    artist_name = row['artist_name']
    query_string = artist_name.lower()
    
    results = sp.search(q=query_string, type='artist', limit=50, market='NO')
    artist_query_list = list()
    
    for result in results['artists']['items']:
        similarity = round(similar(result['name'], artist_name), 2)
        artist_query_list.append({
            'spotify_name': result['name'],
            'spotify_id': result['id'],
            'spotify_similarity': similarity,
            'spotify_popularity': result['popularity'],
        })
    
    if artist_query_list:
        artist_match = sorted(artist_query_list, key=lambda k: (k['spotify_similarity'], k['spotify_popularity']), reverse=True)[0]
        for key, value in artist_match.items():
            artists_matched_df.loc[idx, key] = value
            
    time.sleep(1)
    counter += 1
    
    if counter % 100 == 0:
        save_temp_data(artists_matched_df)
    
artists_matched_df

100it [01:49,  1.11s/it]

crawl/spotify/temp_data_artists_matched.csv


166it [03:00,  1.08s/it]

In [62]:
with open(CRAWL_FOLDER + 'temp_data_artists_matched.csv', encoding='utf-8') as matched_file:
    artists_matched_df = pd.read_csv(matched_file, index_col=0)
    
artists_matched_df.iloc[17300:]

Unnamed: 0,artist_name,spotify_name,spotify_id,spotify_similarity,spotify_popularity
/artists/5673729-mara-simpson,Mara Simpson,Mara Simpson,1qn02YVZeKMegZimHpELHH,1.0,5.0
/artists/171669-mara-tremblay,Mara Tremblay,Mara Tremblay,1tt3YXVP4AltaStku4rpf3,1.0,25.0
/artists/190281-maral-salmassi,Maral Salmassi,Maral Salmassi,6HIYk8vTuSUfGb5nL69pA8,1.0,3.0
/artists/4911773-marama,Marama,Marama,4GepMkTgrIZECoCC55vqjW,1.0,59.0
/artists/9899404-marama-tribe,Marama Tribe,,,,
...,...,...,...,...,...
/artists/4301-and-you-will-know-us-by-the-trail-of-dead,…And You Will Know Us by the Trail of Dead,,,,
/artists/1099500-euuroshima,€urOshima,,,,
/artists/9573874-belize-it,≈ Belize ≈,,,,
/artists/8889739-luo-chai-cao-yuan-wwww-prairie-wwww,落差草原 WWWW / Prairie WWWW,,,,


In [64]:
# get monthly number of listeners of an artist by scrapping Spotify website (data not available through API)

def get_spotify_listeners(artist_url):
    response = requests.get(artist_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    listeners_text = str(soup.find('meta', {'property': 'og:description'}))
    listeners_nbr = re.findall(r'[0-9]+\.*[0-9]*[MK]*', listeners_text)[0]
    
    # remove K and M and multiply by 1,000 or 1,000,000
    if 'K' in listeners_nbr:
        listeners_nbr = listeners_nbr.replace('K', '')
        listeners_nbr = float(listeners_nbr)*1000
    elif 'M' in listeners_nbr:
        listeners_nbr = listeners_nbr.replace('M', '')
        listeners_nbr = float(listeners_nbr)*1000000
    
    return int(listeners_nbr)

In [79]:
# get spotify related artists for an artist

def get_spotify_related_artists(spotify_id):
    related_artists_list = list()
    related_artists = sp.artist_related_artists(spotify_id)
    
    for related_artist in related_artists['artists']:
        related_name = related_artist['name']
        related_id = related_artist['id']
        related_popularity = related_artist['popularity']
        related_artists_list.append({
            'spotify_id': related_id,
            'spotify_name': related_name,
            'spotify_popularity': related_popularity,
        })
    
    # sort list of related artists by popularity (desc.)
    related_artists_list = sorted(related_artists_list, key=lambda k: (k['spotify_popularity']), reverse=True)
    
    # limit to 10 related artists (10 most popular)
    related_artists_list = related_artists_list[:10]
    
    return related_artists_list

In [80]:
# get spotify data for each artist

for idx, row in tqdm(artists_matched_df.iloc[17300:17310].iterrows()):
    spotify_id = row['spotify_id']
    
    if pd.notna(spotify_id):
        artist_info = sp.artist(spotify_id)
        
        url = artist_info['external_urls']['spotify']
        followers = artist_info['followers']['total']
        genres_list = artist_info['genres']
        listeners = get_spotify_listeners(url)
        
        artists_matched_df.loc[idx, 'spotify_followers'] = followers
        artists_matched_df.loc[idx, 'spotify_listeners'] = listeners
        
        genre_count = 1
        for genre in genres_list:
            genre_col_name = 'spotify_genre_' + str(genre_count)
            artists_matched_df.loc[idx, genre_col_name] = genre
            genre_count += 1
            
        related_artist_count = 1
        related_artists_list = get_spotify_related_artists(spotify_id)
        for related_artist in related_artists_list:
            related_id = related_artist['spotify_id']
            related_id_col_name = 'spotify_related_id_' + str(related_artist_count)
            artists_matched_df.loc[idx, related_id_col_name] = related_id
            
            related_name = related_artist['spotify_name']
            related_name_col_name = 'spotify_related_name_' + str(related_artist_count)
            artists_matched_df.loc[idx, related_name_col_name] = related_name
            
            related_artist_count += 1

        time.sleep(1)
        
artists_matched_df.iloc[17300:]

10it [00:14,  1.45s/it]


Unnamed: 0,artist_name,spotify_name,spotify_id,spotify_similarity,spotify_popularity,spotify_followers,spotify_listeners,spotify_genre_1,spotify_genre_2,spotify_genre_3,...,spotify_related_id_6,spotify_related_name_6,spotify_related_id_7,spotify_related_name_7,spotify_related_id_8,spotify_related_name_8,spotify_related_id_9,spotify_related_name_9,spotify_related_id_10,spotify_related_name_10
/artists/5673729-mara-simpson,Mara Simpson,Mara Simpson,1qn02YVZeKMegZimHpELHH,1.0,5.0,1646.0,775.0,,,,...,2KJ2NKrQxiHYA7GlYl6cjs,Anadel,2wHuSfI5SnRFuRsaCKJRL3,Venus and the Moon,1U1l0zn1Ks7bwNrYtx1DGI,Benedict Benjamin,6cTbm24LHutNXmqf6f4JJZ,Shoecraft,1c1Eft7z1a5uQdJ2iwSct7,Victoria Lord
/artists/171669-mara-tremblay,Mara Tremblay,Mara Tremblay,1tt3YXVP4AltaStku4rpf3,1.0,25.0,7922.0,16200.0,auteur-compositeur-interprete quebecois,chanson quebecois,indie quebecois,...,2IAG8mcBKZoIc5VbcPgNNc,Marie-Pierre Arthur,2fshYb8TCzSwt9m8NC4Ttu,Damien Robitaille,6CVur2iw4ExLNmsSGiATZN,Michel Rivard,0Fk3xExGqwbRwLyWUjUl8V,Yann Perreau,7HK83pzwHsZqiGchCqtMuD,Dumas
/artists/190281-maral-salmassi,Maral Salmassi,Maral Salmassi,6HIYk8vTuSUfGb5nL69pA8,1.0,3.0,326.0,363.0,,,,...,2bYg3DwtzQ5LWztZHwbWb3,Mixhell,2IA2rAfPyDDhqqcu6UaSOM,Don Rimini,0rpDUfXxUvzGqZXcKaN89T,Kissy Sell Out,4Ya8HodTVxFH3j4wsAVhRb,Steed Lord,3p9uPDkIZ0mYtY84NDwtRo,We Are Terrorists
/artists/4911773-marama,Marama,Marama,4GepMkTgrIZECoCC55vqjW,1.0,59.0,919637.0,941500.0,cumbia pop,,,...,76iFMHDqONZusQlTu2ckKQ,Los Nota Lokos,4PdggFNYwGfjRfkdG5OfES,#TocoParaVos,6nEgkeR03q2qtKZmrVq100,El Villano,0J65S0gB0D1gDEd0hK196k,Dame 5,4YBAOrBF9vBB9inOLtpRzp,Mano Arriba
/artists/9899404-marama-tribe,Marama Tribe,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
/artists/4301-and-you-will-know-us-by-the-trail-of-dead,…And You Will Know Us by the Trail of Dead,,,,,,,,,,...,,,,,,,,,,
/artists/1099500-euuroshima,€urOshima,,,,,,,,,,...,,,,,,,,,,
/artists/9573874-belize-it,≈ Belize ≈,,,,,,,,,,...,,,,,,,,,,
/artists/8889739-luo-chai-cao-yuan-wwww-prairie-wwww,落差草原 WWWW / Prairie WWWW,,,,,,,,,,...,,,,,,,,,,
