# Extract genres of the artists in the Spotify Chart dataset

In [1]:
from spotipy.oauth2 import SpotifyClientCredentials
import json
import spotipy
import time
import sys
from ast import literal_eval
import signal

from datetime import datetime

import pickle

from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
import os
from time import sleep  

In [2]:
SPOTIPY_CLIENT_ID=[]
SPOTIPY_CLIENT_SECRET=[]

n_spotify_clients=len(SPOTIPY_CLIENT_ID)
client_index=4
client_credentials_manager = SpotifyClientCredentials(client_id=SPOTIPY_CLIENT_ID[client_index], 
                                                      client_secret=SPOTIPY_CLIENT_SECRET[client_index])
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager, requests_timeout= 5)

In [3]:
filenames = []
for (dirpath, dirnames, f) in os.walk(os.path.join('data', 'generated', 'spotify')):
    filenames.extend(f)
    break

df_lst=[]
for f in filenames:
    df= pd.read_csv(os.path.join('data', 'generated', 'spotify', f), index_col=0, converters={"artists": literal_eval})
    df_lst.append(df)
tracks_df= pd.concat(df_lst, axis=0) 
tracks_df= tracks_df.reset_index(drop=True)
tracks_df

Unnamed: 0,track_id,artists,release_date,release_date_precision
0,6VaErRcSQJ9QHZRfrVignI,"[(3YC5DVJDjyazvB8hxJSybr, KUUMAA)]",2017-05-19,day
1,3c21L9S8LuEg8w0Jhf9H5e,"[(66wqVYADY665LeMxs8nlk3, Cor), (5erZiBCKPxe9F...",2020-09-18,day
2,4wrrmptapkv6hZmxh5TQGu,"[(3meJIgRw7YleJrmbpbJK6S, Die drei ???)]",2020-01-31,day
3,1lkE9wHvir22dGjU48gZbs,"[(3u2Sz4K3PFfalhDU0vSHT3, Eva Weel Skram)]",2021-01-15,day
4,2Jn6Aiv9uOkNcPa1kHNCpc,"[(6eBpe3hqpzEJPqh8bREqI6, Albe)]",2021-10-11,day
...,...,...,...,...
128045,0iM2vXFebAeChethPC6w5b,"[(24HI9hevLjIQtj7xp2CeHs, Peer Tasi), (0iJF1SR...",2018-08-20,day
128046,7IE989Ftunp6k2LdMsFC5r,"[(5JZ7CnR6gTvEMKX4g70Amv, Lauv)]",2020-01-16,day
128047,1Cx58u8RNrWl9YoHl0cv2D,"[(215ciYy7TIYnGRim0I1nJ3, JUNIOR CALLY)]",2019-09-06,day
128048,3pFPh0MelVO3CKThgbUo8s,"[(5dXlc7MnpaTeUIsHLVe3n4, Coez)]",2017-05-05,day


## Get songs' genres

### Recover the songs whose artists have already been processed.

In [4]:
processed_songs= []
process_songs_lst_path= os.path.join('data','processed_songs_V2.pkl')
if os.path.exists(process_songs_lst_path):
    with open(process_songs_lst_path, 'rb') as f:
        processed_songs = pickle.load(f)
len(processed_songs)

0

In [5]:
tracks_to_process= tracks_df[~tracks_df['track_id'].isin(processed_songs)]
tracks_to_process

Unnamed: 0,track_id,artists,release_date,release_date_precision
0,6VaErRcSQJ9QHZRfrVignI,"[(3YC5DVJDjyazvB8hxJSybr, KUUMAA)]",2017-05-19,day
1,3c21L9S8LuEg8w0Jhf9H5e,"[(66wqVYADY665LeMxs8nlk3, Cor), (5erZiBCKPxe9F...",2020-09-18,day
2,4wrrmptapkv6hZmxh5TQGu,"[(3meJIgRw7YleJrmbpbJK6S, Die drei ???)]",2020-01-31,day
3,1lkE9wHvir22dGjU48gZbs,"[(3u2Sz4K3PFfalhDU0vSHT3, Eva Weel Skram)]",2021-01-15,day
4,2Jn6Aiv9uOkNcPa1kHNCpc,"[(6eBpe3hqpzEJPqh8bREqI6, Albe)]",2021-10-11,day
...,...,...,...,...
128045,0iM2vXFebAeChethPC6w5b,"[(24HI9hevLjIQtj7xp2CeHs, Peer Tasi), (0iJF1SR...",2018-08-20,day
128046,7IE989Ftunp6k2LdMsFC5r,"[(5JZ7CnR6gTvEMKX4g70Amv, Lauv)]",2020-01-16,day
128047,1Cx58u8RNrWl9YoHl0cv2D,"[(215ciYy7TIYnGRim0I1nJ3, JUNIOR CALLY)]",2019-09-06,day
128048,3pFPh0MelVO3CKThgbUo8s,"[(5dXlc7MnpaTeUIsHLVe3n4, Coez)]",2017-05-05,day


### Recover the artists that already have genres

#### Spotify chart artists

In [6]:
artist_df_path= os.path.join('data', 'artists_genres_2017_2022.csv')
artists_genres_df= None
processed_artists=[]
if os.path.exists(artist_df_path):
    artists_genres_df= pd.read_csv(artist_df_path, index_col=0)    
    processed_artists= list(artists_genres_df.index)
print(len(processed_artists))

20479


#### LMF 2b artists

In [7]:
filenames = []
for (dirpath, dirnames, f) in os.walk(os.path.join('data', 'generated', 'LMF_2b', 'artist_info_V2')):
    filenames.extend(f)
    break
filenames

if len(filenames)>0:
    df_lst=[]
    for f in filenames:
        df= pd.read_csv(os.path.join('data', 'generated', 'LMF_2b', 'artist_info_V2', f), index_col=0)
        df_lst.append(df)
    processed_lmf_2b_artists_df= pd.concat(df_lst, axis=0)    
    processed_lmf_2b_artists_df= processed_lmf_2b_artists_df.reset_index(drop=True)
    processed_artists= processed_artists + list(processed_lmf_2b_artists_df.index)

print(len(processed_artists))    

266479


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


### Pre-process genres

In [8]:
all_genres = sp.recommendation_genre_seeds()
all_genres= all_genres['genres']
all_genres += ['rap', 'pop rock', 'trap', 'ranchera', 'corrido', 'cumbia']
all_genres

['acoustic',
 'afrobeat',
 'alt-rock',
 'alternative',
 'ambient',
 'anime',
 'black-metal',
 'bluegrass',
 'blues',
 'bossanova',
 'brazil',
 'breakbeat',
 'british',
 'cantopop',
 'chicago-house',
 'children',
 'chill',
 'classical',
 'club',
 'comedy',
 'country',
 'dance',
 'dancehall',
 'death-metal',
 'deep-house',
 'detroit-techno',
 'disco',
 'disney',
 'drum-and-bass',
 'dub',
 'dubstep',
 'edm',
 'electro',
 'electronic',
 'emo',
 'folk',
 'forro',
 'french',
 'funk',
 'garage',
 'german',
 'gospel',
 'goth',
 'grindcore',
 'groove',
 'grunge',
 'guitar',
 'happy',
 'hard-rock',
 'hardcore',
 'hardstyle',
 'heavy-metal',
 'hip-hop',
 'holidays',
 'honky-tonk',
 'house',
 'idm',
 'indian',
 'indie',
 'indie-pop',
 'industrial',
 'iranian',
 'j-dance',
 'j-idol',
 'j-pop',
 'j-rock',
 'jazz',
 'k-pop',
 'kids',
 'latin',
 'latino',
 'malay',
 'mandopop',
 'metal',
 'metal-misc',
 'metalcore',
 'minimal-techno',
 'movies',
 'mpb',
 'new-age',
 'new-release',
 'opera',
 'pagode',

In [9]:
gender_converter = {'r&b':'r-n-b', 
                    'soundtrack':'soundtracks', 
                    'ost':'soundtracks', 
                    'urbano':'hip-hop', 
                    'britpop':'pop',
                   'orchestra':'soundtracks'}

In [10]:
print("Total songs to be crawled {}".format(len(tracks_to_process)))

Total songs to be crawled 128050


In [11]:
# Register an handler for the timeout
def time_out_handler(signum, frame):
    print("Spotify call timeout!")
    raise Exception("end of time")
    
signal.signal(signal.SIGALRM, time_out_handler)

<Handlers.SIG_DFL: 0>

In [12]:
tracks_to_process

Unnamed: 0,track_id,artists,release_date,release_date_precision
0,6VaErRcSQJ9QHZRfrVignI,"[(3YC5DVJDjyazvB8hxJSybr, KUUMAA)]",2017-05-19,day
1,3c21L9S8LuEg8w0Jhf9H5e,"[(66wqVYADY665LeMxs8nlk3, Cor), (5erZiBCKPxe9F...",2020-09-18,day
2,4wrrmptapkv6hZmxh5TQGu,"[(3meJIgRw7YleJrmbpbJK6S, Die drei ???)]",2020-01-31,day
3,1lkE9wHvir22dGjU48gZbs,"[(3u2Sz4K3PFfalhDU0vSHT3, Eva Weel Skram)]",2021-01-15,day
4,2Jn6Aiv9uOkNcPa1kHNCpc,"[(6eBpe3hqpzEJPqh8bREqI6, Albe)]",2021-10-11,day
...,...,...,...,...
128045,0iM2vXFebAeChethPC6w5b,"[(24HI9hevLjIQtj7xp2CeHs, Peer Tasi), (0iJF1SR...",2018-08-20,day
128046,7IE989Ftunp6k2LdMsFC5r,"[(5JZ7CnR6gTvEMKX4g70Amv, Lauv)]",2020-01-16,day
128047,1Cx58u8RNrWl9YoHl0cv2D,"[(215ciYy7TIYnGRim0I1nJ3, JUNIOR CALLY)]",2019-09-06,day
128048,3pFPh0MelVO3CKThgbUo8s,"[(5dXlc7MnpaTeUIsHLVe3n4, Coez)]",2017-05-05,day


In [13]:
def save_artist_genres(artist_genres, processed_songs):
    artists_genres_df = pd.DataFrame.from_dict(artists_genres, orient='index')
    rename_cols= {}

    rename_cols[0]='artist_name'
    rename_cols[1]='popularity'
    rename_cols[2]='followers'
    
    for i in artists_genres_df.columns[3:]:
        rename_cols[i]= 'genre_{}'.format(i+1)

    
    artists_genres_df= artists_genres_df.rename(columns=rename_cols)

    now = datetime.now()
    dt_string = now.strftime("%Y%m%d_%H%M%S")
    artists_genres_df.to_csv(os.path.join('data', 'generated', 'spotify', 'artist_info_V2', f'01c_spotify_artist_info_{dt_string}.csv'))

    with open(process_songs_lst_path, 'wb') as f:
        pickle.dump(processed_songs, f)    

In [14]:
len(tracks_to_process)

128050

In [15]:
artists_genres={}

query_size=50
artist_query=[]
for i, song_info in tqdm(tracks_to_process.iterrows()): 
    song_id= song_info['track_id']
    for artist_id, artist_name in song_info['artists']:   
        if (not artist_id in processed_artists) and (not artist_id in artist_query):
            artist_query.append(artist_id)
        
        if len(artist_query)>= query_size:
        
            signal.alarm(10) # Fire exeption after 10 seconds

            query_successfull= True
            try:
                artist_info_lst= sp.artists(artist_query)
            except:
                signal.alarm(0)
                query_successfull=False
                if len(artists_genres)>0:
                    print("Saving intermediate results...", end='')   
                    save_artist_genres(artists_genres, processed_songs)
                    print('DONE')
                    
                else:
                    print("No intermediate results to save")

                
                artists_genres={}      
                
                #Refresh client to use for the extraction
                client_index = (client_index + 1) % n_spotify_clients
                print(f"Using new client  #{client_index}/{n_spotify_clients}")
                client_credentials_manager = SpotifyClientCredentials(client_id=SPOTIPY_CLIENT_ID[client_index], 
                                                                      client_secret=SPOTIPY_CLIENT_SECRET[client_index])
                
                sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager, requests_timeout= 5)
            
            signal.alarm(0)
            if query_successfull:
                artist_query= []
                try:
                    for artist_info in artist_info_lst['artists']:
                        artist_name= artist_info['name']
                        artist_code= artist_info['id']
                        artist_genres = ['hip-hop' if 'hip hop' in g else g for g in artist_info['genres']]
                        artist_genres += [g[2:] if g.startswith('j-')else g for g in artist_info['genres']]

                        artist_genres = [gender_converter.get(g,g) for g in artist_genres]
                        filtered_genres = [g for g in  artist_genres if g in all_genres]

                        split_genres= []
                        for g in artist_genres:
                            if len(g.split(' '))>1:
                                split_genres= split_genres + [g_aux for g_aux in g.split(' ')]

                        if len(split_genres)>0:
                            split_genres = [gender_converter.get(g,g) for g in split_genres]
                            filtered_split_genres = [g for g in split_genres if g in all_genres]

                            filtered_genres= filtered_genres + filtered_split_genres

                        if len(filtered_genres)==0:
                            artists_genres[artist_code]= [artist_name, artist_info['popularity'],artist_info['followers']['total']] + artist_info['genres']
                        else:
                            artists_genres[artist_code]= [artist_name, artist_info['popularity'],artist_info['followers']['total']] + [f for f in set(filtered_genres)]
                         
                        processed_artists.append(artist_code)

                except:
                    print(f"Error retrieving data from artist {artist_code}")
            

    processed_songs.append(song_info['track_id'])

0it [00:00, ?it/s]

In [16]:
save_artist_genres(artists_genres, processed_songs)

In [17]:
print("That's all folks!")

That's all folks!
