###Make spotipy work

In [3]:
# Import and install packages
!pip install spotipy
!pip install tqdm
import networkx as nx
import pandas as pd
import spotipy
from spotipy . oauth2 import SpotifyClientCredentials
import matplotlib.pyplot as plt
from tqdm import tqdm
import time
import random
from spotipy.exceptions import SpotifyException
import scipy


# Make the Spotify api work
CLIENT_ID = "a8ab203894844651964d98d08fd9a714"
CLIENT_SECRET = "e0c3fef273a7409793a1760d7bf9e350"

auth_manager = SpotifyClientCredentials (client_id = CLIENT_ID, client_secret = CLIENT_SECRET)
sp = spotipy . Spotify ( auth_manager = auth_manager )


# ------- IMPLEMENT HERE ANY AUXILIARY FUNCTIONS NEEDED ------- #
def add_node(graph, name, id):  # Function to create nodes representing a crawled artist
  raw = sp.artist(id) 

  genre = ''
  for g in raw['genres']:
    genre = genre+g+','

  graph.add_node(name, 
                 id = raw['id'], 
                 followers = raw['followers']['total'],
                 popularity = raw['popularity'],
                 genres = genre)


def explore_node(graph, name='', id=''): # Function to crawl an artist and its neighbours 
  if id=='':    
    id = search_artist(name)
  elif name=='':    
    name = sp.artist(id)['name']
  
  related = sp.artist_related_artists(id)  
  neighbours = [artist['name'] for artist in related['artists']]
  add_node(graph, name, id)

  for partner in neighbours: 
    add_node(graph, partner, search_artist(partner))
    graph.add_edge(name, partner)  
  
  return neighbours
# --------------- END OF AUXILIARY FUNCTIONS ------------------ #

def search_artist(artist_name: str) -> str:
    """
    Search for an artist in Spotify.

    :param artist_name: name to search for.
    :return: spotify artist id.
    """
    # ------- IMPLEMENT HERE THE BODY OF THE FUNCTION ------- #
    searched = sp.search(artist_name, type='artist', limit=1)
    if searched['artists']['items'] != []:
      return searched['artists']['items'][0]['id']  


def crawler(seed: str, max_nodes_to_crawl: int, strategy: str = "BFS", out_filename: str = "g.graphml") -> nx.DiGraph:
    """
    Crawl the Spotify artist graph, following related artists.

    :param seed: starting artist id.
    :param max_nodes_to_crawl: maximum number of nodes to crawl.
    :param strategy: BFS or DFS.
    :param out_filename: name of the graphml output file.
    :return: networkx directed graph.
    """

    Graph = nx.DiGraph()
    n_nodes = 1    
    to_explore = explore_node(Graph, id = seed)     
    visited = []    

    with tqdm(total=max_nodes_to_crawl, desc='Crawling artists', ncols=75) as pbar:
        while (n_nodes < max_nodes_to_crawl) and (len(to_explore) != 0):
            if strategy=='BFS':        
                current = to_explore.pop(0)         
                while current in visited:
                    current = to_explore.pop(0)                     
                to_explore = to_explore + explore_node(Graph, name = current)
                n_nodes += 1      
                visited.append(current) 

                # Update the progress bar
                pbar.update(1)

            elif strategy=='DFS':
                current = to_explore.pop(0)
                while current in visited:
                    current = to_explore.pop(0)                   
                to_explore = explore_node(Graph, name = current) + to_explore        
                n_nodes += 1     
                visited.append(current)

                # Update the progress bar
                pbar.update(1)

    nx.write_graphml(Graph, out_filename+".graphml")
    nx.draw_spring(Graph)
    return Graph 

def get_track_data(graphs: list, out_filename: str) -> pd.DataFrame:
    """
    Get track data for each visited artist in the graph.
    :param graphs: a list of graphs with artists as nodes. 
    :param out_filename: name of the csv output file. 
    :return: pandas dataframe with track data.
    """

    columns = ['artist_id', 'artist_name', 'track_id', 'track_name', 'track_popularity', 'duration_ms', 'explicit', 'danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'album_id', 'album_name', 'album_release_date']

    # create empty DataFrame with the defined columns and save it
    df = pd.DataFrame(columns=columns)
    df.to_csv(out_filename, index=False)
    
    total_nodes = sum([graph.number_of_nodes() for graph in graphs])

    # Create a progress bar
    with tqdm(total=total_nodes, desc='Processing artists', ncols=75) as pbar:
        for graph in graphs:
            for node in graph.nodes():
                # get artist id
                artist_id = graph.nodes[node]['id']

                # get top tracks of the artist
                top_tracks = sp.artist_top_tracks(artist_id)

                for track in top_tracks['tracks']:
                    # get album data
                    album = sp.album(track['album']['id'])

                    # get audio features of the track
                    audio_features = sp.audio_features(track['id'])[0]

                    track_data = [artist_id, node, 
                                  track['id'], 
                                  track['name'],
                                  track['popularity'], 
                                  track['duration_ms'], 
                                  track['explicit'],
                                  audio_features['danceability'],
                                  audio_features['energy'],
                                  audio_features['loudness'],
                                  audio_features['speechiness'],
                                  audio_features['acousticness'],
                                  audio_features['instrumentalness'],
                                  audio_features['liveness'],
                                  audio_features['valence'],
                                  audio_features['tempo'],
                                  album['id'],
                                  album['name'],
                                  album['release_date']]

                    df = pd.DataFrame([track_data], columns=columns)

                    # append the track data to csv
                    df.to_csv(out_filename, mode='a', header=False, index=False)
                    
                    # Update the progress bar
                    pbar.update(1)
    
    # load the saved DataFrame
    df = pd.read_csv(out_filename)
    
    return df


if __name__ == "__main__":
    # ------- IMPLEMENT HERE THE MAIN FOR THIS SESSION ------- #
    #Obtain the necessary seeds for this session    
    artist_name = "Drake"  # Replace with the name of the artist you want to search for
    seed_artist_id_Drake = search_artist(artist_name)
    artist_name = "French Montana"
    seed_artist_id_FM = search_artist(artist_name)
    

    #Exercises

    # Crawl the Spotify artist graph using BFS, starting with Drake
    gB = crawler(seed_artist_id_Drake, 200, 'BFS', 'gB')

    # Crawl the Spotify artist graph using DFS, starting with Drake
    gD = crawler(seed_artist_id_Drake, 200, 'DFS', 'gD')

    # Obtain the dataframe with information about the most popular songs of the crawled artists
    D = get_track_data([gB, gD], 'data_session-1.csv')

    # Crawl the Spotify artist graph using BFS, starting with French Montana
    hB = crawler(seed_artist_id_FM, 200, 'BFS', 'hB')

    # Crawl the Spotify artist graph using BFS, starting with the last crawled artist of the graph gD
    last_crawled_artist_from_gD = list(gD.nodes())[-1]
    print(last_crawled_artist_from_gD)
    seed_artist_id = search_artist(last_crawled_artist_from_gD)
    fB = crawler(seed_artist_id, 200, 'BFS', 'fB')
    # ------------------- END OF MAIN ------------------------ #

    