## IMPORTS & SETTINGS
Here everything is configured and the necessary libraries are imported.

In [None]:
import yaml
import tqdm
import spotipy
import skimage
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd

from tqdm.notebook import tqdm as t_
from skimage import io
from spotipy.oauth2 import SpotifyClientCredentials

# PACKAGE VERSIONS
print(f"TQDM version: {tqdm.__version__}")
# print(f"Spotify version: {spotipy.__version__}") # Doesnt exist???
print(f"Skimage version: {skimage.__version__}")
print(f"Matplotlib version: {matplotlib.__version__}")
print(f"Pandas version: {pd.__version__}")

# GET CONFIG
config_file = "settings.yml"
with open(config_file, "r") as ymlfile:
    cfg = yaml.safe_load(ymlfile)

In [None]:
# SPOTIFY AUTH
spotify_creds = cfg["spotify_creds"]
sp = spotipy.Spotify(
    auth_manager=SpotifyClientCredentials(
        client_id=spotify_creds["client_id"],
        client_secret=spotify_creds["client_secret"],
    )
)

## HELPER FUNCTIONS
These functions are used to make the code more readable and to avoid repetition.

In [None]:
def search_artist(artist_name, sp):
    # GET START ARTIST
    results = sp.search(q=f"artist:{artist_name}", type="artist")["artists"]

    if len(results["items"]) == 0:
        raise ValueError(f"No artist found with name: {artist_name}")

    if len(results["items"]) > 1:
        print(f"Multiple artists found, picking the most popular one:\n")

    artist = results["items"][0]
    print(f"Artist ID: {artist['id']}")

    plt.figure(figsize=(4, 4))
    plt.title(f"Artist: {artist['name']}")
    plt.imshow(io.imread(artist["images"][0]["url"]))
    plt.axis("off")
    plt.show()

    return artist

In [None]:
def scrape_artist(id_, sp):
    # FIND ALL POSSIBLE RELEASES
    releases = []
    n_singles = 0
    while True:
        response = sp.artist_albums(id_, limit=50, offset=n_singles)
        releases.extend(response["items"])
        if not response["next"]:
            break
        n_singles += 50

    # FILTER OUT RELEASES ON ARTIST ID
    clean_releases = []
    for release in releases:
        for artist in release["artists"]:
            if artist["id"] == id_:
                clean_releases.append(release)
                break

    # GET ALL TRACKS FROM ALL RELEASES
    tracks = []
    n_tracks = 0
    for release in clean_releases:
        while True:
            response = sp.album_tracks(release["id"], limit=50, offset=n_tracks)
            tracks.extend(response["items"])
            if not response["next"]:
                break
            n_tracks += 50

    # EARLY RETURN IF NO TRACKS
    if len(tracks) == 0:
        return pd.DataFrame(), pd.DataFrame(), pd.DataFrame()

    # REMOVE DUPLICATES
    unique_tracks = []
    for track in tracks:
        if track["name"] not in [t["name"] for t in unique_tracks]:
            unique_tracks.append(track)
    tracks = unique_tracks

    # GET ALL ARTISTS IN ALL TRACKS
    artist_track = pd.DataFrame(
        [{"track_id": track["id"], "artist_id": artist["id"]} for track in tracks for artist in track["artists"]]
    )

    # GET ALL ARTIST NAMES
    artists = pd.DataFrame(
        [{"id": artist["id"], "name": artist["name"]} for track in tracks for artist in track["artists"]]
    )
    artists = artists.drop_duplicates().reset_index(drop=True)

    artists_info = pd.DataFrame(sp.artists(artists["id"].tolist())["artists"])
    artists_info = artists_info.loc[
        :,
        [
            "id",
            "followers",
            "genres",
            "popularity",
        ],
    ]
    artists_info["followers"] = artists_info["followers"].apply(lambda x: x["total"])
    
    artists = pd.merge(artists, artists_info, how="outer", on="id")

    # GET INFO ON ALL THE TRACKS
    tracks = pd.DataFrame(tracks)
    tracks = tracks.loc[:, ["id", "name"]]

    tracks_info = pd.DataFrame(sp.audio_features(tracks["id"].tolist()))
    tracks_info = tracks_info.loc[
        :,
        [
            "id",
            "danceability",
            "energy",
            "key",
            "loudness",
            "mode",
            "speechiness",
            "acousticness",
            "instrumentalness",
            "liveness",
            "valence",
            "tempo",
            "duration_ms",
            "time_signature",
        ],
    ]

    tracks = pd.merge(tracks, tracks_info, on="id")

    return artist_track, artists, tracks

## FUN
Here we do all the work. 

In [None]:
# PARAMS
start_artist_name = "Tristam"
connection_depth = 1

In [None]:
start_artist = search_artist(start_artist_name, sp)

In [None]:
# SCRAPED DATA FROM INITIAL ARTIST
artist_track, artists, tracks = scrape_artist(start_artist["id"], sp)
display(artist_track.head())
display(artists.head())
display(tracks.head())

In [None]:
searched_artists = [start_artist["id"]]
artists_to_search = artists["id"].tolist()

In [None]:
for i in range(0, connection_depth):
    print(f"Scraping artists at depth: {i+1}")
    new_artists = []
    depth_artists_to_search = set(artists_to_search) - set(searched_artists)

    for artist_id in t_(depth_artists_to_search):
        temp_artist_track, temp_artists, temp_tracks = scrape_artist(artist_id, sp)

        if len(temp_tracks) == 0:
            continue

        artist_track = pd.concat([artist_track, temp_artist_track])
        artists = pd.concat([artists, temp_artists])
        tracks = pd.concat([tracks, temp_tracks])

        new_artists.extend(temp_artists["id"].tolist())

    searched_artists.extend(depth_artists_to_search)
    print(f"Artists searched: {len(searched_artists)}")
    searched_artists = set(searched_artists)
    artists_to_search = set(new_artists) - searched_artists