### Load libraries

In [None]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

import pandas as pd
from pandas import json_normalize

from tqdm import tqdm
import time 
import json 

from random import randint
import time 

import pickle 

### Connect with Spotify API using spotipy library

In [None]:
def connect_spotipy():
    with open("secrets.txt","r") as f:
        string = f.read()
    
    secrets_dict={}
    
    for line in string.split('\n'):
        if len(line) > 0:
            #print(line.split(':'))
            secrets_dict[line.split(':')[0]]=line.split(':')[1].strip()
            

    #Initialize SpotiPy with user credentials
    sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=secrets_dict['clientid'],
                                                            client_secret=secrets_dict['clientsecret']))
    return sp

In [None]:
sp = connect_spotipy()

### Get spotify playlists by categories

In [None]:
def get_playlists_by_categories():
    
    # get categories 
    categories = sp.categories(limit = 50)['categories']['items']
    

    # get playlist by categories
    top_playlists_by_genre = {}
    for category in tqdm(categories, desc = 'Fetching top playlists for each category.'):
        try:
            top_playlists_by_genre[category['name']] = sp.category_playlists(category_id=category['id'], limit=15)['playlists']['items']
            time.sleep(0.5)  # Introducing a half-second delay between each call
        except spotipy.SpotifyException as e:
            print(f"Error fetching playlists for category {category['name']}: {e}")
    all_playlist_ids = [playlist['id'] for genre in top_playlists_by_genre.values() for playlist in genre]
    
    return all_playlist_ids

In [None]:
top_playlist_ids = get_playlists_by_categories()

### Get tracks from all playlists
- get all tracks
- remove duplicates 

In [None]:
def get_all_tracks(playlist_ids):
    all_tracks = []
    
    # go trough all playlists and get tracks
    for playlist_id in tqdm(playlist_ids):
        tracks = get_tracks_from_playlist(playlist_id)
        all_tracks.extend(tracks) # add tracks to total list 

    final_tracks = json_normalize(all_tracks) # json normalize data 
    final_tracks.dropna(subset='track.artists', inplace = True) # drop null values
    final_tracks = final_tracks.drop_duplicates(subset=["track.id"]) # drop duplicates
    final_tracks ['artist_dict'] = final_tracks ['track.artists'].apply(list_to_dict)
    
    final_tracks.to_pickle('temp_data/tracks_from_playlists.pkl')
    return 

def get_tracks_from_playlist(playlist_id):
    offset = 0
    tracks = []
    while True:
        response = sp.playlist_tracks(playlist_id, offset=offset)
        tracks.extend(response['items'])
        if response['next'] is None:
            break
        offset += 100
        time.sleep(0.5)
    return tracks

def list_to_dict(x):
    return {i: x[i] for i in range(len(x))}

In [None]:
get_all_tracks(top_playlist_ids)

### Delete unnecessary columns 

In [None]:
tracks = pd.read_pickle('temp_data/tracks_from_playlists.pkl')

In [None]:
tracks.head(2).T

In [None]:
columns_to_keep = ['track.id', 'track.name', 'track.external_urls.spotify','artist_dict', 'track.album.id', 'track.album.name', 'track.album.images', 'track.album.release_date' ,'track.popularity']
tracks = tracks[columns_to_keep]

display(tracks.head(5))
tracks.to_pickle('temp_data/final_tracks.pkl')


### Extract artists to get further songs

In [None]:

tracks = pd.read_pickle('temp_data/final_tracks.pkl')

In [None]:
def expand_list_dict(row):
    df = pd.DataFrame.from_dict(row['artist_dict'], orient='index')
    df['song_id'] = row['track.id']
    return df

In [None]:
tqdm.pandas()
tracks['artists_dfs'] = tracks.progress_apply(expand_list_dict, axis=1)

In [None]:
# create new dataframe with columns to keep
artist_df = pd.DataFrame(columns=['external_urls', 'href', 'id', 'name', 'type', 'uri'])

# extract data for each artist df
for val in tqdm(tracks['artists_dfs']):
    #display(val)
    artist_df = pd.concat([artist_df, val], axis=0)
    

In [None]:
artist_df.drop_duplicates(subset='id', inplace=True)
artist_df.to_pickle("temp_data/artist_df.pkl")

### Get top 10 songs from each artist

In [None]:
artist_df = pd.read_pickle('temp_data/artist_df.pkl')

In [None]:
# extract artist IDs
artist_list = artist_df['id'].to_list()

In [None]:
# split artist list into smaller slices (50) for separate APIs calls

artist_dict = {}
slice_size = 50

for i in range(0, len(artist_list), slice_size):
    temp = artist_list[i:i+slice_size]
    artist_dict[i] = temp
    

In [None]:
# print(f"Search top songs for artist_list number = {}")

top10_songs = []
counter = 0

for key in artist_dict:
    # print(artist_dict[key])
    counter += 1
    print(f'Start finding top tracks: {counter}/{len(artist_dict)} - current len {len(top10_songs)}')
    for item in tqdm(artist_dict[key]):
        try:
            response = sp.artist_top_tracks(item)
            time.sleep(randint(1,3000)/1000)
            top10_songs = top10_songs + response["tracks"]
        except spotipy.SpotifyException as e:
            print(f"Error fetching songs for artist id {item}: {e}")
    
    with open('temp_data/top10_tracks.json', 'w') as file:
        json.dump(top10_songs, file)




### import new songs and combine with old track dataset

In [None]:
final_tracks = pd.read_pickle('temp_data/final_tracks.pkl')
final_tracks.head(2).T

In [None]:
with open('temp_data/top10_tracks.json', 'r') as file:
    song_update = json.load(file)
    


In [None]:
update = json_normalize(song_update)

### Get Audio Features and add to current dataframe


In [None]:
final_tracks = pd.read_pickle('temp_data/final_tracks.pkl')
final_tracks.head(2).T

In [None]:
def get_audio_features(df):

    chunks = [(i, i+100) for i in range(0, len(df), 100)]
    audio_features_list = []

    for chunk in tqdm(chunks):
        id_list100 = df['track.id'][chunk[0]:chunk[1]]
        audio_features_list = audio_features_list + sp.audio_features(id_list100)
        time.sleep(randint(1,3000)/1000)
    len(audio_features_list)
    audio_feature_df = json_normalize(audio_features_list )
    
    return audio_feature_df

In [None]:
audio_features =  get_audio_features(final_tracks)

In [None]:
audio_features.to_pickle('temp_data/audio_features_final_tracks.pkl')

In [None]:
with open ('temp_data/audio_features_final_tracks.pkl', 'br') as file:
    audio_features = pd.read_pickle(file)

with open ('temp_data/final_tracks.pkl', 'br') as file:
    final_tracks = pd.read_pickle(file)

final_tracks = final_tracks.reset_index(drop=True)
 
dataset_for_model = pd.concat([final_tracks, audio_features], axis = 1)
dataset_for_model

### Extract the artists dict

In [136]:
def expand_list_dict(row):
    df = pd.DataFrame.from_dict(row['artist_dict'], orient='index')
    df['song_id'] = row['id']
    return df

dataset_for_model['artists_dfs'] = dataset_for_model.apply(expand_list_dict, axis=1)
type(dataset_for_model['artists_dfs'][0])



pandas.core.frame.DataFrame

In [137]:
dataset_for_model.head(2).T

Unnamed: 0,0,1
track.id,31nfdEooLEq7dn3UMcIeB5,0gMTEHzNIyvxikxyUFFJxO
track.name,Vois sur ton chemin - Techno Mix,Summertime
track.external_urls.spotify,https://open.spotify.com/track/31nfdEooLEq7dn3...,https://open.spotify.com/track/0gMTEHzNIyvxikx...
artist_dict,{0: {'external_urls': {'spotify': 'https://ope...,{0: {'external_urls': {'spotify': 'https://ope...
track.album.id,79Cyc8GRWnLyjdJSMyJ0dB,3d5e4tp3t5zTGu44NM88LQ
track.album.name,Vois sur ton chemin (Techno Mix),Summertime
track.album.images,"[{'height': 640, 'url': 'https://i.scdn.co/ima...","[{'height': 640, 'url': 'https://i.scdn.co/ima..."
track.album.release_date,2023-08-04,2023-08-10
track.popularity,85.0,81.0
danceability,0.634,0.756


In [138]:
artist_df = pd.DataFrame(columns=['external_urls', 'href', 'id', 'name', 'type', 'uri', 'song_id'])
for val in tqdm(dataset_for_model['artists_dfs']):
    #display(val)
    artist_df = pd.concat([artist_df, val], axis=0)
    
artist_df


 85%|███████████████████████████████████████████████████████████████▋           | 39017/45957 [02:15<00:24, 288.43it/s]


KeyboardInterrupt: 

In [None]:
final_tracks_artist_split = pd.merge(left=dataset_for_model,
                    right=artist_df,
                    how='inner',
                    left_on='track.id',
                    right_on='song_id')

final_tracks_artist_split.to_pickle('temp_data/final_tracks_artist_split.pkl')

In [135]:
final_tracks_artist_split

NameError: name 'final_tracks_artist_split' is not defined

### Get further information about artist to create improved clusters 

In [None]:
artist_df = pd.read_pickle('temp_data/artist_df.pkl')

# extract artist IDs
artist_list = artist_df['id'].to_list()

# split artist list into smaller slices (50) for separate APIs calls

artist_dict = {}
slice_size = 50

for i in range(0, len(artist_list), slice_size):
    temp = artist_list[i:i+slice_size]
    artist_dict[i] = temp

In [None]:
from random import randint
import time 

artist_genre = []

for key in tqdm(artist_dict):
    try:
        response = sp.artists(artist_dict[key])
        time.sleep(randint(1,3000)/1000)
        artist_genre = artist_genre + response["artists"]
    except spotipy.SpotifyException as e:
        print(f"Error fetching genre for artist{artist}: {e}")
        


In [None]:

with open('temp_data/artist_information.pkl', 'wb') as file:
    pickle.dump(artist_genre, file)

In [140]:
with open('temp_data/artist_information.pkl', 'rb') as file:
    artist_inf = pickle.load(file)
    
with open('temp_data/final_tracks_artist_split.pkl', 'rb') as file:
    final_tracks = pickle.load(file)

In [144]:
final_tracks.head(2).T

Unnamed: 0,0,1
track.id,31nfdEooLEq7dn3UMcIeB5,0gMTEHzNIyvxikxyUFFJxO
track.name,Vois sur ton chemin - Techno Mix,Summertime
track.external_urls.spotify,https://open.spotify.com/track/31nfdEooLEq7dn3...,https://open.spotify.com/track/0gMTEHzNIyvxikx...
artist_dict,{0: {'external_urls': {'spotify': 'https://ope...,{0: {'external_urls': {'spotify': 'https://ope...
track.album.id,79Cyc8GRWnLyjdJSMyJ0dB,3d5e4tp3t5zTGu44NM88LQ
track.album.name,Vois sur ton chemin (Techno Mix),Summertime
track.album.images,"[{'height': 640, 'url': 'https://i.scdn.co/ima...","[{'height': 640, 'url': 'https://i.scdn.co/ima..."
track.album.release_date,2023-08-04,2023-08-10
track.popularity,85.0,81.0
danceability,0.634,0.756


In [141]:
artist_df = json_normalize(artist_inf)

In [142]:
columns_to_keep = ['id', 'images', 'name','popularity','external_urls.spotify','followers.total']
rename_columns = ['artist_id', 'artist_images', 'artist_name','artist_popularity','artist_link','artist_follower']
artist_df = artist_df[columns_to_keep]
artist_df.columns = rename_columns

In [143]:
artist_df

Unnamed: 0,artist_id,artist_images,artist_name,artist_popularity,artist_link,artist_follower
0,1r43wW70tnGUauQYvY5w48,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",BENNETT,63,https://open.spotify.com/artist/1r43wW70tnGUau...,32511
1,4lDiJcOJ2GLCK6p9q5BgfK,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",Kontra K,70,https://open.spotify.com/artist/4lDiJcOJ2GLCK6...,2209647
2,00FQb4jTyendYWaN8pK0wa,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",Lana Del Rey,89,https://open.spotify.com/artist/00FQb4jTyendYW...,27641500
3,5cj0lLjcoR7YOSnhnX0Po5,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",Doja Cat,86,https://open.spotify.com/artist/5cj0lLjcoR7YOS...,26209652
4,2HIP5sNZWkrPgSBWRk8XpA,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",SIRA,68,https://open.spotify.com/artist/2HIP5sNZWkrPgS...,17945
...,...,...,...,...,...,...
30237,26F0RxUxwCswnFxX3gecaD,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",Global Publishers Canada Inc.,13,https://open.spotify.com/artist/26F0RxUxwCswnF...,5098
30238,3sp7NtPgyjKKqYobNw8q6B,[],Complete Language Lessons,0,https://open.spotify.com/artist/3sp7NtPgyjKKqY...,2914
30239,2424G4yH7tJBPlbAoiVmc3,"[{'height': 640, 'url': 'https://i.scdn.co/ima...","Penton Overseas, Inc.",14,https://open.spotify.com/artist/2424G4yH7tJBPl...,5057
30240,4m6Gg5tLbuoiMNnEFOgtcK,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",Stephane Husar,6,https://open.spotify.com/artist/4m6Gg5tLbuoiMN...,553


In [145]:
final_tracks_model = pd.merge(left=final_tracks,
                    right=artist_df,
                    how='inner',
                    left_on='id_y',
                    right_on='artist_id')
final_tracks_model.to_pickle('temp_data/model_data_uncleaned.pkl')

In [147]:
final_tracks_model.to_pickle('temp_data/raw_model_tracks.pkl')