https://developer.spotify.com/documentation/web-api/tutorials/getting-started

In [None]:
# !pip install python-dotenv
# !pip install spotipy

Importiamo le librerie necessarie

In [None]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from dotenv import load_dotenv
import os
import pandas as pd
import time
import numpy as np
import json

colleghiamoci al API di Spotify

In [None]:
load_dotenv()
SPOTIPY_CLIENT_ID = os.getenv("SPOTIPY_CLIENT_ID")
SPOTIPY_CLIENT_SECRET = os.getenv("SPOTIPY_CLIENT_SECRET")
auth_manager = SpotifyClientCredentials(client_id=SPOTIPY_CLIENT_ID, client_secret=SPOTIPY_CLIENT_SECRET)
sp = spotipy.Spotify(auth_manager=auth_manager)

In [None]:
# salviamoci l'urn dell'artista
artist_urns = ['spotify:artist:6XYvaoDGE0VmRt83Jss9Sn', 'spotify:artist:1Dt1UKLtrJIW1xxRBejjos']
# chiediamo a Spotify di restituirci le informazioni sull'artista
artist = sp.artist(artist_urns[0])
print(artist)

In [None]:
# chiediamo a Spotify di restituirci i top tracks dell'artista
artist_top_tracks = sp.artist_top_tracks(artist_urns[0])
print(artist_top_tracks)

In [None]:
# guardiamo il tipo di oggetto che ci è stato restituito
print(type(artist_top_tracks))
print(artist_top_tracks.keys())
print(type(artist_top_tracks['tracks']))
print(type(artist_top_tracks['tracks'][0]))

In [None]:
# guardiamo il contenuto di un elemento della lista
print(artist_top_tracks['tracks'][0])

In [None]:
print(artist_top_tracks['tracks'][0]['name'])
print(artist_top_tracks['tracks'][0]['album']['name'])
print(artist_top_tracks['tracks'][0]['album']['images'][0]['url'])

### Album

In [None]:
artist_albums = sp.artist_albums(artist_urns[0], album_type='album', limit=5, offset=0)
print(artist_albums.keys())
print(type(artist_albums["items"]))
print(len(artist_albums["items"]))

print(artist_albums["items"][0])
print(artist_albums["items"][0].keys())


In [None]:
artist_album_names = []
artist_album_uris = []
for j in range(len(artist_albums['items'])):
    artist_album_names.append(artist_albums['items'][j]['name'])
    artist_album_uris.append(artist_albums['items'][j]['uri'])

# pandas df
df = pd.DataFrame({'album': artist_album_names, 'uri': artist_album_uris})
df

# Costruiamo un dataset
Guardiamo gli album di un artista

In [None]:
print(artist_albums["items"][0].keys())
artist_name = artist_albums['items'][0]['artists'][0]['name']
artist_name = sp.artist(artist_urns[0])['name']
print(artist_name)

In [None]:
artist_urns = ['spotify:artist:6XYvaoDGE0VmRt83Jss9Sn', 'spotify:artist:1Dt1UKLtrJIW1xxRBejjos']
music_dataset = {}
for artist_urn in artist_urns:
    artist_name = sp.artist(artist_urn)['name']
    artist_albums = sp.artist_albums(artist_urn, album_type='album', limit=5, offset=0)
    music_dataset[artist_name] = {}
    for album in range(len(artist_albums['items'])):
        album_name = artist_albums['items'][album]['name']
        music_dataset[artist_name][album_name] = {'release_date': artist_albums['items'][album]['release_date'],
                                                  'tracks': [], 
                                                  'album_uri': artist_albums['items'][album]['uri']}
print(music_dataset)

### Extract all the songs from every album
Next would be to loop through each album to extract key album track data.

In [None]:
album_tracks = sp.album_tracks(artist_album_uris[0], limit=25, offset=0)
print(album_tracks.keys())

In [None]:
print(album_tracks['items'][0].keys())
print(album_tracks['items'][0]['name'])

In [None]:
for song in range(len(album_tracks['items'])):
    print(album_tracks['items'][song]['name'])

In [None]:
artist_list = list(music_dataset)
print(artist_list)

In [None]:
for artist in list(music_dataset):
    for album in list(music_dataset[artist]):
        album_uri = music_dataset[artist][album]['album_uri']
        album_tracks = sp.album_tracks(album_uri, limit=25, offset=0)
        for song in range(len(album_tracks['items'])):
            music_dataset[artist][album]['tracks'].append({'track_name': album_tracks['items'][song]['name'],
                                                            'track_uri': album_tracks['items'][song]['uri'],
                                                            'track_duration': album_tracks['items'][song]['duration_ms'],
                                                            'track_number': album_tracks['items'][song]['track_number']})

In [None]:
print(music_dataset["King Gizzard & The Lizard Wizard"]["Changes"])

Ora aggiugiamo alle canzoni le caratteristiche sonore

In [None]:
track_list = [] 
for track in music_dataset["The Blaze"]["JUNGLE"]["tracks"]:
    track_list.append(track['track_uri'])


In [None]:
sp.audio_features(tracks=track_list)

ora ggiugiamo le caratteristiche al nostro dataset

In [None]:
for artist in list(music_dataset):
    for album in list(music_dataset[artist]):
        for track in music_dataset[artist][album]['tracks']:
            track_uri = track['track_uri']
            track_features = sp.audio_features(track_uri)
            track['features'] = track_features[0]

In [None]:
print(music_dataset["The Blaze"]["JUNGLE"]["tracks"][0])

ora trasformiamo il dataset dizionario in un dataframe

In [None]:
import json
import pickle

# Specify the file path where you want to save the JSON file
file_path = 'music_dataset.json'

# Convert the music dataset to JSON format
music_dataset_json = json.dumps(music_dataset)

# Save the JSON data to a file
with open(file_path, 'w') as file:
    file.write(music_dataset_json)

print("Music dataset saved as JSON successfully.")


In [None]:
print(music_dataset["The Blaze"]["JUNGLE"])

In [None]:
# load the JSON file
with open('music_dataset.json', 'r') as file:
    json_data = json.load(file)
music_dataset = json_data
rows = []

for artist, albums in json_data.items():
    for album_name, album_details in albums.items():
        release_date = album_details["release_date"]
        for track in album_details["tracks"]:
            track_info = {
                "artist": artist,
                "album_name": album_name,
                "album_release_date": release_date,
                "track_name": track["track_name"],
                **track["features"]
            }
            rows.append(track_info)

# Convert the corrected list of dictionaries into a DataFrame
df = pd.DataFrame(rows)

# save the dataframe as a pickle file
df.to_pickle('music_dataset.pkl')

# Display the first few rows of the corrected DataFrame to verify
df

alternartiva a unpacker

In [None]:
rows = []

for artist, albums in json_data.items():
    for album_name, album_details in albums.items():
        release_date = album_details["release_date"]
        for track in album_details["tracks"]:
            track_info = {
                "artist": artist,
                "album_name": album_name,
                "album_release_date": release_date,
                "track_name": track["track_name"],
                "danceability": track["features"]["danceability"],
                "energy": track["features"]["energy"],
                "key": track["features"]["key"],
                "loudness": track["features"]["loudness"],
                "mode": track["features"]["mode"],
                "speechiness": track["features"]["speechiness"],
                "acousticness": track["features"]["acousticness"],
                "instrumentalness": track["features"]["instrumentalness"],
                "liveness": track["features"]["liveness"],
                "valence": track["features"]["valence"],
                "tempo": track["features"]["tempo"],
                # ms in minutes
                "duration_min": track["features"]["duration_ms"] / 60000,
                "time_signature": track["features"]["time_signature"]
                # Add more features as needed
            }
            rows.append(track_info)

# Convert the list of dictionaries into a DataFrame
df_alternative = pd.DataFrame(rows)

# Display the first few rows of the DataFrame to verify
df_alternative.head()


dizionario_base = {"a": 1, "b": 2}
nuove_coppie = {"b": 3, "c": 4}

# Combinare i dizionari
dizionario_combinato = {**dizionario_base, **nuove_coppie}
print(dizionario_combinato)  # Output: {'a': 1, 'b': 3, 'c': 4}


### Extract audio features for each song¶
To store the audio features of each album track and append the data into lists representing all the music tracks for that album we add additional key-values.

Acousticness : A confidence measure from 0.0 to 1.0 of whether the track is acoustic. 1.0 represents high confidence the track is acoustic.

Danceability : Danceability describes how suitable a track is for dancing based on a combination of musical elements including tempo, rhythm stability, beat strength, and overall regularity. A value of 0.0 is least danceable and 1.0 is most danceable.

Energy : Energy is a measure from 0.0 to 1.0 and represents a perceptual measure of intensity and activity. Typically, energetic tracks feel fast, loud, and noisy. For example, death metal has high energy, while a Bach prelude scores low on the scale. Perceptual features contributing to this attribute include dynamic range, perceived loudness, timbre, onset rate, and general entropy.

Instrumentalness: Predicts whether a track contains no vocals. “Ooh” and “aah” sounds are treated as instrumental in this context. Rap or spoken word tracks are clearly “vocal”. The closer the instrumentalness value is to 1.0, the greater likelihood the track contains no vocal content. Values above 0.5 are intended to represent instrumental tracks, but confidence is higher as the value approaches 1.0.

Liveness: Detects the presence of an audience in the recording. Higher liveness values represent an increased probability that the track was performed live. A value above 0.8 provides strong likelihood that the track is live.

Loudness: he overall loudness of a track in decibels (dB). Loudness values are averaged across the entire track and are useful for comparing relative loudness of tracks. Loudness is the quality of a sound that is the primary psychological correlate of physical strength (amplitude). Values typical range between -60 and 0 db.

Speechiness: Speechiness detects the presence of spoken words in a track. The more exclusively speech-like the recording (e.g. talk show, audio book, poetry), the closer to 1.0 the attribute value. Values above 0.66 describe tracks that are probably made entirely of spoken words. Values between 0.33 and 0.66 describe tracks that may contain both music and speech, either in sections or layered, including such cases as rap music. Values below 0.33 most likely represent music and other non-speech-like tracks.

Valence: A measure from 0.0 to 1.0 describing the musical positiveness conveyed by a track. Tracks with high valence sound more positive (e.g. happy, cheerful, euphoric), while tracks with low valence sound more negative (e.g. sad, depressed, angry).

Tempo: The overall estimated tempo of a track in beats per minute (BPM). In musical terminology, tempo is the speed or pace of a given piece and derives directly from the average beat duration.

Now let loop through albums extracting the audio features.For this we will add a random delay every few albums to avoid sending too many requests at Spotify's API.

# EDA ora

In [None]:
!pip install matplotlib
!pip install seaborn

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Setting up the visualization
sns.set(style="whitegrid")

# Plotting distributions of selected audio features
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Distribution of Audio Features')

# Danceability
sns.histplot(df['danceability'], ax=axes[0, 0], kde=True, color="skyblue")
axes[0, 0].set_title('Danceability')

# Energy
sns.histplot(df['energy'], ax=axes[0, 1], kde=True, color="olive")
axes[0, 1].set_title('Energy')

# Tempo
sns.histplot(df['tempo'], ax=axes[1, 0], kde=True, color="gold")
axes[1, 0].set_title('Tempo')

# Duration_ms
sns.histplot(df['duration_ms'] / 60000, ax=axes[1, 1], kde=True, color="teal")  # Convert ms to minutes
axes[1, 1].set_title('Duration (Minutes)')

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()


In [None]:
# Calculating and plotting the correlation matrix of selected features
selected_features = ['danceability', 'energy', 'loudness', 'tempo', 'valence', 'duration_ms']
corr_matrix = df[selected_features].corr(method='pearson')

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm")
plt.title('Correlation Matrix of Audio Features')
plt.show()


In [None]:
# Calcolo delle medie delle caratteristiche per ogni artista
mean_features_per_artist = df.groupby('artist')[['danceability', 'energy', 'tempo']].mean().reset_index()

mean_features_per_artist