This file will produce 2 items.

- 1 - A playlist from spotify with complete song information and metrics
- 2 - A balanced dataset to train ML models on (predicting if a song belongs to a playlist or not)

#### Loading in necessary packages

In [1]:
!pip install spotipy

Collecting spotipy
  Downloading spotipy-2.24.0-py3-none-any.whl.metadata (4.9 kB)
Collecting redis>=3.5.3 (from spotipy)
  Downloading redis-5.2.0-py3-none-any.whl.metadata (9.1 kB)
Downloading spotipy-2.24.0-py3-none-any.whl (30 kB)
Downloading redis-5.2.0-py3-none-any.whl (261 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: redis, spotipy
Successfully installed redis-5.2.0 spotipy-2.24.0


In [2]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

In [3]:
client_id =  '01b87eb4c04640c89b463737e159089a'
client_secret = 'd6d7f6663f1e4d00ad4611960f283b0c'
import requests
import base64

def get_access_token(client_id, client_secret):
    auth_url = 'https://accounts.spotify.com/api/token'
    auth_header = {
        'Authorization': f'Basic {base64.b64encode((client_id + ":" + client_secret).encode()).decode()}'
    }
    auth_data = {
        'grant_type': 'client_credentials',
        'client_id': client_id,
        'client_secret': client_secret
    }

    auth_response = requests.post(auth_url, data=auth_data, headers=auth_header)
    auth_response_data = auth_response.json()
    access_token = auth_response_data['access_token']
    return access_token

access_token = get_access_token(client_id, client_secret)
print(access_token)

BQDlx2oXwgnllPdpLmvpPK40N5sUIBiHehss2m4QD8K5V3bCELno3MQ_ayve_Ec4xUuUmyeRFWCNVD7mTQhOO7Q1zrHUp-OHK_1ZBuCX2_FVf6anxco


#### Fetching data from a playlist on spotify

In [4]:
# getting track ids of a playlist

import requests

def get_playlist_items(playlist_id):
    # Spotify API endpoint for playlist tracks
    playlist_url = f'https://api.spotify.com/v1/playlists/{playlist_id}/tracks'

    headers = {
        'Authorization': f'Bearer {access_token}'
    }

    tracks = []
    params = {
        'limit': 100,  # Spotify's API limit for number of items per request
        'offset': 0,    # Offset for pagination
        'market':'US'
    }

    while True:
        response = requests.get(playlist_url, headers=headers, params=params)

        if response.status_code == 200:
            data = response.json()
            tracks.extend(item['track']['id'] for item in data['items'] if item['track'] is not None)  # Append tracks to the list

            # Check if more pages of data are available
            if data['next']:
                params['offset'] += params['limit']  # Update offset to fetch the next page
            else:
                break  # No more data to fetch, exit the loop
        else:
            print(f"Error retrieving playlist details for ID {playlist_id}: {response.status_code}")
            return None

    return tracks

In [5]:
# getting track information from a list of track ids
import time
import requests
import pandas as pd

def get_tracks_info_with_audio_features(track_ids):
    # Spotify API endpoints
    track_url = 'https://api.spotify.com/v1/tracks'
    audio_features_url = 'https://api.spotify.com/v1/audio-features'

    headers = {
        'Authorization': f'Bearer {access_token}'
    }

    # To store all track information with audio features
    tracks_with_features = []

    # Process track IDs in batches of 50 (Spotify's API limit)
    for i in range(0, len(track_ids), 50):
        batch = track_ids[i:i + 50]

        # Get track metadata
        track_params = {'ids': ','.join(batch)}
        track_response = requests.get(track_url, headers=headers, params=track_params)

        # Get audio features
        audio_features_params = {'ids': ','.join(batch)}
        audio_features_response = requests.get(audio_features_url, headers=headers, params=audio_features_params)

        # Check if both responses are successful
        if track_response.status_code == 200 and audio_features_response.status_code == 200:
            track_data = track_response.json()['tracks']
            audio_features_data = audio_features_response.json()['audio_features']

            # Combine track metadata and audio features
            for track, features in zip(track_data, audio_features_data):
                if track and features:  # Check that both exist
                    combined_data = {
                        'track_id': track['id'],
                        'track_name': track['name'],
                        'artist_name': [artist['name'] for artist in track['artists']],
                        'popularity': track['popularity'],
                        'album': track['album']['name'],
                        'danceability': features['danceability'],
                        'energy': features['energy'],
                        'key': features['key'],
                        'loudness': features['loudness'],
                        'mode': features['mode'],
                        'speechiness': features['speechiness'],
                        'acousticness': features['acousticness'],
                        'instrumentalness': features['instrumentalness'],
                        'liveness': features['liveness'],
                        'valence': features['valence'],
                        'tempo': features['tempo'],
                        'type': features['type'],
                        'duration_ms': track['duration_ms'],
                        'time_signature': features['time_signature'],
                        'release_date': track['album']['release_date'],
                    }
                    tracks_with_features.append(combined_data)
        else:
            print(f"Error retrieving data for batch starting with ID {batch[0]}: track status {track_response.status_code}, audio features status {audio_features_response.status_code}")
            return None
        time.sleep(3)

    return pd.DataFrame(tracks_with_features)


In [6]:
def get_playlist_df(playlist_id):
  playlist_tracks = get_playlist_items(playlist_id)
  playlist_df = get_tracks_info_with_audio_features(playlist_tracks)
  return playlist_df

#### Running code with a rock playlist

#### Exporting playlist data as a csv

In [None]:
def df_to_csv(df,name):
  df.to_csv(str(name)+'.csv', index=False)

In [None]:
df_to_csv(playlist_df,'playlist_df')

#### Creating a training dataset for ML models

In [None]:
from google.colab import drive
import pandas as pd

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
file_path = '/content/drive/My Drive/Analytics Project/Data/Complete Data/FINAL_RAW_DF.csv'

song_df = pd.read_csv(file_path)

##### randomly sampling random spotify songs from song_df and combining them with playlist df

In [None]:
import numpy as np

shuffled_indices = np.random.permutation(song_df.index)
shuffled_song_df = song_df.loc[shuffled_indices].reset_index(drop=True)
data = pd.concat([shuffled_song_df[:playlist_df.shape[0]], playlist_df], axis=0)
data

playlist_track_ids = playlist_df['track_id'].tolist()
data['in_playlist'] = data['track_id'].apply(lambda x: 1 if x in playlist_track_ids else 0)
data['release_date'] = data['release_date'].astype('datetime64[ns]')
data['release_date'] = data['release_date'].dt.year

In [None]:
# shuffling indices
shuffled_data = data.sample(frac=1).reset_index(drop=True)
shuffled_data


Unnamed: 0,track_id,track_name,artist_name,popularity,album,uri,danceability,energy,key,loudness,...,valence,tempo,type,id,track_href,analysis_url,duration_ms,time_signature,release_date,in_playlist
0,2rBHnIxbhkMGLpqmsNX91M,Bombtrack,['Rage Against The Machine'],64,Rage Against The Machine - XX (20th Anniversar...,spotify:track:2rBHnIxbhkMGLpqmsNX91M,0.459,0.926,4,-3.415,...,0.569,151.534,audio_features,2rBHnIxbhkMGLpqmsNX91M,https://api.spotify.com/v1/tracks/2rBHnIxbhkMG...,https://api.spotify.com/v1/audio-analysis/2rBH...,243453,4,1992,0
1,2d4e45fmUnguxh6yqC7gNT,Dirty Deeds Done Dirt Cheap,[AC/DC],71,Dirty Deeds Done Dirt Cheap,,0.668,0.906,11,-4.881,...,0.507,135.653,audio_features,,,,231933,4,1976,1
2,3lfmqF0ULXRHlWxBeaHo3t,Hit That,['The Offspring'],64,Splinter,spotify:track:3lfmqF0ULXRHlWxBeaHo3t,0.704,0.809,4,-3.045,...,0.962,131.952,audio_features,3lfmqF0ULXRHlWxBeaHo3t,https://api.spotify.com/v1/tracks/3lfmqF0ULXRH...,https://api.spotify.com/v1/audio-analysis/3lfm...,169413,4,2003,0
3,6YUUQapDEqBF1yP3iHlsut,Insomnia,['Craig David'],61,Rewind - The Collection,spotify:track:6YUUQapDEqBF1yP3iHlsut,0.620,0.884,1,-4.127,...,0.594,125.390,audio_features,6YUUQapDEqBF1yP3iHlsut,https://api.spotify.com/v1/tracks/6YUUQapDEqBF...,https://api.spotify.com/v1/audio-analysis/6YUU...,207307,4,2017,0
4,5NDyPVjcjK0hw2sUjjWFIO,Soldier of Fortune,['Deep Purple'],39,Stormbringer,spotify:track:5NDyPVjcjK0hw2sUjjWFIO,0.546,0.274,2,-15.070,...,0.228,66.809,audio_features,5NDyPVjcjK0hw2sUjjWFIO,https://api.spotify.com/v1/tracks/5NDyPVjcjK0h...,https://api.spotify.com/v1/audio-analysis/5NDy...,193653,4,1974,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1411,6MGRweoLPBAaPcXTMpMWsA,Coco (feat. DaBaby),"['24kGoldn', 'DaBaby']",57,El Dorado,spotify:track:6MGRweoLPBAaPcXTMpMWsA,0.697,0.783,7,-4.356,...,0.528,98.068,audio_features,6MGRweoLPBAaPcXTMpMWsA,https://api.spotify.com/v1/tracks/6MGRweoLPBAa...,https://api.spotify.com/v1/audio-analysis/6MGR...,142000,4,2021,0
1412,2QrXzOqLCVDRZHkToA0tSR,Never Go Back,['Dennis Lloyd'],57,Never Go Back,spotify:track:2QrXzOqLCVDRZHkToA0tSR,0.421,0.653,9,-4.850,...,0.733,78.600,audio_features,2QrXzOqLCVDRZHkToA0tSR,https://api.spotify.com/v1/tracks/2QrXzOqLCVDR...,https://api.spotify.com/v1/audio-analysis/2QrX...,175424,4,2019,0
1413,1Y372uxsCkKqNclj2ercap,17 Años,['Los Ángeles Azules'],68,Una Lluvia De Rosas,spotify:track:1Y372uxsCkKqNclj2ercap,0.738,0.483,0,-9.097,...,0.774,90.941,audio_features,1Y372uxsCkKqNclj2ercap,https://api.spotify.com/v1/tracks/1Y372uxsCkKq...,https://api.spotify.com/v1/audio-analysis/1Y37...,181307,4,1999,0
1414,3OGOwjwdae1OulCD4DK1ic,Ojo X Ojo,['Kenia OS'],60,Pink Aura,spotify:track:3OGOwjwdae1OulCD4DK1ic,0.847,0.758,1,-5.265,...,0.619,113.975,audio_features,3OGOwjwdae1OulCD4DK1ic,https://api.spotify.com/v1/tracks/3OGOwjwdae1O...,https://api.spotify.com/v1/audio-analysis/3OGO...,180373,4,2024,0


In [None]:
df_to_csv(shuffled_data,'training_data_df')

# Retrieving Multiple datasets, and preprocessing them for the ML models.

In [None]:
playlist_ids_to_retrieve = [('5rRvWEETOsUk0tyhZ30cCw', 'random')]
#https://open.spotify.com/playlist/5d6iujEAP0Vvgmt4KWbB4K?si=wHgX7uUPTj22veFJoOJEfg
#https://open.spotify.com/playlist/7qpKRykChs7H1VyDQ80CUI?si=os6kztNxRfyXVL2FhuXrZQ
#https://open.spotify.com/playlist/6vFhflGD63oXEcdhNmYHCA?si=d5b14783b9d04340

for (id, name) in playlist_ids_to_retrieve:
    print("Creating dataset for playlist ", name)
    playlist_df = get_playlist_df(id)

    shuffled_indices = np.random.permutation(song_df.index)
    shuffled_song_df = song_df.loc[shuffled_indices].reset_index(drop=True)
    data = pd.concat([shuffled_song_df[:playlist_df.shape[0]], playlist_df], axis=0)

    playlist_track_ids = playlist_df['track_id'].tolist()
    data['in_playlist'] = data['track_id'].apply(lambda x: 1 if x in playlist_track_ids else 0)
    data['release_date'] = data['release_date'].astype('datetime64[ns]')
    data['release_date'] = data['release_date'].dt.year

    data = data.sample(frac=1).reset_index(drop=True)

    df_to_csv(data,name)
    print("Dataset for playlist ", name, " created")

Creating dataset for playlist  random
Dataset for playlist  random  created


In [None]:
truly_random_df = song_df.sample(200)
truly_random_df['release_date'] = truly_random_df['release_date'].astype('datetime64[ns]')
truly_random_df['release_date'] = truly_random_df['release_date'].dt.year

half = len(truly_random_df) // 2
truly_random_df['in_playlist'] = [1] * half + [0] * (len(truly_random_df) - half)

# Shuffle the DataFrame
truly_random_df = truly_random_df.sample(frac=1).reset_index(drop=True)

df_to_csv(truly_random_df,'truly_random_df')

Getting Devraj's playlist