# Feature Extraction from Spotify

In this notebook, we are going to get audio features of tracks on MSD genre dataset and merge with these features with those we extracted before. 

In [18]:
import spotipy
import requests
import spotipy.util as util
import pprint, json

Environment variables for getting token is defined as below:

In [16]:
CLIENT_ID = '25e3cc7f42ad41d3b0a394e0658ec33a'
CLIENT_SECRET = '5ab7fd87ffb8417d8ce83cbfde8283ad'
scope = 'user-library-read playlist-modify user-read-private'

## Getting authorization token

In [14]:
def get_token(username, scope, client_id, client_secret):
    return util.prompt_for_user_token(username, scope, client_id, client_secret, redirect_uri='http://localhost/')

## Searching for track with name and its artist

In [112]:
def get_track_id_by_name_and_artist(token, track_name, artist_name):
    response = requests.get('https://api.spotify.com/v1/search',
                    headers={ 'authorization': "Bearer " + token}, 
                    params={ 'q': 'track:' + track_name +  ' artist:' + artist_name, 'type': 'track' })
    if response.ok == False:
        return -2        
    tracks = json.loads(response.text)["tracks"]
    if tracks["total"] == 0:
        return -1
    return tracks["items"][0]["id"]

## Getting audio features of tracks

In [9]:
def get_audio_features(sp, track_id_list):
	return sp.audio_features(tracks=track_id_list)

Since we have created functions we need, we are ready to go! Let's get a token.

In [117]:
username = '11131118133'
token = get_token(username, scope, CLIENT_ID, CLIENT_SECRET)
sp = spotipy.Spotify(auth=token)

## Getting other features

Let's import necessary packages and define features we want to take from Spotify. We're gonna get the audio features from spotify with track id.

In [96]:
import pandas as pd
import numpy as np

feature_list = ["track_id_msd", "danceability", "energy", "key", "loudness", "mode", "speechiness", "acousticness", "instrumentalness", "liveness", "valence", "tempo", "id"]
print("Size of feature list: ", str(len(feature_list)))

Size of feature list:  13


Let's take audio features of tracks of our dataset.

In [103]:
features = pd.read_csv('../msd_extra_features.csv')
size_of_audio_features = len(feature_list)

    

In [98]:
features.columns

Index(['Unnamed: 0', 'genre', 'track_id', 'artist_name', 'title', 'loudness',
       'tempo', 'time_signature', 'key', 'mode', 'duration', 'avg_timbre1',
       'avg_timbre2', 'avg_timbre3', 'avg_timbre4', 'avg_timbre5',
       'avg_timbre6', 'avg_timbre7', 'avg_timbre8', 'avg_timbre9',
       'avg_timbre10', 'avg_timbre11', 'avg_timbre12', 'var_timbre1',
       'var_timbre2', 'var_timbre3', 'var_timbre4', 'var_timbre5',
       'var_timbre6', 'var_timbre7', 'var_timbre8', 'var_timbre9',
       'var_timbre10', 'var_timbre11', 'var_timbre12', '0', '1', '2', '3', '4',
       '5', '6', '7', '8', '9'],
      dtype='object')

In [118]:
#audio_features = np.array(feature_list)
not_found = 0
for index, row in features.iterrows():
    if index > 15196:
        if index % 1000 == 0:
            token = get_token(username, scope, CLIENT_ID, CLIENT_SECRET)
            sp = spotipy.Spotify(auth=token)
            print("Not found:" + str(not_found))
        track_id_msd = row.track_id
        track_name = row.title
        artist_name = row.artist_name
        print("Index\t" + str(index) + "\tTrack\t" + track_name + "\tArtist\t" + artist_name)
        track_id = get_track_id_by_name_and_artist(token, track_name, artist_name)
        if track_id == -2:
            token = get_token(username, scope, CLIENT_ID, CLIENT_SECRET)
            track_id = get_track_id_by_name_and_artist(token, track_name, artist_name)
        if track_id != -1:
            features_in_json = get_audio_features(sp, [track_id])
            track_feature = [value for key, value in features_in_json[0].items()  if key in feature_list[1:]]
            audio_features = np.vstack([audio_features, [track_id_msd] +track_feature])
        else: 
            print("Track not found "+ track_name + "\t" +artist_name)
            not_found += 1
         

        
print(not_found)  


Index	15197	Track	Funk Factory (Remastered Single  Version)	Artist	Wilson Pickett
Index	15198	Track	In The Midnight Hour	Artist	Wilson Pickett
Index	15199	Track	I Found A True Love (LP Version)	Artist	Wilson Pickett
Track not found I Found A True Love (LP Version)	Wilson Pickett
Index	15200	Track	I'm A Midnight Mover (LP Version)	Artist	Wilson Pickett
Track not found I'm A Midnight Mover (LP Version)	Wilson Pickett
Index	15201	Track	Everybody Needs Somebody To Love (LP Version)	Artist	Wilson Pickett
Track not found Everybody Needs Somebody To Love (LP Version)	Wilson Pickett
Index	15202	Track	Knock On Wood (LP Version)	Artist	Wilson Pickett
Track not found Knock On Wood (LP Version)	Wilson Pickett
Index	15203	Track	Ooh Poo Pah Doo (LP Version)	Artist	Wilson Pickett
Track not found Ooh Poo Pah Doo (LP Version)	Wilson Pickett
Index	15204	Track	It's Too Late (LP Version)	Artist	Wilson Pickett
Track not found It's Too Late (LP Version)	Wilson Pickett
Index	15205	Track	She's Lookin' Good (L

In [148]:
#audio_features_frame = pd.DataFrame(data = audio_features[1:, 0:], columns=audio_features[0,0:])

audio_features_frame.rename(index=str, columns={"track_id_msd": "track_id"})
audio_features_frame.to_csv("audio_features.csv")


In [None]:
pd.merge

In [144]:
audio_features = pd.read_csv('audio_features_2.csv', sep=",")
audio_features.set_index("track_id_msd")
audio_features.drop(["index"],1, inplace = True)
audio_features.to_csv("audio_features_v3.csv")