# Spotipy Audio Features Extraction

Below is the code to extract audio features from each track in every playlist.

**Expected runtime:** 20 hours for 10k playlists

In [3]:
# import standard libraries
import os
import pandas as pd
import sys
import numpy as np
import scipy as sp
import sklearn as sk
import itertools
import math
import missingno as msno
import re
import requests
from bs4 import BeautifulSoup

# import model tools
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn import tree
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import GradientBoostingRegressor

# import viz libraries
import graphviz
import seaborn as sns
import matplotlib.pyplot as plt

# import Spotify API and credentials
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials



# pandas tricks for better display
pd.set_option('display.width', 1500)
pd.set_option('display.max_columns', 100)

%matplotlib inline

In [4]:
#setting up Spotify API
SPOTIPY_CLIENT_ID = '420fbc1257c94c6e837c7228adb7ddea'
SPOTIPY_CLIENT_SECRET = '3c78b2c41dc44a4f95289c2be60114b3'

spotify = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials(SPOTIPY_CLIENT_ID, SPOTIPY_CLIENT_SECRET), requests_timeout=10, retries=7)


In [5]:
# get data
playlist_data_sample = pd.read_csv('C:/Users/saul/CS109A/2021-CS109A/project/cs109-personal-notebook/data-sample.csv')
playlist_data_sample.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,name,collaborative,pid,modified_at,num_tracks,num_albums,num_followers,tracks,num_edits,duration_ms,num_artists,description
0,0,798,Rap,False,342798,1503705600,201,143,1,"[{'pos': 0, 'artist_name': 'Wiz Khalifa', 'tra...",32,45908012,93,
1,1,936,Workout,False,174936,1505952000,62,61,1,"[{'pos': 0, 'artist_name': 'PSY', 'track_uri':...",18,13274359,54,
2,2,331,apr,False,53331,1497398400,48,43,1,"[{'pos': 0, 'artist_name': 'TOKiMONSTA', 'trac...",10,12333516,42,
3,3,820,Yacht Rock,False,174820,1442102400,101,90,1,"[{'pos': 0, 'artist_name': 'Christopher Cross'...",2,25529025,77,
4,4,575,Good Vibes,False,916575,1509062400,214,196,1,"[{'pos': 0, 'artist_name': 'Kaleo', 'track_uri...",123,46241434,152,


In [6]:
# This function takes in the following inputs:
#   playlift_info: This is a list of dictionaries where each dictionary pertains to the information stored in each playlist
#
#   returns: playlist_df: DataFrame containing all of the audio features summar

def get_features(playlist_info):

    playlist_data = ['danceability', 'energy', 'valence', 'avg danceability', 'avg energy', 'avg valence']

    features_df = pd.DataFrame(columns=playlist_data)

    # initialize audio features data
    all_tracks_info = {}

    # storing features 
    total_danceability = []
    total_energy = []
    total_valence = []

    avg_danceability = []
    avg_energy = []
    avg_valence = []


    for playlist in playlist_info:
        playlist = eval(playlist)
        danceability = []
        energy = []
        valence = []
        for tracks in playlist:
            track_uri = tracks['track_uri']

            if track_uri not in all_tracks_info:
                try:
                    audio_features = spotify.audio_features(track_uri)[0]
                except:
                    print('Timeout... trying again')
                    audio_features = spotify.audio_features(track_uri)[0]

                

                # catching Nonetype errors
                try:
                    danceability.append(audio_features['danceability'])
                    energy.append(audio_features['energy'])
                    valence.append(audio_features['valence'])
                except:
                    pass

            else:
                track_feats = all_tracks_info[track_uri]

                danceability.append(track_feats['danceability'])
                energy.append(track_feats['energy'])
                valence.append(track_feats['valence'])
        
        # compute sum of audio features       
        total_danceability.append(sum(danceability))
        total_energy.append(sum(energy))
        total_valence.append(sum(valence))

        # compute average values of audio features per track
        avg_danceability.append(np.mean(danceability))
        avg_energy.append(np.mean(energy))
        avg_valence.append(np.mean(valence))

    playlist_features = zip(total_danceability, total_energy, total_valence, avg_danceability, avg_energy, avg_valence)

    features_data = pd.DataFrame(playlist_features)

    playlist_df = pd.concat([features_df, features_data], ignore_index = True)

    return playlist_df


In [7]:
# Get playlist information
playlists = playlist_data_sample['tracks']

# Return features of playlists into dataframe 
features = get_features(playlists)