## Getting data from Spotify API to get features for prediction

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import spotipy
import spotipy.util as util
import json

# from functions import *
import pickle
import json
from datetime import datetime
from collections import Counter

In [2]:
# reading in credentials necessary to use the API
# remember to save credentials in double quotes or else it gets mad at you
with open('../credentials.json') as filename:
    credentials = json.load(filename)
    filename.close()

In [3]:
token = util.oauth2.SpotifyClientCredentials(client_id=credentials['client_id'],
                                             client_secret=credentials['client_secret'])
## creates an access token for you to do what you need to do
cache_token = token.get_access_token()
# probably need to check how much access it allows on default and if you need more access to do more interesting things


In [4]:
# need to create an object
sp = spotipy.Spotify(cache_token)

### Loading in data and search queries to use the API

In [None]:
data = pd.read_csv('data/billboards.csv')
data.rename(columns={
    'Artist': 'artist',
    'Song':'song'
}, inplace=True)
data['search_queries'] = (data.artist +' '+ data.song).str.lower()

In [None]:
def get_features(df):
    results_list = []
    for queries in df.search_queries.values:
        test = sp.search(q=queries, type = 'track', limit=1)
        results = test['tracks']['items']
        cleaned = clean_search_results(results) # returns a dictionary
        cleaned['search_query'] = queries
        results_list.append(cleaned)
    yield list(results_list)

In [None]:
with open('output.pkl', 'wb') as filename:
    results_list=[]
    results_list.extend([x for x in get_features(data)]) # testing on the head only
    pickle.dump(results_list[0], filename)
    filename.close()

In [None]:
len(results_list[0])

In [None]:
test = pickle.load(open('../data/output.pkl', 'rb'))
test_df = pd.DataFrame(test)
pickle.dump(test_df, open('../data/spotify_api_search_results.pkl', 'wb'))

In [None]:
output_data = pickle.load(open('../data/spotify_api_search_results.pkl', 'rb')) # NTS: i need to look back and review what popularity is

In [8]:
output_data = pickle.load(open('../data/search_results.pkl', 'rb'))
output_data.head()

Unnamed: 0,artists,duration_ms,explicit,id,modified_query,name,popularity,search_query
0,"2 Chainz, Travis Scott",255560,True,1nX9KhK3Fff27SnrIor2Yb,,4 AM,72,2 chainz 4 am
1,"2 Chainz, Ty Dolla $ign, Trey Songz, Jhene Aiko",210200,True,6H0AwSQ20mo62jGlPGB8S6,,It's A Vibe,76,2 chainz it's a vibe
2,"2 Chainz, YG, Offset",234666,True,365wwIjijQdlRJEjUWTidq,,PROUD,61,2 chainz proud
3,"2 Chainz, Drake, Quavo",225893,True,5S1IUPueD0xE0vj4zU3nSf,,Bigger Than You (feat. Drake & Quavo),75,"2 chainz, drake bigger > you"
4,21 Savage,220306,True,2fQrGHiQOvpL9UgPvtYy6G,,Bank Account,83,21 savage bank account


In [9]:
output_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 822 entries, 0 to 821
Data columns (total 8 columns):
artists           822 non-null object
duration_ms       822 non-null object
explicit          822 non-null object
id                822 non-null object
modified_query    13 non-null object
name              822 non-null object
popularity        822 non-null object
search_query      822 non-null object
dtypes: object(8)
memory usage: 51.5+ KB


In [10]:
output_data.drop(['modified_query'], axis=1, inplace=True)

In [11]:
def get_audio_features(id_value):
    try:
        results = sp.audio_features(id_value)[0]
        results['id'] = id_value
    except Exception as e:
        results = {}
        results['id'] = 'missing'
    return results

In [None]:
audio_features = output_data['id'].apply(get_audio_features)

In [34]:
audio_features.head()

0    {'danceability': 0.796, 'energy': 0.5, 'key': ...
1    {'danceability': 0.822, 'energy': 0.505, 'key'...
2    {'danceability': 0.781, 'energy': 0.81, 'key':...
3    {'danceability': 0.888, 'energy': 0.515, 'key'...
4    {'danceability': 0.884, 'energy': 0.346, 'key'...
Name: id, dtype: object

In [36]:
# need to flatten it 
audio_list = []
audio_list.extend([value for value in audio_features.values])
audio_list

[{'danceability': 0.796,
  'energy': 0.5,
  'key': 1,
  'loudness': -7.21,
  'mode': 1,
  'speechiness': 0.425,
  'acousticness': 0.118,
  'instrumentalness': 0,
  'liveness': 0.155,
  'valence': 0.227,
  'tempo': 75.012,
  'type': 'audio_features',
  'id': '1nX9KhK3Fff27SnrIor2Yb',
  'uri': 'spotify:track:1nX9KhK3Fff27SnrIor2Yb',
  'track_href': 'https://api.spotify.com/v1/tracks/1nX9KhK3Fff27SnrIor2Yb',
  'analysis_url': 'https://api.spotify.com/v1/audio-analysis/1nX9KhK3Fff27SnrIor2Yb',
  'duration_ms': 255560,
  'time_signature': 4},
 {'danceability': 0.822,
  'energy': 0.505,
  'key': 7,
  'loudness': -7.384,
  'mode': 1,
  'speechiness': 0.147,
  'acousticness': 0.0317,
  'instrumentalness': 0.000911,
  'liveness': 0.114,
  'valence': 0.523,
  'tempo': 73.003,
  'type': 'audio_features',
  'id': '6H0AwSQ20mo62jGlPGB8S6',
  'uri': 'spotify:track:6H0AwSQ20mo62jGlPGB8S6',
  'track_href': 'https://api.spotify.com/v1/tracks/6H0AwSQ20mo62jGlPGB8S6',
  'analysis_url': 'https://api.spoti

In [39]:
audio_features = pd.DataFrame(audio_list)

In [40]:
pickle.dump(audio_features, open('../data/audio_features.pkl', 'wb'))

NTS: it seems like the track method returns results that I already have stores so i'm not going to bother with that right now.... might be something to look into later when i come back to the project?

In [14]:
def get_audio_analysis(song_id):
    """
    takes the song id and passes it through the spotify api to get the song analysis values
    and if there is no song_id then it passes and returns an empty string
    """
    relevant_columns = ['num_samples', 'duration', 'sample_md5', 'offset_seconds',
                             'window_seconds', 'analysis_sample_rate', 'analysis_channels',
                             'end_of_fade_in', 'start_of_fade_out', 'loudness', 'tempo',
                             'tempo_confidence', 'time_signature', 'time_signature_confidence',
                             'key', 'key_confidence', 'mode', 'mode_confidence']
    try:
        song_analysis = sp.audio_analysis(song_id)['track']
        result = {key:song_analysis[key] for key in song_analysis if key in relevant_columns}
    except Exception as e:
        print(e)
        result = {key:'' for key in relevant_columns}
    result['id'] = song_id
    return result
        

In [None]:
## i think i also remember being able to get features on the artist tooo. i should explore whatt other ttings are avaiable in the api

In [15]:
analysis_df = pd.DataFrame.from_dict(output_data['id'].apply(get_audio_analysis))
analysis_df.head()

Unnamed: 0,id
0,"{'num_samples': 5635098, 'duration': 255.56, '..."
1,"{'num_samples': 4634910, 'duration': 210.2, 's..."
2,"{'num_samples': 5174400, 'duration': 234.66667..."
3,"{'num_samples': 4980948, 'duration': 225.89333..."
4,"{'num_samples': 4857762, 'duration': 220.30667..."


In [30]:
flatten_list = []
flatten_values = flatten_list.extend([values[0] for values in analysis_df.values])
flatten_list

[{'num_samples': 5635098,
  'duration': 255.56,
  'sample_md5': '',
  'offset_seconds': 0,
  'window_seconds': 0,
  'analysis_sample_rate': 22050,
  'analysis_channels': 1,
  'end_of_fade_in': 0.0,
  'start_of_fade_out': 252.70857,
  'loudness': -7.21,
  'tempo': 75.012,
  'tempo_confidence': 0.58,
  'time_signature': 4,
  'time_signature_confidence': 1.0,
  'key': 1,
  'key_confidence': 0.271,
  'mode': 1,
  'mode_confidence': 0.516,
  'id': '1nX9KhK3Fff27SnrIor2Yb'},
 {'num_samples': 4634910,
  'duration': 210.2,
  'sample_md5': '',
  'offset_seconds': 0,
  'window_seconds': 0,
  'analysis_sample_rate': 22050,
  'analysis_channels': 1,
  'end_of_fade_in': 0.0,
  'start_of_fade_out': 198.87891,
  'loudness': -7.384,
  'tempo': 73.003,
  'tempo_confidence': 0.297,
  'time_signature': 4,
  'time_signature_confidence': 1.0,
  'key': 7,
  'key_confidence': 0.281,
  'mode': 1,
  'mode_confidence': 0.436,
  'id': '6H0AwSQ20mo62jGlPGB8S6'},
 {'num_samples': 5174400,
  'duration': 234.66667,


In [32]:
analysis_df = pd.DataFrame(flatten_list)

In [33]:
query_date = str(datetime.today().month).zfill(2) + '_' + str(datetime.today().day).zfill(2)
print(query_date)
pickle.dump(analysis_df, open(f'../data/analysis_query_{query_date}.pkl', 'wb'))

04_27


### NTS: I want to get the release date of the songs too 