## Getting data from Spotify API to get features for prediction

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import spotipy
import spotipy.util as util
import json

# from functions import *
import pickle
import json
from datetime import datetime
from collections import Counter
from fuzzywuzzy import fuzz

In [2]:
# reading in credentials necessary to use the API
# remember to save credentials in double quotes or else it gets mad at you
with open('../credentials.json') as filename:
    credentials = json.load(filename)
    filename.close()

In [3]:
token = util.oauth2.SpotifyClientCredentials(client_id=credentials['client_id'],
                                             client_secret=credentials['client_secret'])
## creates an access token for you to do what you need to do
cache_token = token.get_access_token()
# probably need to check how much access it allows on default and if you need more access to do more interesting things

# need to create an object
sp = spotipy.Spotify(cache_token)

### Loading in data and search queries to use the API

In [4]:
data = pd.read_csv('../data/billboards.csv')
data.rename(columns={
    'Artist': 'artist',
    'Song':'song'
}, inplace=True)
data['search_queries'] = (data.artist +' '+ data.song).str.lower()

In [5]:
data.head()

Unnamed: 0,artist,song,num_weeks,search_queries
0,2 Chainz,4 AM,12,2 chainz 4 am
1,2 Chainz,It's A Vibe,20,2 chainz it's a vibe
2,2 Chainz,Proud,2,2 chainz proud
3,2 Chainz x Gucci Mane x Quavo,Good Drank,15,2 chainz x gucci mane x quavo good drank
4,"2 Chainz, Drake",Bigger > You,2,"2 chainz, drake bigger > you"


In [6]:
def clean_search_results(results):
    meta= results[0]
    keys = ['artists', 'duration_ms', 'explicit', 'name', 'uri', 'release_date']
    meta_dict = {}
    for key in keys:
        if key == 'artists':
            meta_dict[key] = ', '.join([artist['name'] for artist in meta[key]])
        elif key == 'release_date':
            meta_dict[key] = meta['album']['release_date']

        else: meta_dict[key] = meta[key]
    return meta_dict

In [7]:
def get_features(df):
    results_list = []
    for queries in df.search_queries.values:
        test = sp.search(q=queries, type = 'track', limit=1)
        try:
            results = test['tracks']['items']
            cleaned = clean_search_results(results) # returns a dictionary
        except:
            cleaned = {}
        cleaned['search_query'] = queries
        results_list.append(cleaned)
    yield list(results_list)

```python
with open('../data/spotify_api_search_results.pkl', 'wb') as filename:
    results_list=[]
    results_list.extend([x for x in get_features(data)]) # testing on the head only
    pickle.dump(results_list[0], filename)
    filename.close()
```

```python
pickle.dump(test_df, open('../data/spotify_api_search_results.pkl', 'wb'))
```

In [13]:
output_data = pickle.load(open('../data/spotify_api_search_results.pkl', 'rb')).dropna()
output_data.rename(columns={'key':'artists'}, inplace=True)
output_data.head()

Unnamed: 0,duration_ms,explicit,artists,name,release_date,search_query,uri
0,255560.0,True,"2 Chainz, Travis Scott",4 AM,2017-06-16,2 chainz 4 am,spotify:track:1nX9KhK3Fff27SnrIor2Yb
1,210200.0,True,"2 Chainz, Ty Dolla $ign, Trey Songz, Jhene Aiko",It's A Vibe,2017-06-16,2 chainz it's a vibe,spotify:track:6H0AwSQ20mo62jGlPGB8S6
2,234666.0,True,"2 Chainz, YG, Offset",PROUD,2018-02-08,2 chainz proud,spotify:track:365wwIjijQdlRJEjUWTidq
4,225893.0,True,"2 Chainz, Drake, Quavo",Bigger Than You (feat. Drake & Quavo),2018-06-15,"2 chainz, drake bigger > you",spotify:track:5S1IUPueD0xE0vj4zU3nSf
5,220306.0,True,21 Savage,Bank Account,2017-09-23,21 savage bank account,spotify:track:2fQrGHiQOvpL9UgPvtYy6G


In [15]:
uri = [uri.split('track:')[1] for uri in output_data.uri.values]

output_data['id'] = uri
output_data.drop(['uri'],axis=1, inplace=True)
output_data.head()

Unnamed: 0,duration_ms,explicit,artists,name,release_date,search_query,id
0,255560.0,True,"2 Chainz, Travis Scott",4 AM,2017-06-16,2 chainz 4 am,1nX9KhK3Fff27SnrIor2Yb
1,210200.0,True,"2 Chainz, Ty Dolla $ign, Trey Songz, Jhene Aiko",It's A Vibe,2017-06-16,2 chainz it's a vibe,6H0AwSQ20mo62jGlPGB8S6
2,234666.0,True,"2 Chainz, YG, Offset",PROUD,2018-02-08,2 chainz proud,365wwIjijQdlRJEjUWTidq
4,225893.0,True,"2 Chainz, Drake, Quavo",Bigger Than You (feat. Drake & Quavo),2018-06-15,"2 chainz, drake bigger > you",5S1IUPueD0xE0vj4zU3nSf
5,220306.0,True,21 Savage,Bank Account,2017-09-23,21 savage bank account,2fQrGHiQOvpL9UgPvtYy6G


### Combining the songs that I found manually

In [16]:
missing_data = pickle.load(open('../data/manually_found_songs.pkl', 'rb'))
missing_data.head()

Unnamed: 0,search_query,artists,duration_ms,explicit,id,modified_query,name,popularity
0,2 chainz x gucci mane x quavo good drank,"2 Chainz, Gucci Mane, Quavo",222706.0,True,39pS70eeDvyCAF3t8NAlVV,quavo good drank,Good Drank,65.0
1,21 savage my choppa hate n****s,"21 Savage, Metro Boomin",148640.0,True,2D2w9943rsnJOGCrI4aMQp,21 savage my choppa,My Choppa Hate Niggas,67.0
2,becky g + natti natasha sin pijama,Marillo,191555.0,False,6ItVmJiq8rDwBKq026zSa3,becky g pijama,Como Becky G Sin Pijama,0.0
9,jacquees x dej loaf at the club,"Jacquees, DeJ Loaf",173053.0,False,0NqZ65jPelNB13gzsvH2Ma,dej loaf at the club,At The Club,67.0
10,kygo x selena gomez it ain't me,"Kygo, Selena Gomez",216586.0,False,677RjvAT2lpYjo1Whczjzx,kygo it aint me,It Ain't Me,50.0


In [17]:
output_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 809 entries, 0 to 827
Data columns (total 7 columns):
duration_ms     809 non-null float64
explicit        809 non-null object
artists         809 non-null object
name            809 non-null object
release_date    809 non-null object
search_query    809 non-null object
id              809 non-null object
dtypes: float64(1), object(6)
memory usage: 50.6+ KB


In [18]:
missing_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13 entries, 0 to 18
Data columns (total 8 columns):
search_query      13 non-null object
artists           13 non-null object
duration_ms       13 non-null float64
explicit          13 non-null object
id                13 non-null object
modified_query    13 non-null object
name              13 non-null object
popularity        13 non-null float64
dtypes: float64(2), object(6)
memory usage: 936.0+ bytes


In [20]:
all_songs = pd.concat([output_data.drop('uri', axis=1, errors='ignore'),
           missing_data.drop(['modified_query','popularity'], axis=1)], axis=0)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [21]:
all_songs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 822 entries, 0 to 18
Data columns (total 7 columns):
artists         822 non-null object
duration_ms     822 non-null float64
explicit        822 non-null object
id              822 non-null object
name            822 non-null object
release_date    809 non-null object
search_query    822 non-null object
dtypes: float64(1), object(6)
memory usage: 51.4+ KB


In [22]:
all_songs.to_csv('../data/all_data_0504.csv', index=False)

In [23]:
def get_audio_features(id_value):
    try:
        results = sp.audio_features(id_value)[0]
        results['id'] = id_value
    except Exception as e:
        results = {}
        results['id'] = 'missing'
    return results

In [None]:
audio_features = all_songs['id'].apply(get_audio_features)

In [None]:
audio_features.head()

In [None]:
len(audio_features)

In [None]:
# need to flatten it 
audio_list = []
audio_list.extend([value for value in audio_features.values])
audio_list

In [None]:
audio_features = pd.DataFrame(audio_list)
audio_features.head()

In [None]:
pickle.dump(audio_features, open('../data/audio_features.pkl', 'wb'))

NTS: it seems like the track method returns results that I already have stores so i'm not going to bother with that right now.... might be something to look into later when i come back to the project?

In [None]:
def get_audio_analysis(song_id):
    """
    takes the song id and passes it through the spotify api to get the song analysis values
    and if there is no song_id then it passes and returns an empty string
    """
    relevant_columns = ['num_samples', 'duration', 'sample_md5', 'offset_seconds',
                             'window_seconds', 'analysis_sample_rate', 'analysis_channels',
                             'end_of_fade_in', 'start_of_fade_out', 'loudness', 'tempo',
                             'tempo_confidence', 'time_signature', 'time_signature_confidence',
                             'key', 'key_confidence', 'mode', 'mode_confidence']
    try:
        song_analysis = sp.audio_analysis(song_id)['track']
        result = {key:song_analysis[key] for key in song_analysis if key in relevant_columns}
    except Exception as e:
        print(e)
        result = {key:'' for key in relevant_columns}
    result['id'] = song_id
    return result
        

In [None]:
## i think i also remember being able to get features on the artist tooo. i should explore whatt other ttings are avaiable in the api

In [None]:
analysis_df = pd.DataFrame.from_dict(output_data['id'].apply(get_audio_analysis))
analysis_df.head()

In [None]:
flatten_list = []
flatten_values = flatten_list.extend([values[0] for values in analysis_df.values])
flatten_list

In [None]:
analysis_df = pd.DataFrame(flatten_list)

In [None]:
query_date = str(datetime.today().month).zfill(2) + '_' + str(datetime.today().day).zfill(2)
print(query_date)
pickle.dump(analysis_df, open(f'../data/analysis_query_{query_date}.pkl', 'wb'))

### NTS: I want to get the release date of the songs too 