## Combining all the data that I've gotten from the Spotify API

In [1]:
import pandas as pd
import numpy as np

import json
import pickle
from glob import glob

In [2]:
data_paths = glob('../data/*')
data_paths

['../data/audio_features.pkl',
 '../data/analysis_query_04_24.pkl',
 '../data/all_data.csv',
 '../data/artist_query_results.pkl',
 '../data/output.pkl',
 '../data/genres_encoded.pkl',
 '../data/billboards.csv',
 '../data/artists_df_dummied.pkl',
 '../data/spotify_api_search_results.pkl']

In [3]:
search_results = '../data/output.pkl'
search_results = pickle.load(open(search_results, 'rb'))
search_results = pd.DataFrame(search_results)
search_results.head()

Unnamed: 0,artists,duration_ms,explicit,id,name,popularity,search_query
0,"2 Chainz, Travis Scott",255560.0,True,1nX9KhK3Fff27SnrIor2Yb,4 AM,72.0,2 chainz 4 am
1,"2 Chainz, Ty Dolla $ign, Trey Songz, Jhene Aiko",210200.0,True,6H0AwSQ20mo62jGlPGB8S6,It's A Vibe,76.0,2 chainz it's a vibe
2,"2 Chainz, YG, Offset",234666.0,True,365wwIjijQdlRJEjUWTidq,PROUD,61.0,2 chainz proud
3,,,,,,,2 chainz x gucci mane x quavo good drank
4,"2 Chainz, Drake, Quavo",225893.0,True,5S1IUPueD0xE0vj4zU3nSf,Bigger Than You (feat. Drake & Quavo),75.0,"2 chainz, drake bigger > you"


In [4]:
search_results.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 828 entries, 0 to 827
Data columns (total 7 columns):
artists         828 non-null object
duration_ms     828 non-null object
explicit        828 non-null object
id              828 non-null object
name            828 non-null object
popularity      828 non-null object
search_query    828 non-null object
dtypes: object(7)
memory usage: 45.4+ KB


In [5]:
audio_features = '../data/audio_features.pkl'
audio_features = pickle.load(open(audio_features, 'rb'))
audio_features = pd.DataFrame(audio_features)
audio_features.head()

Unnamed: 0,acousticness,danceability,duration_ms,energy,id,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,uri,valence
0,0.118,0.796,255560.0,0.5,1nX9KhK3Fff27SnrIor2Yb,0.0,1.0,0.155,-7.21,1.0,0.425,75.012,4.0,spotify:track:1nX9KhK3Fff27SnrIor2Yb,0.227
1,0.0317,0.822,210200.0,0.505,6H0AwSQ20mo62jGlPGB8S6,0.000911,7.0,0.114,-7.384,1.0,0.147,73.003,4.0,spotify:track:6H0AwSQ20mo62jGlPGB8S6,0.523
2,0.738,0.781,234667.0,0.81,365wwIjijQdlRJEjUWTidq,0.0,10.0,0.144,-4.301,0.0,0.356,173.949,4.0,spotify:track:365wwIjijQdlRJEjUWTidq,0.908
3,,,,,missing,,,,,,,,,,
4,0.247,0.888,225893.0,0.515,5S1IUPueD0xE0vj4zU3nSf,0.0,1.0,0.446,-6.246,1.0,0.271,141.009,4.0,spotify:track:5S1IUPueD0xE0vj4zU3nSf,0.264


In [6]:
audio_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 828 entries, 0 to 827
Data columns (total 15 columns):
acousticness        809 non-null float64
danceability        809 non-null float64
duration_ms         809 non-null float64
energy              809 non-null float64
id                  828 non-null object
instrumentalness    809 non-null float64
key                 809 non-null float64
liveness            809 non-null float64
loudness            809 non-null float64
mode                809 non-null float64
speechiness         809 non-null float64
tempo               809 non-null float64
time_signature      809 non-null float64
uri                 809 non-null object
valence             809 non-null float64
dtypes: float64(13), object(2)
memory usage: 97.1+ KB


In [7]:
search_audio_features = pd.merge(search_results, audio_features, left_index=True, right_index=True, how='left')

In [8]:
# how many songs was i not able to successfully find through the search function
sum(search_audio_features['acousticness'].isna()) # thats not too bad

19

In [9]:
analysis = '../data/analysis_query_04_24.pkl'
analysis = pickle.load(open(analysis, 'rb'))

analysis_list = []
analysis_list.extend([analysis.values[i][0] for i in range(0, len(analysis))])

In [10]:
analysis = pd.DataFrame(analysis_list)
analysis.head()

Unnamed: 0,analysis_channels,analysis_sample_rate,duration,end_of_fade_in,id,key,key_confidence,loudness,mode,mode_confidence,num_samples,offset_seconds,sample_md5,start_of_fade_out,tempo,tempo_confidence,time_signature,time_signature_confidence,window_seconds
0,1.0,22050.0,255.56,0.0,1nX9KhK3Fff27SnrIor2Yb,1.0,0.271,-7.21,1.0,0.516,5635098.0,0.0,,252.709,75.012,0.58,4.0,1.0,0.0
1,1.0,22050.0,210.2,0.0,6H0AwSQ20mo62jGlPGB8S6,7.0,0.281,-7.384,1.0,0.436,4634910.0,0.0,,198.879,73.003,0.297,4.0,1.0,0.0
2,1.0,22050.0,234.667,0.08689,365wwIjijQdlRJEjUWTidq,10.0,0.353,-4.301,0.0,0.499,5174400.0,0.0,,230.017,173.949,0.578,4.0,1.0,0.0
3,,,,,,,,,,,,,,,,,,,
4,1.0,22050.0,225.893,2.51356,5S1IUPueD0xE0vj4zU3nSf,1.0,0.802,-6.246,1.0,0.7,4980948.0,0.0,,221.884,141.009,0.753,4.0,0.971,0.0


In [11]:
analysis.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 828 entries, 0 to 827
Data columns (total 19 columns):
analysis_channels            828 non-null object
analysis_sample_rate         828 non-null object
duration                     828 non-null object
end_of_fade_in               828 non-null object
id                           828 non-null object
key                          828 non-null object
key_confidence               828 non-null object
loudness                     828 non-null object
mode                         828 non-null object
mode_confidence              828 non-null object
num_samples                  828 non-null object
offset_seconds               828 non-null object
sample_md5                   828 non-null object
start_of_fade_out            828 non-null object
tempo                        828 non-null object
tempo_confidence             828 non-null object
time_signature               828 non-null object
time_signature_confidence    828 non-null object
window_second

In [12]:
all_data = pd.merge(search_audio_features, analysis, left_index=True, right_index=True, how='left')
all_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 828 entries, 0 to 827
Data columns (total 41 columns):
artists                      828 non-null object
duration_ms_x                828 non-null object
explicit                     828 non-null object
id_x                         828 non-null object
name                         828 non-null object
popularity                   828 non-null object
search_query                 828 non-null object
acousticness                 809 non-null float64
danceability                 809 non-null float64
duration_ms_y                809 non-null float64
energy                       809 non-null float64
id_y                         828 non-null object
instrumentalness             809 non-null float64
key_x                        809 non-null float64
liveness                     809 non-null float64
loudness_x                   809 non-null float64
mode_x                       809 non-null float64
speechiness                  809 non-null float64
tem

In [13]:
all_data.to_csv('../data/all_data.csv', index=False)

NTS: I should also check the quality of the results from the search queries to make sure that im getting what im expecting to get

Probably important to check the overlap of the ids that result from the search query to the audio features that were return from from id

In [None]:
search_results.head()

In [None]:
search_results.info()

In [None]:
audio_features.head()

In [None]:
search_set = set(search_results['id'].values.astype(str)) 
audio_features_set = set(audio_features['id'].values.astype(str))

In [None]:
main_list = search_set.difference(audio_features_set)
search_set.symmetric_difference(audio_features_set)

In [None]:
len(search_set)

In [None]:
'000xQL6tZNLJzIrtIgxqSl' in set(audio_features['id'].values.astype(str))