## Data Collection
In this notebook I will pull in data from spotify. First I will pull in a json file which includes 1 year of my listening history that I requested from Spotify. Once I have this data I will pull in various features from the spotify API relating to each artist and track in my listening history. Lastly I will collect a library of similar datapoints for thousands of tracks which I can use to compare to a users listening history to surface recommendations.

1- Spotify Listening History- 1 year listening history from Spotify
<br>2- Spotify API
<br>3- Kaggle Spotify Library

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time

In [2]:
streaming_df = pd.read_json('../data/StreamingHistory0.json')

In [3]:
streaming_df['count'] = 1
streaming_df.head(1)

Unnamed: 0,endTime,artistName,trackName,msPlayed,count
0,2021-10-26 23:02,Daniel Caesar,Transform (feat. Charlotte Day Wilson),277984,1


In [4]:
#load kaggle data
kaggle_df = pd.read_csv('../data/data.csv')
kaggle_df.head(1)

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo
0,0.0594,1921,0.982,"['Sergei Rachmaninoff', 'James Levine', 'Berli...",0.279,831667,0.211,0,4BJqT0PrAfrxzMOxytFOIz,0.878,10,0.665,-20.096,1,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...",4,1921,0.0366,80.954


In [5]:
kaggle_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170653 entries, 0 to 170652
Data columns (total 19 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   valence           170653 non-null  float64
 1   year              170653 non-null  int64  
 2   acousticness      170653 non-null  float64
 3   artists           170653 non-null  object 
 4   danceability      170653 non-null  float64
 5   duration_ms       170653 non-null  int64  
 6   energy            170653 non-null  float64
 7   explicit          170653 non-null  int64  
 8   id                170653 non-null  object 
 9   instrumentalness  170653 non-null  float64
 10  key               170653 non-null  int64  
 11  liveness          170653 non-null  float64
 12  loudness          170653 non-null  float64
 13  mode              170653 non-null  int64  
 14  name              170653 non-null  object 
 15  popularity        170653 non-null  int64  
 16  release_date      17

### Extended Spotify Streaming History

In [6]:
extended = pd.read_json('../data/endsong_0.json')
extended1 = pd.read_json('../data/endsong_1.json')
extended2 = pd.read_json('../data/endsong_2.json')
extended = pd.concat([extended, extended1, extended2])

In [7]:
extended.reset_index(drop = True, inplace = True)
extended.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39734 entries, 0 to 39733
Data columns (total 21 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   ts                                 39734 non-null  object 
 1   username                           39734 non-null  int64  
 2   platform                           39734 non-null  object 
 3   ms_played                          39734 non-null  int64  
 4   conn_country                       39734 non-null  object 
 5   ip_addr_decrypted                  39734 non-null  object 
 6   user_agent_decrypted               38370 non-null  object 
 7   master_metadata_track_name         39716 non-null  object 
 8   master_metadata_album_artist_name  39716 non-null  object 
 9   master_metadata_album_album_name   39716 non-null  object 
 10  spotify_track_uri                  39716 non-null  object 
 11  episode_name                       18 non-null     obj

### Spotify API

In [15]:
#throw my keys into a .py file and call to it here so that my keys are hidden
import spotipy
import config

from spotipy.oauth2 import SpotifyClientCredentials
client_credentials_manager = SpotifyClientCredentials(client_id=config.cid, client_secret=config.secret)
sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager)

In [7]:
def artist_features(data):
    artist = []
    artist_id = []
    genres = []
    artist_popularity= []
    followers = []
    
    for i in data:
        results = sp.search(q=f'artist: {i}', type='artist', limit=10)
        for i, t in enumerate(results['artists']['items']):
            artist.append(t['name'])
            artist_id.append(t['id'])
            genres.append(t['genres'])
            artist_popularity.append(t['popularity'])
            followers.append(t['followers']['total'])
    
#     return pd.DataFrame([artist_id[0], artist[0], genres[0], artist_popularity[0], followers[0]], 
#                         columns = ['artist_id', 'artist', 'genres', 'artist_popularity', 'followers'])
    
    artist_feats = pd.DataFrame(artist_id, columns = ['artist_id'])
    artist_feats['artistName'] = artist
    artist_feats['genres'] = genres
    artist_feats['artist_popularity'] = artist_popularity
    artist_feats['followers'] = followers
    
    return artist_feats


def track_names(data):
    trackName = []
    trackID = []
    
    for a,t in data:
        results = sp.search(q="artist:" + a + " track:" + t, type="track", limit =1)
        for i, t in enumerate(results['tracks']['items']):
            trackName.append(t['name'])
            trackID.append(t['id'])
    
    tracks = pd.DataFrame(trackID, columns = ['trackID'])
    tracks['trackName'] = trackName
    
    return tracks

In [8]:
def track_features(data):
    danceability = []
    energy = []
    key = []
    loudness = []
    mode = []
    speechiness = []
    acousticness = []
    instrumentalness = []
    liveness = []
    valence = []
    tempo = []
    track_id = []
    duration_ms = []
    time_signature = []
    
    for t in data:
        try:
            results = sp.audio_features(tracks = t)
            danceability.append(results[0]['danceability'])
            energy.append(results[0]['energy'])
            key.append(results[0]['key'])
            loudness.append(results[0]['loudness'])
            mode.append(results[0]['mode'])
            speechiness.append(results[0]['speechiness'])
            acousticness.append(results[0]['acousticness'])
            instrumentalness.append(results[0]['instrumentalness'])
            liveness.append(results[0]['liveness'])
            valence.append(results[0]['valence'])
            tempo.append(results[0]['tempo'])
            track_id.append(results[0]['id'])
            duration_ms.append(results[0]['duration_ms'])
            time_signature.append(results[0]['time_signature'])
        except:
            pass

#     return pd.DataFrame([artist_id[0], artist[0], genres[0], artist_popularity[0], followers[0]], 
#                         columns = ['artist_id', 'artist', 'genres', 'artist_popularity', 'followers'])
    
    track_features = pd.DataFrame(track_id, columns = ['trackID'])
    track_features['danceability'] = danceability
    track_features['energy'] = energy
    track_features['key'] = key
    track_features['loudness'] = loudness 
    track_features['mode'] = mode 
    track_features['speechiness'] = speechiness
    track_features['acousticness'] = acousticness
    track_features['instrumentalness'] = instrumentalness
    track_features['liveness'] = liveness
    track_features['valence'] = valence
    track_features['tempo'] = tempo
    track_features['duration_ms'] = duration_ms
    track_features['time_signature'] = time_signature
    
    return track_features

In [8]:
track_features(['5h5T5gtl1RqMxOuQ5O8rzs', '6z4uqDqO9LLdsW3i8GR9Li'])

Unnamed: 0,trackID,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,5h5T5gtl1RqMxOuQ5O8rzs,0.865,0.549,0,-10.083,1,0.244,0.0845,0.156,0.081,0.616,142.8,233300,4
1,6z4uqDqO9LLdsW3i8GR9Li,0.424,0.204,8,-12.952,0,0.0907,0.779,0.842,0.11,0.334,83.934,147632,4


In [16]:
batches = np.array_split(kaggle_df['artists'].unique(), 1000)
test = [artist_features(x) for x in batches[0]]

In [12]:
batches = np.array_split(streaming_df['artistName'], 4)
len(batches[0])`

954

In [None]:
#split the data into 4 chunks to be processed individually. First try in a for loop then put it into a function
batches = np.array_split(streaming_df['artistName'], 5)
art_feat = [artist_features(x) for x in batches[0]]

#concatenate the results into one dataframe and drop the duplicates
test = pd.concat([art_feat[0], art_feat[1], art_feat[2], art_feat[3]], axis = 0)
test.drop_duplicates(subset = 'artistName', inplace = True)
test

In [None]:
final = pd.merge(streaming_df, test, how = "left", on = ['artistName', 'artistName'])
final.shape

In [None]:
final.isnull().sum()

In [None]:
final.dropna(inplace= True)

In [None]:
#split the data into 4 chunks to be processed individually. First try in a for loop then put it into a function
batches = np.array_split(list(final[['artistName', 'trackName']].itertuples(index=False, name=None)), 4)
tracks = [track_names(x) for x in batches]

In [None]:
#concatenate the results into one dataframe and drop the duplicates
test_track = pd.concat([tracks[0], tracks[1], tracks[2], tracks[3]], axis = 0)
test_track.drop_duplicates(subset = 'trackName', inplace = True)
test_track

In [12]:
%store -r history

In [13]:
history.head(1)

Unnamed: 0,endTime,artistName,trackName,msPlayed,count,artist_id,genres,artist_popularity,followers,trackID,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,2021-10-26 23:02,Daniel Caesar,Transform (feat. Charlotte Day Wilson),277984,1,20wkVLutqVOYrc0kxFs7rA,"[canadian contemporary r&b, pop, r&b]",76.0,3551511.0,1jQfgl9WRle7D8a3GXLwaD,...,-10.656,1.0,0.031,0.511,1.9e-05,0.256,0.348,68.963,280587.0,4.0


In [None]:
batches = np.array_split(test_track['trackID'], 4)
track_feat = [track_features(x) for x in batches]

In [None]:
#concatenate the results into one dataframe and drop the duplicates
track_df = pd.concat([track_feat[0], track_feat[1], track_feat[2], track_feat[3]], axis = 0)
#track.drop_duplicates(subset = 'trackName', inplace = True)
track_df

In [None]:
#merge the track data with the artist and streaming history data for a complete dataset
track_df.drop_duplicates(subset = 'trackID', inplace = True)
track_df = pd.merge(track_df, test_track, how = 'left', on = 'trackID')

history = pd.merge(final, track_df, how = "left", on = 'trackName')

history.dropna(inplace = True)

In [None]:
%store history
%store kaggle_df

### Recommendations Library
I need to build a library of random tracks from spotify API to compare against my history and pick recommendations from.

<br> I will take two main approaches here: 
<br> 1) Pull random playlists from every spotify category then pull tracks from each of those playlists.
<br> 2) Identify artists related to my streaming history and pull top songs from these artists. 

#### Random Playlists & Tracks from Each Spotify Category

In [3]:
#create a function that grabs offsets of the playlist ids
def get_playlist_ids(row):
    category_id = []
    cat_playlist = []
    playlist_ids = []

    for i in range (1, 55):
        category_id.append(sp.categories(limit=1, offset = i)['categories']['items'][0]['id'])
    
    category_id.remove('0JQ5DAqbMKFRNXsIvgZF9A')

    for a in category_id:
        cat_playlist.append(sp.category_playlists(category_id = a, offset = row)['playlists']['items'])
     
    for x in range(0,53):
        for y in range(0,20):
            try:
                playlist_ids.append(cat_playlist[x][y]['id'])
            except:
                playlist_ids.append(0)


    playlist_ids.remove(0)
    playlist_ids = set(playlist_ids)
    
    return playlist_ids

In [13]:
def get_track_ids(playlist_ids):
    
    results = sp.playlist_tracks(playlist_ids)
    t = results['items']
    ids = []
    
    while results['next']:
        results = sp.next(results)
        t.extend(results['items'])
    try:
        for s in t: ids.append(s['track']['id'])
    except:
        ids.append(0)
        
    return ids

In [4]:
offset = [0,20,40,60,80,100]
func_ids = [get_playlist_ids(x) for x in offset]

In [5]:
playlist_ids = set(list(func_ids[0]) + list(func_ids[1]) + list(func_ids[2]) + list(func_ids[3]) 
                   + list(func_ids[4]) + list(func_ids[5]))

playlist_ids.remove(0)

In [6]:
len(playlist_ids)

2353

In [11]:
#playlist_ids

In [None]:
playlist_ids.remove('37i9dQZF1DWVztgMIUG66M') 

In [30]:
batches = np.array_split(list(playlist_ids), 5)

In [31]:
len(batches[0])

471

In [39]:
#My requests weren't processing so I seperated the requests into smaller batches and ran them at different times in the day to avoid rate limiting
ids = [get_track_ids(x) for x in batches[0]]

ids1 = [get_track_ids(x) for x in batches[1]]

ids2 = [get_track_ids(x) for x in batches[2]]

ids3 = [get_track_ids(x) for x in batches[3]]

ids4 = [get_track_ids(x) for x in batches[4]]

In [5]:
#Flattening lists of track ids
ids1 = set(sum(ids1,[]))
ids2 = set(sum(ids2,[]))
ids3 = set(sum(ids3,[]))
ids4 = set(sum(ids4,[]))

ids.remove(0)
ids1.remove(0)
ids2.remove(0)
ids3.remove(0)
ids4.remove(0)

In [8]:
%store ids 
%store ids1 
%store ids2 
%store ids3 
%store ids4

Stored 'ids' (set)
Stored 'ids1' (set)
Stored 'ids2' (set)
Stored 'ids3' (set)
Stored 'ids4' (set)


In [15]:
%store -r ids 
%store -r ids1 
%store -r ids2 
%store -r ids3 
%store -r ids4

In [16]:
#Splitting the data into smaller batches for rate limiting
ids_batches = np.array_split(list(ids), 36)
ids1_batches = np.array_split(list(ids1), 36)
ids2_batches = np.array_split(list(ids2), 36)
ids3_batches = np.array_split(list(ids3), 36)
ids4_batches = np.array_split(list(ids4), 36)

In [43]:
# library_feats = []
# counter = 0

In [34]:
for x in range(34,36):
    if counter < 1:
        library_feats.append(track_features(ids4_batches[x]))
        counter += 1
    else: 
        time.sleep(30)
        counter == 0
        library_feats.append(track_features(ids4_batches[x]))

In [12]:
len(library_feats)

178

In [11]:
#%store library_feats

In [11]:
%store -r library_feats

#### Get Top Songs from Artists Related to Streaming History 

In [20]:
%store -r history

In [17]:
#sp.artist_related_artists('20wkVLutqVOYrc0kxFs7rA')

In [18]:
def related_artists(artist_ids):
    results = sp.artist_related_artists(artist_ids)
    t = results['artists']
    ids = []
    name = []
    genre = []
    followers = []
    try:
        for s in t: 
            ids.append(s['id'])
            genre.append(s['genres'])
            followers.append(s['followers']['total'])
            name.append(s['name'])
        
    except:
        ids.append(0)
        
    related = pd.DataFrame(ids, columns = ['artist_id'])
    related['artistName'] = name
    related['genre'] = genre
    related['followers'] = followers
    
    return related

In [23]:
len(history['artist_id'].unique())

636

In [24]:
rel_art = [related_artists(x) for x in history['artist_id'].unique()]

dfs = []
for x in range(0,636):
    dfs.append(rel_art[x])

related_artists = pd.concat(dfs, axis = 0)

related_artists.drop_duplicates(subset = 'artist_id', inplace = True)

related_artists.reset_index(inplace = True, drop = True)

related_artists.head()

In [30]:
%store related_artists

Stored 'related_artists' (DataFrame)


In [32]:
related_artists

Unnamed: 0,artist_id,artistName,genre,followers
0,1A9o3Ljt67pFZ89YtPPL5X,Snoh Aalegra,"[alternative r&b, neo soul, r&b, scandinavian ...",881683.0
1,3Y7RZ31TRPVadSFVy1o8os,H.E.R.,"[pop, r&b, rap, urban contemporary]",5661164.0
2,30DhU7BDmF4PH0JVhu8ZRg,Sabrina Claudio,"[pop, r&b]",1443420.0
3,5aMIbwZQvP2MHPMVC5zCGj,ODIE,"[canadian contemporary r&b, indie r&b]",241438.0
4,3tlXnStJ1fFhdScmQeLpuG,Brent Faiyaz,"[dmv rap, hip hop, pop, r&b, rap]",2889860.0
...,...,...,...,...
5928,2YEnrpAWWaNRFumgde1lLH,Oden & Fatzo,[disco house],16982.0
5929,4pSMnAlD8JVEW3eZDuaQH8,Anish Kumar,[],6794.0
5930,1uF7AFfGahplhiaHEy9NNl,Loods,"[australian dance, disco house]",10598.0
5931,4W991QdgKWX4TO864ypInA,Eats Everything,"[deep disco house, house, raw techno]",80982.0


In [None]:
# search for all tracks from related artists and artists from the random generated playlist
# pull more tracks using offset from random playlist generator
# goal will be to get 500k tracks
# once I have 500k tracks ids pull the track features that are needed for modeling 

In [None]:
#sp.artist_albums('1A9o3Ljt67pFZ89YtPPL5X')['items'][2]['id']

In [None]:
#related_artists['artist_id'][1]

In [None]:
#sp.album_tracks('2OIMJ2Arm0dYpmWIfQOXTD')['items'][0]['name']

#### Pulling in the Genre and Followers for Kaggle Dataset

In [7]:
def artist_features(artist):
    results = sp.search(q=f'artist: {artist}', type='artist', limit=1)
    t = results['artists']['items']
    ids = []
    artist = artist
    name = []
    genre = []
    followers = []
    
    try:
        for s in t: 
            ids.append(s['id'])
            genre.append(s['genres'])
            followers.append(s['followers']['total'])
            name.append(s['name'])   
    except:
        ids.append(0)
        
    art_feat = pd.DataFrame(ids, columns = ['artist_id'])
    art_feat['artists'] = artist
    art_feat['artistName'] = name
    art_feat['genre'] = genre
    art_feat['followers'] = followers
    
    return art_feat

In [8]:
kaggle_df['artists'][17]

"['Hector Berlioz', 'Arturo Toscanini']"

In [None]:
artist_features(kaggle_df['artists'][0])

In [81]:
#I ran into issues trying to pull artist data so I had to run API requests in smaller batches
#I also had to filter out strings that were too long to be searched

artist_batches = np.array_split(kaggle_df['artists'].loc[(kaggle_df['artists'].str.len() < 200)].unique(), 100)

In [142]:
len(artist_batches[149])

226

In [90]:
kaggle_artists = []

In [107]:
kaggle_artists.append([artist_features(x) for x in artist_batches[4]])

In [None]:
for x in range(110,120):
    if counter < 1:
        time.sleep(30)
        kaggle_artists.append([artist_features(a) for a in artist_batches[x]])
        counter += 1
    else: 
        time.sleep(30)
        counter == 0
        kaggle_artists.append([artist_features(a) for a in artist_batches[x]])

In [10]:
len(kaggle_artists)

50

In [14]:
len(kaggle_df['artists'].loc[(kaggle_df['artists'].str.len() < 200)].unique())

33951

In [128]:
%store kaggle_artists

Stored 'kaggle_artists' (list)


In [9]:
%store -r kaggle_artists