## Data Collection
In this notebook I will pull in data from spotify. First I will pull in a json file which includes 1 year of my listening history that I requested from Spotify. Once I have this data I will pull in various features from the spotify API relating to each artist and track in my listening history. Lastly I will collect a library of similar datapoints for thousands of tracks which I can use to compare to a users listening history to surface recommendations.

1- Spotify Listening History- 1 year listening history from Spotify
<br>2- Spotify API
<br>3- Kaggle Spotify Library

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
streaming_df = pd.read_json('../data/StreamingHistory0.json')

In [3]:
streaming_df['count'] = 1
streaming_df.head(10)

Unnamed: 0,endTime,artistName,trackName,msPlayed,count
0,2021-10-26 23:02,Daniel Caesar,Transform (feat. Charlotte Day Wilson),277984,1
1,2021-10-28 00:36,Nia Sultana,Coconut Water,25836,1
2,2021-10-28 00:36,Nia Sultana,Positions,146823,1
3,2021-10-28 00:40,Wizkid,Essence (feat. Tems),234590,1
4,2021-10-28 00:42,Nija,Finesse,129850,1
5,2021-10-28 00:45,Femme It Forward,What You Deserve,175702,1
6,2021-10-28 01:02,Bathe,Sundress,3100,1
7,2021-10-28 01:02,Tone Stith,Do I Ever (feat. Chris Brown),166817,1
8,2021-10-28 01:04,Sonder,Nobody But You,165440,1
9,2021-10-28 01:27,Adele,Easy On Me,120624,1


In [7]:
streaming_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3815 entries, 0 to 3814
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   endTime     3815 non-null   object
 1   artistName  3815 non-null   object
 2   trackName   3815 non-null   object
 3   msPlayed    3815 non-null   int64 
 4   count       3815 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 149.1+ KB


In [8]:
streaming_df.isnull().sum()

endTime       0
artistName    0
trackName     0
msPlayed      0
count         0
dtype: int64

In [9]:
streaming_df.describe()

Unnamed: 0,msPlayed,count
count,3815.0,3815.0
mean,153129.0,1.0
std,134488.8,0.0
min,0.0,1.0
25%,40089.5,1.0
50%,168387.0,1.0
75%,217990.0,1.0
max,3822727.0,1.0


In [10]:
streaming_df.groupby('trackName')['count'].count().sort_values(ascending=False).head(20)

trackName
CPR                                 89
Later                               86
Lose Control                        71
Coming Home                         71
Session 32                          62
One Right Now (with The Weeknd)     61
telepatía                           60
On It                               57
In the Air                          55
Hometown Glory                      55
You Can't Save Me                   52
Remember Me                         52
Essence (feat. Tems)                50
Bad Habit                           50
Positions                           47
Waiting For (feat. Jamila Woods)    46
Moment                              43
I Drink Wine                        40
Better Than                         30
Not Another Love Song               29
Name: count, dtype: int64

In [11]:
streaming_df.groupby('artistName')['count'].count().sort_values(ascending=False).head(20)

artistName
Adele               207
Summer Walker       197
Beyoncé             159
Leon Bridges        102
Pinegrove            91
L.A.B.               90
Jazmine Sullivan     89
emawk                88
MEDUZA               78
Steve Lacy           70
Kali Uchis           68
Free Nationals       68
Post Malone          63
Nia Sultana          59
Lake Street Dive     57
SiR                  57
Wizkid               54
UMI                  53
rum.gold             48
Matt Corby           48
Name: count, dtype: int64

In [12]:
#678 different artists
artists = streaming_df['artistName'].unique()
len(streaming_df['artistName'].unique())

678

In [None]:
#Pull artist ID's 
#Use Artist IDs to get the genre associated with each artist
#Pull other artists within a given genre 

In [13]:
#load kaggle data
kaggle_df = pd.read_csv('../data/data.csv')
kaggle_df.head()

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo
0,0.0594,1921,0.982,"['Sergei Rachmaninoff', 'James Levine', 'Berli...",0.279,831667,0.211,0,4BJqT0PrAfrxzMOxytFOIz,0.878,10,0.665,-20.096,1,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...",4,1921,0.0366,80.954
1,0.963,1921,0.732,['Dennis Day'],0.819,180533,0.341,0,7xPhfUan2yNtyFG0cUWkt8,0.0,7,0.16,-12.441,1,Clancy Lowered the Boom,5,1921,0.415,60.936
2,0.0394,1921,0.961,['KHP Kridhamardawa Karaton Ngayogyakarta Hadi...,0.328,500062,0.166,0,1o6I8BglA6ylDMrIELygv1,0.913,3,0.101,-14.85,1,Gati Bali,5,1921,0.0339,110.339
3,0.165,1921,0.967,['Frank Parker'],0.275,210000,0.309,0,3ftBPsC5vPBKxYSee08FDH,2.8e-05,5,0.381,-9.316,1,Danny Boy,3,1921,0.0354,100.109
4,0.253,1921,0.957,['Phil Regan'],0.418,166693,0.193,0,4d6HGyGT8e121BsdKmw9v6,2e-06,3,0.229,-10.096,1,When Irish Eyes Are Smiling,2,1921,0.038,101.665


In [14]:
kaggle_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170653 entries, 0 to 170652
Data columns (total 19 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   valence           170653 non-null  float64
 1   year              170653 non-null  int64  
 2   acousticness      170653 non-null  float64
 3   artists           170653 non-null  object 
 4   danceability      170653 non-null  float64
 5   duration_ms       170653 non-null  int64  
 6   energy            170653 non-null  float64
 7   explicit          170653 non-null  int64  
 8   id                170653 non-null  object 
 9   instrumentalness  170653 non-null  float64
 10  key               170653 non-null  int64  
 11  liveness          170653 non-null  float64
 12  loudness          170653 non-null  float64
 13  mode              170653 non-null  int64  
 14  name              170653 non-null  object 
 15  popularity        170653 non-null  int64  
 16  release_date      17

### Spotify API

In [15]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
cid = "bec8ed6d2e4c4623869edd14b310b1eb"
secret = "dd390053697a4cad874554c022b5db03"
client_credentials_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret)
sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager)

In [20]:
artist = []
artist_id = []
genres = []
artist_popularity= []
followers = []

In [24]:
for i in streaming_df['artistName'][3000:]:
    results = sp.search(q=f'artist: {i}', type='artist', limit=1)
    for i, t in enumerate(results['artists']['items']):
        artist.append(t['name'])
        artist_id.append(t['id'])
        genres.append(t['genres'])
        artist_popularity.append(t['popularity'])
        followers.append(t['followers']['total'])

In [25]:
check = []
for name in streaming_df['artistName']:
    if name not in artist:
        check.append(name)

In [26]:
len(check)

235

In [27]:
spotify_api = pd.DataFrame(artist)
spotify_api['artist_id'] = artist_id
spotify_api['genres'] = genres
spotify_api['artist_popularity'] = artist_popularity
spotify_api['followers'] = followers
spotify_api.rename(columns = {0:'artistName'}, inplace = True)
spotify_api.head(10)

Unnamed: 0,artistName,artist_id,genres,artist_popularity,followers
0,Daniel Caesar,20wkVLutqVOYrc0kxFs7rA,"[canadian contemporary r&b, pop, r&b]",76,3541677
1,Nia Sultana,1L8An7RfJbMW7zBy2fE0Tz,"[alternative r&b, chill r&b, indie r&b]",43,17892
2,Nia Sultana,1L8An7RfJbMW7zBy2fE0Tz,"[alternative r&b, chill r&b, indie r&b]",43,17892
3,Wizkid,3tVQdUvClmAT7URs9V3rsp,"[afro dancehall, afropop, azonto, nigerian hip...",76,2799272
4,Nija,7f9KxQWD88MZrSY6jc0zoW,"[alternative r&b, indie r&b, pop r&b, r&b]",48,29755
5,Femme It Forward,3rw1MKkbLTZw46AqC7gRpH,"[alternative r&b, indie r&b]",36,11517
6,Bathe,3BBN1P1JNw0sSdYEdBkOZK,"[alternative r&b, chill r&b, experimental r&b,...",41,9077
7,Tone Stith,756t7CBmWLNYsshVtS6P44,"[alternative r&b, chill r&b, indie r&b, pop r&...",52,160990
8,Sonder,2ICR2m4hOBPhaYiZB3rnLW,[r&b],66,457770
9,Adele,4dpARuHxo51G3z768sgnrY,"[british soul, pop, pop soul, uk pop]",84,43349077


In [27]:
#streaming_df.to_csv('../data/streaming_df.csv')
#spotify_api.to_csv('../data/spotify_api.csv')

In [28]:
name = []
id = []
genre = []
popularity = []
follower = []

for i in check:
    results = sp.search(q=f'artist: {i}', type='artist', limit=10)
    for i, t in enumerate(results['artists']['items']):
        name.append(t['name'])
        id.append(t['id'])
        genre.append(t['genres'])
        popularity.append(t['popularity'])
        follower.append(t['followers']['total'])

In [29]:
missing_api = pd.DataFrame(name)
missing_api['artist_id'] = id
missing_api['genres'] = genre
missing_api['artist_popularity'] = popularity
missing_api['followers'] = follower
missing_api.rename(columns = {0:'artistName'}, inplace = True)
missing_api.head(10)

Unnamed: 0,artistName,artist_id,genres,artist_popularity,followers
0,Sleeping With Sirens,3N8Hy6xQnQv1F1XCiyGQqA,"[metalcore, modern rock, pop emo, pop punk, ro...",65,2080896
1,SiR,3QTDHixorJelOLxoxcjqGx,"[alternative r&b, hip hop, indie soul, la pop,...",63,741363
2,Sirah,3oAazIwC0nAYkOKVQPUC38,[],54,17876
3,Sir Chloe,6rniTPs9zN26kYnkPdFl1U,"[indie pop, modern rock]",61,511188
4,Siqtruand,34nrdgWD7gZThWiwbFWsI3,[],31,11
5,Sir Neville Marriner,6NUhQz7eAEsZvjEHTKHux9,"[classical performance, orchestral performance]",63,27869
6,Sahir,0dqT9B1Xej71qvAo8uE4Uh,"[reggaeton flow, trap boricua, trap latino]",43,19271
7,Sir Simon Rattle,4GQwgdcDQwqtcHICjUNndp,"[choral, classical performance, orchestral per...",61,47389
8,Sire,6YRoP7nazDmTYCtvhz5fJU,[],38,3959
9,Sir Mix-A-Lot,3TQ9JTBI2n2hfo7aRONEYV,"[gangster rap, hip hop, hip house, old school ...",54,481854


In [30]:
check2 = []
for n in check:
    if n not in name:
        check2.append(n)

In [31]:
check2

['ELIZA',
 'FantasyPros - Fantasy Football Podcast',
 '2021 Wrapped',
 'BLEU',
 'The Joe Rogan Experience',
 'The Joe Rogan Experience',
 'The Pomp Podcast',
 'The Journal.',
 'The Mindset Mentor',
 'The Mindset Mentor',
 'The Mindset Mentor',
 'The Mindset Mentor',
 'The Mindset Mentor',
 'The Mindset Mentor',
 'The Mindset Mentor',
 'Focus/Study Music',
 'Girls']

In [32]:
spotify_api.drop_duplicates(subset = 'artist_id',inplace = True)
missing_api.drop_duplicates(subset = 'artistName', inplace = True)
spotify_df = pd.concat([spotify_api, missing_api])
spotify_df.drop_duplicates(subset = 'artistName', inplace = True)
spotify_df

Unnamed: 0,artistName,artist_id,genres,artist_popularity,followers
0,Daniel Caesar,20wkVLutqVOYrc0kxFs7rA,"[canadian contemporary r&b, pop, r&b]",76,3541677
1,Nia Sultana,1L8An7RfJbMW7zBy2fE0Tz,"[alternative r&b, chill r&b, indie r&b]",43,17892
3,Wizkid,3tVQdUvClmAT7URs9V3rsp,"[afro dancehall, afropop, azonto, nigerian hip...",76,2799272
4,Nija,7f9KxQWD88MZrSY6jc0zoW,"[alternative r&b, indie r&b, pop r&b, r&b]",48,29755
5,Femme It Forward,3rw1MKkbLTZw46AqC7gRpH,"[alternative r&b, indie r&b]",36,11517
...,...,...,...,...,...
2300,Indigo Girls,4wM29TDTr3HI0qFY3KoSFG,"[folk, lilith, singer-songwriter, women's music]",52,287829
2301,Original Broadway Cast of Mean Girls,6B1foTh2pK3K4MKQzYaKRu,[broadway],51,18169
2302,The Cheetah Girls,4ntkql3f3ect7NDRUJ7aAY,"[dance pop, disney, girl group, pop, post-teen...",47,716128
2303,Mary Jane Girls,7vRMMs8yrKf4PKUpUllMkr,"[disco, funk, post-disco, quiet storm]",45,234026


In [33]:
final = pd.merge(streaming_df, spotify_df, how = "left", on = ['artistName', 'artistName'])
final.shape

(3815, 9)

In [34]:
final.isnull().sum()

endTime               0
artistName            0
trackName             0
msPlayed              0
count                 0
artist_id            17
genres               17
artist_popularity    17
followers            17
dtype: int64

In [35]:
final.dropna(inplace= True)

In [36]:
final

Unnamed: 0,endTime,artistName,trackName,msPlayed,count,artist_id,genres,artist_popularity,followers
0,2021-10-26 23:02,Daniel Caesar,Transform (feat. Charlotte Day Wilson),277984,1,20wkVLutqVOYrc0kxFs7rA,"[canadian contemporary r&b, pop, r&b]",76.0,3541677.0
1,2021-10-28 00:36,Nia Sultana,Coconut Water,25836,1,1L8An7RfJbMW7zBy2fE0Tz,"[alternative r&b, chill r&b, indie r&b]",43.0,17892.0
2,2021-10-28 00:36,Nia Sultana,Positions,146823,1,1L8An7RfJbMW7zBy2fE0Tz,"[alternative r&b, chill r&b, indie r&b]",43.0,17892.0
3,2021-10-28 00:40,Wizkid,Essence (feat. Tems),234590,1,3tVQdUvClmAT7URs9V3rsp,"[afro dancehall, afropop, azonto, nigerian hip...",76.0,2799272.0
4,2021-10-28 00:42,Nija,Finesse,129850,1,7f9KxQWD88MZrSY6jc0zoW,"[alternative r&b, indie r&b, pop r&b, r&b]",48.0,29755.0
...,...,...,...,...,...,...,...,...,...
3810,2022-10-27 23:26,The Pussycat Dolls,Don't Cha,3130,1,6wPhSqRtPu1UhRCDX5yaDJ,"[dance pop, girl group, pop, post-teen pop, ur...",68.0,5240104.0
3811,2022-10-27 23:26,PAMÉ,Fresh Water,2030,1,5ZSOXLTnZcSjdVCIdjnq03,[],28.0,2111.0
3812,2022-10-27 23:26,KAYTRANADA,What You Need,16440,1,6qgnBH6iDM91ipVXv28OMu,"[escape room, lgbtq+ hip hop]",68.0,1084625.0
3813,2022-10-27 23:26,Lizzo,2 Be Loved (Am I Ready),2100,1,56oDRnqbIiwx4mymNEv7dS,"[escape room, minnesota hip hop, pop, trap queen]",79.0,5113566.0


In [39]:
trackName = []
trackID = []

In [43]:
for a,t in list(final[['artistName', 'trackName']][3000:].itertuples(index=False, name=None)):
    results = sp.search(q="artist:" + a + " track:" + t, type="track", limit =1)
    for i, t in enumerate(results['tracks']['items']):
        trackName.append(t['name'])
        trackID.append(t['id'])

In [44]:
len(trackName)

3723

In [45]:
track_df = pd.DataFrame(trackName, columns = ['trackName'])
track_df['id'] = trackID
track_df

Unnamed: 0,trackName,id
0,Transform (feat. Charlotte Day Wilson),1jQfgl9WRle7D8a3GXLwaD
1,Coconut Water,54I1dC6Ux5I4h7U44DmjgK
2,Positions,4rnriM1hFyeRrdRQSZd9uA
3,Essence (feat. Tems),5FG7Tl93LdH117jEKYl3Cm
4,Finesse,7ifz6wYLdZ1ujxzXSnBTvd
...,...,...
3718,Party (feat. J. Cole),7GjNPaxtLRJxPy1U6bLYrK
3719,Fresh Water,2q8EbgPUw6bCQjVyfGoytw
3720,What You Need,4O9t8Qq941SAzdGlex4noA
3721,2 Be Loved (Am I Ready),2rmwqU7yzTvzkiaRV53DpT


In [301]:
#next step I need the track get audio features (danceability, energy, acousticness, etc)
#I can pull this data using the search query/get track endpoint

In [59]:
# I am pulling the same 1k songs 10 times, I want to pull 10k different songs

In [46]:
len(pd.unique(trackName))

1464

In [47]:
len(final)

3798

In [48]:
danceability = []
energy = []
key = []
loudness = []
mode = []
speechiness = []
acousticness = []
instrumentalness = []
liveness = []
valence = []
tempo = []
id = []
duration_ms = []
time_signature = []

In [52]:
for t in trackID[3000:]:
    try:
        results = sp.audio_features(tracks = t)
        danceability.append(results[0]['danceability'])
        energy.append(results[0]['energy'])
        key.append(results[0]['key'])
        loudness.append(results[0]['loudness'])
        mode.append(results[0]['mode'])
        speechiness.append(results[0]['speechiness'])
        acousticness.append(results[0]['acousticness'])
        instrumentalness.append(results[0]['instrumentalness'])
        liveness.append(results[0]['liveness'])
        valence.append(results[0]['valence'])
        tempo.append(results[0]['tempo'])
        id.append(results[0]['id'])
        duration_ms.append(results[0]['duration_ms'])
        time_signature.append(results[0]['time_signature'])
    except:
        pass

In [1]:
# features = [danceability, energy, key, loudness, mode, speechiness, acousticness, instrumentalness, liveness, valence, tempo, id, duration_ms, time_signature]
# fet = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'id', 'duration_ms', 'time_signature']
# col_name = [f'{f}' for f in fet]

In [53]:
#creating track features dataset to be merged with streaming history
track_features = pd.DataFrame(id)
track_features.rename(columns = {0:'id'}, inplace = True)
track_features['danceability'] = danceability
track_features['energy'] = energy
track_features['key'] = key
track_features['loudness'] = loudness 
track_features['mode'] = mode 
track_features['speechiness'] = speechiness
track_features['acousticness'] = acousticness
track_features['instrumentalness'] = instrumentalness
track_features['liveness'] = liveness
track_features['valence'] = valence
track_features['tempo'] = tempo
track_features['duration_ms'] = duration_ms
track_features['time_signature'] = time_signature

In [54]:
track_features.sort_values(by = 'id')

Unnamed: 0,id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
1627,00Blm7zeNqgYLPtW6zg8cj,0.687,0.781,1,-4.806,1,0.0530,0.03610,0.000000,0.0755,0.688,97.014,193507,4
2460,00Blm7zeNqgYLPtW6zg8cj,0.687,0.781,1,-4.806,1,0.0530,0.03610,0.000000,0.0755,0.688,97.014,193507,4
2127,00Blm7zeNqgYLPtW6zg8cj,0.687,0.781,1,-4.806,1,0.0530,0.03610,0.000000,0.0755,0.688,97.014,193507,4
1557,00Blm7zeNqgYLPtW6zg8cj,0.687,0.781,1,-4.806,1,0.0530,0.03610,0.000000,0.0755,0.688,97.014,193507,4
2351,00Blm7zeNqgYLPtW6zg8cj,0.687,0.781,1,-4.806,1,0.0530,0.03610,0.000000,0.0755,0.688,97.014,193507,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1925,7yBbV2k2S2uhaQc24NF2xt,0.680,0.825,8,-4.296,1,0.0702,0.00173,0.000122,0.2700,0.784,117.987,193533,4
3566,7ySbfLwdCwl1EM0zNCJZ38,0.568,0.495,1,-8.964,1,0.0299,0.35300,0.000000,0.0839,0.303,128.234,281067,4
1357,7ytR5pFWmSjzHJIeQkgog4,0.746,0.690,11,-7.956,1,0.1640,0.24700,0.000000,0.1010,0.497,89.977,181733,4
1982,7zFXmv6vqI4qOt4yGf3jYZ,0.658,0.294,4,-8.533,0,0.0321,0.42200,0.000052,0.0749,0.358,74.038,278180,4


In [55]:
len(final['trackName'].unique())

1522

In [56]:
track_features.drop_duplicates(subset = 'id', inplace = True)

In [57]:
track_features = pd.merge(track_features, track_df, how = 'left', on = 'id')

In [58]:
track_features.drop_duplicates(subset = 'trackName', inplace = True)

In [59]:
temp = pd.merge(final, track_features, how = "left", on = 'trackName')
temp

Unnamed: 0,endTime,artistName,trackName,msPlayed,count,artist_id,genres,artist_popularity,followers,id,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,2021-10-26 23:02,Daniel Caesar,Transform (feat. Charlotte Day Wilson),277984,1,20wkVLutqVOYrc0kxFs7rA,"[canadian contemporary r&b, pop, r&b]",76.0,3541677.0,1jQfgl9WRle7D8a3GXLwaD,...,-10.656,1.0,0.0310,0.5110,0.000019,0.2560,0.348,68.963,280587.0,4.0
1,2021-10-28 00:36,Nia Sultana,Coconut Water,25836,1,1L8An7RfJbMW7zBy2fE0Tz,"[alternative r&b, chill r&b, indie r&b]",43.0,17892.0,54I1dC6Ux5I4h7U44DmjgK,...,-9.945,1.0,0.2030,0.0876,0.000032,0.0828,0.696,84.992,180706.0,4.0
2,2021-10-28 00:36,Nia Sultana,Positions,146823,1,1L8An7RfJbMW7zBy2fE0Tz,"[alternative r&b, chill r&b, indie r&b]",43.0,17892.0,4rnriM1hFyeRrdRQSZd9uA,...,-7.714,1.0,0.0596,0.4890,0.001260,0.1090,0.321,84.975,146824.0,4.0
3,2021-10-28 00:40,Wizkid,Essence (feat. Tems),234590,1,3tVQdUvClmAT7URs9V3rsp,"[afro dancehall, afropop, azonto, nigerian hip...",76.0,2799272.0,5FG7Tl93LdH117jEKYl3Cm,...,-6.002,1.0,0.1130,0.0266,0.000009,0.6180,0.602,104.027,248040.0,4.0
4,2021-10-28 00:42,Nija,Finesse,129850,1,7f9KxQWD88MZrSY6jc0zoW,"[alternative r&b, indie r&b, pop r&b, r&b]",48.0,29755.0,7ifz6wYLdZ1ujxzXSnBTvd,...,-8.364,1.0,0.0475,0.1340,0.000026,0.2600,0.351,132.045,173760.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3793,2022-10-27 23:26,The Pussycat Dolls,Don't Cha,3130,1,6wPhSqRtPu1UhRCDX5yaDJ,"[dance pop, girl group, pop, post-teen pop, ur...",68.0,5240104.0,,...,,,,,,,,,,
3794,2022-10-27 23:26,PAMÉ,Fresh Water,2030,1,5ZSOXLTnZcSjdVCIdjnq03,[],28.0,2111.0,2q8EbgPUw6bCQjVyfGoytw,...,-6.318,0.0,0.2180,0.2300,0.007160,0.2040,0.834,101.996,157010.0,4.0
3795,2022-10-27 23:26,KAYTRANADA,What You Need,16440,1,6qgnBH6iDM91ipVXv28OMu,"[escape room, lgbtq+ hip hop]",68.0,1084625.0,4O9t8Qq941SAzdGlex4noA,...,-6.323,0.0,0.1310,0.0555,0.843000,0.0601,0.600,111.038,183787.0,4.0
3796,2022-10-27 23:26,Lizzo,2 Be Loved (Am I Ready),2100,1,56oDRnqbIiwx4mymNEv7dS,"[escape room, minnesota hip hop, pop, trap queen]",79.0,5113566.0,2rmwqU7yzTvzkiaRV53DpT,...,-4.111,1.0,0.1050,0.0922,0.000000,0.0817,0.915,155.932,187108.0,4.0


In [65]:
#investigate the null values
temp.isnull().sum()

endTime              0
artistName           0
trackName            0
msPlayed             0
count                0
artist_id            0
genres               0
artist_popularity    0
followers            0
id                   0
danceability         0
energy               0
key                  0
loudness             0
mode                 0
speechiness          0
acousticness         0
instrumentalness     0
liveness             0
valence              0
tempo                0
duration_ms          0
time_signature       0
dtype: int64

In [64]:
temp.dropna(inplace = True)

In [68]:
%store temp
%store kaggle_df

Stored 'temp' (DataFrame)
Stored 'kaggle_df' (DataFrame)


In [78]:
def get_the_playlists(date,offset):
    '''
    Get playlist ids for a particular day
    Inputs: date in ISO-8601 format
            offfset - page number
    Output: paired playlist ids with number of tracks
    '''
    return ([i['id'],i['snapshot_id'],i['tracks']['total']] for i in sp.featured_playlists(
        locale=None,
        country='US',
        timestamp=date,
        limit=50,
        offset=offset*50
    )['playlists']['items'])
    
def get_playlist_tracks(id_):
    '''
    Get the tracks for a spotify playlist.
    Input: playlist ID
    Output: list of song IDs
    '''
    return [i['track']['id'] for i in sp.user_playlist('spotify',id_)['tracks']['items']]

def get_song_features(ids):
    '''
    Get song features for each song in a list
    '''
    return pd.DataFrame(sp.audio_features(_)[0] for _ in ids)

def get_all_playlists():
    num_days = 1461 # that's 4 years
    start_date = datetime(2015,1,1,0,0)

    dates = ['T'.join(str(start_date + timedelta(hours=i*8)).split()) for i in range(int(num_days*3))]
    playlist_df = pd.DataFrame({0:[],1:[],2:[]})

    for date in tqdm_notebook(dates):
        while True:
            try:
                playlist_df = pd.concat([playlist_df,pd.DataFrame(list(get_the_playlists(date,0)))])
            except:
                time.sleep(30)
                continue
            break
    return playlist_df

def get_tune(search_string):
    attempts = 0
    while attempts < 3:
        try:
            return sp.search(user_df.iloc[0].search_string,type='track',limit=1)['tracks']['items'][0]['id']
        except:
            time.sleep(30)
            attempts += 1
            continue
        break
    return ''

In [302]:
#playlist_df = get_all_playlists()

In [77]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
import seaborn as sns
from datetime import datetime, timedelta
from tqdm import tqdm, tqdm_notebook
import json
import time

datetime(2015,1,1,0,0)

datetime.datetime(2015, 1, 1, 0, 0)