In [None]:
!pip install spotipy numpy pandas matplotlib seaborn plotly chart_studio sklearn

# Importing Packages

In [None]:
import spotipy
import spotipy.util as util
from spotipy.oauth2 import SpotifyClientCredentials
import json
import requests
import time
import sys 
from math import log
from pprint import pprint

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

import plotly
import plotly.express as px
import plotly.graph_objects as go
from plotly.offline import download_plotlyjs, init_notebook_mode,plot,iplot
import plotly.figure_factory as ff
import chart_studio.tools as tls
import chart_studio.plotly as py

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# Acquiring Personal Spotify Data

- Visit https://www.spotify.com/us/account/privacy/ -- log into Spotify account and scroll to the bottom and request data.
- Receive a downloadable zip file with listening data from Spotify's team in around 1-3 days.
- Move ***MyData*** Folder to desired directory.

## Load & extract streaming data from JSON


In [None]:
with open('full-analysis/StreamingHistory0.json', encoding='utf8') as f:
    jsondata1 = json.load(f)
    
with open('full-analysis/StreamingHistory1.json', encoding='utf8') as f:
    jsondata2 = json.load(f)
    
jsondata = jsondata1 + jsondata2

## Extract data for each listening instance

In [None]:
my_streaming = pd.DataFrame()

def extract_json_value(column_name,data):
    return [i[column_name] for i in data]

# For each streaming instance:

# Track Name
# Artist Name
# Timestamp of End Time listening to that track stream
# Milliseconds listened in that instance

my_streaming['track_name'] = extract_json_value('trackName',jsondata)
my_streaming['artist_name'] = extract_json_value('artistName',jsondata)
my_streaming['end_time'] = extract_json_value('endTime',jsondata)
my_streaming['ms_played'] = extract_json_value('msPlayed',jsondata)

# Accessing the Spotify API

## Create an App

1) Go to https://developer.spotify.com/dashboard/applications

2) Create an App

3) Name your App 

4) Go to App --> Find Client_ID & Client Secret

## Retrieve token using account details

In [None]:
username = 'shahv1057'

# Copy-paste previously found client details
client_id ='9edcd1a862ea4eba9df48424c54d8049' 
client_secret = 'efcde76a19b94ff18dc7135c2611777e'

# This can be any localhost site
redirect_uri = 'http://localhost:1234/callback'

scope = 'user-read-recently-played'

# Running this cell will open a prompt at 'redirect_uri', click 'agree' to authorize and connect to API
token = util.prompt_for_user_token(username=username, 
                                   scope=scope, 
                                   client_id=client_id,   
                                   client_secret=client_secret,     
                                   redirect_uri=redirect_uri)


## Get individual track IDs every track in Dataframe for Spotify API


In [None]:
def get_id(track_name,artist, token):
    '''
    Input: Track Name, Artist Name, and API token
    Output: Spotify's unique Track ID for that track
    '''
    
    headers = {
    'Accept': 'application/json',
    'Content-Type': 'application/json',
    'Authorization': f'Bearer ' + token,
    }
    
    trackandartist = track_name+ " " + artist
    
    params = [
    #q is the search query parameter
    ('q',trackandartist ),
    ('type', 'track'),
    ]
    
    try:
        response = requests.get('https://api.spotify.com/v1/search', 
                    headers = headers, params = params, timeout = 10)
        json = response.json()
        track_id = json['tracks']['items'][0]['id']
        return track_id
    
    except:
        return None

In [None]:
# This may take several minutes, up to close to an hour mattering on your listening history
my_streaming["track_id"] = my_streaming.apply(lambda x: get_id(x["track_name"],x["artist_name"],token),axis=1)

## Acquire Spotify's audio feature data for all tracks in my_streaming Dataframe


In [None]:
trackid = list(my_streaming["track_id"].dropna().unique())

my_features = pd.DataFrame(columns=[
    "track_id","energy","tempo","speechiness",
    "acousticness","instrumentalness","danceability",
    "loudness","valence"
    ])

# Authorize access to audio features 
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, 
                                                      client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager,requests_timeout=100)

# For each Track ID in my Spotify-provided listening history...
# import spotify's audio features into my_features DataFrame

for track in trackid:
    print (track)
    features = sp.audio_features(tracks = [track])[0]
    if features is not None:
        my_features = my_features.append({
            "track_id":track,
            "energy":features['energy'], 
            "tempo":features['tempo'],
            "speechiness":features['speechiness'],
            "acousticness":features['acousticness'],
            "instrumentalness":features['instrumentalness'],
            "danceability":features['danceability'],
            "loudness":features['loudness'],
            "valence":features['valence'],
            },ignore_index=True)

## Merge track and artist names to features dataframe

In [None]:
my_features = my_features.merge(my_streaming[['track_id','track_name','artist_name']],how="left",left_on= "track_id", right_on="track_id").drop_duplicates()

## Add Album names

In [None]:
def acquire_album(track_id):
    return sp.track(track_id)['album']['name']

my_features['album'] = my_features['track_id'].apply(acquire_album)
my_streaming = my_streaming.merge(my_features[['track_id','album']],how="left",left_on= "track_id", right_on="track_id").drop_duplicates()

## Pickle Dataframes

In [None]:
my_streaming.to_pickle('my_streaming.pkl')
my_features.to_pickle('my_features.pkl')

# Interactive Data Analysis with Plotly

## Top Songs

In [None]:
songs = my_streaming.copy()
songs['month_year'] = pd.to_datetime(songs['end_time']).dt.to_period('M')
songs['ms_played'] = songs['ms_played'] / 60000
top20songs = songs.groupby('track_name')['ms_played'].sum().sort_values(ascending=False).nlargest(20)
songs = songs[songs['track_name'].isin(top20songs.index)]

plotly_songs_df = songs.groupby(['track_name','artist_name','month_year'])['ms_played'].sum().reset_index()
plotly_songs_df['month_year'] = plotly_songs_df['month_year'].astype(str)
plotly_songs_df['ms_played'] = plotly_songs_df['ms_played'].round()

months_order = ['2019-06', '2019-07', '2019-08','2019-09','2019-10','2019-11','2019-12',
                '2020-01', '2020-02', '2020-03', '2020-04','2020-05', '2020-06']

colors = ["Black","#240011","#480020","#6D002E","#91003A","#B30046","#D10550","#EC0E5B","#E3416A","#DE7082","#DE999E","#E3BFBE","#ECDFDE"][::-1]
labels={"month_year": "Month",  "track_name": "Song", "ms_played": "Minutes Listening",'artist_name': 'Artist'}
fig = px.bar(plotly_songs_df, 
             x='track_name',
             y='ms_played',
             hover_data=['track_name','artist_name','month_year','ms_played'],
             opacity=.8,
             title = '(Last 12 Months)',
             color='month_year',
             labels=labels,
             category_orders={"month_year": months_order},
             color_discrete_sequence=colors
            )
fig.update_traces(marker_line_width=0,marker_line_color='black')
fig.show()

## Top Artists


In [None]:
artists = my_streaming.copy()
artists['month_year'] = pd.to_datetime(artists['end_time']).dt.to_period('M')
artists['ms_played'] = artists['ms_played'] / 60000
top20artists = artists.groupby('artist_name')['ms_played'].sum().sort_values(ascending=False).nlargest(20)
artists = artists[artists['artist_name'].isin(top20artists.index)]

plotly_artists_df = artists.groupby(['track_name','artist_name','month_year'])['ms_played'].sum().reset_index()
plotly_artists_df['month_year'] = plotly_artists_df['month_year'].astype(str)
plotly_artists_df['ms_played'] = plotly_artists_df['ms_played'].round()

months_order = ['2019-06', '2019-07', '2019-08','2019-09','2019-10','2019-11','2019-12',
                '2020-01', '2020-02', '2020-03', '2020-04','2020-05', '2020-06']
colors = ['rgb(237,248,251)','rgb(237,248,251)','rgb(204,236,230)','rgb(204,236,230)',
          'rgb(153,216,201)','rgb(153,216,201)','rgb(102,194,164)','rgb(102,194,164)',
          'rgb(44,162,95)','rgb(44,162,95)','rgb(0,109,44)','rgb(0,109,44)','rgb(0,85,23)']

labels={"month_year": "Month",  "track_name": "Song", "ms_played": "Minutes Listening",'artist_name': 'Artist'}

fig = px.bar(plotly_artists_df, 
             x='artist_name',
             y='ms_played',
             hover_data=['track_name','artist_name','month_year','ms_played'],
             opacity=.8,
             title = '(Last 12 Months)',
             color='month_year',
             labels=labels,
             category_orders={"month_year": months_order},
             color_discrete_sequence=colors
            )
fig.update_traces(marker_line_width=0,marker_line_color='black')
fig.show()

## Top Albums


In [None]:
albums = my_streaming.copy()
albums['month_year'] = pd.to_datetime(albums['end_time']).dt.to_period('M')
albums['ms_played'] = albums['ms_played'] / 60000
top20albums = albums.groupby('album')['ms_played'].sum().sort_values(ascending=False).nlargest(20)
albums = albums[albums['album'].isin(top20albums.index)]

plotly_albums_df = albums.groupby(['album','track_name','artist_name','month_year'])['ms_played'].sum().reset_index()
plotly_albums_df['month_year'] = plotly_albums_df['month_year'].astype(str)
plotly_albums_df['ms_played'] = plotly_albums_df['ms_played'].round()

months_order = ['2019-06', '2019-07', '2019-08','2019-09','2019-10','2019-11','2019-12',
                '2020-01', '2020-02', '2020-03', '2020-04','2020-05', '2020-06']
colors = ['rgb(254,240,217)','rgb(254,240,217)','rgb(253,212,158)','rgb(253,212,158)',
          'rgb(253,187,132)','rgb(253,187,132)','rgb(252,141,89)','rgb(252,141,89)',
          'rgb(227,74,51)','rgb(227,74,51)','rgb(179,0,0)','rgb(179,0,0)','rgb(110,0,0)']
labels={"month_year": "Month",  "track_name": "Song", "ms_played": "Minutes Listening",'artist_name': 'Artist','album':'Album'}

fig = px.bar(plotly_albums_df, 
             x='album',
             y='ms_played',
             hover_data=['album','track_name','artist_name','month_year','ms_played'],
             opacity=.8,
             title = '(Last 12 Months)',
             color='month_year',
             labels=labels,
             category_orders={"month_year": months_order},
             color_discrete_sequence=colors
            )
fig.update_traces(marker_line_width=0,marker_line_color='black')
fig.show()

# Clustering Songs with the K-Mean Algorithim

## Data preprocessing

### Filter for tracks listened to for 15+ min in last year

In [None]:
minplayed = (my_streaming.groupby('track_id')['ms_played'].sum()/60000).reset_index()
listens15 = minplayed[minplayed.ms_played > 13]
song_prefs = listens15.merge(my_features,how="left",left_on= "track_id", right_on="track_id").drop_duplicates()

### Create all-numeric numpy array, X

In [None]:
song_prefs = song_prefs[['track_id','track_name','artist_name','album','energy', 'tempo','speechiness', 'acousticness', 'instrumentalness', 'danceability', 'loudness', 'valence']].dropna()
X = song_prefs.drop(['track_id','track_name','artist_name','album'],axis=1)

### Scale numeric columns not between 0 and 1

In [None]:
scaler = MinMaxScaler()
X.loudness = scaler.fit_transform(X.loudness.values.reshape(-1,1))
X.tempo = scaler.fit_transform(X.tempo.values.reshape(-1,1))

## Data analysis

### Correlations b/n audio features

In [None]:
cmap = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(X.corr(),cmap=cmap)

### Audio features distributions

In [None]:
distplot_data = [list(X[feature]) for feature in X.columns]
group_labels = list(X.columns)

fig = ff.create_distplot(distplot_data,
                         group_labels,
                         bin_size=.02,
                         show_hist=False
                        )
fig.update_layout({"template": 'plotly_white'})
fig.update_yaxes(range=[0, 7])
fig.show()

## Choosing Number of Mood Clusters for K-Means

In [None]:
inertia = {}
for n in range(1,15):
    kmeans = KMeans(n_clusters=n, random_state=1,n_jobs=-1).fit(X.values)
    inertia[n] = kmeans.inertia_
cluster_num = list(inertia.keys())
inertia_vals = list(inertia.values())

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=cluster_num, y=inertia_vals,
                    mode='lines+markers'))
elbow = [dict(type="circle",
                            xref="x", yref="y",
                            x0=cluster_num[3]-.4, y0=inertia_vals[3]-5,
                            x1 = cluster_num[3]+.4, y1 = inertia_vals[3]+5,
                            line=dict(color="Red"))]

fig.update_layout(xaxis_title="Number of Clusters",
    yaxis_title="Inertia",
    updatemenus=[
        dict(
            type="buttons",
            buttons=[
                dict(label="None",
                     method="relayout",
                     args=["shapes", []]),
                dict(label="Elbow",
                     method="relayout",
                     args=["shapes", elbow])
            ])])
config = {'displayModeBar': False}
fig.show(config=config)

## K-Means

In [None]:
n_clusters = 4
kmeans = KMeans(n_clusters=n_clusters,n_jobs=-1,random_state=1).fit(X.values)
y_kmeans = kmeans.predict(X.values)

## PCA - 2D

In [None]:
pca_2D = PCA(n_components=2)
principal_components_2D = pca_2D.fit_transform(X.values)
pc2D = pd.DataFrame(principal_components_2D)

pc2D['label'] = [str(y) for y in y_kmeans]
pc2D.columns = ['x', 'y','label']

fig = px.scatter(pc2D, 
                 x='x', 
                 y='y',
                 color='label',
                 color_discrete_map={'0':'purple','1':'blue','2':'green','3':'red'},
                 category_orders={"label": ["0", "1", "2", "3"]})

In [None]:
print (pca_2D.explained_variance_ratio_, sum(pca_2D.explained_variance_ratio_))

## PCA - 3D

In [None]:
from mpl_toolkits.mplot3d import Axes3D

pca_3D = PCA(n_components=3)
principal_components_3D = pca_3D.fit_transform(X.values)
pc3D = pd.DataFrame(principal_components_3D)

pc3D['label'] = [str(y) for y in y_kmeans]
pc3D.columns = ['x', 'y', 'z', 'label']

fig = px.scatter_3d(pc3D, 
                 x='x', 
                 y='y',
                 z='z',
                 color='label',
                 color_discrete_map={'0':'purple','1':'blue','2':'green','3':'red'},
                 category_orders={"label": ["0", "1", "2", "3"]})

In [None]:
print (pca_3D.explained_variance_ratio_, sum(pca_3D.explained_variance_ratio_))

# Defining Moods for each K-Means Cluster

## Number of songs in each cluster

In [None]:
song_prefs['label'] = y_kmeans
# shuffle dataset
fig = sns.barplot(x=song_prefs['label'].value_counts().index, 
                  y=song_prefs['label'].value_counts()
                 )

plt.title('# of Songs in each Group')
plt.ylabel('')
fig = fig.get_figure()
fig.set_size_inches(10, 4)
fig.show()

## Scale all audio features to have [mean=0 & variance=1] for intuitive feature comparison

In [None]:
scaler = StandardScaler()
sns.set(font_scale=1.6,font='Times New Roman')
fig = sns.heatmap(scaler.fit_transform(song_prefs.groupby('label').mean()).T,
                  cmap='coolwarm',
                  yticklabels=[x.capitalize() for x in list(X.columns)],
                  annot=True)
fig = fig.get_figure()
fig.set_size_inches(16, 8)

# Using a Random Forest Classifier to Analyze Cluster Accuracy

## Create training and test sets from data

In [None]:
scaler = StandardScaler()
Xtrain, Xtest, ytrain, ytest = train_test_split(scaler.fit_transform(X.values),y_kmeans,test_size =.25,random_state=1)

## Run Random Forest Classifier

In [None]:
clf = RandomForestClassifier(n_estimators=30, random_state=10,criterion='entropy')
clf.fit(Xtrain, ytrain)
ypred = clf.predict(Xtest)

## Audio Feature Importance

In [None]:
print ([(x,y) for x,y in zip(list(np.round(clf.feature_importances_,2)),group_labels)])

## Classification results

In [None]:
moods = ['Hype','Angsty','Happy',"Sad"]
classification_matrix = np.zeros((4,4))
for x,y in zip(ytest,ypred):
    classification_matrix[x,y]+=1
ax = sns.heatmap(classification_matrix,
                 cmap='Blues',
                 cbar=False,
                 annot=True,
                 xticklabels = moods,
                 yticklabels= moods)
ax.set(xlabel='Preds', ylabel='True')
plt.show()

# Creating Mood-Based Spotify Playlists

## Authorize access to User playlists

In [None]:
from spotipy.oauth2 import SpotifyOAuth

scope = 'playlist-modify-public'
token = util.prompt_for_user_token(username=username, 
                                   scope=scope, 
                                   client_id=client_id,   
                                   client_secret=client_secret,     
                                   redirect_uri=redirect_uri)

sp = spotipy.Spotify(auth_manager=SpotifyOAuth(client_id,client_secret,redirect_uri,scope=scope,username=username))

## Create dataset of songs based on minutes listened

In [None]:
minplayed = (my_streaming.groupby('track_id')['ms_played'].sum()/60000).reset_index()
minutes_bar = 10
listens_mins = minplayed[minplayed.ms_played > minutes_bar]

In [None]:
song_prefs = listens_mins.merge(my_features,
                                how="left",
                                left_on= "track_id", 
                                right_on="track_id").drop_duplicates().reset_index(drop=True)


In [None]:
minplayed = (my_streaming.groupby('track_id')['ms_played'].sum()/60000).reset_index()
minutes_bar = 10
listens_mins = minplayed[minplayed.ms_played > minutes_bar]

song_prefs = listens_mins.merge(my_features,
                                how="left",
                                left_on= "track_id", 
                                right_on="track_id").drop_duplicates().reset_index(drop=True)
song_prefs = song_prefs[['track_id','track_name','artist_name','album','energy', 'tempo','speechiness', 'acousticness', 'instrumentalness', 'danceability', 'loudness', 'valence']].dropna()

X = song_prefs.drop(['track_id','track_name','artist_name','album'],axis=1)
scaler = MinMaxScaler()
X.loudness = scaler.fit_transform(X.loudness.values.reshape(-1,1))
X.tempo = scaler.fit_transform(X.tempo.values.reshape(-1,1))

## Fit K-Means algorithim to new data

In [None]:
y_kmeans = kmeans.predict(X.values)
song_prefs['label'] = y_kmeans

## Create new mood-specific playlists directly in Spotify

In [None]:
def create_mood_playlists(moods, df, num_clusters, playlist_length):
    '''
    Input: List of defined moods, features df, number of clusters, len of desired playlist
    Output: Spotify Playlist
    '''
    for moodnum in range(num_clusters):
        mood_data = df[df.label==moodnum]
        sp.user_playlist_create(username, moods[moodnum])      
        playlist_id = sp.user_playlists(username)['items'][0]['id']
        playlist_song_IDs = list(mood_data['track_id'].sample(playlist_length))
        sp.user_playlist_add_tracks(username, playlist_id, list(playlist_song_IDs))
        
moods = ['Sad','Happy','Angsty',"Hype"]
num_clusters = 4
playlist_length = 20

create_mood_playlists(moods, song_prefs, num_clusters, playlist_length)
        

In [None]:
moods = ['Sad','Happy','Angsty',"Hype"]
num_clusters = 4
playlist_length = 20

create_mood_playlists(moods, song_prefs, num_clusters, playlist_length)