In [217]:
import pandas as pd
import numpy as np
from sklearn import datasets # sklearn comes with some toy datasets to practise
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import pickle

In [218]:
#Concat for clustering
not_hot_songs_df=pd.read_csv('/Users/devirughani/Desktop/IronHack/Week_6/Spotify_Midterm_Project/data/not_hot_songs_and_audio_features_df.csv')
hot_songs_df=pd.read_csv('/Users/devirughani/Desktop/IronHack/Week_6/Spotify_Midterm_Project/data/hot_songs_and_audio_features_df.csv')
#display(not_hot_songs_df.head())
hot_songs_df=hot_songs_df.rename(columns={"main_artist": "artist"})
#display(hot_songs_df.head())

all_songs_df=pd.concat([not_hot_songs_df,hot_songs_df], axis=0)
all_songs_df.to_csv("all_songs_and_audio_features_df.csv",index=False)
all_songs_df.head()
all_songs_df.nunique()

song_title          2963
artist              1793
danceability         762
energy               982
key                   12
loudness            2593
mode                   2
speechiness          897
acousticness        1642
instrumentalness    1578
liveness             939
valence             1005
tempo               2810
type                   1
id                  2887
uri                 2887
track_href          2887
analysis_url        2887
duration_ms         2755
time_signature         4
dtype: int64

In [219]:
all_songs_df['time_signature'].value_counts()

4    2686
3     265
5      33
1      31
Name: time_signature, dtype: int64

In [220]:
all_songs_df.shape

(3015, 20)

In [221]:
X=all_songs_df[['danceability','energy','acousticness','instrumentalness','liveness','valence']]
X

Unnamed: 0,danceability,energy,acousticness,instrumentalness,liveness,valence
0,0.737,0.399,0.171000,0.000000,0.214,0.438
1,0.451,0.831,0.062300,0.000000,0.185,0.410
2,0.773,0.859,0.085500,0.000180,0.914,0.813
3,0.238,0.943,0.000449,0.000004,0.102,0.181
4,0.844,0.262,0.680000,0.000000,0.111,0.274
...,...,...,...,...,...,...
95,0.877,0.599,0.044700,0.000000,0.110,0.305
96,0.941,0.514,0.153000,0.000002,0.104,0.573
97,0.415,0.830,0.014400,0.000000,0.140,0.485
98,0.748,0.524,0.414000,0.000000,0.111,0.661


## Scaling features ##

K-Means is a distance based algorithm: we need to scale / normalize!:

In [222]:
#Scaling dataset
scaler = StandardScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)
filename = "/Users/devirughani/Desktop/IronHack/Week_6/Spotify_Midterm_Project/scalers/kmeans_scaler.pickle" # Path with filename
with open(filename, "wb") as file:
        pickle.dump(scaler,file)
X_scaled_df = pd.DataFrame(X_scaled, columns = X.columns)
display(X.head())
print()
display(X_scaled_df.head())

Unnamed: 0,danceability,energy,acousticness,instrumentalness,liveness,valence
0,0.737,0.399,0.171,0.0,0.214,0.438
1,0.451,0.831,0.0623,0.0,0.185,0.41
2,0.773,0.859,0.0855,0.00018,0.914,0.813
3,0.238,0.943,0.000449,4e-06,0.102,0.181
4,0.844,0.262,0.68,0.0,0.111,0.274





Unnamed: 0,danceability,energy,acousticness,instrumentalness,liveness,valence
0,1.093395,-0.744918,-0.426663,-0.552902,0.08906,-0.137627
1,-0.494574,0.932528,-0.748977,-0.552902,-0.07973,-0.246503
2,1.293279,1.041252,-0.680185,-0.552338,4.163313,1.320546
3,-1.677222,1.367422,-0.932376,-0.55289,-0.56282,-1.136961
4,1.687495,-1.276886,1.082608,-0.552902,-0.510437,-0.775334


Now, all features will have the same weight.

In [223]:
K = range(2, 21)
inertia = []
silhouette = []

for k in K:
    print("Training a K-Means model with {} neighbours! ".format(k))
    print()
    kmeans = KMeans(n_clusters=k,
                    n_init=3,
                    max_iter=300,
                    random_state=1234,
                    verbose=1)
    kmeans.fit(X_scaled_df)
    filename = "/Users/devirughani/Desktop/IronHack/Week_6/Spotify_Midterm_Project/models/Kmeans_models/model_pickles" + str(k) + ".pickle" # Path with filename
    with open(filename, "wb") as file:
        pickle.dump(kmeans,file)
    inertia.append(kmeans.inertia_)
    silhouette.append(silhouette_score(X_scaled_df, kmeans.predict(X_scaled_df)))


import matplotlib.pyplot as plt
%matplotlib inline

fig, ax = plt.subplots(1,2,figsize=(16,8))
ax[0].plot(K, inertia, 'bx-')
ax[0].set_xlabel('k')
ax[0].set_ylabel('inertia')
ax[0].set_xticks(np.arange(min(K), max(K)+1, 1.0))
ax[0].set_title('Elbow Method showing the optimal k')
ax[1].plot(K, silhouette, 'bx-')
ax[1].set_xlabel('k')
ax[1].set_ylabel('silhouette score')
ax[1].set_xticks(np.arange(min(K), max(K)+1, 1.0))
ax[1].set_title('Silhouette Method showing the optimal k')

Training a K-Means model with 2 neighbours! 

Initialization complete
Iteration 0, inertia 17448.4017837144
Iteration 1, inertia 13213.760239819854
Iteration 2, inertia 13014.135634455977
Iteration 3, inertia 12941.659354347672
Iteration 4, inertia 12906.708432435687
Iteration 5, inertia 12898.409532934162
Iteration 6, inertia 12896.560759219346
Iteration 7, inertia 12895.479891270723
Iteration 8, inertia 12895.235893427634
Converged at iteration 8: center shift 9.183460868033881e-05 within tolerance 0.00010000000000000018.
Initialization complete
Iteration 0, inertia 17308.624511561382
Iteration 1, inertia 12972.940283839593
Iteration 2, inertia 12903.053421431527
Iteration 3, inertia 12896.642931346425
Iteration 4, inertia 12895.576819133556
Iteration 5, inertia 12895.235893427636
Converged at iteration 5: center shift 9.183460868034236e-05 within tolerance 0.00010000000000000018.
Initialization complete
Iteration 0, inertia 18427.868897658573
Iteration 1, inertia 13010.237630141226


Training a K-Means model with 7 neighbours! 

Initialization complete
Iteration 0, inertia 9878.728468088451
Iteration 1, inertia 8043.067292834452
Iteration 2, inertia 7405.832221863388
Iteration 3, inertia 7126.440251009791
Iteration 4, inertia 6970.768600357498
Iteration 5, inertia 6904.787211459966
Iteration 6, inertia 6879.499287080245
Iteration 7, inertia 6860.859016431888
Iteration 8, inertia 6849.435798539342
Iteration 9, inertia 6840.227799289551
Iteration 10, inertia 6836.5674572212565
Iteration 11, inertia 6833.668850803808
Iteration 12, inertia 6829.768468593705
Iteration 13, inertia 6824.461336565775
Iteration 14, inertia 6819.462409581055
Iteration 15, inertia 6814.75119054778
Iteration 16, inertia 6811.218315210407
Iteration 17, inertia 6809.426075442174
Iteration 18, inertia 6807.860549110758
Iteration 19, inertia 6803.583154196995
Iteration 20, inertia 6801.787788899416
Iteration 21, inertia 6799.597195801696
Iteration 22, inertia 6797.456235583468
Iteration 23, inerti

Training a K-Means model with 10 neighbours! 

Initialization complete
Iteration 0, inertia 8326.692994522382
Iteration 1, inertia 6620.550339998966
Iteration 2, inertia 6333.54060615449
Iteration 3, inertia 6186.5349807259845
Iteration 4, inertia 6083.825752253294
Iteration 5, inertia 6013.341481758298
Iteration 6, inertia 5947.013940072564
Iteration 7, inertia 5902.659250991593
Iteration 8, inertia 5885.023319018673
Iteration 9, inertia 5879.971811365914
Iteration 10, inertia 5877.134912978425
Iteration 11, inertia 5874.320651949886
Iteration 12, inertia 5870.02775431213
Iteration 13, inertia 5866.453728153858
Iteration 14, inertia 5863.600861381392
Iteration 15, inertia 5858.4343605060285
Iteration 16, inertia 5852.624285440383
Iteration 17, inertia 5845.834640236121
Iteration 18, inertia 5840.250734847891
Iteration 19, inertia 5835.7252557111115
Iteration 20, inertia 5828.050900138643
Iteration 21, inertia 5816.620158032409
Iteration 22, inertia 5808.562107968926
Iteration 23, iner

Iteration 16, inertia 5532.045450465297
Iteration 17, inertia 5531.591127689905
Iteration 18, inertia 5531.533834112247
Converged at iteration 18: strict convergence.
Training a K-Means model with 13 neighbours! 

Initialization complete
Iteration 0, inertia 7139.162860414963
Iteration 1, inertia 5577.77051858608
Iteration 2, inertia 5361.854069929597
Iteration 3, inertia 5244.841870964905
Iteration 4, inertia 5169.633964138189
Iteration 5, inertia 5119.3830671286005
Iteration 6, inertia 5091.242240756937
Iteration 7, inertia 5078.31107350704
Iteration 8, inertia 5069.762669956837
Iteration 9, inertia 5062.630496239986
Iteration 10, inertia 5057.684646270007
Iteration 11, inertia 5048.402271000835
Iteration 12, inertia 5045.091013339774
Iteration 13, inertia 5042.832361729585
Iteration 14, inertia 5041.73760925016
Iteration 15, inertia 5040.867898922787
Iteration 16, inertia 5040.597961533411
Iteration 17, inertia 5040.46599372281
Iteration 18, inertia 5040.3800600382165
Converged at i

Iteration 27, inertia 4835.674361121701
Iteration 28, inertia 4835.357410812601
Iteration 29, inertia 4835.085343630773
Iteration 30, inertia 4834.460502951685
Iteration 31, inertia 4834.191048202394
Iteration 32, inertia 4834.039844240824
Iteration 33, inertia 4834.002543126961
Iteration 34, inertia 4833.962320386379
Converged at iteration 34: strict convergence.
Training a K-Means model with 16 neighbours! 

Initialization complete
Iteration 0, inertia 6336.167855328394
Iteration 1, inertia 4971.982089648297
Iteration 2, inertia 4747.252479226316
Iteration 3, inertia 4659.261558361237
Iteration 4, inertia 4620.257133661836
Iteration 5, inertia 4600.003803556472
Iteration 6, inertia 4589.369525238323
Iteration 7, inertia 4584.06461746104
Iteration 8, inertia 4579.726155720951
Iteration 9, inertia 4577.590599575532
Iteration 10, inertia 4576.152054404939
Iteration 11, inertia 4575.102755207638
Iteration 12, inertia 4569.47899994745
Iteration 13, inertia 4567.639824324627
Iteration 14, 

Training a K-Means model with 19 neighbours! 

Initialization complete
Iteration 0, inertia 5764.842663451263
Iteration 1, inertia 4584.556223039072
Iteration 2, inertia 4377.847264051032
Iteration 3, inertia 4306.966744972173
Iteration 4, inertia 4271.927158576781
Iteration 5, inertia 4253.851028878629
Iteration 6, inertia 4239.407627693035
Iteration 7, inertia 4227.307934538593
Iteration 8, inertia 4216.9325064945715
Iteration 9, inertia 4211.968017004965
Iteration 10, inertia 4209.664107205979
Iteration 11, inertia 4205.639204941902
Iteration 12, inertia 4201.182531574631
Iteration 13, inertia 4197.854230255781
Iteration 14, inertia 4194.860847295702
Iteration 15, inertia 4192.564964469427
Iteration 16, inertia 4190.1386908507
Iteration 17, inertia 4186.732380529277
Iteration 18, inertia 4185.4430528967005
Iteration 19, inertia 4184.764286595826
Iteration 20, inertia 4184.3456269909575
Iteration 21, inertia 4183.908069922425
Iteration 22, inertia 4183.709519115648
Iteration 23, iner

Text(0.5, 1.0, 'Silhouette Method showing the optimal k')

Error in callback <function flush_figures at 0x7fac72e8ff70> (for post_execute):


KeyboardInterrupt: 

Here, we confirm that there is no clear elbow. However, in the Silhouette there are peaks at 6,9,11, and 17. 

Likely that the prime number of groups is 8 or 11. 

## Loading the scaler and the best model ##

In [247]:
def load(filename = "filename.pickle"): 
    try: 
        with open(filename, "rb") as file: 
            return pickle.load(file) 
    except FileNotFoundError: 
        print("File not found!") 

In [248]:
scaler2 = load("/Users/devirughani/Desktop/IronHack/Week_6/Spotify_Midterm_Project/scalers/kmeans_scaler.pickle")
scaler2

StandardScaler()

In [249]:
best_model = load("/Users/devirughani/Desktop/IronHack/Week_6/Spotify_Midterm_Project/models/Kmeans_models/model_pickles11.pickle")

In [250]:
best_model

KMeans(n_clusters=11, n_init=3, random_state=1234, verbose=1)

## Applying the best model ##

In [251]:
best_model.fit(X_scaled_df)

Initialization complete
Iteration 0, inertia 8123.967336334067
Iteration 1, inertia 6323.568816769999
Iteration 2, inertia 6021.020973849859
Iteration 3, inertia 5865.369224249653
Iteration 4, inertia 5744.279259445395
Iteration 5, inertia 5668.760666345142
Iteration 6, inertia 5625.750948533099
Iteration 7, inertia 5586.199859049819
Iteration 8, inertia 5566.478244229876
Iteration 9, inertia 5556.299761024498
Iteration 10, inertia 5545.035006073891
Iteration 11, inertia 5537.825442085654
Iteration 12, inertia 5531.27741639557
Iteration 13, inertia 5523.024365639261
Iteration 14, inertia 5510.687106258921
Iteration 15, inertia 5495.804150461259
Iteration 16, inertia 5480.818687268221
Iteration 17, inertia 5467.795085917662
Iteration 18, inertia 5456.39755561872
Iteration 19, inertia 5448.954584880784
Iteration 20, inertia 5442.493563668936
Iteration 21, inertia 5430.200423595756
Iteration 22, inertia 5424.9201131422
Iteration 23, inertia 5421.876802823172
Iteration 24, inertia 5420.932

KMeans(n_clusters=11, n_init=3, random_state=1234, verbose=1)

In [252]:
clusters = best_model.predict(X_scaled_df)
clusters

array([ 2,  4,  6, ...,  4, 10,  4], dtype=int32)

In [253]:
pd.Series(clusters).value_counts().sort_index()

0     535
1     185
2     439
3     328
4     441
5     270
6     152
7     130
8     141
9     124
10    270
dtype: int64

In [254]:
X["cluster"] = clusters
X.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["cluster"] = clusters


Unnamed: 0,danceability,energy,acousticness,instrumentalness,liveness,valence,cluster
0,0.737,0.399,0.171,0.0,0.214,0.438,2
1,0.451,0.831,0.0623,0.0,0.185,0.41,4
2,0.773,0.859,0.0855,0.00018,0.914,0.813,6
3,0.238,0.943,0.000449,4e-06,0.102,0.181,4
4,0.844,0.262,0.68,0.0,0.111,0.274,10


In [255]:
song_descriptors=all_songs_df[['song_title','artist']]
all_clustered_songs_df=pd.concat([song_descriptors,X], axis=1)

In [256]:
all_clustered_songs_df.reset_index(inplace=True)
all_clustered_songs_df=all_clustered_songs_df.drop(columns=['index'])

In [234]:
all_clustered_songs_df.to_csv("all_all_clustered_songs_df.csv",index=False)

In [235]:
not_hot_clusters=X.iloc[:2915,-1:]

In [236]:
not_hot_songs_clustered_df=pd.concat([song_descriptors.iloc[:2915,:],X.iloc[:2915,-1:]], axis=1)
not_hot_songs_clustered_df.to_csv("all_all_clustered_songs_df.csv",index=False)

In [237]:
hot_songs_clustered_df=pd.concat([song_descriptors.iloc[2915:,:],X.iloc[2915:,-1:]], axis=1)
hot_songs_clustered_df.to_csv("all_all_clustered_songs_df.csv",index=False)

In [238]:
not_hot_songs_clustered_df.head()

Unnamed: 0,song_title,artist,cluster
0,Random Access Memories,Daft Punk,2
1,"Good Kid, M.A.A.d City",Kendrick Lamar,4
2,Thriller,Michael Jackson,6
3,The Dark Side Of The Moon,Pink Floyd,4
4,Rumours,Fleetwood Mac,10


In [239]:
all_clustered_songs_df

Unnamed: 0,song_title,artist,danceability,energy,acousticness,instrumentalness,liveness,valence,cluster
0,Random Access Memories,Daft Punk,0.737,0.399,0.171000,0.000000,0.214,0.438,2
1,"Good Kid, M.A.A.d City",Kendrick Lamar,0.451,0.831,0.062300,0.000000,0.185,0.410,4
2,Thriller,Michael Jackson,0.773,0.859,0.085500,0.000180,0.914,0.813,6
3,The Dark Side Of The Moon,Pink Floyd,0.238,0.943,0.000449,0.000004,0.102,0.181,4
4,Rumours,Fleetwood Mac,0.844,0.262,0.680000,0.000000,0.111,0.274,10
...,...,...,...,...,...,...,...,...,...
3010,Evil Twins,King Von,0.877,0.599,0.044700,0.000000,0.110,0.305,2
3011,Trust Nothing,King Von Featuring Moneybagg Yo,0.941,0.514,0.153000,0.000002,0.104,0.573,0
3012,Iffy,Chris Brown,0.415,0.830,0.014400,0.000000,0.140,0.485,4
3013,Closer,Saweetie Featuring H.E.R.,0.748,0.524,0.414000,0.000000,0.111,0.661,10


In [240]:
all_clustered_songs_df['org'] = "Not Hot"
all_clustered_songs_df.iloc[-100:,-1] = "Hot"

In [241]:
all_clustered_songs_df.tail()

Unnamed: 0,song_title,artist,danceability,energy,acousticness,instrumentalness,liveness,valence,cluster,org
3010,Evil Twins,King Von,0.877,0.599,0.0447,0.0,0.11,0.305,2,Hot
3011,Trust Nothing,King Von Featuring Moneybagg Yo,0.941,0.514,0.153,2e-06,0.104,0.573,0,Hot
3012,Iffy,Chris Brown,0.415,0.83,0.0144,0.0,0.14,0.485,4,Hot
3013,Closer,Saweetie Featuring H.E.R.,0.748,0.524,0.414,0.0,0.111,0.661,10,Hot
3014,When I'm Gone,Alesso / Katy Perry,0.53,0.768,0.00385,0.0,0.103,0.374,4,Hot


In [242]:
all_clustered_songs_df['id'] = all_songs_df['id'].values

In [243]:
my_list = [1,5,6,3,4]
my_list[-2:]

[3, 4]

In [244]:
all_clustered_songs_df.to_csv("all_clustered_songs_final_with_id_df.csv",index=False)

In [245]:
!ls

Hot_100.ipynb
K-Means Clustering and DBSCAN.ipynb
User Workflow.ipynb
all_all_clustered_songs_df.csv
all_clustered_songs_final_with_id_df.csv
all_songs_and_audio_features_df.csv
[34maudiofeatures[m[m
case-study-gnod.md
config.py
[34mdata[m[m
[34mmodels[m[m
not_hot_100_notebook.ipynb
requirements.txt
[34mscalers[m[m
using_spotipy_project_continued.ipynb


In [246]:
if (song_id in list(all_clustered_songs_df[all_clustered_songs_df['org'] == "Hot"]['id'].values)):
    rec = all_clustered_songs_df[(all_clustered_songs_df['org'] == "Hot") & all_clustered_songs_df['cluster'] == cluster)].sample(n=1)
    rec[['song_title','artist']]
else:
    

SyntaxError: closing parenthesis ')' does not match opening parenthesis '[' (3561205747.py, line 2)

### DBSCAN ###

In [257]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.cluster import DBSCAN

In [258]:
dbscan_model_1 = DBSCAN(eps=0.30, min_samples=12)
yhat = dbscan_model_1.fit_predict(X)

In [259]:
db_clusters = yhat
db_clusters

array([0, 1, 9, ..., 1, 2, 1])

In [260]:
X['db_0.30_clusters'] = db_clusters

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['db_0.30_clusters'] = db_clusters


In [261]:
X.head()

Unnamed: 0,danceability,energy,acousticness,instrumentalness,liveness,valence,cluster,db_0.30_clusters
0,0.737,0.399,0.171,0.0,0.214,0.438,2,0
1,0.451,0.831,0.0623,0.0,0.185,0.41,4,1
2,0.773,0.859,0.0855,0.00018,0.914,0.813,6,9
3,0.238,0.943,0.000449,4e-06,0.102,0.181,4,1
4,0.844,0.262,0.68,0.0,0.111,0.274,10,2


In [290]:
%matplotlib inline
import matplotlib.pyplot as plt

In [303]:
# create scatter plot for samples from each cluster
for cluster in db_clusters:
# get row indexes for samples with this cluster
    #row_ix = np.where(yhat == cluster)
    row_ix=X[X['db_0.30_clusters'] == cluster].index
    # create scatter of these samples
    plt.scatter([row_ix, 0]),([row_ix, 1])

# show the plot
plt.title("Clusters detected by DBSCAN, blue dots are considered Noise")
plt.show()

TypeError: scatter() missing 1 required positional argument: 'y'

In [299]:
from scipy.spatial import distance_matrix
d = distance_matrix(X,X)
print(d)
print()

[[0.         2.2982299  9.89195791 ... 2.30687095 8.25440083 2.28546847]
 [2.2982299  0.         8.2945076  ... 0.10602552 6.1134792  0.14712716]
 [9.89195791 8.2945076  0.         ... 8.29703081 8.11718395 8.30207436]
 ...
 [2.30687095 0.10602552 8.29703081 ... 0.         6.11522871 0.17569947]
 [8.25440083 6.1134792  8.11718395 ... 6.11522871 0.         6.11209588]
 [2.28546847 0.14712716 8.30207436 ... 0.17569947 6.11209588 0.        ]]



In [280]:
d2 = np.sort(d)[:,1:]
print(d2)

[[ 0.10682345  0.11750745  0.11979149 ... 12.58406902 12.58452733
  12.5863147 ]
 [ 0.          0.06221616  0.06943987 ... 10.82575288 10.82978764
  10.83024201]
 [ 0.21017619  0.23878211  0.24026294 ... 11.31491837 11.32146344
  11.70428054]
 ...
 [ 0.0340657   0.06261374  0.06986725 ... 10.82906514 10.82930364
  10.83010836]
 [ 0.          0.13454817  0.13579764 ... 10.07143607 10.07267173
  10.4599494 ]
 [ 0.04652185  0.04652185  0.04836606 ... 10.82553265 10.82942998
  10.83235159]]
