In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import seaborn as sns

df = pd.read_csv('SpotifyMostStreamedSongs.csv', engine = 'python')

plt.hist(df.danceability, bins = 15)

plt.title("Danceability Descriptives")
plt.xlabel("Danceability (%)")
plt.ylabel("# of Songs")

In [None]:
plt.hist(df.energy, bins = 15)

plt.title("Energy Descriptives")
plt.xlabel("Energy (%)")
plt.ylabel("# of Songs")

In [None]:
plt.hist(df.acousticness, bins = 15)

plt.title("Acousticness Descriptives")
plt.xlabel("Acousticness (%)")
plt.ylabel("# of Songs")

In [None]:
X = df[['danceability', 'valence', 'energy', 'acousticness', 'instrumentalness', 'liveness', 'speechiness']]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

kmeans = KMeans(n_clusters = 5, random_state = 42)
kmeans.fit(X_scaled)

df['Cluster'] = kmeans.labels_
plt.figure(figsize = (10, 6))
sns.scatterplot(x = df['danceability'], y = df['acousticness'], hue = df['Cluster'], palette = 'viridis', s = 100)
plt.scatter(kmeans.cluster_centers_[:, 1], kmeans.cluster_centers_[:, 2], s = 300, c = 'red', label = 'Centroids')
plt.title('Music Segments based on Danceability and Acousticness Percentages')
plt.xlabel("Danceability (%)")
plt.ylabel("Acousticness (%)")
plt.legend()
plt.show()

In [None]:
plt.figure(figsize = (10, 6))
sns.scatterplot(x = df['danceability'], y = df['energy'], hue = df['Cluster'], palette = 'viridis', s = 100)
plt.scatter(kmeans.cluster_centers_[:, 1], kmeans.cluster_centers_[:, 2], s = 300, c = 'red', label = 'Centroids')
plt.title('Music Segments based on Danceability and Energy Percentages')
plt.xlabel("Danceability (%)")
plt.ylabel("Energy (%)")
plt.legend()
plt.show()