<a href="https://colab.research.google.com/github/ethankreuzer/projects/blob/main/MATH308_A3_Q5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cdist
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.cluster import KMeans

In [2]:
songs_df = pd.read_csv('songs.csv')

In [3]:
# Extracting features (assuming columns 1 to 9 are features)
features = songs_df.values

# Calculating Euclidean distances
distances = euclidean_distances(features)

# Generating adjacency matrix based on distances
adjacency_matrix = np.where(distances < 1, 1, 0)


In [4]:
def cluster_songs(adjacency_matrix):
    # Compute degree matrix D
    degree_matrix = np.diag(np.sum(adjacency_matrix, axis=1))

    # Compute normalized Laplacian L_normalized
    laplacian_matrix = degree_matrix - adjacency_matrix
    D_sqrt_inv = np.linalg.inv(np.sqrt(degree_matrix))
    normalized_laplacian = np.dot(np.dot(D_sqrt_inv, laplacian_matrix), D_sqrt_inv)

    # Compute eigenvectors and eigenvalues of L_normalized
    eigenvalues, eigenvectors = np.linalg.eigh(normalized_laplacian)

    # Second smallest eigenvalue and corresponding eigenvector
    second_smallest_eigenvalue = eigenvalues[1]
    second_smallest_eigenvector = eigenvectors[:, 1]

    # Compute x = D^(-1/2) * v
    x = np.dot(D_sqrt_inv, second_smallest_eigenvector)

    # Cluster x using the indicator function
    clusters = np.where(x >= 0, 1, 2)

    return clusters

clusters = cluster_songs(adjacency_matrix)

print("Clusters of first 5 songs:", clusters[0:5])


Clusters of first 5 songs: [2 1 2 2 1]


In [5]:
def calculate_mean_difference(features, clusters):
    # Calculate mean feature values for each cluster
    cluster_means = {}
    for cluster_id in np.unique(clusters):
        cluster_means[cluster_id] = np.mean(features[clusters == cluster_id], axis=0)

    # Calculate the difference between the mean feature values of the two clusters
    mean_difference = cluster_means[1] - cluster_means[2]

    return mean_difference


# Calculate the mean difference of each feature between the two clusters
mean_difference = calculate_mean_difference(features, clusters)

# Find the top 3 features with the highest absolute mean difference
top_features_indices = np.argsort(np.abs(mean_difference))[::-1][:3]
top_features = top_features_indices  # Adding 1 to get feature indices (assuming features start from 1)

print("Top 3 features with the highest absolute mean difference:")
for feature_index in top_features:
    print("Feature", songs_df.columns[feature_index])


Top 3 features with the highest absolute mean difference:
Feature energy
Feature danceability
Feature pitches


In [None]:
print(mean_difference)

[ 0.00139952  0.00588542 -0.02280919 -0.02044281 -0.02550022  0.6555892
 -0.03411942 -0.01463438  0.01366864  0.0753245 ]


Looking at these features, the lagest difference in mean was by far "energy", with the second being "danceability", but by much less than "energy". It is very likely that this clustering is distinguishing being pop/hip-hop songs and other songs, as these types of songs are usually high energy and "catchy" type songs people usually dance to.