In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score
import matplotlib.pyplot as plt

N = 100000
df_scaled_subset = df_scaled[:N]

tsne = TSNE(
    n_components=2,
    perplexity=30,
    random_state=42,
    init='random'
)
tsne_results = tsne.fit_transform(df_scaled_subset)

# Evaluation loop
silhouette_scores = []
ch_scores = []
k_values = range(2, 15)

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(tsne_results)
    
    sil_score = silhouette_score(tsne_results, labels)
    ch_score = calinski_harabasz_score(tsne_results, labels)
    
    silhouette_scores.append(sil_score)
    ch_scores.append(ch_score)
    
    print(f"k={k}: Silhouette={sil_score:.4f}, CH Score={ch_score:.2f}")

plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(k_values, silhouette_scores, marker='o')
plt.title("Silhouette Score vs. Number of Clusters (t-SNE)")
plt.xlabel("k")
plt.ylabel("Silhouette Score")
plt.grid(True)

plt.subplot(1, 2, 2)
plt.plot(k_values, ch_scores, marker='o', color='orange')
plt.title("Calinski-Harabasz Score vs. Number of Clusters (t-SNE)")
plt.xlabel("k")
plt.ylabel("CH Score")
plt.grid(True)

plt.tight_layout()
plt.show()

In [None]:
#KMeans with k=10
kmeans = KMeans(n_clusters=k, random_state=42)
df['cluster_pca'] = kmeans.fit_predict(df_pca)
df.head()

In [None]:
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans

N = 100000
df_scaled_subset = df_scaled[:N]

tsne = TSNE(
    n_components=2,
    perplexity=30,
    random_state=42,
    init='random'
)
tsne_results = tsne.fit_transform(df_scaled_subset)

kmeans_tsne = KMeans(n_clusters=7, random_state=42)
tsne_clusters = kmeans_tsne.fit_predict(tsne_results)

df_subset = df.iloc[:N].copy()
df_subset['cluster_tsne'] = tsne_clusters

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(tsne_results[:, 0], tsne_results[:, 1], c=tsne_clusters, cmap='viridis', alpha=0.6)
plt.title('Clustering of songs with t-SNE (on scaled features)')
plt.xlabel('t-SNE Dimension 1')
plt.ylabel('t-SNE Dimension 2')
plt.show()

In [None]:
def tsne(amount, features, cluster_column, ax=None):
    # Obtener índices de características
    if isinstance(features[0], str):
        feature_indices = [df_numerical.columns.get_loc(f) for f in features]
    else:
        feature_indices = features
        
    df_scaled2 = df_scaled[:amount, feature_indices]
    
    # Aplicar t-SNE
    tsne = TSNE(n_components=2, perplexity=50, random_state=42)
    df_tsne = tsne.fit_transform(df_scaled2)
    
    # Obtener los clusters del DataFrame original
    cluster_values = df[cluster_column].values[:amount]  # Usamos los clusters existentes
    
    # Plot - usando los clusters de df para colorear
    if ax is None:
        fig, ax = plt.subplots(figsize=(8, 6))
    
    # Verificar si los clusters son valores numéricos o colores directos
    if pd.api.types.is_numeric_dtype(df[cluster_column]):
        # Si son numéricos, usamos cmap
        scatter = ax.scatter(df_tsne[:, 0], df_tsne[:, 1], c=cluster_values, cmap='viridis', alpha=0.6)
    else:
        # Si son strings (nombres de colores), los usamos directamente
        scatter = ax.scatter(df_tsne[:, 0], df_tsne[:, 1], c=cluster_values, alpha=0.6)
    
    ax.set_title(f'{amount} samples ({cluster_column} from df)')
    ax.set_xlabel('t-SNE 1')
    ax.set_ylabel('t-SNE 2')
    
    return ax

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(10, 8))

# Plot in each subplot
tsne(1000, numerical_features, 'cluster_pca', ax=axes[0, 0])
tsne(1000, numerical_features_no_duration, 'cluster_pca', ax=axes[0, 1])
tsne(1000, numerical_features, 'cluster_tsne', ax=axes[1, 0])
tsne(1000, numerical_features_no_duration, 'cluster_tsne', ax=axes[1, 1])

# Add column titles
col_titles = ["With Duration", "Without Duration"]
for ax, col_title in zip(axes[0, :], col_titles):
    ax.annotate(col_title, xy=(0.5, 1.1), xycoords='axes fraction', 
                ha='center', va='center', fontsize=12, fontweight='bold')

# Adjust layout
plt.tight_layout()
plt.show()
plt.tight_layout()
plt.show()

In [None]:
def plot_voronoi(amount, features, cluster_column, ax=None):
    # Obtener los datos según las características especificadas
    if isinstance(features[0], str):  # Si las características son nombres de columnas
        feature_indices = [df_numerical.columns.get_loc(f) for f in features]
        df_scaled2 = df_scaled[:amount, feature_indices]
    else:  # Si las características son índices
        df_scaled2 = df_scaled[:amount, features]
    
    # Aplicar t-SNE
    tsne = TSNE(n_components=2, perplexity=50, random_state=42)
    tsne_results = tsne.fit_transform(df_scaled2)
    
    # Obtener los clusters del DataFrame original para la cantidad de muestras seleccionada
    cluster_values = df[cluster_column].values[:amount]
    
    # Calcular los centros de los clusters existentes en el espacio t-SNE
    unique_clusters = np.unique(cluster_values)
    centers = np.array([tsne_results[cluster_values == k].mean(axis=0) for k in unique_clusters])
    
    # Calcular diagrama de Voronoi basado en los centros
    vor = Voronoi(centers)
    
    # Graficar
    if ax is None:
        fig, ax = plt.subplots(figsize=(8, 6))
    
    # Dibujar diagrama de Voronoi
    voronoi_plot_2d(vor, ax=ax, show_points=False, show_vertices=False, line_colors='orange')
    
    # Dibujar puntos coloreados por cluster con transparencia
    scatter = ax.scatter(tsne_results[:, 0], tsne_results[:, 1], 
                         c=cluster_values, cmap='viridis', alpha=0.5)
    
    # Dibujar centros de los clusters
    ax.scatter(centers[:, 0], centers[:, 1], c='red', s=50, marker='x')
    
    # Añadir contornos de densidad
    for k in unique_clusters:
        cluster_points = tsne_results[cluster_values == k]
        if len(cluster_points) > 1:  # Necesitamos al menos 2 puntos para KDE
            sns.kdeplot(x=cluster_points[:, 0], y=cluster_points[:, 1], 
                        ax=ax, color='darkblue', alpha=0.3)
    
    ax.set_title(f"Voronoi para {amount} muestras ({cluster_column} preexistentes)")
    return ax

In [None]:
numerical_features = ["artist_id", "album_id", "duration_ms", "danceability", "energy", "key", "loudness", "mode", "speechiness",
                "acousticness", "instrumentalness", "liveness", "valence", "tempo", "time_signature"]

numerical_features_no_duration = ["artist_id", "album_id", "danceability", "energy", "key", "loudness", "mode", "speechiness",
                "acousticness", "instrumentalness", "liveness", "valence", "tempo", "time_signature"]
                

In [None]:
# Create the 2x2 layout
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

plot_voronoi(1000, numerical_features,'cluster_pca', ax=axes[0, 0])
plot_voronoi(1000, numerical_features_no_duration,'cluster_pca', ax=axes[0, 1])
plot_voronoi(1000, numerical_features,'cluster_tsne', ax=axes[1, 0])
plot_voronoi(1000, numerical_features_no_duration,'cluster_tsne', ax=axes[1, 1])

# Add column titles
fig.text(0.25, 0.92, "With Duration", ha='center', va='center', 
         fontsize=12, fontweight='bold')
fig.text(0.75, 0.92, "Without Duration", ha='center', va='center', 
         fontsize=12, fontweight='bold')

# Add row labels
fig.text(0.05, 0.75, "PCA clusters", ha='center', va='center', 
         rotation='vertical', fontsize=12, fontweight='bold')
fig.text(0.05, 0.25, "TSNE clusters", ha='center', va='center', 
         rotation='vertical', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.subplots_adjust(top=0.85, left=0.1)  # Adjust margins
plt.show()