In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Números y Datos
import pandas as pd
import numpy as np

from sklearn.metrics import confusion_matrix

sns.set_style('darkgrid')

plt.rcParams['figure.figsize'] = [4, 4]
plt.rcParams['figure.dpi'] = 100 # 200 e.g. is really fine, but slower

# Set seed for random generation
seed = 3569
np.random.seed(seed)

genre_config = {
    'jazz': 1,
    'classical' : 2,
    'drum-and-bass' : 3,
    'death-metal' : 4,
    'ambient' : 5,
    'ska' : 6,
    'singer-songwriter' : 7,
    'opera' : 8,
    'trance' : 9
}

In [None]:
df_spotify = pd.read_pickle('sources/df_merged_posta_preprocesado.pickle')

In [None]:
df_spotify.head(10)

In [None]:
genre_real = df_spotify['genre']
genre_numeric = [genre_config[item] for item in genre_real]

In [None]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
df_numeric = df_spotify.select_dtypes(include=numerics)

In [None]:
def plot_conf_matrix(real, Z2, xlabels, description):
    mat = confusion_matrix(real, Z2)
    sns.heatmap(
        mat.T,
        square=True,
        annot=True,
        fmt='d',
        cbar=False,
        xticklabels=xlabels,
        yticklabels=range(9)
    )
    plt.xlabel('Real')
    plt.ylabel(description)
    plt.show()

In [None]:
genres = genre_real.unique()

plot_conf_matrix(
    genre_real.values,
    genre_real.values,
    genres,
    'Original'
)

In [None]:
# Efecto Uniforme
clusters = df_spotify.genre.value_counts()
print(np.std(clusters)/np.mean(clusters))

In [None]:
## Validación Interna
from sklearn.metrics.pairwise import euclidean_distances

df = df_spotify.drop(['genre'], axis=1)
d  = euclidean_distances(df, df)

In [None]:
import pylab
from scipy.cluster.hierarchy import dendrogram, linkage

# Abrir figura nueva
fig = pylab.figure()

# Calcular dendrograma y graficar.
axdendro = fig.add_axes([0.09,0.1,0.2,0.8])
Y = linkage(d, method='centroid')
Z = dendrogram(Y, orientation='right')
axdendro.set_xticks([])
axdendro.set_yticks([])
axdendro.invert_xaxis()

# Graficar la matriz de distancias.
axmatrix = fig.add_axes([0.3,0.1,0.6,0.8])
index = Z['leaves']
d = d[index,:]
d = d[:,index]
im = axmatrix.matshow(d, aspect='auto', origin='lower')
axmatrix.set_xticks([])
axmatrix.set_yticks([])

# Graficar la barra de color
axcolor = fig.add_axes([0.91,0.1,0.02,0.8])
pylab.colorbar(im, cax=axcolor)

In [None]:
sns.clustermap(d)

In [None]:
from sklearn.metrics import  silhouette_score

## Silhuette usando sklearn
silhouette_avg = silhouette_score(df, df_spotify.genre)
silhouette_avg

In [None]:
from sklearn.metrics import  silhouette_samples

## Silhuette usando sklearn
sample_silhouette_values = silhouette_samples(df, df_spotify.genre)
sample_silhouette_values

In [None]:
# Estimar el coeficiente de Silhouette para cada cluster
cluster_labels = df_spotify.genre
clusters = np.unique(cluster_labels)
ith_cluster_silhouette_avg = []
for cluster in clusters:
    ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == cluster]
    ith_cluster_silhouette_avg.append(
        {
            "cluster": cluster,
            "silhouette_avg": np.mean(ith_cluster_silhouette_values)
        }
    )
ith_cluster_silhouette_avg

In [None]:
palette = iter(
    sns.husl_palette(
        len(clusters)
    )
)

fig, ax1 = plt.subplots(1, 1)
fig.set_size_inches(18, 7)

# The 1st subplot is the silhouette plot
# The silhouette coefficient can range from -1, 1 but in this example all
# lie within [-0.1, 1]
ax1.set_xlim([-0.1, 1])

# The (n_clusters+1)*10 is for inserting blank space between silhouette
# plots of individual clusters, to demarcate them clearly.
ax1.set_ylim([0, len(df_spotify) + (len(clusters) + 1) * 10])

y_lower = 10
for cluster in clusters:
    ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == cluster]
    ith_cluster_silhouette_values.sort()
    size_cluster_i = ith_cluster_silhouette_values.shape[0]
    y_upper = y_lower + size_cluster_i

    color = next(palette)
    ax1.fill_betweenx(
        np.arange(y_lower, y_upper),
        0,
        ith_cluster_silhouette_values,
        facecolor=color,
        edgecolor=color,
        alpha=0.7
    )

    # Label the silhouette plots with their cluster numbers at the middle
    ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, cluster)

    # Compute the new y_lower for next plot
    y_lower = y_upper + 10  # 10 for the 0 samples

ax1.set_xlabel("Coeficiente de silhouette")
ax1.set_ylabel("Cluster label")

# The vertical line for average silhouette score of all the values
ax1.axvline(
    x=silhouette_avg,
    color="red",
    linestyle="--"
)

ax1.set_yticks([])  # Clear the yaxis labels / ticks
ax1.set_xticks([-0.4, -0.3, -0.2, 0, 0.2, 0.4, 0.6])

plt.show()