In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os 
import numpy as np

import scipy
import sklearn.metrics
GOLD_DATA_PATH = os.path.join("..", "..", "data/gold/")

In [None]:
df4 = pd.read_csv(GOLD_DATA_PATH + "data_card_4_df.csv", sep=";", encoding = 'latin')

In [None]:
df4 = df4.drop(columns=["Unnamed: 0"])
df4.set_index("Provincias", inplace=True)
df4.head()

## Correlación

In [None]:
columns = df4.columns
crr_results = []

for i in range(len(columns)):
    for j in range(i + 1, len(columns)):
        col1 = columns[i]
        col2 = columns[j]

        # Calculate Pearson correlation
        pearson = scipy.stats.pearsonr(df4[col1], df4[col2])
        pearson_corr = pearson[0]
        pearson_pval = pearson[1]
        
        # Calculate Spearman correlation
        spear = scipy.stats.spearmanr(df4[col1], df4[col2])
        spear_corr = spear.correlation
        spear_pval = spear.pvalue

            
        # Save results
        crr_results.append({
            'Feature 1': col1,
            'Feature 2': col2,
            'Pearson Correlation': pearson_corr,
            'Pearson p-value': pearson_pval,
            'Spearman Correlation': spear_corr,
            'Spearman p-value': spear_pval
        })


crr_results_df = pd.DataFrame(crr_results)

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(df4)
df4_scaled = pd.DataFrame(scaled_data, columns=df4.columns)

estimator = PCA(n_components=2)
X_pca = estimator.fit_transform(scaled_data)
print(estimator.explained_variance_ratio_)
pd.DataFrame(np.matrix.transpose(estimator.components_), index=df4.columns)


In [None]:
fig, ax = plt.subplots()
ax.scatter(X_pca[:,0], X_pca[:,1], s=50)

# anotación 
for i in range(0, len(X_pca)):
    ax.annotate(df4.iloc[i, :].name, (X_pca[i, 0], X_pca[i, 1]), fontsize=8)

In [None]:
import plotly.express as px

estimator = PCA(n_components=3)
X_pca = estimator.fit_transform(scaled_data)
print(estimator.explained_variance_ratio_)
pd.DataFrame(np.matrix.transpose(estimator.components_), index=df4.columns)

fig = px.scatter_3d(df4, x=X_pca[:,0], y=X_pca[:,1], z=X_pca[:,2])
# anotación 
# for i in range(0, len(X_pca)):
#     ax.text(X_pca[i, 0], X_pca[i, 1], X_pca[i, 2], df4.iloc[i, :].name, fontsize=8)
fig.show()

In [None]:
# metodo ward, minimiza la varianza intra-cluster
from scipy.cluster.hierarchy import dendrogram, linkage

link_matrix_avg = linkage(scaled_data, method='ward', metric='euclidean')
plt.figure(figsize=(8, 5))
dendrogram(link_matrix_avg, labels=df4.index)
plt.show()

In [None]:
# metodo single, minimiza la distancia entre los puntos mas cercanos
from scipy.cluster.hierarchy import dendrogram, linkage

link_matrix = linkage(scaled_data, method='single', metric='euclidean')
plt.figure(figsize=(8, 5))
dendrogram(link_matrix, labels=df4.index)
plt.show()

In [None]:
# metodo complete, minimiza la distancia entre los puntos mas lejanos
from scipy.cluster.hierarchy import dendrogram, linkage

link_matrix = linkage(scaled_data, method='complete', metric='euclidean')
plt.figure(figsize=(8, 5))
dendrogram(link_matrix, labels=df4.index)
plt.show()

In [None]:
# metodo average, minimiza la distancia promedio entre los puntos
from scipy.cluster.hierarchy import dendrogram, linkage

link_matrix = linkage(scaled_data, method='average', metric='euclidean')
plt.figure(figsize=(8, 5))
dendrogram(link_matrix, labels=df4.index)
plt.show()

In [None]:
from scipy.cluster.hierarchy import fcluster
import numpy as np

# Realizar el clustering jerárquico (usando la matriz de enlace creada previamente)
# 'linkage_matrix' debe estar calculada con el método 'ward' u otro método.
# # Paso 1: Generar muchos clusters inicialmente (granularidad alta)
# clusters_granular = fcluster(link_matrix_avg, t=10, criterion='maxclust')  # Generar 10 clusters

# # Agregar los clusters al DataFrame original para observar los resultados
# df4["Cluster_Granular"] = clusters_granular
# df4_reset = df4.reset_index()  # Reset the index to access "Provincias" column
# print("Clusters iniciales con granularidad alta:")
# print(df4_reset[["Provincias", "Cluster_Granular"]])

In [None]:
# # Paso 2: Reagrupar o inspeccionar los clusters manualmente
# # Aquí observamos los tamaños de los clusters
# cluster_sizes = df4["Cluster_Granular"].value_counts()
# print("\nTamaño de cada cluster inicial:")
# print(cluster_sizes)

Silhouette: cuantifica la cohesión y la separación de los grupos. Valores cercanos a 1 indican que los puntos están bien agrupados y los grupos están separados.

In [None]:
from sklearn import metrics


fig, axes = plt.subplots(2, 4, figsize=(20, 10))
axes = axes.flatten()

for i in range(2, 10):
    clusters = fcluster(link_matrix_avg, t=i, criterion='maxclust')  # Generar i clusters
    scatter = axes[i-2].scatter(X_pca[:, 0], X_pca[:, 1], c=clusters, s=50, cmap='rainbow')
    coef = metrics.silhouette_score(df4_scaled, clusters)
    axes[i-2].legend(*scatter.legend_elements(), title="Clusters")
    axes[i-2].set_title(f'{i} Clusters score: {coef:.3f}')

plt.tight_layout()
plt.show()

In [None]:
# Paso 3: Cortar a un nivel lógico, por ejemplo, para obtener 5 clusters
clusters_final = fcluster(link_matrix_avg, t=3, criterion='maxclust')  # Cortar a 5 clusters

In [None]:
# tratamos el cluster -1 como cluster de outliers
from sklearn import metrics
n_clusters_ = len(set(clusters_final)) - (1 if -1 in clusters_final else 0)
print('Estimated number of clusters: %d' % n_clusters_)
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(df4, clusters_final))

In [None]:
df4["Cluster"] = clusters_final
df4_scaled["Cluster"] = clusters_final

In [None]:
#plotting orginal points with color related to label
scatter = plt.scatter(X_pca[:,0], X_pca[:,1], c=clusters_final, s=50)
plt.legend(*scatter.legend_elements(), title="Clusters", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

In [None]:
import plotly.express as px

fig = px.scatter_3d(df4, x=X_pca[:,0], y=X_pca[:,1], z=X_pca[:,2], color=clusters_final)
# anotación 
# for i in range(0, len(X_pca)):
#     ax.text(X_pca[i, 0], X_pca[i, 1], X_pca[i, 2], df4.iloc[i, :].name, fontsize=8)
fig.show()

In [None]:
import seaborn as sns

# Plotear las variables estandarizadas por cluster
df4_melted = df4_scaled.melt(id_vars="Cluster", var_name="Variable", value_name="Valor")
plt.figure(figsize=(12, 6))
sns.boxplot(x="Variable", y="Valor", hue="Cluster", data=df4_melted, palette="Set3")
plt.title("Distribución de las variables por cluster (estandarizadas)")
plt.xticks(rotation=90)
plt.show()

In [None]:
df4[["Cluster"]].to_csv("clusters_mapa.csv")

![img](img/Edad_mediana_2C.png)

![img](img/Edad_mediana_3C.png)