In [None]:
import numpy as np
import pandas as pd

In [None]:
from case_hours import *
from pract2_utils import *

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from collections import Counter

In [None]:
from sklearn.cluster import AgglomerativeClustering

# Case 1

Accidentes que se producen a la hora de entrada y salida del trabajo y colegio (entre las 8-10 horas y 13-16 horas), escogiendo como días de lunes a viernes donde además, para reducir el número de muestras del problema consideramos lo comentado anteriormente en una proviciona concreta, por ejemplo Madrid, que tiene la mayor densidad de población de España.

In [None]:
sns.set_style("whitegrid")

In [None]:
dgt_data = pd.read_csv("../data/accidentes_2013.csv")
dgt_data.columns = [col.lower() for col in dgt_data]

In [None]:
g = sns.FacetGrid(dgt_data, col="diasemana", col_wrap=4)
g.map_dataframe(sns.histplot, x="hora",y="tot_muertos", binwidth=2)
g.set_axis_labels("hora", "tot_muertos")
g.savefig("figures/2_segmentation/case_1/eda_diasemana_hora_muertos.png")

In [None]:
g = sns.FacetGrid(dgt_data, col="diasemana", col_wrap=4)
g.map_dataframe(sns.histplot, x="hora",y="tot_victimas", binwidth=2)
g.set_axis_labels("hora", "tot_victimas")
g.savefig("figures/2_segmentation/case_1/eda_diasemana_hora_victimas.png")

In [None]:
case_1_subset = define_case_1()

In [None]:
case_1_subset.head()

In [None]:
case_1_subset.shape

In [None]:
case_1_subset['provincia'].value_counts().head()

In [None]:
case_1_subset['diasemana'].value_counts().head()

In [None]:
case_1_subset['hora'].value_counts().head()

In [None]:
atributos = ['tot_vehiculos_implicados', 'tot_heridos_leves', 'tot_heridos_graves','tot_muertos', 'tot_victimas']

In [None]:
hm = sns.pairplot(case_1_subset, vars = atributos, diag_kind="kde")
hm.savefig("figures/2_segmentation/case_1/eda.png")

In [None]:
g = sns.FacetGrid(case_1_subset, col="tot_vehiculos_implicados")
g.map_dataframe(sns.histplot, x="hora")

In [None]:
case_1_data = to_matrix(case_1_subset, atributos)

In [None]:
case_1_data_norm = norm(case_1_data)

In [None]:
case_1_algorithms = definition_clusters(case_1_data_norm)

In [None]:
case_1_predictions, case_1_times = get_predictions(case_1_algorithms, case_1_data_norm)

In [None]:
case_1_measures = calcule_measures(case_1_data_norm, case_1_predictions, case_1_times)

In [None]:
case_1_measures

In [None]:
# Medidas iniciales
columns = ['Algoritmo', 'Clusters', 'Calinski', 'Silhouette', 'time(s)']
print(latex_table(case_1_measures, columns, False))

In [None]:
a, b, n = 1, 10, 1

In [None]:
# Elbow
case_1_distortions, case_1_inertia, case_1_time_kmeans  = kmeans_parameters(case_1_data_norm, a, b, n)
plot_time(case_1_inertia, case_1_time_kmeans, "elbow", "time(s)", "figures/2_segmentation/case_1/kmeans_elbow.pdf", a, b, n)

In [None]:
a, b, n = 5, 50, 5

In [None]:
# Distortions
case_1_distortions, case_1_inertia, case_1_time_kmeans  = kmeans_parameters(case_1_data_norm, a, b, n)

In [None]:
plot_time(case_1_distortions, case_1_time_kmeans, "Distortions", "time(s)", "figures/2_segmentation/case_1/kmeans_distortions.pdf", a, b, n)

In [None]:
# Inertia
plot_time(case_1_inertia, case_1_time_kmeans, "Inertia", "time(s)", "figures/2_segmentation/case_1/kmeans_inertia.pdf", a, b, n)

In [None]:
case_1_silhouette_scores, case_1_calinski_scores = measures_kmeans_range(case_1_data_norm, a, b, n)

In [None]:
# Comparison silhouette calinski
plot_time(case_1_silhouette_scores, case_1_calinski_scores, "silhouette", "calinski", "figures/2_segmentation/case_1/silhouette_calinski.pdf", a, b, n)

In [None]:
case_1_table = {
                "Silhouette":case_1_silhouette_scores,
                "Calinski-Harabaz":case_1_calinski_scores,
                "time(s)":case_1_time_kmeans
                }

In [None]:
columns = ['Silhouette','Calinski-Harabaz','time(s)']
index=range(a,b,n)
print(latex_table_index(case_1_table, columns, index))

## Outliers

In [None]:
# Generamos el nuevo set sin outliers
ward_no_outliers_subset = delete_outliers(case_1_predictions[3], case_1_subset, 2)

In [None]:
ward_no_outliers_subset_data_norm = norm(to_matrix(ward_no_outliers_subset, atributos))

In [None]:
ward_no_outliers_subset_prediction, ward_no_outliers_times = get_predictions(case_1_algorithms[3:5], ward_no_outliers_subset_data_norm)

In [None]:
ward_no_outliers_measures = calcule_measures(ward_no_outliers_subset_data_norm, ward_no_outliers_subset_prediction, ward_no_outliers_times)

In [None]:
# 35
ward_35 = AgglomerativeClustering(n_clusters=35, linkage='ward')
prediction_ward_35 =  ('ward_35', ward_35.fit_predict(ward_no_outliers_subset_data_norm))

In [None]:
ward_35_no_outliers_subset = delete_outliers(prediction_ward_35, ward_no_outliers_subset, 2)

In [None]:
case_1_agglomerative = configuraciones_agglomerative()

In [None]:
ward_35_no_outliers_data_norm = norm(to_matrix(ward_35_no_outliers_subset, atributos))

In [None]:
case_ward_35_predictions_agglomerative, case_ward_35_times_agglomerative  = get_predictions(case_1_agglomerative, ward_35_no_outliers_data_norm)

In [None]:
case_35_measures_ward = calcule_measures(ward_35_no_outliers_data_norm, case_ward_35_predictions_agglomerative, case_ward_35_times_agglomerative)

In [None]:
# Tabla de medidas sin outliers
columns = ['Algoritmo', 'Clusters', 'Calinski', 'Silhouette', 'time(s)']
print(latex_table(case_35_measures_ward, columns, False))

## Interpretacion

In [None]:
# kmeans
kmeans_label = case_1_predictions[0][1]

In [None]:
Counter(kmeans_label)

In [None]:
pairplot(case_1_subset, atributos, "figures/2_segmentation/case_1/pairplot_kmeans.png", kmeans_label)

In [None]:
df_kmeans_centroids = pd.DataFrame(case_1_data_norm)
df_kmeans_centroids.columns = atributos
df_kmeans_centroids['cluster'] = kmeans_label
df_kmeans_centroids = df_kmeans_centroids.groupby('cluster').mean()

In [None]:
visualize_centroids(df_kmeans_centroids.values, case_1_data_norm, "figures/2_segmentation/case_1/centroids_kmeans_norm.pdf", atributos, 0.0)

In [None]:
visualize_centroids(df_kmeans_centroids.values, case_1_data, "figures/2_segmentation/case_1/centroids_kmeans.pdf", atributos, 0.0)

## Agglomerative sin outliers

In [None]:
ward_35_labels = case_ward_35_predictions_agglomerative[0][1]

In [None]:
pairplot(ward_35_no_outliers_subset, atributos, "figures/2_segmentation/case_1/pairplot_ward35.png", ward_35_labels)

In [None]:
# 5
ward_5 = AgglomerativeClustering(n_clusters=5, linkage='ward')
prediction_ward_5 =  ('ward_5', ward_5.fit_predict(ward_no_outliers_subset_data_norm))

In [None]:
ward_5_no_outliers_subset = delete_outliers(prediction_ward_5, ward_no_outliers_subset, 2)

In [None]:
ward_5_no_outliers_data = to_matrix(ward_5_no_outliers_subset, atributos)

In [None]:
ward_5_no_outliers_data_norm = norm(ward_5_no_outliers_data)

In [None]:
case_ward_5_predictions_agglomerative, case_ward_5_times_agglomerative  = get_predictions(case_1_agglomerative, ward_5_no_outliers_data_norm)

In [None]:
case_5_measures_ward = calcule_measures(ward_5_no_outliers_data_norm, case_ward_5_predictions_agglomerative, case_ward_5_times_agglomerative)

In [None]:
ward_5_labels = case_ward_5_predictions_agglomerative[0][1]

In [None]:
pairplot(ward_5_no_outliers_subset, atributos, "figures/2_segmentation/case_1/pairplot_ward5.png", ward_5_labels)

## Dendograma

In [None]:
# setting distance_threshold=0 ensures we compute the full tree.
model = AgglomerativeClustering(linkage='ward', distance_threshold=0, n_clusters=None)

model = model.fit(ward_35_no_outliers_data_norm)
plt.title('Hierarchical Clustering Dendrogram')
# plot the top three levels of the dendrogram
plot_dendrogram(model, truncate_mode='level', p=4)
#plt.xlabel("Number of points in node (or index of point if no parenthesis).")
plt.savefig("figures/2_segmentation/case_1/hierarchical_clustering.pdf")
plt.show()

In [None]:
dendograma_subset = pd.DataFrame(ward_35_no_outliers_subset,index=ward_35_no_outliers_subset.index, columns=atributos)
hm = sns.clustermap(dendograma_subset, method='ward', col_cluster=False, figsize=(20,10), cmap="YlGnBu",  yticklabels=False)
hm.savefig("figures/2_segmentation/case_1/clustermap.pdf")

In [None]:
Counter(ward_35_labels)

In [None]:
Counter(ward_5_labels)

In [None]:
df_ward_5_no_outliers = pd.DataFrame(ward_5_no_outliers_data_norm)
df_ward_5_no_outliers.columns = atributos
df_ward_5_no_outliers['cluster'] = ward_5_labels
df_ward_5_centroides = df_ward_5_no_outliers.groupby('cluster').mean()

In [None]:
visualize_centroids(df_ward_5_centroides.values, ward_5_no_outliers_data_norm, "figures/2_segmentation/case_1/centroids_ward_5_norm.pdf", atributos, 0.0)

In [None]:
visualize_centroids(df_ward_5_centroides.values, ward_5_no_outliers_data, "figures/2_segmentation/case_1/centroids_ward_5.pdf", atributos, 0.0)