In [0]:
!pip install tabulate

In [0]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import plotly.express as px
import os
import geopandas as gpd

In [0]:
df = pd.read_csv('./Datos/sv_202312_metricas_locales.csv', sep=',', encoding='utf-8',quotechar='"')

In [0]:
df.head()

In [0]:
feature_cols = [col for col in df.columns if col.startswith("footfall") or col.endswith("_dwell") or "repeat" in col or "horas_" in col or "peak_visits" in col or "total_devices" in col]

X = df[feature_cols]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [0]:
sse = []
K = range(2, 11)

for k in K:
    km = KMeans(n_clusters=k, random_state=42)
    km.fit(X_scaled)
    sse.append(km.inertia_)

plt.figure(figsize=(8, 4))
plt.plot(K, sse, marker='o')
plt.xlabel('Número de clusters (k)')
plt.ylabel('SSE (inertia)')
plt.title('Método del Codo')
plt.grid(True)
plt.show()

In [0]:
kmeans = KMeans(n_clusters=4, random_state=42)
df["cluster"] = kmeans.fit_predict(X_scaled)

In [0]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

plt.figure(figsize=(8, 6))
scatter = plt.scatter(
    X_pca[:, 0],
    X_pca[:, 1],
    c=df["cluster"],        # df debe contener la columna 'cluster'
    cmap="tab10",
    alpha=0.7
)

plt.title("Visualización PCA de 4 Clusters")
plt.xlabel("PC 1")
plt.ylabel("PC 2")
plt.grid(True)
plt.show()

In [0]:
pca = PCA(n_components=3)
X_pca_3d = pca.fit_transform(X_scaled)

# Creamos un DataFrame con los componentes y los clusters
df_plotly = pd.DataFrame({
    "PC1": X_pca_3d[:, 0],
    "PC2": X_pca_3d[:, 1],
    "PC3": X_pca_3d[:, 2],
    "cluster": df["cluster"].astype(str)  # como texto para que Plotly lo coloree categóricamente
})

# Gráfico interactivo 3D
fig = px.scatter_3d(
    df_plotly,
    x="PC1",
    y="PC2",
    z="PC3",
    color="cluster",
    opacity=0.7,
    title="Clusters en PCA 3D (Interactivo)"
)

fig.update_traces(marker=dict(size=4))  # tamaño del punto
fig.update_layout(margin=dict(l=0, r=0, b=0, t=40))
fig.show()

In [0]:
df[df.cluster==0]

In [0]:
cluster_summary = df.groupby("cluster").mean(numeric_only=True).round(2)

In [0]:
cluster_summary["n_locales"] = df.groupby("cluster").size()

In [0]:
summary_text = cluster_summary.reset_index().to_markdown(index=False)
print(summary_text)

In [0]:
df.head()

Agregando los poligonos de las locations, primero debo generar el geopandas de las locations sin duplicados al igual que en databricks

In [0]:
shapes_dir = "./Datos/shapes"

gdfs = []

for filename in os.listdir(shapes_dir):
    if filename.endswith(".geojson"):
      file_path = os.path.join(shapes_dir, filename)
      gdf = gpd.read_file(file_path)
      gdfs.append(gdf)

gdf_zonas = gpd.GeoDataFrame(pd.concat(gdfs, ignore_index=True))
gdf_zonas = gdf_zonas.set_crs("EPSG:4326")

print(gdf_zonas.columns)

In [0]:
gdf_zonas['source_layer'].value_counts()

In [0]:
layer_ranking = {
"gis_osm_buildings_a_free_1": 5,
"gis_osm_pois_a_free_1": 8,
"gis_osm_landuse_a_free_1": 2,
"gis_osm_traffic_a_free_1": 4,
"gis_osm_pofw_a_free_1": 7,   
"gis_osm_places_a_free_1": 6,
"gis_osm_natural_a_free_1": 1,
"gis_osm_transport_a_free_1": 3
}

gdf_zonas['rank'] = gdf_zonas['source_layer'].map(layer_ranking)
gdf_zonas_unique = gdf_zonas.sort_values('rank').drop_duplicates('osm_id', keep='first').drop(columns=['rank'])

In [0]:
print(f"Registros en DF: {len(gdf_zonas_unique)}")
print(f"Ids unicos (osm_id) en DF: {gdf_zonas_unique['osm_id'].nunique()}")

In [0]:
gdf_zonas_unique['source_layer'].value_counts()

In [0]:
gdf_zonas_unique['osm_id']=gdf_zonas_unique['osm_id'].astype(int)

In [0]:
gdf_zonas_unique.head()

In [0]:
df = df.merge(
    gdf_zonas_unique[['osm_id', 'geometry']],
    on='osm_id',
    how='inner'
)

In [0]:
df.head()

In [0]:
import csv
df.to_csv('./Datos/sv_202312_clusters_georef.csv', encoding='utf-8', index=False, quoting=csv.QUOTE_ALL)