# Clustering (taking into account the pre-calculated Clusters)


## 1. Import Libraries


In [None]:
# Import Libraries

import pandas as pd
import numpy as np

import os

import matplotlib.pyplot as plt
import contextily as cx

import spopt

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import pairwise as skm
from sklearn.metrics import silhouette_score, silhouette_samples

import scipy.cluster.hierarchy as sch

from spopt.region import MaxPHeuristic as MaxP
from libpysal.weights import Queen

import scipy.sparse as sp
from splot.libpysal import plot_spatial_weights

%matplotlib inline
os.environ["USE_PYGEOS"] = "0"

## 2. Loading Data, Transforming Variables


In [None]:
# read data
bgri_cluster = pd.read_pickle("../Data/piclo_bgri.piclo")

In [None]:
all_data = pd.read_pickle("../Data/all_data.piclo")

In [None]:
bgri_cluster.shape

In [None]:
bgri_cluster.shape

In [None]:
# read data
clusters_fr = pd.read_pickle("../Data/piclo_clusters_fr.piclo")
clusters_lug = pd.read_pickle("../Data/piclo_clusters_lug.piclo")
clusters_lp = pd.read_pickle("../Data/piclo_clusters_lp.piclo")

In [None]:
bgri_cluster.head()

In [None]:
bgri_cluster.shape

In [None]:
bgri_cluster[bgri_cluster["Cluster_LP"].isnull()]

In [None]:
bgri_cluster = bgri_cluster[~bgri_cluster["Cluster_LP"].isnull()]

In [None]:
bgri_cluster.shape

In [None]:
clusters_fr.head()

In [None]:
clusters_lug.head()

In [None]:
clusters_lp.head()

In [None]:
clusters_fr.shape

In [None]:
clusters_lug.shape

In [None]:
clusters_lp.shape

In [None]:
bgri_cluster.head()

In [None]:
bgri_cluster.shape

In [None]:
# group by cluster and sum values, getting socioeconomic indicators for each cluster (from the subsection information)
bgri_cluster_LP = (
    bgri_cluster.groupby(["Cluster_LP"]).sum().reset_index(level="Cluster_LP")
)
bgri_cluster_LUG = bgri_cluster.groupby(["LUG11"]).sum().reset_index(level="LUG11")
bgri_cluster_FR = bgri_cluster.groupby(["FR11"]).sum().reset_index(level="FR11")

In [None]:
bgri_cluster_LP.shape

In [None]:
# rotinas utilizadas para verificar a presença de NaNs - linhas dropadas

# set de pontos (X) com NaNs depois do standardscaler()
# este output só aparece se o bloco "bgri_cluster.drop([24, 32, 47, 94, 101], axis=0, inplace=True)" não estiver implementado


# {10, 14, 49, 56, 59, 61, 62}



In [None]:
bgri_cluster_LP.isnull().values.any()

In [None]:
list(bgri_cluster_LP.columns)

In [None]:
## code below was used to check for NaNs in the rows identified with NaN values
# bgri_cluster_LP.loc[[10]].transpose()[0:60]
# bgri_cluster_LP.loc[[14]].transpose()[0:60]
# bgri_cluster_LP.loc[[49]].transpose()[0:60]
# bgri_cluster_LP.loc[[56]].transpose()[0:60]
# bgri_cluster_LP.loc[[61]].transpose()[0:60]
# bgri_cluster_LP.loc[[62]].transpose()[0:60]

In [None]:
## after analyzing all the columns, we decided to drop the following columns:
## - N_RES_HABITUAL_1_2_DIV
## - N_RES_HABITUAL_3_4_DIV
## - N_RES_HABITUAL_ESTAC_1
## - N_RES_HABITUAL_ESTAC_2
## - N_RES_HABITUAL_ESTAC_3

In [None]:
bgri_cluster_LP.drop(
    columns=[
        "N_RES_HABITUAL_1_2_DIV",
        "N_RES_HABITUAL_3_4_DIV",
        "N_RES_HABITUAL_ESTAC_1",
        "N_RES_HABITUAL_ESTAC_2",
        "N_RES_HABITUAL_ESTAC_3",
    ],
    inplace=True,
)

In [None]:
## drop row with ONLY NaNs - other 6 rows with NaNs were not dropped; columns with NaN values were dropped instead
# bgri_cluster_LP.drop([61], axis=0, inplace=True)

In [None]:
bgri_cluster_LP.head()

### 2.1 New variables, enhancing the information available in the dataset


In [None]:
# here we prove that N_EDIFICIOS_CLASSICOS_ISOLADOS+N_EDIFICIOS_CLASSICOS_GEMIN+N_EDIFICIOS_CLASSICOS_EMBANDA=N_EDIFICIOS_CLASSICOS_1OU2
bgri_cluster_LP["N_EDIFICIOS_CLASSICOS_1OU2"].sum() == (
    bgri_cluster_LP["N_EDIFICIOS_CLASSICOS_ISOLADOS"]
    + bgri_cluster_LP["N_EDIFICIOS_CLASSICOS_GEMIN"]
    + bgri_cluster_LP["N_EDIFICIOS_CLASSICOS_EMBANDA"]
).sum()

In [None]:
# here we prove that N_EDIFICIOS_CLASSICOS_ISOLADOS+N_EDIFICIOS_CLASSICOS_GEMIN+N_EDIFICIOS_CLASSICOS_EMBANDA=N_EDIFICIOS_CLASSICOS_1OU2
bgri_cluster_LUG["N_EDIFICIOS_CLASSICOS_1OU2"].sum() == (
    bgri_cluster_LUG["N_EDIFICIOS_CLASSICOS_ISOLADOS"]
    + bgri_cluster_LUG["N_EDIFICIOS_CLASSICOS_GEMIN"]
    + bgri_cluster_LUG["N_EDIFICIOS_CLASSICOS_EMBANDA"]
).sum()

In [None]:
# here we prove that N_EDIFICIOS_CLASSICOS_ISOLADOS+N_EDIFICIOS_CLASSICOS_GEMIN+N_EDIFICIOS_CLASSICOS_EMBANDA=N_EDIFICIOS_CLASSICOS_1OU2
bgri_cluster_FR["N_EDIFICIOS_CLASSICOS_1OU2"].sum() == (
    bgri_cluster_FR["N_EDIFICIOS_CLASSICOS_ISOLADOS"]
    + bgri_cluster_FR["N_EDIFICIOS_CLASSICOS_GEMIN"]
    + bgri_cluster_FR["N_EDIFICIOS_CLASSICOS_EMBANDA"]
).sum()

In [None]:
bgri_cluster_LP.head()

In [None]:
bgri_cluster_LP.shape

In [None]:
# sabemos que N_EDIFICIOS_CLASSICOS = N_EDIFICIOS_CLASSICOS_ISOLADOS + N_EDIFICIOS_CLASSICOS_GEMIN + N_EDIFICIOS_CLASSICOS_EMBANDA +
# + N_EDIFICIOS_CLASSICOS_3OUMAIS + N_EDIFICIOS_CLASSICOS_OUTROS

# drop possivel - N_EDIFICIOS_CLASSICOS_OUTROS (para não dar 100%)

bgri_cluster_LP["PER_EDIFICIOS_CLASSICOS_ISOLADOS"] = (
    bgri_cluster_LP["N_EDIFICIOS_CLASSICOS_ISOLADOS"]
    / bgri_cluster_LP["N_EDIFICIOS_CLASSICOS"]
)
bgri_cluster_LP["PER_EDIFICIOS_CLASSICOS_GEMIN"] = (
    bgri_cluster_LP["N_EDIFICIOS_CLASSICOS_GEMIN"]
    / bgri_cluster_LP["N_EDIFICIOS_CLASSICOS"]
)
bgri_cluster_LP["PER_EDIFICIOS_CLASSICOS_EMBANDA"] = (
    bgri_cluster_LP["N_EDIFICIOS_CLASSICOS_EMBANDA"]
    / bgri_cluster_LP["N_EDIFICIOS_CLASSICOS"]
)
bgri_cluster_LP["PER_EDIFICIOS_CLASSICOS_1OU2"] = (
    bgri_cluster_LP["N_EDIFICIOS_CLASSICOS_1OU2"]
    / bgri_cluster_LP["N_EDIFICIOS_CLASSICOS"]
)
bgri_cluster_LP["PER_EDIFICIOS_CLASSICOS_3OUMAIS"] = (
    bgri_cluster_LP["N_EDIFICIOS_CLASSICOS_3OUMAIS"]
    / bgri_cluster_LP["N_EDIFICIOS_CLASSICOS"]
)
bgri_cluster_LP["PER_EDIFICIOS_CLASSICOS_OUTROS"] = (
    bgri_cluster_LP["N_EDIFICIOS_CLASSICOS_OUTROS"]
    / bgri_cluster_LP["N_EDIFICIOS_CLASSICOS"]
)

bgri_cluster_LP.drop(
    [
        "N_EDIFICIOS_CLASSICOS",
        "N_EDIFICIOS_CLASSICOS_1OU2",
        "N_EDIFICIOS_CLASSICOS_ISOLADOS",
        "N_EDIFICIOS_CLASSICOS_GEMIN",
        "N_EDIFICIOS_CLASSICOS_EMBANDA",
        "N_EDIFICIOS_CLASSICOS_3OUMAIS",
        "N_EDIFICIOS_CLASSICOS_OUTROS",
    ],
    axis=1,
    inplace=True,
)

In [None]:
# sabemos que N_EDIFICIOS_EXCLUSIV_RESID + N_EDIFICIOS_PRINCIPAL_RESID + N_EDIFICIOS_PRINCIP_NAO_RESID =
# = N_EDIFICIOS_1OU2_PISOS + N_EDIFICIOS_3OU4_PISOS + N_EDIFICIOS_5OU_MAIS_PISOS

# drop possivel - PER_EDIFICIOS_5OU_MAIS_PISOS (para não dar 100%)

bgri_cluster_LP["total_temp"] = (
    bgri_cluster_LP["N_EDIFICIOS_EXCLUSIV_RESID"]
    + bgri_cluster_LP["N_EDIFICIOS_PRINCIPAL_RESID"]
    + bgri_cluster_LP["N_EDIFICIOS_PRINCIP_NAO_RESID"]
)

bgri_cluster_LP["PER_EDIFICIOS_EXCLUSIV_RESID"] = (
    bgri_cluster_LP["N_EDIFICIOS_EXCLUSIV_RESID"] / bgri_cluster_LP["total_temp"]
)
bgri_cluster_LP["PER_EDIFICIOS_PRINCIPAL_RESID"] = (
    bgri_cluster_LP["N_EDIFICIOS_PRINCIPAL_RESID"] / bgri_cluster_LP["total_temp"]
)
bgri_cluster_LP["PER_EDIFICIOS_PRINCIP_NAO_RESID"] = (
    bgri_cluster_LP["N_EDIFICIOS_PRINCIP_NAO_RESID"] / bgri_cluster_LP["total_temp"]
)

bgri_cluster_LP["PER_EDIFICIOS_1OU2_PISOS"] = (
    bgri_cluster_LP["N_EDIFICIOS_1OU2_PISOS"] / bgri_cluster_LP["total_temp"]
)
bgri_cluster_LP["PER_EDIFICIOS_3OU4_PISOS"] = (
    bgri_cluster_LP["N_EDIFICIOS_3OU4_PISOS"] / bgri_cluster_LP["total_temp"]
)
bgri_cluster_LP["PER_EDIFICIOS_5OU_MAIS_PISOS"] = (
    bgri_cluster_LP["N_EDIFICIOS_5OU_MAIS_PISOS"] / bgri_cluster_LP["total_temp"]
)

bgri_cluster_LP.drop(
    [
        "total_temp",
        "N_EDIFICIOS_EXCLUSIV_RESID",
        "N_EDIFICIOS_PRINCIPAL_RESID",
        "N_EDIFICIOS_PRINCIP_NAO_RESID",
        "N_EDIFICIOS_1OU2_PISOS",
        "N_EDIFICIOS_3OU4_PISOS",
        "N_EDIFICIOS_5OU_MAIS_PISOS",
    ],
    axis=1,
    inplace=True,
)

In [None]:
# conversão dos indicadores do ano de construção em percentagens do total de casas

# drop possivel - N_EDIFICIOS_CONSTR_2006A2011 (para não dar 100%)

bgri_cluster_LP["total_temp"] = (
    bgri_cluster_LP["N_EDIFICIOS_CONSTR_ANTES_1919"]
    + bgri_cluster_LP["N_EDIFICIOS_CONSTR_1919A1945"]
    + bgri_cluster_LP["N_EDIFICIOS_CONSTR_1946A1960"]
    + bgri_cluster_LP["N_EDIFICIOS_CONSTR_1961A1970"]
    + bgri_cluster_LP["N_EDIFICIOS_CONSTR_1971A1980"]
    + bgri_cluster_LP["N_EDIFICIOS_CONSTR_1981A1990"]
    + bgri_cluster_LP["N_EDIFICIOS_CONSTR_1991A1995"]
    + bgri_cluster_LP["N_EDIFICIOS_CONSTR_1996A2000"]
    + bgri_cluster_LP["N_EDIFICIOS_CONSTR_2001A2005"]
    + bgri_cluster_LP["N_EDIFICIOS_CONSTR_2006A2011"]
)

bgri_cluster_LP["PER_EDIFICIOS_CONSTR_ANTES_1919"] = (
    bgri_cluster_LP["N_EDIFICIOS_CONSTR_ANTES_1919"] / bgri_cluster_LP["total_temp"]
)
bgri_cluster_LP["PER_EDIFICIOS_CONSTR_1919A1945"] = (
    bgri_cluster_LP["N_EDIFICIOS_CONSTR_1919A1945"] / bgri_cluster_LP["total_temp"]
)
bgri_cluster_LP["PER_EDIFICIOS_CONSTR_1946A1960"] = (
    bgri_cluster_LP["N_EDIFICIOS_CONSTR_1946A1960"] / bgri_cluster_LP["total_temp"]
)
bgri_cluster_LP["PER_EDIFICIOS_CONSTR_1961A1970"] = (
    bgri_cluster_LP["N_EDIFICIOS_CONSTR_1961A1970"] / bgri_cluster_LP["total_temp"]
)
bgri_cluster_LP["PER_EDIFICIOS_CONSTR_1971A1980"] = (
    bgri_cluster_LP["N_EDIFICIOS_CONSTR_1971A1980"] / bgri_cluster_LP["total_temp"]
)
bgri_cluster_LP["PER_EDIFICIOS_CONSTR_1981A1990"] = (
    bgri_cluster_LP["N_EDIFICIOS_CONSTR_1981A1990"] / bgri_cluster_LP["total_temp"]
)
bgri_cluster_LP["PER_EDIFICIOS_CONSTR_1991A1995"] = (
    bgri_cluster_LP["N_EDIFICIOS_CONSTR_1991A1995"] / bgri_cluster_LP["total_temp"]
)
bgri_cluster_LP["PER_EDIFICIOS_CONSTR_1996A2000"] = (
    bgri_cluster_LP["N_EDIFICIOS_CONSTR_1996A2000"] / bgri_cluster_LP["total_temp"]
)
bgri_cluster_LP["PER_EDIFICIOS_CONSTR_2001A2005"] = (
    bgri_cluster_LP["N_EDIFICIOS_CONSTR_2001A2005"] / bgri_cluster_LP["total_temp"]
)
bgri_cluster_LP["PER_EDIFICIOS_CONSTR_2006A2011"] = (
    bgri_cluster_LP["N_EDIFICIOS_CONSTR_2006A2011"] / bgri_cluster_LP["total_temp"]
)

bgri_cluster_LP.drop(
    [
        "total_temp",
        "N_EDIFICIOS_CONSTR_ANTES_1919",
        "N_EDIFICIOS_CONSTR_1919A1945",
        "N_EDIFICIOS_CONSTR_1946A1960",
        "N_EDIFICIOS_CONSTR_1961A1970",
        "N_EDIFICIOS_CONSTR_1971A1980",
        "N_EDIFICIOS_CONSTR_1981A1990",
        "N_EDIFICIOS_CONSTR_1991A1995",
        "N_EDIFICIOS_CONSTR_1996A2000",
        "N_EDIFICIOS_CONSTR_2001A2005",
        "N_EDIFICIOS_CONSTR_2006A2011",
    ],
    axis=1,
    inplace=True,
)

In [None]:
# vamos agora droppar variáveis que decidimos não usar, por acreditarmos que não "informam" o modelo

bgri_cluster_LP.drop(
    [
        "N_EDIFICIOS_ESTRUT_BETAO",
        "N_EDIFICIOS_ESTRUT_COM_PLACA",
        "N_EDIFICIOS_ESTRUT_SEM_PLACA",
        "N_EDIFICIOS_ESTRUT_ADOBE_PEDRA",
        "N_EDIFICIOS_ESTRUT_OUTRA",
    ],
    axis=1,
    inplace=True,
)

In [None]:
# "Cluster" de Variáveis em análise: N_ALOJAMENTOS, N_ALOJAMENTOS_FAM_CLASSICOS, N_ALOJAMENTOS_FAM_N_CLASSICOS, N_ALOJAMENTOS_COLECTIVOS, N_CLASSICOS_RES_HABITUAL, N_ALOJAMENTOS_RES_HABITUAL, N_ALOJAMENTOS_VAGOS
# Após análise (à parte), definiu-se que toda a informação destas variáveis está contida nos seguintes percentuais:

bgri_cluster_LP["PER_ALOJAMENTOS_FAM_CLASSICOS"] = (
    bgri_cluster_LP["N_ALOJAMENTOS_FAM_CLASSICOS"] / bgri_cluster_LP["N_ALOJAMENTOS"]
)
bgri_cluster_LP["PER_ALOJAMENTOS_FAM_N_CLASSICOS"] = (
    bgri_cluster_LP["N_ALOJAMENTOS_FAM_N_CLASSICOS"] / bgri_cluster_LP["N_ALOJAMENTOS"]
)
bgri_cluster_LP["PER_ALOJAMENTOS_COLECTIVOS"] = (
    bgri_cluster_LP["N_ALOJAMENTOS_COLECTIVOS"] / bgri_cluster_LP["N_ALOJAMENTOS"]
)
bgri_cluster_LP["PER_CLASSICOS_RES_HABITUAL"] = (
    bgri_cluster_LP["N_CLASSICOS_RES_HABITUAL"] / bgri_cluster_LP["N_ALOJAMENTOS"]
)
bgri_cluster_LP["PER_ALOJAMENTOS_RES_HABITUAL"] = (
    bgri_cluster_LP["N_ALOJAMENTOS_RES_HABITUAL"] / bgri_cluster_LP["N_ALOJAMENTOS"]
)
bgri_cluster_LP["PER_ALOJAMENTOS_VAGOS"] = (
    bgri_cluster_LP["N_ALOJAMENTOS_VAGOS"] / bgri_cluster_LP["N_ALOJAMENTOS"]
)
bgri_cluster_LP["PER_ALOJAMENTOS_FAMILIARES"] = (
    bgri_cluster_LP["N_ALOJAMENTOS_FAMILIARES"] / bgri_cluster_LP["N_ALOJAMENTOS"]
)

bgri_cluster_LP.drop(
    [
        "N_ALOJAMENTOS",
        "N_ALOJAMENTOS_FAM_CLASSICOS",
        "N_ALOJAMENTOS_FAM_N_CLASSICOS",
        "N_ALOJAMENTOS_COLECTIVOS",
        "N_CLASSICOS_RES_HABITUAL",
        "N_ALOJAMENTOS_RES_HABITUAL",
        "N_ALOJAMENTOS_VAGOS",
        "N_ALOJAMENTOS_FAMILIARES",
    ],
    axis=1,
    inplace=True,
)

In [None]:
# vamos droppar as seguintes variáveis 'N_RES_HABITUAL_COM_AGUA','N_RES_HABITUAL_COM_RETRETE','N_RES_HABITUAL_COM_ESGOTOS','N_RES_HABITUAL_COM_BANHO'

bgri_cluster_LP.drop(
    [
        "N_RES_HABITUAL_COM_AGUA",
        "N_RES_HABITUAL_COM_RETRETE",
        "N_RES_HABITUAL_COM_ESGOTOS",
        "N_RES_HABITUAL_COM_BANHO",
    ],
    axis=1,
    inplace=True,
)

In [None]:
# Próximo Cluster de 'N_RES_HABITUAL_AREA_50', 'N_RES_HABITUAL_AREA_50_100', 'N_RES_HABITUAL_AREA_100_200', 'N_RES_HABITUAL_AREA_200'

bgri_cluster_LP["temp_total"] = (
    bgri_cluster_LP["N_RES_HABITUAL_AREA_50"]
    + bgri_cluster_LP["N_RES_HABITUAL_AREA_50_100"]
    + bgri_cluster_LP["N_RES_HABITUAL_AREA_100_200"]
    + bgri_cluster_LP["N_RES_HABITUAL_AREA_200"]
)

bgri_cluster_LP["PER_RES_HABITUAL_AREA_50"] = (
    bgri_cluster_LP["N_RES_HABITUAL_AREA_50"] / bgri_cluster_LP["temp_total"]
)
bgri_cluster_LP["PER_RES_HABITUAL_AREA_50_100"] = (
    bgri_cluster_LP["N_RES_HABITUAL_AREA_50_100"] / bgri_cluster_LP["temp_total"]
)
bgri_cluster_LP["PER_RES_HABITUAL_AREA_100_200"] = (
    bgri_cluster_LP["N_RES_HABITUAL_AREA_100_200"] / bgri_cluster_LP["temp_total"]
)
bgri_cluster_LP["PER_RES_HABITUAL_AREA_200"] = (
    bgri_cluster_LP["N_RES_HABITUAL_AREA_200"] / bgri_cluster_LP["temp_total"]
)

bgri_cluster_LP.drop(
    [
        "temp_total",
        "N_RES_HABITUAL_AREA_50",
        "N_RES_HABITUAL_AREA_50_100",
        "N_RES_HABITUAL_AREA_100_200",
        "N_RES_HABITUAL_AREA_200",
    ],
    axis=1,
    inplace=True,
)

In [None]:
# Próximo Cluster de 'N_RES_HABITUAL_PROP_OCUP','N_RES_HABITUAL_ARREND'
# Não fui capaz de encontrar relação entre variáveis (os totais não batem certo), pelo que será criado subtotatal e as variáveis serão percentagens desses subtotais

bgri_cluster_LP["temp_total3"] = (
    bgri_cluster_LP["N_RES_HABITUAL_PROP_OCUP"]
    + bgri_cluster_LP["N_RES_HABITUAL_ARREND"]
)

bgri_cluster_LP["PER_RES_HABITUAL_PROP_OCUP"] = (
    bgri_cluster_LP["N_RES_HABITUAL_PROP_OCUP"] / bgri_cluster_LP["temp_total3"]
)
bgri_cluster_LP["PER_RES_HABITUAL_ARREND"] = (
    bgri_cluster_LP["N_RES_HABITUAL_ARREND"] / bgri_cluster_LP["temp_total3"]
)


bgri_cluster_LP.drop(
    ["temp_total3", "N_RES_HABITUAL_PROP_OCUP", "N_RES_HABITUAL_ARREND"],
    axis=1,
    inplace=True,
)

In [None]:
# Próximo Cluster de 'N_FAMILIAS_CLASSICAS','N_FAMILIAS_INSTITUCIONAIS','N_FAMILIAS_CLASSICAS_1OU2_PESS','N_FAMILIAS_CLASSICAS_3OU4_PESS','N_FAMILIAS_CLASSICAS_NPES65',
#'N_FAMILIAS_CLASSICAS_NPES14','N_FAMILIAS_CLASSIC_SEM_DESEMP','N_FAMILIAS_CLASSIC_1DESEMPREG','N_FAMILIAS_CLASS_2MAIS_DESEMP'

# Não fui capaz de encontrar relação entre todas as variáveis (os totais não batem todos certo), pelo que serão as variáveis serão percentagens do N_FAMILIAS_CLASSICAS

bgri_cluster_LP["PER_FAMILIAS_INSTITUCIONAIS"] = (
    bgri_cluster_LP["N_FAMILIAS_INSTITUCIONAIS"]
    / bgri_cluster_LP["N_FAMILIAS_CLASSICAS"]
)
bgri_cluster_LP["PER_FAMILIAS_CLASSICAS_1OU2_PESS"] = (
    bgri_cluster_LP["N_FAMILIAS_CLASSICAS_1OU2_PESS"]
    / bgri_cluster_LP["N_FAMILIAS_CLASSICAS"]
)
bgri_cluster_LP["PER_FAMILIAS_CLASSICAS_3OU4_PESS"] = (
    bgri_cluster_LP["N_FAMILIAS_CLASSICAS_3OU4_PESS"]
    / bgri_cluster_LP["N_FAMILIAS_CLASSICAS"]
)
bgri_cluster_LP["PER_FAMILIAS_CLASSICAS_NPES65"] = (
    bgri_cluster_LP["N_FAMILIAS_CLASSICAS_NPES65"]
    / bgri_cluster_LP["N_FAMILIAS_CLASSICAS"]
)
bgri_cluster_LP["PER_FAMILIAS_CLASSICAS_NPES14"] = (
    bgri_cluster_LP["N_FAMILIAS_CLASSICAS_NPES14"]
    / bgri_cluster_LP["N_FAMILIAS_CLASSICAS"]
)
bgri_cluster_LP["PER_FAMILIAS_CLASSIC_SEM_DESEMP"] = (
    bgri_cluster_LP["N_FAMILIAS_CLASSIC_SEM_DESEMP"]
    / bgri_cluster_LP["N_FAMILIAS_CLASSICAS"]
)
bgri_cluster_LP["PER_FAMILIAS_CLASSIC_1DESEMPREG"] = (
    bgri_cluster_LP["N_FAMILIAS_CLASSIC_1DESEMPREG"]
    / bgri_cluster_LP["N_FAMILIAS_CLASSICAS"]
)
bgri_cluster_LP["PER_FAMILIAS_CLASS_2MAIS_DESEMP"] = (
    bgri_cluster_LP["N_FAMILIAS_CLASS_2MAIS_DESEMP"]
    / bgri_cluster_LP["N_FAMILIAS_CLASSICAS"]
)

bgri_cluster_LP.drop(
    [
        "N_FAMILIAS_CLASSICAS",
        "N_FAMILIAS_INSTITUCIONAIS",
        "N_FAMILIAS_CLASSICAS_1OU2_PESS",
        "N_FAMILIAS_CLASSICAS_3OU4_PESS",
        "N_FAMILIAS_CLASSICAS_NPES65",
        "N_FAMILIAS_CLASSICAS_NPES14",
        "N_FAMILIAS_CLASSIC_SEM_DESEMP",
        "N_FAMILIAS_CLASSIC_1DESEMPREG",
        "N_FAMILIAS_CLASS_2MAIS_DESEMP",
    ],
    axis=1,
    inplace=True,
)

In [None]:
# Próximo Cluster de 'N_NUCLEOS_FAMILIARES','N_NUCLEOS_1FILH_NAO_CASADO','N_NUCLEOS_2FILH_NAO_CASADO','N_NUCLEOS_FILH_INF_6ANOS','N_NUCLEOS_FILH_INF_15ANOS','N_NUCLEOS_FILH_MAIS_15ANOS'

# Não fui capaz de encontrar relação entre todas as variáveis (os totais não batem todos certo), pelo que serão as variáveis serão percentagens do N_NUCLEOS_FAMILIARES


bgri_cluster_LP["PER_NUCLEOS_1FILH_NAO_CASADO"] = (
    bgri_cluster_LP["N_NUCLEOS_1FILH_NAO_CASADO"]
    / bgri_cluster_LP["N_NUCLEOS_FAMILIARES"]
)
bgri_cluster_LP["PER_NUCLEOS_2FILH_NAO_CASADO"] = (
    bgri_cluster_LP["N_NUCLEOS_2FILH_NAO_CASADO"]
    / bgri_cluster_LP["N_NUCLEOS_FAMILIARES"]
)
bgri_cluster_LP["PER_NUCLEOS_FILH_INF_6ANOS"] = (
    bgri_cluster_LP["N_NUCLEOS_FILH_INF_6ANOS"]
    / bgri_cluster_LP["N_NUCLEOS_FAMILIARES"]
)
bgri_cluster_LP["PER_NUCLEOS_FILH_INF_15ANOS"] = (
    bgri_cluster_LP["N_NUCLEOS_FILH_INF_15ANOS"]
    / bgri_cluster_LP["N_NUCLEOS_FAMILIARES"]
)
bgri_cluster_LP["PER_NUCLEOS_FILH_MAIS_15ANOS"] = (
    bgri_cluster_LP["N_NUCLEOS_FILH_MAIS_15ANOS"]
    / bgri_cluster_LP["N_NUCLEOS_FAMILIARES"]
)

bgri_cluster_LP.drop(
    [
        "N_NUCLEOS_FAMILIARES",
        "N_NUCLEOS_1FILH_NAO_CASADO",
        "N_NUCLEOS_2FILH_NAO_CASADO",
        "N_NUCLEOS_FILH_INF_6ANOS",
        "N_NUCLEOS_FILH_INF_15ANOS",
        "N_NUCLEOS_FILH_MAIS_15ANOS",
    ],
    axis=1,
    inplace=True,
)

In [None]:
# Antes de passarmos ao próximo bloco de indicadores a tratar, vamos eliminar todas as variáveis relativas ao sexo dos residentes, mantendo aoenas informação relativa à idade
bgri_cluster_LP.drop(
    [
        "N_INDIVIDUOS_PRESENT_H",
        "N_INDIVIDUOS_PRESENT_M",
        "N_INDIVIDUOS_RESIDENT_H",
        "N_INDIVIDUOS_RESIDENT_M",
        "N_INDIVIDUOS_RESIDENT_H_0A4",
        "N_INDIVIDUOS_RESIDENT_H_5A9",
        "N_INDIVIDUOS_RESIDENT_H_10A13",
        "N_INDIVIDUOS_RESIDENT_H_14A19",
        "N_INDIVIDUOS_RESIDENT_H_15A19",
        "N_INDIVIDUOS_RESIDENT_H_20A24",
        "N_INDIVIDUOS_RESIDENT_H_20A64",
        "N_INDIVIDUOS_RESIDENT_H_25A64",
        "N_INDIVIDUOS_RESIDENT_H_65",
        "N_INDIVIDUOS_RESIDENT_M_0A4",
        "N_INDIVIDUOS_RESIDENT_M_5A9",
        "N_INDIVIDUOS_RESIDENT_M_10A13",
        "N_INDIVIDUOS_RESIDENT_M_14A19",
        "N_INDIVIDUOS_RESIDENT_M_15A19",
        "N_INDIVIDUOS_RESIDENT_M_20A24",
        "N_INDIVIDUOS_RESIDENT_M_20A64",
        "N_INDIVIDUOS_RESIDENT_M_25A64",
        "N_INDIVIDUOS_RESIDENT_M_65",
    ],
    axis=1,
    inplace=True,
)

In [None]:
# Próximo Cluster de 'N_INDIVIDUOS_PRESENT','N_INDIVIDUOS_RESIDENT','N_INDIVIDUOS_RESIDENT_0A4','N_INDIVIDUOS_RESIDENT_5A9','N_INDIVIDUOS_RESIDENT_10A13','N_INDIVIDUOS_RESIDENT_14A19',
#                       'N_INDIVIDUOS_RESIDENT_15A19','N_INDIVIDUOS_RESIDENT_20A24','N_INDIVIDUOS_RESIDENT_20A64','N_INDIVIDUOS_RESIDENT_25A64','N_INDIVIDUOS_RESIDENT_65'

# Todos os indicadores acima serão calculados em percentagem da variável N_INDIVIDUOS_RESIDENT
# NOTA: esta variável N_INDIVIDUOS_RESIDENT não será dropada no final visto que ainda será necessária para o próximo batch de indicadores a tratar

bgri_cluster_LP["PER_INDIVIDUOS_PRESENT"] = (
    bgri_cluster_LP["N_INDIVIDUOS_PRESENT"] / bgri_cluster_LP["N_INDIVIDUOS_RESIDENT"]
)
bgri_cluster_LP["PER_INDIVIDUOS_RESIDENT_0A4"] = (
    bgri_cluster_LP["N_INDIVIDUOS_RESIDENT_0A4"]
    / bgri_cluster_LP["N_INDIVIDUOS_RESIDENT"]
)
bgri_cluster_LP["PER_INDIVIDUOS_RESIDENT_5A9"] = (
    bgri_cluster_LP["N_INDIVIDUOS_RESIDENT_5A9"]
    / bgri_cluster_LP["N_INDIVIDUOS_RESIDENT"]
)
bgri_cluster_LP["PER_INDIVIDUOS_RESIDENT_10A13"] = (
    bgri_cluster_LP["N_INDIVIDUOS_RESIDENT_10A13"]
    / bgri_cluster_LP["N_INDIVIDUOS_RESIDENT"]
)
bgri_cluster_LP["PER_INDIVIDUOS_RESIDENT_14A19"] = (
    bgri_cluster_LP["N_INDIVIDUOS_RESIDENT_14A19"]
    / bgri_cluster_LP["N_INDIVIDUOS_RESIDENT"]
)
bgri_cluster_LP["PER_INDIVIDUOS_RESIDENT_20A24"] = (
    bgri_cluster_LP["N_INDIVIDUOS_RESIDENT_20A24"]
    / bgri_cluster_LP["N_INDIVIDUOS_RESIDENT"]
)
bgri_cluster_LP["PER_INDIVIDUOS_RESIDENT_25A64"] = (
    bgri_cluster_LP["N_INDIVIDUOS_RESIDENT_25A64"]
    / bgri_cluster_LP["N_INDIVIDUOS_RESIDENT"]
)
bgri_cluster_LP["PER_INDIVIDUOS_RESIDENT_65"] = (
    bgri_cluster_LP["N_INDIVIDUOS_RESIDENT_65"]
    / bgri_cluster_LP["N_INDIVIDUOS_RESIDENT"]
)

bgri_cluster_LP.drop(
    [
        "N_INDIVIDUOS_PRESENT",
        "N_INDIVIDUOS_RESIDENT_0A4",
        "N_INDIVIDUOS_RESIDENT_5A9",
        "N_INDIVIDUOS_RESIDENT_10A13",
        "N_INDIVIDUOS_RESIDENT_14A19",
        "N_INDIVIDUOS_RESIDENT_15A19",
        "N_INDIVIDUOS_RESIDENT_20A24",
        "N_INDIVIDUOS_RESIDENT_20A64",
        "N_INDIVIDUOS_RESIDENT_25A64",
        "N_INDIVIDUOS_RESIDENT_65",
    ],
    axis=1,
    inplace=True,
)

In [None]:
# Próximo Cluster de 'N_INDIVIDUOS_RESIDENT','N_INDIV_RESIDENT_N_LER_ESCRV','N_IND_RESIDENT_FENSINO_1BAS','N_IND_RESIDENT_FENSINO_2BAS','N_IND_RESIDENT_FENSINO_3BAS','N_IND_RESIDENT_FENSINO_SEC','N_IND_RESIDENT_FENSINO_POSSEC',
#'N_IND_RESIDENT_FENSINO_SUP','N_IND_RESIDENT_ENSINCOMP_1BAS','N_IND_RESIDENT_ENSINCOMP_2BAS','N_IND_RESIDENT_ENSINCOMP_3BAS','N_IND_RESIDENT_ENSINCOMP_SEC','N_IND_RESIDENT_ENSINCOMP_POSEC','N_IND_RESIDENT_ENSINCOMP_SUP',
#'N_IND_RESID_DESEMP_PROC_1EMPRG','N_IND_RESID_DESEMP_PROC_EMPRG','N_IND_RESID_EMPREGADOS','N_IND_RESID_PENS_REFORM','N_IND_RESID_SEM_ACT_ECON','N_IND_RESID_EMPREG_SECT_PRIM','N_IND_RESID_EMPREG_SECT_SEQ',
#'N_IND_RESID_EMPREG_SECT_TERC','N_IND_RESID_ESTUD_MUN_RESID','N_IND_RESID_TRAB_MUN_RESID'

# Todos os indicadores acima serão calculados em percentagem da variável N_INDIVIDUOS_RESIDENT (vamos ignorar indicadores relativos a emprego/desemprego porque são indicadores muito "conjunturais")

bgri_cluster_LP["PER_INDIV_RESIDENT_N_LER_ESCRV"] = (
    bgri_cluster_LP["N_INDIV_RESIDENT_N_LER_ESCRV"]
    / bgri_cluster_LP["N_INDIVIDUOS_RESIDENT"]
)
bgri_cluster_LP["PER_IND_RESIDENT_FENSINO_1BAS"] = (
    bgri_cluster_LP["N_IND_RESIDENT_FENSINO_1BAS"]
    / bgri_cluster_LP["N_INDIVIDUOS_RESIDENT"]
)
bgri_cluster_LP["PER_IND_RESIDENT_FENSINO_2BAS"] = (
    bgri_cluster_LP["N_IND_RESIDENT_FENSINO_2BAS"]
    / bgri_cluster_LP["N_INDIVIDUOS_RESIDENT"]
)
bgri_cluster_LP["PER_IND_RESIDENT_FENSINO_3BAS"] = (
    bgri_cluster_LP["N_IND_RESIDENT_FENSINO_3BAS"]
    / bgri_cluster_LP["N_INDIVIDUOS_RESIDENT"]
)
bgri_cluster_LP["PER_IND_RESIDENT_FENSINO_SEC"] = (
    bgri_cluster_LP["N_IND_RESIDENT_FENSINO_SEC"]
    / bgri_cluster_LP["N_INDIVIDUOS_RESIDENT"]
)
bgri_cluster_LP["PER_IND_RESIDENT_FENSINO_POSSEC"] = (
    bgri_cluster_LP["N_IND_RESIDENT_FENSINO_POSSEC"]
    / bgri_cluster_LP["N_INDIVIDUOS_RESIDENT"]
)
bgri_cluster_LP["PER_IND_RESIDENT_FENSINO_SUP"] = (
    bgri_cluster_LP["N_IND_RESIDENT_FENSINO_SUP"]
    / bgri_cluster_LP["N_INDIVIDUOS_RESIDENT"]
)
bgri_cluster_LP["PER_IND_RESIDENT_ENSINCOMP_1BAS"] = (
    bgri_cluster_LP["N_IND_RESIDENT_ENSINCOMP_1BAS"]
    / bgri_cluster_LP["N_INDIVIDUOS_RESIDENT"]
)
bgri_cluster_LP["PER_IND_RESIDENT_ENSINCOMP_2BAS"] = (
    bgri_cluster_LP["N_IND_RESIDENT_ENSINCOMP_2BAS"]
    / bgri_cluster_LP["N_INDIVIDUOS_RESIDENT"]
)
bgri_cluster_LP["PER_IND_RESIDENT_ENSINCOMP_3BAS"] = (
    bgri_cluster_LP["N_IND_RESIDENT_ENSINCOMP_3BAS"]
    / bgri_cluster_LP["N_INDIVIDUOS_RESIDENT"]
)
bgri_cluster_LP["PER_IND_RESIDENT_ENSINCOMP_SEC"] = (
    bgri_cluster_LP["N_IND_RESIDENT_ENSINCOMP_SEC"]
    / bgri_cluster_LP["N_INDIVIDUOS_RESIDENT"]
)
bgri_cluster_LP["PER_IND_RESIDENT_ENSINCOMP_POSEC"] = (
    bgri_cluster_LP["N_IND_RESIDENT_ENSINCOMP_POSEC"]
    / bgri_cluster_LP["N_INDIVIDUOS_RESIDENT"]
)
bgri_cluster_LP["PER_IND_RESIDENT_ENSINCOMP_SUP"] = (
    bgri_cluster_LP["N_IND_RESIDENT_ENSINCOMP_SUP"]
    / bgri_cluster_LP["N_INDIVIDUOS_RESIDENT"]
)
bgri_cluster_LP["PER_IND_RESID_PENS_REFORM"] = (
    bgri_cluster_LP["N_IND_RESID_PENS_REFORM"]
    / bgri_cluster_LP["N_INDIVIDUOS_RESIDENT"]
)
bgri_cluster_LP["PER_IND_RESID_SEM_ACT_ECON"] = (
    bgri_cluster_LP["N_IND_RESID_SEM_ACT_ECON"]
    / bgri_cluster_LP["N_INDIVIDUOS_RESIDENT"]
)
bgri_cluster_LP["PER_IND_RESID_EMPREG_SECT_PRIM"] = (
    bgri_cluster_LP["N_IND_RESID_EMPREG_SECT_PRIM"]
    / bgri_cluster_LP["N_INDIVIDUOS_RESIDENT"]
)
bgri_cluster_LP["PER_IND_RESID_EMPREG_SECT_SEQ"] = (
    bgri_cluster_LP["N_IND_RESID_EMPREG_SECT_SEQ"]
    / bgri_cluster_LP["N_INDIVIDUOS_RESIDENT"]
)
bgri_cluster_LP["PER_IND_RESID_EMPREG_SECT_TERC"] = (
    bgri_cluster_LP["N_IND_RESID_EMPREG_SECT_TERC"]
    / bgri_cluster_LP["N_INDIVIDUOS_RESIDENT"]
)
bgri_cluster_LP["PER_IND_RESID_ESTUD_MUN_RESID"] = (
    bgri_cluster_LP["N_IND_RESID_ESTUD_MUN_RESID"]
    / bgri_cluster_LP["N_INDIVIDUOS_RESIDENT"]
)
bgri_cluster_LP["PER_IND_RESID_TRAB_MUN_RESID"] = (
    bgri_cluster_LP["N_IND_RESID_TRAB_MUN_RESID"]
    / bgri_cluster_LP["N_INDIVIDUOS_RESIDENT"]
)

bgri_cluster_LP.drop(
    [
        "N_INDIVIDUOS_RESIDENT",
        "N_INDIV_RESIDENT_N_LER_ESCRV",
        "N_IND_RESIDENT_FENSINO_1BAS",
        "N_IND_RESIDENT_FENSINO_2BAS",
        "N_IND_RESIDENT_FENSINO_3BAS",
        "N_IND_RESIDENT_FENSINO_SEC",
        "N_IND_RESIDENT_FENSINO_POSSEC",
        "N_IND_RESIDENT_FENSINO_SUP",
        "N_IND_RESIDENT_ENSINCOMP_1BAS",
        "N_IND_RESIDENT_ENSINCOMP_2BAS",
        "N_IND_RESIDENT_ENSINCOMP_3BAS",
        "N_IND_RESIDENT_ENSINCOMP_SEC",
        "N_IND_RESIDENT_ENSINCOMP_POSEC",
        "N_IND_RESIDENT_ENSINCOMP_SUP",
        "N_IND_RESID_DESEMP_PROC_1EMPRG",
        "N_IND_RESID_DESEMP_PROC_EMPRG",
        "N_IND_RESID_EMPREGADOS",
        "N_IND_RESID_PENS_REFORM",
        "N_IND_RESID_SEM_ACT_ECON",
        "N_IND_RESID_EMPREG_SECT_PRIM",
        "N_IND_RESID_EMPREG_SECT_SEQ",
        "N_IND_RESID_EMPREG_SECT_TERC",
        "N_IND_RESID_ESTUD_MUN_RESID",
        "N_IND_RESID_TRAB_MUN_RESID",
    ],
    axis=1,
    inplace=True,
)

In [None]:
bgri_cluster_LP.shape

In [None]:
bgri_cluster_LP.head()

In [None]:
a = list(bgri_cluster_LP.columns)

In [None]:
# retirar o primeiro elemento da lista, para poder standardizar os dados (a servirá como lista de indicadores no próximo passo)
a = a[3:]

In [None]:
a

In [None]:
# Standardizing the features
scaler = StandardScaler()
bgri_cluster_LP[a] = StandardScaler().fit_transform(bgri_cluster_LP[a])

In [None]:
bgri_cluster_LP.head()

In [None]:
bgri_cluster_LP.fillna(0, inplace=True)

In [None]:
X_pca = bgri_cluster_LP[a]

In [None]:
# rotina para verificar quais os pontos que apresentam NaN após a standardização - esses pontos, ou as linhas às quais pertencem, serão excluídas

# primeira iteração do código tinha o bloco "bgri_cluster.drop([24, 32, 47, 94, 101], axis=0, inplace=True)" não implementado, resultando numa
# lista de linhas com pontos NaN - essas linhas são então descartadas, eliminando o problema verificado na Fatorização

x, y = sp.coo_matrix(bgri_cluster_LP.isnull()).nonzero()
print(set(x))

In [None]:
# rotinas utilizadas para verificar a presença de NaNs - linhas dropadas

# set de pontos (X) com NaNs depois do standardscaler()
# este output só aparece se o bloco "bgri_cluster.drop([10, 14, 49, 56, 59, 61, 62], axis=0, inplace=True)" não estiver implementado


# {10, 14, 49, 56, 59, 61, 62}

In [None]:
bgri_cluster_LP

In [None]:
clusters_lp["Cluster_LP"] = clusters_lp["Cluster_LP"].astype("int64")
bgri_cluster_LP["Cluster_LP"] = bgri_cluster_LP["Cluster_LP"].astype("int64")
clusters_lp.reset_index(drop=True, inplace=True)
bgri_cluster_LP.reset_index(drop=True, inplace=True)

In [None]:
# filter clusters file, based on the bgri_cluster file (clusters existing in bgri_cluster)
clusters_lp = clusters_lp.loc[
    clusters_lp["Cluster_LP"].isin(bgri_cluster_LP["Cluster_LP"])
]
clusters_lug = clusters_lug.loc[clusters_lug["LUG11"].isin(clusters_lug["LUG11"])]
clusters_fr = clusters_fr.loc[clusters_fr["FR11"].isin(clusters_fr["FR11"])]

In [None]:
clusters_lp.reset_index(drop=True, inplace=True)
bgri_cluster_LP.reset_index(drop=True, inplace=True)

In [None]:
clusters_lp

In [None]:
len(bgri_cluster_LP.Cluster_LP.unique())

In [None]:
clusters_lp.shape

### 2.2 Factorization (socioeconomic variables)


In [None]:
# PCA para os indicadores de habitação
pca = PCA(n_components=20)

principalComponents = pca.fit_transform(X_pca)

principalDf = pd.DataFrame(
    data=principalComponents,
    columns=[
        "PCA_1",
        "PCA_2",
        "PCA_3",
        "PCA_4",
        "PCA_5",
        "PCA_6",
        "PCA_7",
        "PCA_8",
        "PCA_9",
        "PCA_10",
        "PCA_11",
        "PCA_12",
        "PCA_13",
        "PCA_14",
        "PCA_15",
        "PCA_16",
        "PCA_17",
        "PCA_18",
        "PCA_19",
        "PCA_20",
    ],
)

In [None]:
# variância explicada
sum(pca.explained_variance_ratio_)

In [None]:
pca.explained_variance_ratio_

In [None]:
# eigenvalues, todos superiores a 1
pca.explained_variance_

In [None]:
# olhando para eigenvalues e scree plot, decidiu-se optar por usar eigenvalues superior a 1 para o PCA
# poderia ter sido utilizado outro método, como o elbow method, mas este é mais simples e intuitivo e garante mais de 80% da variância explicada

PC_values = np.arange(pca.n_components) + 1
plt.plot(PC_values, pca.explained_variance_, "o-", linewidth=2, color="blue")
plt.title("Scree Plot")
plt.xlabel("Principal Component")
plt.ylabel("Variance Explained")
plt.show()

In [None]:
# PCA para os indicadores de habitação
pca = PCA(n_components=17)

principalComponents = pca.fit_transform(X_pca)

principalDf = pd.DataFrame(
    data=principalComponents,
    columns=[
        "PCA_1",
        "PCA_2",
        "PCA_3",
        "PCA_4",
        "PCA_5",
        "PCA_6",
        "PCA_7",
        "PCA_8",
        "PCA_9",
        "PCA_10",
        "PCA_11",
        "PCA_12",
        "PCA_13",
        "PCA_14",
        "PCA_15",
        "PCA_16",
        "PCA_17",
    ],
)

In [None]:
loadings = pd.DataFrame(
    pca.components_.T,
    columns=[
        "PCA_1",
        "PCA_2",
        "PCA_3",
        "PCA_4",
        "PCA_5",
        "PCA_6",
        "PCA_7",
        "PCA_8",
        "PCA_9",
        "PCA_10",
        "PCA_11",
        "PCA_12",
        "PCA_13",
        "PCA_14",
        "PCA_15",
        "PCA_16",
        "PCA_17",
    ],
    index=X_pca.columns,
)

In [None]:
bgri_cluster_LP.reset_index(inplace=True, drop=True)

In [None]:
principalDf.shape

In [None]:
# Concatenar os dados
bgri_cluster_LP = pd.concat([bgri_cluster_LP, principalDf], axis=1)

In [None]:
bgri_cluster_LP.shape

In [None]:
bgri_cluster_LP.drop(columns=["FR11", "LUG11"], inplace=True)

In [None]:
bgri_cluster_LP

#### 2.2.a PCA Loadings


In [None]:
# view loadings for first principal component group
loadings

### 2.3 Dentrogram (socioeconomic variables)


In [None]:
# calculo do dendrograma com método ward
dendrogram = sch.dendrogram(
    sch.linkage(
        bgri_cluster_LP[
            [
                "PCA_1",
                "PCA_2",
                "PCA_3",
                "PCA_4",
                "PCA_5",
                "PCA_6",
                "PCA_7",
                "PCA_8",
                "PCA_9",
                "PCA_10",
                "PCA_11",
                "PCA_12",
                "PCA_13",
                "PCA_14",
                "PCA_15",
                "PCA_16",
                "PCA_17",
            ]
        ],
        method="ward",
    )
)
plt.title("Dendrograma")
plt.xlabel("Clientes")
plt.ylabel("Distâncias Euclidianas")
plt.axhline(16.2, color="red", linestyle="--", linewidth=1)
plt.grid(False)
plt.show()

In [None]:
# number of clusters suggested by the dendrogram
n_clusters = 9

## 3. Clustering


### 3.1 Ward Linkage


In [None]:
# Agglomerative Clustering, no contiguity matrix, ward linkage
hc = AgglomerativeClustering(n_clusters=n_clusters, linkage="ward")

In [None]:
y_hc = hc.fit_predict(bgri_cluster_LP)

In [None]:
# write labels in our dataframe
bgri_cluster_LP["Zona_Ward"] = y_hc.T

In [None]:
bgri_cluster_LP.head()

In [None]:
# prepare data for merging
bgri_cluster2 = bgri_cluster_LP.iloc[:, -18:]

In [None]:
bgri_cluster2

In [None]:
# prepare data for merging
bgri_cluster3 = bgri_cluster_LP.iloc[:, :1]

In [None]:
bgri_cluster3

In [None]:
# concatenate dataframes
bgri_cluster_LP = pd.concat([bgri_cluster2, bgri_cluster3], axis=1)

In [None]:
bgri_cluster_LP.head()

In [None]:
# merge relevant data (resulting from the PCA) with the cluster dataframe
clusters_lp = clusters_lp.merge(bgri_cluster_LP, how="left", on="Cluster_LP")

In [None]:
clusters_lp.head()

In [None]:
clusters_lp.shape

In [None]:
# remove the column with the cluster Zona Ward labels
bgri_cluster_LP.drop(["Zona_Ward"], axis=1, inplace=True)

In [None]:
# print the result from the clustering done above
ax = clusters_lp.plot(
    figsize=(10, 10),
    column="Zona_Ward",
    categorical=True,
    edgecolor="b",
    legend=True,
    linewidth=0.2,
    cmap="tab20",
)
cx.add_basemap(ax, crs=clusters_lp.crs, source=cx.providers.OpenStreetMap.Mapnik)
plt.title("Clusterização - Ward Linkage. Clusters = {}".format(n_clusters), fontsize=16)

### 3.2 Ward Linkage + Queen Contiguity


In [None]:
# Contiguity matrix Queen
RANDOM_SEED = 123456

wqueen = Queen.from_dataframe(clusters_lp)

In [None]:
# Contiguity matrix Queen (arry like)
df = pd.DataFrame(*wqueen.full()).astype(int)

arr = df.to_numpy()

arr2d = np.transpose(arr)

In [None]:
wqueen.set_transform("R")

In [None]:
plot_queen = plot_spatial_weights(wqueen, clusters_lp)
plt.title(
    "Matriz de Contiguidade 'Queen' aplicada às unidades territoriais base", fontsize=14
)
plt.show()

In [None]:
# Vamos repetir o processo para o método de Ward matriz de contiguidade

hc2 = AgglomerativeClustering(n_clusters=n_clusters, linkage="ward", connectivity=arr2d)
y_hc2 = hc2.fit_predict(bgri_cluster_LP)
clusters_lp["Zona_Ward_Queen"] = y_hc2.T

In [None]:
clusters_lp.head()

In [None]:
# print the result from the clustering done above
ax = clusters_lp.plot(
    figsize=(10, 10),
    column="Zona_Ward_Queen",
    categorical=True,
    edgecolor="b",
    legend=True,
    linewidth=0.2,
    cmap="tab20",
)
cx.add_basemap(ax, crs=clusters_lp.crs, source=cx.providers.OpenStreetMap.Mapnik)
plt.title(
    "Clusterização - Ward Linkage. Matriz contiguidade Queen. Clusters = {}".format(
        n_clusters
    ),
    fontsize=14,
)

### 3.3 Max-P Regionalization

https://www.tandfonline.com/doi/full/10.1080/13658816.2020.1759806

https://www.youtube.com/watch?v=HFVKHej1EOc&ab_channel=GeoDaSoftware

https://pysal.org/spopt/generated/spopt.region.MaxPHeuristic.html#spopt.region.MaxPHeuristic

https://pysal.org/spopt/notebooks/maxp.html

É necessário definir a lista de atributos, matriz de pesos espaciais, threshold, threshold_name e o top_n.


In [None]:
# load data (pickle) casasapo
casasapo = pd.read_pickle("../Data/piclo_casasapo.piclo")

# load data (pickle) py
py = pd.read_pickle("../Data/piclo_py.piclo")

In [None]:
# definition of seed
RANDOM_SEED = 123456

In [None]:
casasapo["a"] = 1
py["a"] = 1

In [None]:
casasapo["Cluster_LP"] = casasapo["Cluster_LP"].astype("Int64")

In [None]:
# preparation to calculate the number of dwellings per cluster
df_subtotal_cs = pd.DataFrame()
df_subtotal_py = pd.DataFrame()
df_subtotal_cs = casasapo.groupby("Cluster_LP", as_index=False)[["a"]].agg("sum")
df_subtotal_py = py.groupby("Cluster_LP", as_index=False)[["a"]].agg("sum")

In [None]:
# merge with clusters_lp dataframe
clusters_lp = clusters_lp.merge(
    df_subtotal_cs[["Cluster_LP", "a"]], how="left", on="Cluster_LP"
)
clusters_lp = clusters_lp.merge(
    df_subtotal_py[["Cluster_LP", "a"]], how="left", on="Cluster_LP"
)

In [None]:
# drop column 'a'
casasapo.drop(columns=["a"], inplace=True)
py.drop(columns=["a"], inplace=True)

In [None]:
# rename columns
clusters_lp.rename(columns={"a_x": "tot_cs", "a_y": "tot_py"}, inplace=True)

In [None]:
clusters_lp.head()

In [None]:
clusters_lp[["tot_cs", "tot_py"]].describe()

In [None]:
# define the minimum number of dwellings per cluster, pre and post intervention (casasapo and py)
clusters_lp["tot_min"] = clusters_lp[["tot_cs", "tot_py"]].min(axis=1)

In [None]:
attrs_name = list(
    clusters_lp[
        [
            "PCA_1",
            "PCA_2",
            "PCA_3",
            "PCA_4",
            "PCA_5",
            "PCA_6",
            "PCA_7",
            "PCA_8",
            "PCA_9",
            "PCA_10",
            "PCA_11",
            "PCA_12",
            "PCA_13",
            "PCA_14",
            "PCA_15",
            "PCA_16",
            "PCA_17",
        ]
    ]
)

In [None]:
# minimum number of dwellings per cluster
threshold = 102

In [None]:
# The number of top candidate regions to consider for enclave assignment.
top_n = 3

In [None]:
# criteria for the alghoritm - number of dwellings per cluster
threshold_name = "tot_min"

In [None]:
# model with MaxP
np.random.seed(RANDOM_SEED)
model_maxp = MaxP(clusters_lp, wqueen, attrs_name, threshold_name, threshold, top_n)
model_maxp.solve()

In [None]:
clusters_lp["Zona_Maxp"] = model_maxp.labels_

In [None]:
clusters_lp[["Zona_Maxp"]].groupby(by="Zona_Maxp").count()

In [None]:
a = model_maxp.p
a

In [None]:
# result from the Max-P Regionalization
ax = clusters_lp.plot(
    figsize=(10, 10),
    column="Zona_Maxp",
    categorical=True,
    edgecolor="b",
    legend=True,
    linewidth=0.2,
    cmap="tab10",
)
cx.add_basemap(ax, crs=clusters_lp.crs, source=cx.providers.OpenStreetMap.Mapnik)
plt.title(
    "Clusterização - Max-P. Threshold = %s imóveis. Clusters = %s" % (threshold, a),
    fontsize=16,
)

In [None]:
clusters_lp.head()

### 3.4 SKATER (Spatial ’K’luster Analysis by Tree Edge Removal)

https://www.tandfonline.com/doi/full/10.1080/13658810600665111

https://pysal.org/spopt/notebooks/skater.html

https://pysal.org/spopt/generated/spopt.region.Skater.html#spopt.region.Skater

https://www.dshkol.com/post/spatially-constrained-clustering-and-regionalization/


In [None]:
# minimum number of zones per cluster
floor = 3

In [None]:
# Flag denoting whether to store intermediate labelings as the tree gets pruned. (default False)
trace = False

In [None]:
# Description of what to do with islands. If 'ignore', the algorithm will discover n_clusters regions, treating islands as their own regions.
# If “increase”, the algorithm will discover n_clusters regions, treating islands as separate from n_clusters. (default ‘increase’)
islands = "increase"

In [None]:
# standard definition for the spannig tree algorithm
default = dict(
    dissimilarity=skm.manhattan_distances,
    affinity=None,
    reduction=np.sum,
    center=np.mean,
    verbose=False,
)

In [None]:
# model using the skater algorithm
model_skater = spopt.region.Skater(
    clusters_lp,
    wqueen,
    attrs_name,
    n_clusters=n_clusters,
    floor=floor,
    trace=trace,
    islands=islands,
    spanning_forest_kwds=default,
)
model_skater.solve()

In [None]:
# write skater cluster info
clusters_lp["Zona_SKATER"] = model_skater.labels_

In [None]:
# resulting number of clusters, for the graph title
temp = len(clusters_lp["Zona_SKATER"].unique())
temp

In [None]:
# result of the SKATER Regionalization
ax = clusters_lp.plot(
    figsize=(10, 10),
    column="Zona_SKATER",
    categorical=True,
    edgecolor="b",
    legend=True,
    linewidth=0.2,
    cmap="tab20",
)
cx.add_basemap(ax, crs=clusters_lp.crs, source=cx.providers.OpenStreetMap.Mapnik)
plt.title("Clusterização - SKATER. Clusters = {}".format(temp), fontsize=16)

In [None]:
# result of the SKATER Regionalization
ax = clusters_lp.plot(
    figsize=(10, 10),
    column="Zona_SKATER",
    categorical=True,
    legend=False,
    linewidth=0.1,
    cmap="tab20",
)
cx.add_basemap(ax, crs=clusters_lp.crs, source=cx.providers.OpenStreetMap.Mapnik)
ax.set_title("SKATER Clusters", fontweight="bold", fontsize=16)
ax.set_axis_off()

# 4. Export Data to Pickle Files


In [None]:
# save data (pickle) bgri
bgri_cluster.to_pickle("../Data/piclo_bgri_2.piclo")

In [None]:
# save data (pickle) clusters
clusters_lp.to_pickle("../Data/piclo_clusters_2.piclo")

# 5. Cluster Metrics

Neste Link podem ser encontrados os métodos de avaliação de clusterização:
https://scikit-learn.org/stable/modules/clustering.html#clustering-performance-evaluation

https://scikit-learn.org/stable/modules/generated/sklearn.metrics.silhouette_score.html

https://journals.sagepub.com/doi/10.1177/2399808319875752

https://pysal.org/esda/notebooks/geosilhouettes.html

https://www.scikit-yb.org/en/latest/index.html


In [None]:
x_val = clusters_lp[
    [
        "PCA_1",
        "PCA_2",
        "PCA_3",
        "PCA_4",
        "PCA_5",
        "PCA_6",
        "PCA_7",
        "PCA_8",
        "PCA_9",
        "PCA_10",
        "PCA_11",
        "PCA_12",
        "PCA_13",
        "PCA_14",
        "PCA_15",
        "PCA_16",
        "PCA_17",
    ]
]

In [None]:
clusters_lp.head()

## 5.1 - Metrics - Ward Linkage


In [None]:
score_ward = silhouette_score(x_val, clusters_lp.Zona_Ward, metric="manhattan")

In [None]:
silhouettes_ward = silhouette_samples(x_val, clusters_lp.Zona_Ward)

## 5.2 - Metrics - Ward Linkage + Queen Contiguity


In [None]:
score_ward_queen = silhouette_score(
    x_val, clusters_lp.Zona_Ward_Queen, metric="manhattan"
)

In [None]:
silhouettes_ward_queen = silhouette_samples(x_val, clusters_lp.Zona_Ward_Queen)

## 5.3 - Metrics - Max-P Regionalization


In [None]:
score_maxp = silhouette_score(x_val, clusters_lp.Zona_Maxp, metric="manhattan")

In [None]:
silhouettes_maxp = silhouette_samples(x_val, clusters_lp.Zona_Maxp)

## 5.4 - Metrics - SKATER (Spatial ’K’luster Analysis by Tree Edge Removal)


In [None]:
score_skater = silhouette_score(x_val, clusters_lp.Zona_SKATER, metric="manhattan")

In [None]:
silhouettes_skater = silhouette_samples(x_val, clusters_lp.Zona_SKATER)

## 5.5 - Metrics - Clustering - Comparison


### 5.5.1 - Comparison - Silhouette Score (average) - Sklearn


In [None]:
score = [score_ward, score_ward_queen, score_maxp, score_skater]
method = ["Ward", "Ward Queen", "Max-P", "SKATER"]

In [None]:
# Plot dos histogramas das silhuetas para cada clusterização (média), para cada método de clusterização

plt.bar(method, score)
plt.xlabel("Método de Clusterização", fontsize=12)
plt.ylabel("Coeficiente", fontsize=12)
plt.title("Coeficiente da Silhueta (média)", fontsize=16)
plt.grid(False)
plt.show()

### 5.5.2 - Comparison - Silhouette Score (Samples) - Sklearn


In [None]:
# Plot dos histogramas das silhuetas para cada cluster, para cada método de clusterização, com a média de cada cluster

f, ax = plt.subplots(4, 2, figsize=(8, 12))
ax[0, 0].hist(silhouettes_ward)
clusters_lp.plot(
    silhouettes_ward, ax=ax[0, 1], cmap="viridis", vmin=-0.5, vmax=0.5, legend=True
)
ax[1, 0].hist(silhouettes_ward_queen)
clusters_lp.plot(
    silhouettes_ward_queen,
    ax=ax[1, 1],
    cmap="viridis",
    vmin=-0.5,
    vmax=0.5,
    legend=True,
)
ax[2, 0].hist(silhouettes_maxp)
clusters_lp.plot(
    silhouettes_maxp, ax=ax[2, 1], cmap="viridis", vmin=-0.5, vmax=0.5, legend=True
)
ax[3, 0].hist(silhouettes_skater)
clusters_lp.plot(
    silhouettes_skater, ax=ax[3, 1], cmap="viridis", vmin=-0.5, vmax=0.5, legend=True
)
ax[0, 0].set_title("Ward - Coeficiente de Silhueta")
ax[0, 0].grid(False)
ax[0, 1].set_title("Ward - Coeficiente de Silhueta")
ax[0, 1].axes.get_xaxis().set_visible(False)
ax[0, 1].axes.get_yaxis().set_visible(False)
ax[1, 0].set_title("Ward + Queen - Coeficiente de Silhueta")
ax[1, 0].grid(False)
ax[1, 1].set_title("Ward + Queen - Coeficiente de Silhueta")
ax[1, 1].axes.get_xaxis().set_visible(False)
ax[1, 1].axes.get_yaxis().set_visible(False)
ax[2, 0].set_title("Max-P - Coeficiente de Silhueta")
ax[2, 0].grid(False)
ax[2, 1].set_title("Max-P - Coeficiente de Silhueta")
ax[2, 1].axes.get_xaxis().set_visible(False)
ax[2, 1].axes.get_yaxis().set_visible(False)
ax[3, 0].set_title("SKATER - Coeficiente de Silhueta")
ax[3, 0].grid(False)
ax[3, 1].set_title("SKATER - Coeficiente de Silhueta")
ax[3, 1].axes.get_xaxis().set_visible(False)
ax[3, 1].axes.get_yaxis().set_visible(False)
f.tight_layout()
plt.show()