# Principal Component Analysis (PCA)

## Importing packages

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from os.path import join
import numpy as np


In [2]:
from fancyimpute import Solver

In [3]:
pd.options.display.max_columns = None

sns.set_theme(style='darkgrid')
sns.set_palette("twilight_shifted")

## Reading Database

In [4]:
numerical_columns = ['IN001', 'IN009', 'IN020', 'IN023', 'IN037', 'IN038', 'IN055', 'IN057']
fmt_func = lambda x: x.str.replace(",", ".").astype(float)
columns_transformations = {c: fmt_func for c in numerical_columns}
columns_transformations

{'IN001': <function __main__.<lambda>(x)>,
 'IN009': <function __main__.<lambda>(x)>,
 'IN020': <function __main__.<lambda>(x)>,
 'IN023': <function __main__.<lambda>(x)>,
 'IN037': <function __main__.<lambda>(x)>,
 'IN038': <function __main__.<lambda>(x)>,
 'IN055': <function __main__.<lambda>(x)>,
 'IN057': <function __main__.<lambda>(x)>}

In [5]:
def get_consumption_bands(in022: float):
    if np.isnan(in022):
        return np.nan
    elif in022 <= 50:
        return "Critic low"
    elif in022 > 50 and in022 <= 100:
        return "Low"
    elif in022 > 100 and in022 <= 150:
        return "Moderate"
    elif in022 > 150 and in022 <= 200:
        return "Moderate to High"
    else:
        return "Very High"


def preprocess_water_consumption_data(data: pd.DataFrame) -> pd.DataFrame:

    columns_new_names = {
        'Município': 'nm_municipio',
        # 'Ano de Referência': 'ano',
        # 'Sigla do Prestador': 'sg_prestador',
        'Abrangência': 'abrangencia',
        'Tipo de Serviço': 'tipo_servico',
        'Natureza Jurídica': 'nat_juridica',
        'IN001 - Densidade de economias de água por ligação': 'IN001',
        'IN009 - Índice de hidrometração': 'IN009',
        'IN020 - Extensão da rede de água por ligação': 'IN020',
        'IN022 - Consumo médio percapita de água': 'IN022',
        'IN023 - Índice de atendimento urbano de água': 'IN023',
        'IN037 - Participação da despesa com energia elétrica nas despesas de exploração': 'IN037',
        'IN038 - Participação da despesa com produtos químicos nas despesas de exploração (DEX)': 'IN038',
        'IN055 - Índice de atendimento total de água': 'IN055',
        'IN057 - Índice de fluoretação de água': 'IN057'
    }

    columns_transformations = {
        "IN001": lambda x: x.str.replace(",", ".").astype(float), 
        "IN009": lambda x: x.str.replace(",", ".").astype(float), 
        "IN020": lambda x: x.str.replace(",", ".").astype(float), 
        "IN022": lambda x: x.str.replace(",", ".").astype(float), 
        "IN023": lambda x: x.str.replace(",", ".").astype(float), 
        "IN037": lambda x: x.str.replace(",", ".").astype(float), 
        "IN038": lambda x: x.str.replace(",", ".").astype(float), 
        "IN055": lambda x: x.str.replace(",", ".").astype(float), 
        "IN057": lambda x: x.str.replace(",", ".").astype(float)
        }
    
    selected_cols = columns_new_names.keys()
    numerical_columns = ['IN001', 'IN009', 'IN020', 'IN023', 'IN037', 'IN038', 'IN055', 'IN057']
    categorical_columns = ['tipo_servico']
    target_col = 'consumption_bands'

    fmt_func = lambda x: x.str.replace(",", ".").astype(float)

    df = (data[selected_cols]
        .rename(columns=columns_new_names))

    df["tipo_servico"] = df["tipo_servico"].str.rstrip()
    df[list(columns_transformations.keys())] = df.agg(columns_transformations)

    df[target_col] = df["IN022"].apply(get_consumption_bands)

    df = (df[df["consumption_bands"].notna()]
        .drop(columns=["IN022"]))
            

    return df

In [6]:
data = pd.read_csv("../../../data/external/Agregado-20241114133303.csv", 
                   encoding="latin-1",
                   sep=";")
df = preprocess_water_consumption_data(data)
print(df.shape)
df.head()

(511, 13)


Unnamed: 0,nm_municipio,abrangencia,tipo_servico,nat_juridica,IN001,IN009,IN020,IN023,IN037,IN038,IN055,IN057,consumption_bands
0,Acopiara,Microrregional,Água,Organização social,1.0,100.0,37.69,,54.21,18.39,20.76,0.0,Critic low
1,Aiuaba,Local,Água,Administração pública direta,1.0,90.95,33.41,,38.6,2.63,65.39,0.0,Low
2,Aiuaba,Local,Água,Administração pública direta,1.0,45.72,30.18,100.0,40.04,2.69,59.03,0.0,Low
3,Aiuaba,Local,Água,Administração pública direta,1.0,0.0,12.77,100.0,19.23,9.66,58.31,0.0,Low
4,Aiuaba,Local,Água,Administração pública direta,1.0,0.0,10.65,98.26,25.45,1.49,57.78,0.0,Moderate to High


In [7]:
df.dtypes

nm_municipio          object
abrangencia           object
tipo_servico          object
nat_juridica          object
IN001                float64
IN009                float64
IN020                float64
IN023                float64
IN037                float64
IN038                float64
IN055                float64
IN057                float64
consumption_bands     object
dtype: object

# 5. Perform an unconditional multi-variate analysis of the predictors.

Perform an unconditional multi-variate analysis of the predictors. Specifically, you must perform a principal components analysis of the predictors, for the sake of visualisation, retain only the first two principal components (those associated with the two largest eigenvalues) and plot the scatter plot of the projected observations. Again, for each projected point (observation) you must use colours or symbols to indicate the associated class label. [Remember to perform the necessary preprocessing of the data]

In [10]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA

In [11]:
X = (df[numerical_columns + ["consumption_bands"]].dropna()
     )

labels = X["consumption_bands"]

X.drop(columns=["consumption_bands"], inplace=True)

In [12]:
ohe = make_column_transformer(
  (StandardScaler(), numerical_columns),
  # (OneHotEncoder(handle_unknown = 'ignore', sparse_output=False, dtype=float), categorical_columns),
  remainder='passthrough')

ohe.fit(X)
X_transformed = ohe.transform(X)

In [13]:
dummies_columns = [x.split("__")[1] for x in ohe.get_feature_names_out()]
dummies = pd.DataFrame(X_transformed, columns=dummies_columns)
X = dummies.astype({c: float for c in dummies_columns})

print(X.shape)
X.head()

(422, 8)


Unnamed: 0,IN001,IN009,IN020,IN023,IN037,IN038,IN055,IN057
0,-0.563576,-1.241541,2.047297,0.418907,1.378935,-0.715113,-0.699146,-0.966671
1,-0.563576,-2.866769,0.148696,0.418907,-0.213527,0.542527,-0.733658,-0.966671
2,-0.563576,-2.866769,-0.082495,0.283465,0.262451,-0.931636,-0.759062,-0.966671
3,-0.191678,-2.866769,-0.058503,0.326277,-1.685081,-0.875701,-0.743724,-0.966671
4,-0.005728,-2.866769,-0.041055,0.060843,-1.685081,-1.200486,-2.366273,-0.966671


## Custom PCA

In [26]:
_X = X.values

cov_x = np.cov(_X.T)


eig_vals, eig_vecs = np.linalg.eig(cov_x)

Na prática, o algoritmo do PCA sempre vai convergir para a mesma solução até os sinais dos vetores de *loadings* e *scores*. Como $VAR(Z) = VAR(-Z)$, desde que ambos os valores $z_{im}$ e $\phi_{jm}$ estejam com sinais trocados o produto final das duas quantidades permanece inalterado. lembrando que $x_{ij} \approx \sum_{m=1}^{M}  z_{im} \phi_{jm}$ (ILS, Pág. 514).

Uma forma de padronizar a saída do PCA é ajustando os sinais dos vetores de *loadings* para serem positivos na direção do maior valor em módulo. Como exemplificado abaixo. 

In [None]:
## Isso é opcional
# Índice do maior valor absoluto da coluna
max_abs_idx = np.argmax(np.abs(eig_vecs), axis=0)

# Sinal dos maiores valores de cada coluna
signs = np.sign(eig_vecs[max_abs_idx, range(eig_vecs.shape[0])])

# Ajustando os autovetores (loadings) para serem positivos na direção do maior valor
eig_vecs = eig_vecs * signs#[np.newaxis, :]
eig_vecs = eig_vecs.T

print('Eigenvalues \n', eig_vals)
print('Eigenvectors \n', eig_vecs)



Eigenvalues 
 [2.04402062 1.37188609 1.17171433 1.0093241  0.39631954 0.75490843
 0.68294189 0.58788737]
Eigenvectors 
 [[ 0.15807505  0.51125571 -0.03734526  0.30873008 -0.3021006   0.41950534
   0.30240383  0.50815684]
 [ 0.15291694 -0.15852641  0.08728502  0.45684927  0.46790961 -0.33019268
   0.63800346  0.01186726]
 [-0.4622682   0.21196123  0.73855033  0.09457468  0.29100911  0.29517131
  -0.06400368 -0.10521744]
 [ 0.72659109  0.14851757  0.46647706 -0.46864054  0.04822698 -0.03307759
   0.08294779 -0.04982782]
 [-0.23448784  0.35764182 -0.2969182  -0.40485446  0.03285073  0.1571145
   0.5420245  -0.49546497]
 [-0.34068433  0.0142399   0.00558033 -0.52277361  0.18829605 -0.3007262
   0.15240089  0.67918358]
 [ 0.19642526  0.18121046 -0.37279505 -0.02114954  0.75238412  0.34095222
  -0.31299725  0.09412161]
 [-0.01920784  0.69681724 -0.02759526  0.17250937  0.01293237 -0.62880697
  -0.27485056 -0.11156845]]


In [87]:
eig_vecs[:, sorted_index[::-1]].T

array([[ 0.15807505,  0.15291694, -0.4622682 ,  0.72659109, -0.23448784,
        -0.34068433,  0.19642526, -0.01920784],
       [ 0.51125571, -0.15852641,  0.21196123,  0.14851757,  0.35764182,
         0.0142399 ,  0.18121046,  0.69681724],
       [-0.03734526,  0.08728502,  0.73855033,  0.46647706, -0.2969182 ,
         0.00558033, -0.37279505, -0.02759526],
       [ 0.30873008,  0.45684927,  0.09457468, -0.46864054, -0.40485446,
        -0.52277361, -0.02114954,  0.17250937],
       [ 0.41950534, -0.33019268,  0.29517131, -0.03307759,  0.1571145 ,
        -0.3007262 ,  0.34095222, -0.62880697],
       [ 0.30240383,  0.63800346, -0.06400368,  0.08294779,  0.5420245 ,
         0.15240089, -0.31299725, -0.27485056],
       [ 0.50815684,  0.01186726, -0.10521744, -0.04982782, -0.49546497,
         0.67918358,  0.09412161, -0.11156845],
       [-0.3021006 ,  0.46790961,  0.29100911,  0.04822698,  0.03285073,
         0.18829605,  0.75238412,  0.01293237]])

In [83]:
sorted_index = np.argsort(eig_vals)
eig_vals[sorted_index[::-1]]

array([2.04402062, 1.37188609, 1.17171433, 1.0093241 , 0.75490843,
       0.68294189, 0.58788737, 0.39631954])

In [72]:
# Agrupando os pares de autovalores e autovetores
eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[i,:]) for i in range(len(eig_vals))]

# Ordenando os dados em ordem de Magnitude
eig_pairs.sort(key=lambda x: x[0], reverse=True)

# For further usage
eig_vals_sorted = np.array([x[0] for x in eig_pairs])
eig_vecs_sorted = np.array([x[1] for x in eig_pairs])


In [82]:
eig_vals_sorted

array([2.04402062, 1.37188609, 1.17171433, 1.0093241 , 0.75490843,
       0.68294189, 0.58788737, 0.39631954])

In [77]:
eig_vecs_sorted.T

array([[ 0.15807505,  0.15291694, -0.4622682 ,  0.72659109, -0.34068433,
         0.19642526, -0.01920784, -0.23448784],
       [ 0.51125571, -0.15852641,  0.21196123,  0.14851757,  0.0142399 ,
         0.18121046,  0.69681724,  0.35764182],
       [-0.03734526,  0.08728502,  0.73855033,  0.46647706,  0.00558033,
        -0.37279505, -0.02759526, -0.2969182 ],
       [ 0.30873008,  0.45684927,  0.09457468, -0.46864054, -0.52277361,
        -0.02114954,  0.17250937, -0.40485446],
       [-0.3021006 ,  0.46790961,  0.29100911,  0.04822698,  0.18829605,
         0.75238412,  0.01293237,  0.03285073],
       [ 0.41950534, -0.33019268,  0.29517131, -0.03307759, -0.3007262 ,
         0.34095222, -0.62880697,  0.1571145 ],
       [ 0.30240383,  0.63800346, -0.06400368,  0.08294779,  0.15240089,
        -0.31299725, -0.27485056,  0.5420245 ],
       [ 0.50815684,  0.01186726, -0.10521744, -0.04982782,  0.67918358,
         0.09412161, -0.11156845, -0.49546497]])

In [64]:
eig_vecs[max_abs_idx, range(eig_vecs.shape[0])]

array([ 0.51125571,  0.63800346, -0.73855033,  0.72659109,  0.5420245 ,
        0.67918358, -0.75238412,  0.69681724])

## Using Scikit-learn

In [None]:
pca = PCA(n_components=X.shape[1])
pca_features  = pca.fit_transform(X)


In [None]:
# Principal components correlation coefficients
loadings = pca.components_
 
# Number of features before PCA
n_features = pca.n_features_in_
 
# Feature names before PCA
feature_names = pca.feature_names_in_

# PC names
pc_list = [f'PC{i}' for i in list(range(1, n_features + 1))]
 
# Match PC names to loadings
pc_loadings = dict(zip(pc_list, loadings))
 
# Matrix of corr coefs between feature names and PCs
loadings_df = pd.DataFrame.from_dict(pc_loadings)
loadings_df['feature_names'] = feature_names
loadings_df = loadings_df.set_index('feature_names')
loadings_df.head(10)

Unnamed: 0_level_0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8
feature_names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
IN001,-0.158075,-0.152917,-0.462268,0.726591,-0.340684,0.196425,0.019208,-0.234488
IN009,-0.511256,0.158526,0.211961,0.148518,0.01424,0.18121,-0.696817,0.357642
IN020,0.037345,-0.087285,0.73855,0.466477,0.00558,-0.372795,0.027595,-0.296918
IN023,-0.30873,-0.456849,0.094575,-0.468641,-0.522774,-0.02115,-0.172509,-0.404854
IN037,0.302101,-0.46791,0.291009,0.048227,0.188296,0.752384,-0.012932,0.032851
IN038,-0.419505,0.330193,0.295171,-0.033078,-0.300726,0.340952,0.628807,0.157115
IN055,-0.302404,-0.638003,-0.064004,0.082948,0.152401,-0.312997,0.274851,0.542025
IN057,-0.508157,-0.011867,-0.105217,-0.049828,0.679184,0.094122,0.111568,-0.495465
