In [8]:
# bibliotecas
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [2]:
# dados
df = pd.read_csv('../data/distritos_sp.csv')
df.head()

Unnamed: 0,cod_ibge,distritos,renda,quota,escolaridade,idade,mortalidade,txcresc,causasext,favel,denspop
0,1,Água Rasa,1961,34.619999,7.6,32,13.86,-1.84,52.98,0.0,125.610001
1,12,Alto de Pinheiros,4180,75.959999,8.4,33,8.68,-2.52,38.57,0.69,57.560001
2,23,Anhanguera,1093,4.5,5.8,23,15.36,18.120001,22.68,0.0,8.57
3,34,Aricanduva,1311,21.02,6.8,27,18.43,-1.07,76.220001,5.38,138.539993
4,45,Artur Alvim,1248,15.91,7.0,27,19.73,-1.4,67.25,4.11,167.399994


In [3]:
# preparando os dados
df = df.drop(['cod_ibge', 'distritos'], axis=1)
df.head()

Unnamed: 0,renda,quota,escolaridade,idade,mortalidade,txcresc,causasext,favel,denspop
0,1961,34.619999,7.6,32,13.86,-1.84,52.98,0.0,125.610001
1,4180,75.959999,8.4,33,8.68,-2.52,38.57,0.69,57.560001
2,1093,4.5,5.8,23,15.36,18.120001,22.68,0.0,8.57
3,1311,21.02,6.8,27,18.43,-1.07,76.220001,5.38,138.539993
4,1248,15.91,7.0,27,19.73,-1.4,67.25,4.11,167.399994


In [4]:
# normalizando os dados
scaler = StandardScaler()
df_norm = scaler.fit_transform(df)
df_norm = pd.DataFrame(df_norm, columns=df.columns)
df_norm.head()

Unnamed: 0,renda,quota,escolaridade,idade,mortalidade,txcresc,causasext,favel,denspop
0,0.108502,0.180715,0.523096,1.055413,-0.429282,-0.538806,-0.780681,-0.756406,0.521805
1,2.406305,1.984386,1.320195,1.284643,-1.471886,-0.733574,-1.401978,-0.668425,-0.857184
2,-0.790323,-1.133425,-1.270377,-1.007657,-0.127369,5.178222,-2.087086,-0.756406,-1.849934
3,-0.564581,-0.412655,-0.274003,-0.090737,0.490545,-0.318259,0.221327,-0.070409,0.783822
4,-0.629818,-0.635605,-0.074728,-0.090737,0.752202,-0.412779,-0.16542,-0.232345,1.368651


In [None]:
# definindo o número de componentes
df_norm.shape[1] # maximo 9 todas as variaveis

9

In [6]:
# PCA
pca = PCA(n_components=9)
pca.fit(df_norm)

In [7]:
# variancia
variancia = pca.explained_variance_ratio_
variancia

array([0.55067008, 0.1293709 , 0.11070601, 0.08045185, 0.05881136,
       0.04425824, 0.01440763, 0.00722961, 0.00409432])

In [11]:
# atribuindo um nome para cada componente
fatores = [f'F{i+1}' for i in range(df_norm.shape[1])]
fatores

['F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8', 'F9']

In [12]:
# visualizando fatores em grafico
fig = px.bar(x=fatores, y=variancia, text=np.around(variancia, decimals=2), title='Variância dos Fatores')
fig.update_layout(
    yaxis={
        'title':'Porcentagem da variância explicada',
        'tickfont': {'size':15}
    },
    xaxis={
        'title':'Fatores',
        'tickfont': {'size':15}
    },
    title={'font':{'size':25}}
)
fig.show()

In [13]:
# variancia acumulada
variancia_acumulada = np.cumsum(variancia)
variancia_acumulada

array([0.55067008, 0.68004098, 0.790747  , 0.87119884, 0.9300102 ,
       0.97426844, 0.98867608, 0.99590568, 1.        ])

*1º fator explica 55%, 1º e 2º explicam 68%, 1º, 2º e 3º explicam 79%* ...

In [15]:
# visualizando variancia acumulada em grafico
fig = px.bar(x=fatores, y=variancia_acumulada, text=np.around(variancia_acumulada, decimals=3), title='Variância Acumulada')
fig.update_layout(
    yaxis={
        'title':'Porcentagem da variância acumulada',
        'tickfont': {'size':15}
    },
    xaxis={
        'title':'Fatores',
        'tickfont': {'size':15}
    },
    title={'font':{'size':25}}
)
fig.show()

In [18]:
# variaveis explicadas por fatores, autovalores
autovalores = pca.explained_variance_ratio_ * df_norm.shape[1]
autovalores

array([4.95603069, 1.16433814, 0.99635412, 0.72406663, 0.52930223,
       0.39832415, 0.12966871, 0.06506649, 0.03684884])

In [19]:
# selecionando fatores com autovalores maiores que 1
autovalores > 1

array([ True,  True, False, False, False, False, False, False, False])

In [21]:
# visualizando fatores com autovalores maiores que 1
fig = px.bar(
    x=fatores, 
    y=autovalores, 
    text=np.around(autovalores, decimals=2), 
    title='Autovalores',
    color=autovalores > 1
)
fig.update_layout(
    yaxis={
        'title':'Quantidade de variáveis explicadas',
        'tickfont': {'size':15}
    },
    xaxis={
        'title':'Autovalor',
        'tickfont': {'size':15}
    },
    title={'font':{'size':25}}
)
fig.show()

In [22]:
# resumo
resumo = pd.DataFrame({
    'Fator': fatores,
    'Autovalor': autovalores,
    'Variancia explicada': variancia,
    'Variancia acumulada': variancia_acumulada
})
resumo

Unnamed: 0,Fator,Autovalor,Variancia explicada,Variancia acumulada
0,F1,4.956031,0.55067,0.55067
1,F2,1.164338,0.129371,0.680041
2,F3,0.996354,0.110706,0.790747
3,F4,0.724067,0.080452,0.871199
4,F5,0.529302,0.058811,0.93001
5,F6,0.398324,0.044258,0.974268
6,F7,0.129669,0.014408,0.988676
7,F8,0.065066,0.00723,0.995906
8,F9,0.036849,0.004094,1.0
