# Análise de Cluster

O conjunto de dados usado neste experimento é base da PNAE 2018 restruturada por cidade.

## Importando pacotes

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import iqr

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [2]:
pd.options.display.float_format = '{:,.2f}'.format

## Carregando dados

In [3]:
base = pd.read_csv('PNAE_RECURSOS_REPASSADOS_2018_FINAL.csv')

In [4]:
base.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5513 entries, 0 to 5512
Data columns (total 13 columns):
 #   Column                                            Non-Null Count  Dtype  
---  ------                                            --------------  -----  
 0   CIDADE                                            5513 non-null   object 
 1   ATENDIMENTO EDUCACIONAL ESPECIALIZADO (AEE)       5513 non-null   float64
 2   CRECHE                                            5513 non-null   float64
 3   EJA                                               5513 non-null   float64
 4   ENSINO FUNDAMENTAL                                5513 non-null   float64
 5   ENSINO Mï¿½DIO                                    5513 non-null   float64
 6   ENSINO Mï¿½DIO INTEGRAL                           5513 non-null   float64
 7   INDï¿½GENA                                        5513 non-null   float64
 8   MAIS EDUCAï¿½ï¿½O (PARC. COMPLEM.) - FUNDAMENTAL  5513 non-null   float64
 9   MAIS EDUCAï¿½ï¿½O (

## Explorando dados

In [5]:
base.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ATENDIMENTO EDUCACIONAL ESPECIALIZADO (AEE),5513.0,5460.85,28188.88,0.0,0.0,1060.0,3710.0,1102930.0
CRECHE,5513.0,91768.77,415035.6,0.0,14552.0,31393.8,72332.0,24561636.0
EJA,5513.0,24977.67,193400.89,0.0,0.0,3584.0,14784.0,6104498.0
ENSINO FUNDAMENTAL,5513.0,304055.66,1505341.77,0.0,36626.0,93480.0,257020.0,66987801.7
ENSINO Mï¿½DIO,5513.0,63772.59,1019313.88,0.0,0.0,0.0,0.0,42052995.8
ENSINO Mï¿½DIO INTEGRAL,5513.0,10787.47,195108.27,0.0,0.0,0.0,0.0,7917920.0
INDï¿½GENA,5513.0,5446.85,61433.02,0.0,0.0,0.0,0.0,1995436.0
MAIS EDUCAï¿½ï¿½O (PARC. COMPLEM.) - FUNDAMENTAL,5513.0,14998.17,52939.57,0.0,0.0,0.0,1022.4,1549560.8
MAIS EDUCAï¿½ï¿½O (PARC. COMPLEM.) - INDï¿½GENA,5513.0,53.9,1435.82,0.0,0.0,0.0,0.0,75060.8
MAIS EDUCAï¿½ï¿½O (PARC. COMPLEM.) - QUILOMBOLA,5513.0,207.77,2257.96,0.0,0.0,0.0,0.0,96457.6


In [6]:
base['ESTADO']=base['CIDADE'].str.slice(0, 2)

In [7]:
# Intervalo interquartil e limites
def ver_outliers(df, feature):
    valor_iqr=iqr(df[feature],  keepdims=True)[0]

    l_sup = df[feature].mean()  + 1.5*valor_iqr
    l_inf = df[feature].mean()  - 1.5*valor_iqr

    print('Intervalo interquartil:', valor_iqr)
    print('Média', df[feature].mean())
    print('Limite Superior:', l_sup)
    print('Limite inferior:', l_inf)
    
    df[feature+'_OUT']=df[feature].apply(lambda x: 1 if (x<l_inf or x>l_sup) else 0)
    print()
    print(base.groupby(feature+'_OUT')[feature+'_OUT'].count())
    

In [8]:
ver_outliers(base, 'CRECHE')

Intervalo interquartil: 57780.0
Média 91768.77049156553
Limite Superior: 178438.77049156552
Limite inferior: 5098.770491565534

CRECHE_OUT
0    4499
1    1014
Name: CRECHE_OUT, dtype: int64


In [33]:
(base.groupby('CIDADE').sum()[['PRï¿½-ESCOLA']]).to_csv('PNAE_RECURSOS_REPASSADOS_2018_PRE.csv')

In [None]:
campos = ['ATENDIMENTO EDUCACIONAL ESPECIALIZADO (AEE)',
'CRECHE',
'EJA',
'ENSINO FUNDAMENTAL',
'ENSINO Mï¿½DIO',
'ENSINO Mï¿½DIO INTEGRAL',
'INDï¿½GENA',
'MAIS EDUCAï¿½ï¿½O (PARC. COMPLEM.) - FUNDAMENTAL',
'MAIS EDUCAï¿½ï¿½O (PARC. COMPLEM.) - INDï¿½GENA',
'MAIS EDUCAï¿½ï¿½O (PARC. COMPLEM.) - QUILOMBOLA',
'PRï¿½-ESCOLA',
'QUILOMBOLA']

In [None]:
base.groupby('ESTADO').mean()

In [None]:
sns.set_theme(style="white")
corr =base.corr()
plt.figure(figsize=(16, 6))
heatmap = sns.heatmap(corr, vmin=-1, vmax=1, annot=True, cmap='RdBu')
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':18}, pad=12);
# save heatmap as .png file
# dpi - sets the resolution of the saved image in dots/inches
# bbox_inches - when set to 'tight' - does not allow the labels to be cropped
plt.savefig('heatmap.png', dpi=300, bbox_inches='tight')

In [None]:
campos = ['CRECHE', 'ESTADO']

In [None]:
X = base[campos]
X.boxplot(by='ESTADO')

In [None]:
plt.scatter(X.iloc[0], X.iloc[1])
plt.grid()
plt.tight_layout()
plt.savefig('scatter.png', dpi=300)
plt.show()

In [None]:
km = KMeans(n_clusters=5,init='k-means++',n_init=10,max_iter=300,random_state=21)

In [None]:
#scaler = StandardScaler()
scaler = MinMaxScaler()

In [None]:
X_std = scaler.fit_transform(X)
X_std

In [None]:
y_km = km.fit_predict(X_std)

In [None]:
print('Inércia: %.2f' % km.inertia_)

In [None]:
inercia = []
for i in range(1, 16):
    km = KMeans(n_clusters=i, 
                init='k-means++', 
                n_init=10, 
                max_iter=300, 
                random_state=21)
    km.fit(X)
    inercia.append(km.inertia_)
plt.plot(range(1, 16), inercia, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Inércia')
plt.tight_layout()
#plt.savefig('images/11_03.png', dpi=300)
plt.show()

In [None]:
import numpy as np
from matplotlib import cm
from sklearn.metrics import silhouette_samples

km = KMeans(n_clusters=7, 
            init='k-means++', 
            n_init=10, 
            max_iter=300,
            tol=1e-04,
            random_state=21)
y_km = km.fit_predict(X)

cluster_labels = np.unique(y_km)
n_clusters = cluster_labels.shape[0]
silhouette_vals = silhouette_samples(X, y_km, metric='euclidean')
y_ax_lower, y_ax_upper = 0, 0
yticks = []
for i, c in enumerate(cluster_labels):
    c_silhouette_vals = silhouette_vals[y_km == c]
    c_silhouette_vals.sort()
    y_ax_upper += len(c_silhouette_vals)
    color = cm.jet(float(i) / n_clusters)
    plt.barh(range(y_ax_lower, y_ax_upper), c_silhouette_vals, height=1.0, 
             edgecolor='none', color=color)

    yticks.append((y_ax_lower + y_ax_upper) / 2.)
    y_ax_lower += len(c_silhouette_vals)
    
silhouette_avg = np.mean(silhouette_vals)
plt.axvline(silhouette_avg, color="red", linestyle="--") 

plt.yticks(yticks, cluster_labels + 1)
plt.ylabel('Cluster')
plt.xlabel('Silhouette coefficient')

plt.tight_layout()
plt.savefig('11_04.png', dpi=300)
plt.show()