### 4.ISOLATION_FOREST

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest

In [2]:
df = pd.read_csv("base_pronta.csv", sep=";")

In [3]:
df.columns.tolist()

['CO_UF',
 'UF',
 'SIG_UF',
 'CO_MUNICIPIO',
 'MUNICIPIO',
 'CO_MESORREGIAO',
 'CO_MICRORREGIAO',
 'CO_ENTIDADE',
 'NOME_ENTIDADE',
 'POPULACAO_ESTIMADA',
 'N_MATRICULA_MUN_19',
 'N_MATRICULA_ENT_19',
 'N_MATRICULA_INTEGRAL_ENT_19',
 'N_MATRICULA_MUN_20',
 'N_MATRICULA_ENT_20',
 'N_MATRICULA_INTEGRAL_ENT_20',
 'SCORE_MAT_MUN_20_19',
 'SCORE_MAT_ENT_20_19',
 'SCORE_MAT_INTEGRAL_ENT_20_19']

In [4]:
df_var_numericas = df[[
       'POPULACAO_ESTIMADA',
       'SCORE_MAT_MUN_20_19',
       'SCORE_MAT_ENT_20_19',
       'SCORE_MAT_INTEGRAL_ENT_20_19'
    ]]

In [5]:
def check_missing_att(df):
    # verificando atributos faltantes
    for var in df:
        # somamos o número de variáveis nulas (NaN)
        nullatt = np.sum(df[var].isnull())
        if nullatt > 0:
            print(var,"- faltantes: ", end='')
            print(nullatt)

In [6]:
check_missing_att(df_var_numericas)

In [7]:
df_var_numericas.head(3)

Unnamed: 0,POPULACAO_ESTIMADA,SCORE_MAT_MUN_20_19,SCORE_MAT_ENT_20_19,SCORE_MAT_INTEGRAL_ENT_20_19
0,1516113,0.989204,0.791139,0.0
1,235647,1.045727,1.188869,0.0
2,168468,0.966079,0.87602,0.0


In [8]:
df_var_numericas = df_var_numericas.fillna(0)

In [9]:
check_missing_att(df_var_numericas)

In [10]:
df_var_numericas.head()

Unnamed: 0,POPULACAO_ESTIMADA,SCORE_MAT_MUN_20_19,SCORE_MAT_ENT_20_19,SCORE_MAT_INTEGRAL_ENT_20_19
0,1516113,0.989204,0.791139,0.0
1,235647,1.045727,1.188869,0.0
2,168468,0.966079,0.87602,0.0
3,21993,0.860054,0.913677,0.0
4,42900,0.939561,0.720461,0.068182


#### Normalizando os dados:

In [11]:
base_normal = StandardScaler().fit_transform(df_var_numericas)

In [12]:
base_normal

array([[ 2.13308437,  0.19069779, -0.72313812, -0.09796226],
       [-0.20935784,  1.17727102,  0.61626152, -0.09796226],
       [-0.33225289, -0.21293223, -0.43729378, -0.09796226],
       ...,
       [-0.25181196, -0.07408363, -1.17185438, -0.09796226],
       [-0.63482287, -0.0572268 , -0.90726973, -0.09796226],
       [-0.61115635,  0.90618571,  0.18170591, -0.09796226]])

In [13]:
base_normal.shape

(4486, 4)

#### Ajustando a Isolation Forest

In [14]:
n_estimators = 100 #default
max_samples = 4486
contamination = 0.028

In [15]:
clf = IsolationForest(n_estimators = n_estimators, max_samples = max_samples, contamination = contamination, random_state=0)
clf.fit(base_normal)
y = clf.predict(base_normal)

In [16]:
y.shape

(4486,)

In [17]:
Isolation_result = pd.DataFrame(y, columns=['IF_resultado'])

In [18]:
Isolation_result.loc[Isolation_result['IF_resultado'] == -1].shape

(126, 1)

| n_estimators | max_samples | contamination | quant. outliers |
| --- | --- | --- | --- |
| 100 | 4486 | 0.1 | 449 |
| 100 | 4486 | 0.09 | 404 |
| 100 | 4486 | 0.05 | 225 |
| 100 | 4486 | 0.03 | 135 |
| 100 | 4486 | 0.028 | 126 |

In [19]:
Isolation_result.sample(3)

Unnamed: 0,IF_resultado
1859,1
2467,1
641,1


In [20]:
Isolation_result.shape

(4486, 1)

In [21]:
df_avalicacao = pd.concat([df, Isolation_result], axis=1)

In [22]:
df_avalicacao.sample(3)

Unnamed: 0,CO_UF,UF,SIG_UF,CO_MUNICIPIO,MUNICIPIO,CO_MESORREGIAO,CO_MICRORREGIAO,CO_ENTIDADE,NOME_ENTIDADE,POPULACAO_ESTIMADA,N_MATRICULA_MUN_19,N_MATRICULA_ENT_19,N_MATRICULA_INTEGRAL_ENT_19,N_MATRICULA_MUN_20,N_MATRICULA_ENT_20,N_MATRICULA_INTEGRAL_ENT_20,SCORE_MAT_MUN_20_19,SCORE_MAT_ENT_20_19,SCORE_MAT_INTEGRAL_ENT_20_19,IF_resultado
1664,52,Goias,GO,5203500,Bom Jesus de Goiás,5205,52015,52099334,ESCOLA MUNICIPAL RUI BARBOSA PEREIRA FILHO,25216,5523,279,0.0,5563,293,0.0,1.007242,1.050179,0.0,1
4107,52,Goias,GO,5208608,Goianésia,5203,52006,52015963,ESCOLA ESTADUAL LUIZ GONZAGA SOBRINHO,70084,16771,238,0.0,17097,253,0.0,1.019438,1.063025,0.0,1
1690,52,Goias,GO,5211800,Jaraguá,5203,52007,52024733,COLEGIO ESTADUAL MANOEL RIBEIRO FREITAS MACHADO,50511,9795,503,0.0,9521,514,0.0,0.972027,1.021869,0.0,1


In [23]:
df_noise = df_avalicacao.loc[df_avalicacao['IF_resultado'] == -1]

In [24]:
df_noise.shape

(126, 20)

In [25]:
df_noise.head(3)

Unnamed: 0,CO_UF,UF,SIG_UF,CO_MUNICIPIO,MUNICIPIO,CO_MESORREGIAO,CO_MICRORREGIAO,CO_ENTIDADE,NOME_ENTIDADE,POPULACAO_ESTIMADA,N_MATRICULA_MUN_19,N_MATRICULA_ENT_19,N_MATRICULA_INTEGRAL_ENT_19,N_MATRICULA_MUN_20,N_MATRICULA_ENT_20,N_MATRICULA_INTEGRAL_ENT_20,SCORE_MAT_MUN_20_19,SCORE_MAT_ENT_20_19,SCORE_MAT_INTEGRAL_ENT_20_19,IF_resultado
11,52,Goias,GO,5208707,Goiânia,5203,52010,52094308,ESC MUL LUZIA DE SOUZA FIUZA,1516113,279830,519,1.0,276809,611,3.0,0.989204,1.177264,3.0,-1
84,52,Goias,GO,5203906,Buriti Alegre,5205,52015,52108201,ESCOLA MUNICIPAL DE TEMPO INTEGRAL MARIA INEZ ...,9459,2269,307,110.0,1963,119,119.0,0.865139,0.387622,1.081818,-1
89,52,Goias,GO,5220702,Sítio d'Abadia,5204,52011,52077730,CEMEI - CENTRO MUNICIPAL DE EDUCACAO INFANTIL,2989,912,23,23.0,593,19,19.0,0.650219,0.826087,0.826087,-1


#### Carregando a base de Taxa de Risco

In [26]:
df_tr_escolas = pd.read_excel('Base_TR\BRASIL_TR_2020_21_ajustada_escola.xlsx')  

In [27]:
df_tr_escolas.shape

(126, 7)

#### Convergindo base Taxa de Risco escolas com outlier Isolation Forest

In [28]:
df_compara_is_tr = df_noise.merge(df_tr_escolas, on='CO_ENTIDADE')

In [29]:
print("Ocorrências similares na base Isolation Forest e Taxa de risco:", df_compara_is_tr.shape[0])

Ocorrências similares na base Isolation Forest e Taxa de risco: 6


In [30]:
validacao_ordenada_IF = df_compara_is_tr[[
    
 'MUNICIPIO',
    
 'POPULACAO_ESTIMADA',
    
 'N_MATRICULA_MUN_19',
    
 'N_MATRICULA_MUN_20',
    
 'SCORE_MAT_MUN_20_19',

 'CO_ENTIDADE',
    
 'NOME_ENTIDADE_x',
    
 'N_MATRICULA_ENT_19',
    
 'N_MATRICULA_ENT_20',
    
 'SCORE_MAT_ENT_20_19',
    
 'N_MATRICULA_INTEGRAL_ENT_19',
    
 'N_MATRICULA_INTEGRAL_ENT_20',
    
 'SCORE_MAT_INTEGRAL_ENT_20_19',
    
 'IF_resultado',
 
 'TAXA_RISCO_ENT']]

In [31]:
validacao_ordenada_IF.head(10)

Unnamed: 0,MUNICIPIO,POPULACAO_ESTIMADA,N_MATRICULA_MUN_19,N_MATRICULA_MUN_20,SCORE_MAT_MUN_20_19,CO_ENTIDADE,NOME_ENTIDADE_x,N_MATRICULA_ENT_19,N_MATRICULA_ENT_20,SCORE_MAT_ENT_20_19,N_MATRICULA_INTEGRAL_ENT_19,N_MATRICULA_INTEGRAL_ENT_20,SCORE_MAT_INTEGRAL_ENT_20_19,IF_resultado,TAXA_RISCO_ENT
0,Novo Gama,115711,20192,19788,0.979992,52047172,CAIC NOVO GAMA,176,445,2.528409,0.0,0.0,0.0,-1,9
1,Aparecida de Goiânia,578179,116323,118693,1.020374,52104257,COLEGIO ESTADUAL MICHELLE DO PRADO RODRIGUES,920,376,0.408696,0.0,0.0,0.0,-1,5
2,Campos Verdes,2141,929,939,1.010764,52006638,COLEGIO ESTADUAL EDMUNDO ROCHA,204,492,2.411765,0.0,0.0,0.0,-1,5
3,Anicuns,21850,4142,3560,0.859488,52028518,CENTRO DE ENSINO EM PERIODO INTEGRAL PROFESSOR...,84,224,2.666667,0.0,0.0,0.0,-1,6
4,Goiânia,1516113,279830,276809,0.989204,52104028,ESCOLA MUNICIPAL PROFESSORA LOUSINHA,126,283,2.246032,126.0,283.0,2.246032,-1,6
5,Goiânia,1516113,279830,276809,0.989204,52035913,ESCOLA MUNICIPAL ANA DAS NEVES DE FREITAS,276,187,0.677536,1.0,187.0,187.0,-1,5


In [33]:
validacao_ordenada_IF.to_excel('resultado_Isolation_forest.xlsx')

#### Quantitativo dos dois resultados sem taxa de risco

In [34]:
df_noise_dbscan = pd.read_csv("df_noise_dbscan.csv", sep=";")

In [36]:
df_compara_is_DBSCAN = df_noise.merge(df_noise_dbscan, on='CO_ENTIDADE')

In [38]:
df_compara_is_DBSCAN.shape

(85, 39)