In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('database_fires.csv') 

In [3]:
df.head()

Unnamed: 0,id,estado,estacao,data,precipitacao,temp_max,temp_min,insolacao,evaporacao_piche,temp_comp_med,umidade_rel_med,vel_vento_med,altitude,fires
0,1,AC,50484,01/01/2016,,31.4,,2.3,,,96.0,0.0,170.0,0
1,2,AC,50484,02/01/2016,0.0,32.5,23.3,4.4,,26.44,95.0,0.51444,170.0,0
2,3,AC,50484,03/01/2016,35.2,30.5,24.0,0.6,,25.78,97.0,0.51444,170.0,0
3,4,AC,50484,04/01/2016,60.2,31.7,22.3,2.0,,25.94,96.0,0.0,170.0,0
4,5,AC,50484,05/01/2016,28.4,28.5,23.0,0.1,,25.04,95.0,0.0,170.0,0


In [4]:
# Removendo coletas com menos de 11 valores
df.dropna(thresh=11, inplace=True)

In [5]:
#porcentagem de valores faltantes
(df.isna().sum() / df.count()) * 100 

id                   0.000000
estado               0.000000
estacao              0.000000
data                 0.000000
precipitacao         0.136133
temp_max             6.013400
temp_min             2.029289
insolacao           16.107691
evaporacao_piche    36.627985
temp_comp_med       11.842538
umidade_rel_med     10.936761
vel_vento_med       16.124355
altitude             0.000000
fires                0.000000
dtype: float64

In [6]:
#removendo linhas sem valores de precipitação
df.dropna(subset=['precipitacao'], inplace=True) 

In [7]:
#trocar estados por região
def getRegiao(estado):
    if (estado in ['AM', 'RR', 'AC', 'RO', 'PA', 'TO', 'AP']):
        return 1
    elif (estado in ['BA', 'SE', 'AL', 'PE', 'PB', 'RN', 'CE', 'MA', 'PI']):
        return 2
    elif (estado in ['GO', 'MT', 'MS', 'DF']):
        return 3        
    elif (estado in ['PR', 'SC', 'RS']):
        return 4
    elif (estado in ['ES', 'MG', 'SP', 'RJ']):
        return 5
df['regiao'] = df['estado'].map(getRegiao)

In [8]:
#trocar as datas completas pelo mês apenas
import re
df['mes'] = df['data'].map(lambda x: int(re.search('/(.+?)/', x).group(1)))

In [9]:
#remover colunas de estado e data
df.drop(['estado', 'data'], axis=1, inplace=True)
df.head()

Unnamed: 0,id,estacao,precipitacao,temp_max,temp_min,insolacao,evaporacao_piche,temp_comp_med,umidade_rel_med,vel_vento_med,altitude,fires,regiao,mes
1,2,50484,0.0,32.5,23.3,4.4,,26.44,95.0,0.51444,170.0,0,1,1
2,3,50484,35.2,30.5,24.0,0.6,,25.78,97.0,0.51444,170.0,0,1,1
3,4,50484,60.2,31.7,22.3,2.0,,25.94,96.0,0.0,170.0,0,1,1
4,5,50484,28.4,28.5,23.0,0.1,,25.04,95.0,0.0,170.0,0,1,1
5,6,50484,2.7,31.7,23.3,1.2,,26.28,95.0,0.0,170.0,0,1,1


In [10]:
#remover coluna de id
df.drop('id', axis=1, inplace=True)

In [11]:
#Importando biblioteca para completar os dados faltantes
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [12]:
#Completando dados faltantes pelo IterativeImputer
imp = IterativeImputer(max_iter=10, random_state=0)
imp.fit(df)

df = pd.DataFrame(imp.transform(df), columns=df.columns)

In [13]:
#Separando Df de treino e df de teste
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.drop('fires', axis=1), df['fires'], test_size=0.2, random_state=42)

In [14]:
#Treino de Arvore de Decisao
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [15]:
#PS: O score no df de treino foi quase 100%, parece overfitted
tree.score(X_test, y_test)

0.7603166474629796

In [16]:
#Treinando KNN e testando
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(3)
knn.fit(X_train, y_train)
knn.score(X_test, y_test)

0.80358895574617

In [17]:
#Não parece tão overfitted quanto a arvore de decisão
knn.score(X_train, y_train)
knn.score(X_test, y_test)

0.80358895574617