In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('database_fires.csv') 
respId = df['id']

In [3]:
df.head()

Unnamed: 0,id,estado,estacao,data,precipitacao,temp_max,temp_min,insolacao,evaporacao_piche,temp_comp_med,umidade_rel_med,vel_vento_med,altitude,fires
0,1,AC,50484,01/01/2016,,31.4,,2.3,,,96.0,0.0,170.0,0
1,2,AC,50484,02/01/2016,0.0,32.5,23.3,4.4,,26.44,95.0,0.51444,170.0,0
2,3,AC,50484,03/01/2016,35.2,30.5,24.0,0.6,,25.78,97.0,0.51444,170.0,0
3,4,AC,50484,04/01/2016,60.2,31.7,22.3,2.0,,25.94,96.0,0.0,170.0,0
4,5,AC,50484,05/01/2016,28.4,28.5,23.0,0.1,,25.04,95.0,0.0,170.0,0


In [4]:
# Removendo coletas com menos de 11 valores
df.dropna(thresh=11, inplace=True)

In [5]:
#porcentagem de valores faltantes
(df.isna().sum() / df.count()) * 100 

id                   0.000000
estado               0.000000
estacao              0.000000
data                 0.000000
precipitacao         0.136133
temp_max             6.013400
temp_min             2.029289
insolacao           16.107691
evaporacao_piche    36.627985
temp_comp_med       11.842538
umidade_rel_med     10.936761
vel_vento_med       16.124355
altitude             0.000000
fires                0.000000
dtype: float64

In [6]:
#removendo linhas sem valores de precipitação
df.dropna(subset=['precipitacao'], inplace=True) 

In [7]:
#trocar estados por região
def getRegiao(estado):
    if (estado in ['AM', 'RR', 'AC', 'RO', 'PA', 'TO', 'AP']):
        return 1
    elif (estado in ['BA', 'SE', 'AL', 'PE', 'PB', 'RN', 'CE', 'MA', 'PI']):
        return 2
    elif (estado in ['GO', 'MT', 'MS', 'DF']):
        return 3        
    elif (estado in ['PR', 'SC', 'RS']):
        return 4
    elif (estado in ['ES', 'MG', 'SP', 'RJ']):
        return 5
df['regiao'] = df['estado'].map(getRegiao)

In [8]:
#trocar as datas completas pelo mês apenas
import re
df['mes'] = df['data'].map(lambda x: int(re.search('/(.+?)/', x).group(1)))

In [9]:
#remover colunas de estado e data
df.drop(['estado', 'data'], axis=1, inplace=True)
df.head()

Unnamed: 0,id,estacao,precipitacao,temp_max,temp_min,insolacao,evaporacao_piche,temp_comp_med,umidade_rel_med,vel_vento_med,altitude,fires,regiao,mes
1,2,50484,0.0,32.5,23.3,4.4,,26.44,95.0,0.51444,170.0,0,1,1
2,3,50484,35.2,30.5,24.0,0.6,,25.78,97.0,0.51444,170.0,0,1,1
3,4,50484,60.2,31.7,22.3,2.0,,25.94,96.0,0.0,170.0,0,1,1
4,5,50484,28.4,28.5,23.0,0.1,,25.04,95.0,0.0,170.0,0,1,1
5,6,50484,2.7,31.7,23.3,1.2,,26.28,95.0,0.0,170.0,0,1,1


In [10]:
#remover coluna de id
df.drop('id', axis=1, inplace=True)

In [11]:
#Importando biblioteca para completar os dados faltantes
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [12]:
#Completando dados faltantes pelo IterativeImputer
imp = IterativeImputer(max_iter=10, random_state=0)
imp.fit(df)

df = pd.DataFrame(imp.transform(df), columns=df.columns)

In [13]:
#Separando Df de treino e df de teste
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.drop('fires', axis=1), df['fires'], test_size=0.2, random_state=42)

In [14]:
#Treino de Arvore de Decisao
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [15]:
#PS: O score no df de treino foi quase 100%, parece overfitted
tree.score(X_test, y_test)

0.7590790765160245

In [16]:
#Treinando KNN e testando
from sklearn.neighbors import KNeighborsClassifier


In [17]:
#Não parece tão overfitted quanto a arvore de decisão


In [18]:
df.head()

Unnamed: 0,estacao,precipitacao,temp_max,temp_min,insolacao,evaporacao_piche,temp_comp_med,umidade_rel_med,vel_vento_med,altitude,fires,regiao,mes
0,50484.0,0.0,32.5,23.3,4.4,1.374398,26.44,95.0,0.51444,170.0,0.0,1.0,1.0
1,50484.0,35.2,30.5,24.0,0.6,0.850312,25.78,97.0,0.51444,170.0,0.0,1.0,1.0
2,50484.0,60.2,31.7,22.3,2.0,0.055928,25.94,96.0,0.0,170.0,0.0,1.0,1.0
3,50484.0,28.4,28.5,23.0,0.1,0.391262,25.04,95.0,0.0,170.0,0.0,1.0,1.0
4,50484.0,2.7,31.7,23.3,1.2,1.187128,26.28,95.0,0.0,170.0,0.0,1.0,1.0


In [19]:
target = df['fires'].copy()
df.drop('fires', axis=1, inplace=True)


In [20]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df = pd.DataFrame(scaler.fit_transform(df))
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,-0.18182,-0.363162,0.367415,0.799693,-0.713105,-1.011128,0.445115,1.56045,-0.967507,-0.645363,-1.275397,-1.469086
1,-0.18182,2.812267,-0.096113,0.968142,-1.86859,-1.176387,0.273772,1.703605,-0.967507,-0.645363,-1.275397,-1.469086
2,-0.18182,5.067543,0.182004,0.55905,-1.442885,-1.426877,0.31531,1.632028,-1.381487,-0.645363,-1.275397,-1.469086
3,-0.18182,2.198832,-0.559641,0.7275,-2.020628,-1.321137,0.08166,1.56045,-1.381487,-0.645363,-1.275397,-1.469086
4,-0.18182,-0.119592,0.182004,0.799693,-1.686145,-1.07018,0.403577,1.56045,-1.381487,-0.645363,-1.275397,-1.469086


In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df, target, test_size=0.2, random_state=42)

In [22]:
knn = KNeighborsClassifier(3)
knn.fit(X_train, y_train)
knn.score(X_train, y_train)

0.8875197371228609

In [23]:
knn.score(X_test, y_test)

0.800516365808902

In [32]:
gridPar = {'n_neighbors' : np.arange(1, 15, 2),'weights': ['uniform', 'distance']}
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(KNeighborsClassifier(), gridPar, cv=5, return_train_score=True)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=None,
             param_grid={'n_neighbors': array([ 1,  3,  5,  7,  9, 11, 13]),
                         'weights': ['uniform', 'distance']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring=None, verbose=0)

In [44]:
grid_search.best_params_ 

Unnamed: 0,id,estado,estacao,data,precipitacao,temp_max,temp_min,insolacao,evaporacao_piche,temp_comp_med,umidade_rel_med,vel_vento_med,altitude
0,251768,AC,50484,27/01/2016,5.8,34.7,23.7,8.8,,27.84,92.0,1.02888,170.0
1,251769,AC,50484,28/01/2016,0.0,30.0,23.7,0.2,,26.6,93.0,0.0,170.0
2,251770,AC,50484,29/01/2016,0.0,35.5,24.1,7.1,,27.78,97.0,0.0,170.0
3,251771,AC,50484,05/08/2016,,,,,,,,0.0,170.0
4,251772,AC,88948,28/05/2019,0.0,34.7,21.6,8.3,2.0,26.56,86.75,0.666667,160.0


In [40]:
respostas = pd.read_csv('respostas.csv')
respId = respostas['id']
respostas.head()

In [45]:
respostas['mes'] = respostas['data'].map(lambda x: int(re.search('/(.+?)/', x).group(1)))
respostas['regiao'] = respostas['estado'].map(getRegiao)
respostas.drop(['estado', 'data', 'id'], axis=1, inplace=True)
imp.fit(respostas)
respostas = pd.DataFrame(imp.transform(respostas), columns=respostas.columns)
respostas.head()

Unnamed: 0,estacao,precipitacao,temp_max,temp_min,insolacao,evaporacao_piche,temp_comp_med,umidade_rel_med,vel_vento_med,altitude,mes,regiao
0,50484.0,5.8,34.7,23.7,8.8,1.577761,27.84,92.0,1.02888,170.0,1.0,1.0
1,50484.0,0.0,30.0,23.7,0.2,1.379076,26.6,93.0,0.0,170.0,1.0,1.0
2,50484.0,0.0,35.5,24.1,7.1,1.251912,27.78,97.0,0.0,170.0,1.0,1.0
3,50484.0,2.903674,31.652886,19.809776,6.684685,2.511524,25.07333,78.253462,0.0,170.0,8.0,1.0
4,88948.0,0.0,34.7,21.6,8.3,2.0,26.56,86.75,0.666667,160.0,5.0,1.0


In [46]:
respostas = pd.DataFrame(scaler.fit_transform(respostas))
respostas.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,-0.182251,0.335614,0.796642,0.96499,0.529846,-1.150927,0.810966,1.446064,-0.616765,-0.653632,-3.022169,-1.270882
1,-0.182251,-0.312614,-0.219171,0.96499,-2.266231,-1.235978,0.51032,1.513043,-1.465892,-0.653632,-3.022169,-1.270882
2,-0.182251,-0.312614,0.969546,1.05443,-0.022867,-1.290413,0.796419,1.780962,-1.465892,-0.653632,-3.022169,-1.270882
3,-0.182251,0.011911,0.138068,0.095141,-0.157896,-0.751213,0.140169,0.525323,-1.465892,-0.653632,-0.206287,-1.270882
4,1.323337,-0.312614,0.796642,0.495433,0.367283,-0.97018,0.500622,1.09442,-0.915697,-0.683845,-1.413094,-1.270882


In [47]:
gabarito = pd.Series(grid_search.predict(respostas), index=respId, dtype='int32')
gabarito.head()

id
251768    0
251769    0
251770    0
251771    0
251772    0
dtype: int32

In [51]:
gabarito.to_csv('gabaritoo.csv', header=False)

{'n_neighbors': 13, 'weights': 'distance'}