In [76]:
import numpy as np
import pandas as pd

In [77]:
df = pd.read_csv('database_fires.csv') 

In [78]:
df.head()

Unnamed: 0,id,estado,estacao,data,precipitacao,temp_max,temp_min,insolacao,evaporacao_piche,temp_comp_med,umidade_rel_med,vel_vento_med,altitude,fires
0,1,AC,50484,01/01/2016,,31.4,,2.3,,,96.0,0.0,170.0,0
1,2,AC,50484,02/01/2016,0.0,32.5,23.3,4.4,,26.44,95.0,0.51444,170.0,0
2,3,AC,50484,03/01/2016,35.2,30.5,24.0,0.6,,25.78,97.0,0.51444,170.0,0
3,4,AC,50484,04/01/2016,60.2,31.7,22.3,2.0,,25.94,96.0,0.0,170.0,0
4,5,AC,50484,05/01/2016,28.4,28.5,23.0,0.1,,25.04,95.0,0.0,170.0,0


In [79]:
# Removendo coletas com menos de 11 valores
df.dropna(thresh=11, inplace=True)

In [80]:
#porcentagem de valores faltantes
(df.isna().sum() / df.count()) * 100 

id                   0.000000
estado               0.000000
estacao              0.000000
data                 0.000000
precipitacao         0.136133
temp_max             6.013400
temp_min             2.029289
insolacao           16.107691
evaporacao_piche    36.627985
temp_comp_med       11.842538
umidade_rel_med     10.936761
vel_vento_med       16.124355
altitude             0.000000
fires                0.000000
dtype: float64

In [81]:
#removendo linhas sem valores de precipitação
df.dropna(subset=['precipitacao'], inplace=True) 

In [82]:
df.corr()['fires'].sort_values()

umidade_rel_med    -0.357786
precipitacao       -0.169521
temp_min           -0.032186
estacao             0.002006
altitude            0.015150
vel_vento_med       0.034038
id                  0.039549
temp_comp_med       0.112953
temp_max            0.229382
evaporacao_piche    0.271760
insolacao           0.321320
fires               1.000000
Name: fires, dtype: float64

In [83]:
#trocar estados por região
def getRegiao(estado):
    if (estado in ['AM', 'RR', 'AC', 'RO', 'PA', 'TO', 'AP']):
        return 1
    elif (estado in ['BA', 'SE', 'AL', 'PE', 'PB', 'RN', 'CE', 'MA', 'PI']):
        return 2
    elif (estado in ['GO', 'MT', 'MS', 'DF']):
        return 3        
    elif (estado in ['PR', 'SC', 'RS']):
        return 4
    elif (estado in ['ES', 'MG', 'SP', 'RJ']):
        return 5
df['regiao'] = df['estado'].map(getRegiao)

In [84]:
#trocar as datas completas pelo mês apenas
import re
df['mes'] = df['data'].map(lambda x: int(re.search('/(.+?)/', x).group(1)))

In [85]:
#remover colunas de estado e data
df.drop(['estado', 'data', 'id', 'altitude'], axis=1, inplace=True)
df.head()

Unnamed: 0,estacao,precipitacao,temp_max,temp_min,insolacao,evaporacao_piche,temp_comp_med,umidade_rel_med,vel_vento_med,fires,regiao,mes
1,50484,0.0,32.5,23.3,4.4,,26.44,95.0,0.51444,0,1,1
2,50484,35.2,30.5,24.0,0.6,,25.78,97.0,0.51444,0,1,1
3,50484,60.2,31.7,22.3,2.0,,25.94,96.0,0.0,0,1,1
4,50484,28.4,28.5,23.0,0.1,,25.04,95.0,0.0,0,1,1
5,50484,2.7,31.7,23.3,1.2,,26.28,95.0,0.0,0,1,1


In [86]:
#Importando biblioteca para completar os dados faltantes
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [87]:
#Completando dados faltantes pelo IterativeImputer
imp = IterativeImputer(max_iter=10, random_state=0)
imp.fit(df)

df = pd.DataFrame(imp.transform(df), columns=df.columns)

In [91]:
from sklearn.preprocessing import OneHotEncoder
def oneHot(column):
    cat_encoder = OneHotEncoder()
    resultado = cat_encoder.fit_transform(column)
    resultado = pd.DataFrame(resultado.toarray())
    resultado.columns = resultado.columns.map(lambda x: column.columns[0]+str(x))
    return resultado

df = pd.concat([df, oneHot(df[['regiao']]), oneHot(df[['mes']])], axis=1, sort=False)
df.drop(['regiao', 'mes'], axis=1, inplace=True)
df.head()

Unnamed: 0,estacao,precipitacao,temp_max,temp_min,insolacao,evaporacao_piche,temp_comp_med,umidade_rel_med,vel_vento_med,fires,...,mes2,mes3,mes4,mes5,mes6,mes7,mes8,mes9,mes10,mes11
0,50484.0,0.0,32.5,23.3,4.4,1.36681,26.44,95.0,0.51444,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,50484.0,35.2,30.5,24.0,0.6,0.86125,25.78,97.0,0.51444,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,50484.0,60.2,31.7,22.3,2.0,0.074433,25.94,96.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,50484.0,28.4,28.5,23.0,0.1,0.402582,25.04,95.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,50484.0,2.7,31.7,23.3,1.2,1.191513,26.28,95.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [92]:
df.columns

Index(['estacao', 'precipitacao', 'temp_max', 'temp_min', 'insolacao',
       'evaporacao_piche', 'temp_comp_med', 'umidade_rel_med', 'vel_vento_med',
       'fires', 'regiao0', 'regiao1', 'regiao2', 'regiao3', 'regiao4', 'mes0',
       'mes1', 'mes2', 'mes3', 'mes4', 'mes5', 'mes6', 'mes7', 'mes8', 'mes9',
       'mes10', 'mes11'],
      dtype='object')

In [93]:
target = df['fires'].copy()
df.drop('fires', axis=1, inplace=True)

In [94]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df = pd.DataFrame(scaler.fit_transform(df))
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,-0.18182,-0.363162,0.367913,0.799908,-0.713199,-1.012397,0.445028,1.56368,-0.969392,2.147155,...,-0.333404,-0.325661,-0.322849,-0.286619,-0.282316,-0.287603,-0.280364,-0.283109,-0.278171,-0.280108
1,-0.18182,2.812267,-0.095687,0.96833,-1.868592,-1.171825,0.273634,1.707139,-0.969392,2.147155,...,-0.333404,-0.325661,-0.322849,-0.286619,-0.282316,-0.287603,-0.280364,-0.283109,-0.278171,-0.280108
2,-0.18182,5.067543,0.182473,0.559305,-1.442921,-1.419946,0.315184,1.635409,-1.383517,2.147155,...,-0.333404,-0.325661,-0.322849,-0.286619,-0.282316,-0.287603,-0.280364,-0.283109,-0.278171,-0.280108
3,-0.18182,2.198832,-0.559287,0.727727,-2.020618,-1.316465,0.081465,1.56368,-1.383517,2.147155,...,-0.333404,-0.325661,-0.322849,-0.286619,-0.282316,-0.287603,-0.280364,-0.283109,-0.278171,-0.280108
4,-0.18182,-0.119592,0.182473,0.799908,-1.686162,-1.067677,0.403478,1.56368,-1.383517,2.147155,...,-0.333404,-0.325661,-0.322849,-0.286619,-0.282316,-0.287603,-0.280364,-0.283109,-0.278171,-0.280108


In [95]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df, target, test_size=0.2, random_state=42)
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
12701,0.352161,-0.363162,0.530173,0.727727,0.350978,-0.448924,0.767041,0.667060,-0.555267,2.147155,...,-0.333404,-0.325661,-0.322849,3.488946,-0.282316,-0.287603,-0.280364,-0.283109,-0.278171,-0.280108
7060,0.974887,-0.345120,-0.536107,0.775847,0.928675,0.038720,0.294409,1.348491,-0.578515,-0.465733,...,-0.333404,-0.325661,3.097428,-0.286619,-0.282316,-0.287603,-0.280364,-0.283109,-0.278171,-0.280108
97903,-0.368657,1.783861,-1.324226,-0.234685,-1.777377,-0.844256,-1.040387,0.900181,0.226486,-0.465733,...,-0.333404,3.070673,-0.322849,-0.286619,-0.282316,-0.287603,-0.280364,-0.283109,-0.278171,-0.280108
155646,-1.390730,-0.363162,0.344733,0.270581,1.141510,3.129135,0.502160,-1.144112,1.272988,-0.465733,...,-0.333404,-0.325661,-0.322849,-0.286619,-0.282316,3.477010,-0.280364,-0.283109,-0.278171,-0.280108
164218,-1.520339,-0.363162,0.599713,0.607425,0.411788,-0.510254,0.678747,1.061572,0.763154,-0.465733,...,-0.333404,3.070673,-0.322849,-0.286619,-0.282316,-0.287603,-0.280364,-0.283109,-0.278171,-0.280108
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119879,1.188565,0.142020,0.112933,0.107048,0.138143,-1.065000,0.030510,0.971910,-0.041847,-0.465733,...,-0.333404,-0.325661,-0.322849,-0.286619,-0.282316,-0.287603,-0.280364,-0.283109,-0.278171,-0.280108
103694,-1.132407,-0.236866,-1.022887,-1.076796,1.111105,-0.907326,-1.196199,0.326344,0.763154,-0.465733,...,-0.333404,-0.325661,-0.322849,-0.286619,-0.282316,-0.287603,-0.280364,-0.283109,-0.278171,-0.280108
131932,0.028040,2.018410,0.391093,0.511184,-0.460814,-1.001931,0.595647,0.577398,-0.846849,2.147155,...,-0.333404,-0.325661,3.097428,-0.286619,-0.282316,-0.287603,-0.280364,-0.283109,-0.278171,-0.280108
146867,0.812943,-0.101550,0.112933,1.232993,0.807054,-0.528908,0.969598,0.200817,0.199653,-0.465733,...,-0.333404,-0.325661,-0.322849,-0.286619,-0.282316,-0.287603,-0.280364,-0.283109,-0.278171,-0.280108


In [96]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(3)
knn.fit(X_train, y_train)
knn.score(X_train, y_train)

0.8839243801476551

In [97]:
knn.score(X_test, y_test)

0.793112277557291

In [98]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(max_depth=14, min_samples_split=14, random_state=42)
print('treinando...')
tree.fit(X_train, y_train)
print('testando...')
tree.score(X_train, y_train)

treinando...
testando...


0.835461741987795

In [99]:
tree.score(X_test, y_test)

0.7959714931933598

In [100]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(max_depth=20, min_samples_split=5, random_state=42)
print('treinando...')
forest.fit(X_train, y_train)
print('testando...')
forest.score(X_train, y_train)

treinando...
testando...


0.9080410105406905

In [101]:
forest.score(X_test, y_test)

0.8193573166047882

In [53]:
gridPar = {'max_depth' : np.arange(5, 21, 3), 'min_samples_split' : np.arange(2, 15, 3)}
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(forest, gridPar, cv=5, return_train_score=True)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=20,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=10,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False, random_state=42,
                                   

In [56]:
grid_search.best_params_

{'max_depth': 20, 'min_samples_split': 5}

In [89]:
respostas = pd.read_csv('respostas.csv')['id']
respId = respostas['id']
respostas.head()

Unnamed: 0,id,estado,estacao,data,precipitacao,temp_max,temp_min,insolacao,evaporacao_piche,temp_comp_med,umidade_rel_med,vel_vento_med,altitude
0,251768,AC,50484,27/01/2016,5.8,34.7,23.7,8.8,,27.84,92.0,1.02888,170.0
1,251769,AC,50484,28/01/2016,0.0,30.0,23.7,0.2,,26.6,93.0,0.0,170.0
2,251770,AC,50484,29/01/2016,0.0,35.5,24.1,7.1,,27.78,97.0,0.0,170.0
3,251771,AC,50484,05/08/2016,,,,,,,,0.0,170.0
4,251772,AC,88948,28/05/2019,0.0,34.7,21.6,8.3,2.0,26.56,86.75,0.666667,160.0


In [90]:
respostas['mes'] = respostas['data'].map(lambda x: int(re.search('/(.+?)/', x).group(1)))
respostas['regiao'] = respostas['estado'].map(getRegiao)
respostas.drop(['estado', 'data', 'id', 'altitude'], axis=1, inplace=True)
imp.fit(df.drop('fires', axis=1))

respostas = pd.DataFrame(imp.transform(respostas), columns=respostas.columns)
respostas.head()

Unnamed: 0,estacao,precipitacao,temp_max,temp_min,insolacao,evaporacao_piche,temp_comp_med,umidade_rel_med,vel_vento_med,mes,regiao
0,50484.0,5.8,34.7,23.7,8.8,1.826289,27.84,92.0,1.02888,1.0,1.0
1,50484.0,0.0,30.0,23.7,0.2,1.045803,26.6,93.0,0.0,1.0,1.0
2,50484.0,0.0,35.5,24.1,7.1,1.115198,27.78,97.0,0.0,1.0,1.0
3,50484.0,4.23681,31.987726,19.703147,5.941907,3.062923,24.44681,70.916977,0.0,8.0,1.0
4,88948.0,0.0,34.7,21.6,8.3,2.0,26.56,86.75,0.666667,5.0,1.0


In [102]:
respostas = pd.concat([respostas, oneHot(respostas[['regiao']]), oneHot(respostas[['mes']])], axis=1, sort=False)
respostas.drop(['regiao', 'mes'], axis=1, inplace=True)

respostas = pd.DataFrame(scaler.fit_transform(respostas))
respostas.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,-0.182251,0.33469,0.760394,0.954934,0.548631,-0.933214,0.829348,1.503579,-0.490351,2.16076,...,-0.120438,-0.129197,-0.221008,-0.370258,-0.382752,-0.39646,-0.381279,-0.389308,-0.383255,-0.387539
1,-0.182251,-0.313401,-0.240916,0.954934,-2.172974,-1.232524,0.526712,1.572377,-1.31654,2.16076,...,-0.120438,-0.129197,-0.221008,-0.370258,-0.382752,-0.39646,-0.381279,-0.389308,-0.383255,-0.387539
2,-0.182251,-0.313401,0.930829,1.044065,0.010639,-1.205911,0.814705,1.847569,-1.31654,2.16076,...,-0.120438,-0.129197,-0.221008,-0.370258,-0.382752,-0.39646,-0.381279,-0.389308,-0.383255,-0.387539
3,-0.182251,0.160019,0.182559,0.064319,-0.355858,-0.458976,0.0012,0.053108,-1.31654,2.16076,...,-0.120438,-0.129197,-0.221008,-0.370258,-0.382752,2.522323,-0.381279,-0.389308,-0.383255,-0.387539
4,1.323337,-0.313401,0.760394,0.486993,0.390398,-0.866597,0.516949,1.142389,-0.781207,2.16076,...,-0.120438,-0.129197,4.524724,-0.370258,-0.382752,-0.39646,-0.381279,-0.389308,-0.383255,-0.387539


In [104]:
df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
count,234330.0,234330.0,234330.0,234330.0,234330.0,234330.0,234330.0,234330.0,234330.0,234330.0,...,234330.0,234330.0,234330.0,234330.0,234330.0,234330.0,234330.0,234330.0,234330.0,234330.0
mean,1.888402e-14,-2.34497e-14,2.294379e-14,1.740186e-14,1.040315e-15,-6.309884e-17,1.050633e-15,-5.315739e-15,-1.126382e-14,2.159685e-13,...,1.93103e-15,-2.163994e-15,-1.848857e-15,9.83629e-15,-7.984511e-15,4.838073e-15,-2.248871e-15,-1.962972e-15,8.516764e-15,1.080695e-15
std,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,...,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002
min,-1.756768,-0.363162,-6.609265,-6.490364,-3.417444,-2.457299,-7.158634,-4.891984,-1.811192,-0.4657325,...,-0.3334045,-0.3256615,-0.3228485,-0.2866195,-0.2823164,-0.2876034,-0.2803644,-0.2831086,-0.2781714,-0.2801076
25%,-0.8582282,-0.363162,-0.4652784,-0.4993486,-0.7036764,-0.7040612,-0.4482982,-0.624073,-0.6933084,-0.4657325,...,-0.3334045,-0.3256615,-0.3228485,-0.2866195,-0.2823164,-0.2876034,-0.2803644,-0.2831086,-0.2781714,-0.2801076
50%,0.01635333,-0.363162,0.1592932,0.2224604,0.1989529,-0.2152514,0.2113092,0.1470202,-0.171313,-0.4657325,...,-0.3334045,-0.3256615,-0.3228485,-0.2866195,-0.2823164,-0.2876034,-0.2803644,-0.2831086,-0.2781714,-0.2801076
75%,0.8277079,-0.2098032,0.6692531,0.7277268,0.8374594,0.4486728,0.6735536,0.7567217,0.4948196,-0.4657325,...,-0.3334045,-0.3256615,-0.3228485,-0.2866195,-0.2823164,-0.2876034,-0.2803644,-0.2831086,-0.2781714,-0.2801076
max,1.739727,20.78231,3.010432,2.60443,2.479333,7.63862,2.678344,2.400101,11.04023,2.147155,...,2.99936,3.070673,3.097428,3.488946,3.542125,3.47701,3.566786,3.532214,3.594906,3.570057


In [105]:
respostas.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
count,44342.0,44342.0,44342.0,44342.0,44342.0,44342.0,44342.0,44342.0,44342.0,44342.0,...,44342.0,44342.0,44342.0,44342.0,44342.0,44342.0,44342.0,44342.0,44342.0,44342.0
mean,3.141384e-16,4.110141e-15,9.899665e-16,-1.70532e-17,5.568997e-15,3.023306e-18,-5.009399e-16,-1.887835e-15,-1.333129e-15,-6.430242e-14,...,-1.465796e-16,2.975933e-15,5.157533e-15,-1.890882e-15,-2.287224e-15,-8.647731e-16,1.655495e-17,6.076256e-16,1.672621e-16,7.148973e-16
std,1.000011,1.000011,1.000011,1.000011,1.000011,1.000011,1.000011,1.000011,1.000011,1.000011,...,1.000011,1.000011,1.000011,1.000011,1.000011,1.000011,1.000011,1.000011,1.000011,1.000011
min,-1.764715,-1.475219,-5.311376,-5.195145,-3.208863,-3.917922,-5.818896,-4.550649,-2.230463,-0.4628002,...,-0.1204382,-0.1291971,-0.221008,-0.370258,-0.382752,-0.39646,-0.3812791,-0.3893081,-0.3832553,-0.3875394
25%,-0.8449382,-0.313401,-0.4539602,-0.5256105,-0.6539388,-0.7335684,-0.3909608,-0.6463599,-0.703648,-0.4628002,...,-0.1204382,-0.1291971,-0.221008,-0.370258,-0.382752,-0.39646,-0.3812791,-0.3893081,-0.3832553,-0.3875394
50%,0.01686838,-0.313401,0.163869,0.1750326,0.2005186,-0.1562878,0.1264503,0.09321905,-0.2458752,-0.4628002,...,-0.1204382,-0.1291971,-0.221008,-0.370258,-0.382752,-0.39646,-0.3812791,-0.3893081,-0.3832553,-0.3875394
75%,0.8272805,-0.2687051,0.6538715,0.7543878,0.8018034,0.5871498,0.6926738,0.7468005,0.503591,-0.4628002,...,-0.1204382,-0.1291971,-0.221008,-0.370258,-0.382752,-0.39646,-0.3812791,-0.3893081,-0.3832553,-0.3875394
max,1.748467,20.06793,2.809522,2.624144,2.461713,6.496434,2.484088,2.159553,7.114948,2.16076,...,8.303015,7.740112,4.524724,2.700819,2.612658,2.522323,2.62275,2.56866,2.609227,2.580383


In [113]:
gabarito = pd.Series(forest.predict(respostas), index=respId, dtype='int32')
gabarito.head()

id
251768    0
251769    0
251770    0
251771    0
251772    0
dtype: int32

In [115]:
gabarito.to_csv('gabarito2.csv', header=False)