In [11]:
import tensorflow as tf
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, mean_squared_error
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE, SelectKBest, chi2
from imblearn.under_sampling import RandomUnderSampler

In [12]:
df = pd.read_csv('./Focos_2020-01-01_2020-12-31.csv')
print(f'Tamanho da Base de Dados: \n\n Linhas: {df.shape[0]},  Colunas; {df.shape[1]}\n')
df.tail()


Tamanho da Base de Dados: 

 Linhas: 222797,  Colunas; 12



Unnamed: 0,datahora,satelite,pais,estado,municipio,bioma,diasemchuva,precipitacao,riscofogo,latitude,longitude,frp
222792,2020/07/11 16:50:00,AQUA_M-T,Brasil,PARA,PARAGOMINAS,Amazonia,10.0,0.0,0.3,-3.183,-47.246,12.7
222793,2020/07/11 16:45:00,AQUA_M-T,Brasil,RIO DE JANEIRO,RIO DE JANEIRO,Mata Atlantica,10.0,0.0,0.8,-22.855,-43.508,11.1
222794,2020/07/11 16:45:00,AQUA_M-T,Brasil,MINAS GERAIS,FORMIGA,Cerrado,11.0,0.2,0.8,-20.542,-45.689,11.2
222795,2020/07/11 16:50:00,AQUA_M-T,Brasil,PARA,TOME-ACU,Amazonia,10.0,1.8,0.1,-2.84,-48.11,19.5
222796,2020/07/11 16:50:00,AQUA_M-T,Brasil,PARA,PORTEL,Amazonia,2.0,1.6,0.1,-2.371,-50.491,5.7


### Tratamento dos dados

In [13]:
# Remove os Registros Duplicados.
dfClean = df.drop_duplicates(keep=False).copy()

# Remove os Atributos Irrelevantes do DataFrame.
dfClean.drop(['pais', 'municipio', 'latitude', 'longitude', 'satelite'], inplace=True, axis=1)

# Valor Padrão Definido Para o Atributo Dias Sem Chuva - (diasemchuva).
median = dfClean['diasemchuva'].median()

# Tratando Valores NAN/NULL.
dfClean['diasemchuva'].fillna(median, inplace=True)

# Tratando Valores Menores que Zero.
dfClean['diasemchuva'] = dfClean['diasemchuva'].apply(lambda item: 0.0 if item < 0 else item)

print('Informações do Atributo Dias Sem Chuva:\n')

# Tratando Valores Menores que Zero.
dfClean['diasemchuva'] = dfClean['diasemchuva'].apply(lambda item: 0.0 if item < 0 else item)

dfClean['precipitacao'].fillna(median, inplace=True)

# Tratando Valores Menores que Zero.
dfClean['precipitacao'] = dfClean['precipitacao'].apply(lambda item: 0.0 if item < 0 else item)

# Tratando Valores Maiores que Duzentos.
dfClean['precipitacao'] = dfClean['precipitacao'].apply(lambda item: 200.0 if item > 200 else item)


# Valor Padrão Definido Para o Atributo Risco De Fogo - (riscofogo).

median = dfClean['riscofogo'].median()
medians = dfClean.groupby('bioma')['riscofogo'].median()

# Tratando Valores NAN/NULL.
for i in dfClean.index:

    if pd.isnull(dfClean.at[i, 'riscofogo']):

        biome = dfClean.at[i, 'bioma']
        
        dfClean.at[i, 'riscofogo'] = median if pd.isnull(biome) else medians[biome]

# Tratando Valores Menores que Zero.
dfClean['riscofogo'] = dfClean['riscofogo'].apply(lambda item: 0.0 if item < 0 else item)



# Valor Padrão Definido Para o Atributo Fire Radiative Power - (frp).
median = dfClean['frp'].median()

# Tratando Valores NAN/NULL.
dfClean['frp'].fillna(median, inplace=True)

# Tratando Valores Menores que Zero.
dfClean['frp'] = dfClean['frp'].apply(lambda item: 0.0 if item < 0 else item)

# Tratando Valores Maiores que Oito Mil.
dfClean['frp'] = dfClean['frp'].apply(lambda item: 8000.0 if item > 8000 else item)


# Criando Um Novo Atributo Mes - (mes) a Partir Do Atributo Data e Hora - (datahora) Para o DataFrame.

# Separando o Mes Da Data e Hora do Atributo Data e Hora - (datahora).
dfClean['datahora'] = pd.DatetimeIndex(dfClean['datahora']).month

# Renomeando o Atributo Data e Hora - (datahora) Para Mes - (mes).
dfClean.rename(columns={'datahora':'mes'}, inplace=True)

dfClean.head()

Informações do Atributo Dias Sem Chuva:



Unnamed: 0,mes,estado,bioma,diasemchuva,precipitacao,riscofogo,frp
0,7,GOIAS,Cerrado,43.0,0.0,1.0,11.0
1,7,GOIAS,Cerrado,42.0,0.0,0.9,9.5
2,7,RIO DE JANEIRO,Mata Atlantica,38.0,0.0,0.8,13.0
3,7,MINAS GERAIS,Caatinga,46.0,0.0,1.0,17.6
4,7,PARA,Amazonia,0.0,0.4,0.2,18.7


### Normalizar atributos quantitativos.

In [14]:
# Normalizando  Atributo Dias Sem Chuva - (diasemchuva).

# Separando os Valores Limites.
max = dfClean['diasemchuva'].max()
min = dfClean['diasemchuva'].min()

# Aplicando a Normalização ao Atributo Dias Sem Chuva - (diasemchuva).
dfClean['diasemchuva'] = dfClean['diasemchuva'].apply(lambda item: (item - min) / (max - min))

# Exibindo os Valores Limites do Atributo Dias Sem Chuva - (diasemchuva).

print('Informações do Atributo Dias Sem Chuva:\n')

print(f'Valor Máximo: {max}')
print(f'Valor Miníno: {min}\n')

dfClean

Informações do Atributo Dias Sem Chuva:

Valor Máximo: 120.0
Valor Miníno: 0.0



Unnamed: 0,mes,estado,bioma,diasemchuva,precipitacao,riscofogo,frp
0,7,GOIAS,Cerrado,0.358333,0.0,1.0,11.0
1,7,GOIAS,Cerrado,0.350000,0.0,0.9,9.5
2,7,RIO DE JANEIRO,Mata Atlantica,0.316667,0.0,0.8,13.0
3,7,MINAS GERAIS,Caatinga,0.383333,0.0,1.0,17.6
4,7,PARA,Amazonia,0.000000,0.4,0.2,18.7
...,...,...,...,...,...,...,...
222792,7,PARA,Amazonia,0.083333,0.0,0.3,12.7
222793,7,RIO DE JANEIRO,Mata Atlantica,0.083333,0.0,0.8,11.1
222794,7,MINAS GERAIS,Cerrado,0.091667,0.2,0.8,11.2
222795,7,PARA,Amazonia,0.083333,1.8,0.1,19.5


In [15]:
# Normalizando  Atributo Precipitação - (precipitacao).

max = dfClean['precipitacao'].max()
min = dfClean['precipitacao'].min()

dfClean['precipitacao'] = dfClean['precipitacao'].apply(lambda item: (item - min) / (max - min))

# Exibindo os Valores Limites do Atributo Precipitação - (precipitacao).

print('Informações do Atributo Precipitação:\n')

print(f'Valor Máximo: {max}')
print(f'Valor Miníno: {min}\n')

dfClean

Informações do Atributo Precipitação:

Valor Máximo: 125.5
Valor Miníno: 0.0



Unnamed: 0,mes,estado,bioma,diasemchuva,precipitacao,riscofogo,frp
0,7,GOIAS,Cerrado,0.358333,0.000000,1.0,11.0
1,7,GOIAS,Cerrado,0.350000,0.000000,0.9,9.5
2,7,RIO DE JANEIRO,Mata Atlantica,0.316667,0.000000,0.8,13.0
3,7,MINAS GERAIS,Caatinga,0.383333,0.000000,1.0,17.6
4,7,PARA,Amazonia,0.000000,0.003187,0.2,18.7
...,...,...,...,...,...,...,...
222792,7,PARA,Amazonia,0.083333,0.000000,0.3,12.7
222793,7,RIO DE JANEIRO,Mata Atlantica,0.083333,0.000000,0.8,11.1
222794,7,MINAS GERAIS,Cerrado,0.091667,0.001594,0.8,11.2
222795,7,PARA,Amazonia,0.083333,0.014343,0.1,19.5


In [16]:
# Normalizando  Atributo Fire Radiative Power - (frp).

max = dfClean['frp'].max()
min = dfClean['frp'].min()

dfClean['frp'] = dfClean['frp'].apply(lambda item: (item - min) / (max - min))

# Exibindo os Valores Limites do Atributo Fire Radiative Power - (frp).

print('Informações do Atributo Fire Radiative Power:\n')

print(f'Valor Máximo: {max}')
print(f'Valor Miníno: {min}\n')

dfClean

Informações do Atributo Fire Radiative Power:

Valor Máximo: 8000.0
Valor Miníno: 0.0



Unnamed: 0,mes,estado,bioma,diasemchuva,precipitacao,riscofogo,frp
0,7,GOIAS,Cerrado,0.358333,0.000000,1.0,0.001375
1,7,GOIAS,Cerrado,0.350000,0.000000,0.9,0.001187
2,7,RIO DE JANEIRO,Mata Atlantica,0.316667,0.000000,0.8,0.001625
3,7,MINAS GERAIS,Caatinga,0.383333,0.000000,1.0,0.002200
4,7,PARA,Amazonia,0.000000,0.003187,0.2,0.002337
...,...,...,...,...,...,...,...
222792,7,PARA,Amazonia,0.083333,0.000000,0.3,0.001587
222793,7,RIO DE JANEIRO,Mata Atlantica,0.083333,0.000000,0.8,0.001388
222794,7,MINAS GERAIS,Cerrado,0.091667,0.001594,0.8,0.001400
222795,7,PARA,Amazonia,0.083333,0.014343,0.1,0.002437


### 7 - Transformar atributos qualitativos em quantitativos (dummies).

In [17]:
# Transformando os Atributos Qualitativos em Atributos Quantitativos.

dfClean = pd.get_dummies(dfClean, columns=['estado', 'bioma'], prefix=['E', 'B'])

dfClean

Unnamed: 0,mes,diasemchuva,precipitacao,riscofogo,frp,E_ACRE,E_ALAGOAS,E_AMAPA,E_AMAZONAS,E_BAHIA,...,E_SANTA CATARINA,E_SAO PAULO,E_SERGIPE,E_TOCANTINS,B_Amazonia,B_Caatinga,B_Cerrado,B_Mata Atlantica,B_Pampa,B_Pantanal
0,7,0.358333,0.000000,1.0,0.001375,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
1,7,0.350000,0.000000,0.9,0.001187,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
2,7,0.316667,0.000000,0.8,0.001625,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
3,7,0.383333,0.000000,1.0,0.002200,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
4,7,0.000000,0.003187,0.2,0.002337,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
222792,7,0.083333,0.000000,0.3,0.001587,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
222793,7,0.083333,0.000000,0.8,0.001388,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
222794,7,0.091667,0.001594,0.8,0.001400,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
222795,7,0.083333,0.014343,0.1,0.002437,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False


In [18]:
X = dfClean.drop(columns = ['riscofogo']) # pegando o db sem a variável resposta
y = df['riscofogo'] # pegando somente a variável resposta
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40, random_state=2023)

# Tiro uma parte do teste para colocar na validacao
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.40, random_state=2023)

print(f" Treino {len(X_train)}, Teste {len(X_test)}, Validacao {len(X_val)}")

 Treino 133678, Teste 53471, Validacao 35648
