In [1]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer, MissingIndicator
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [2]:
df = pd.read_csv('database_fires.csv') 
df.drop(['id'], axis=1, inplace=True)

In [3]:
target = df['fires'].copy()
df.drop('fires', axis=1, inplace=True)

In [4]:
class EstadosEmRegiao(BaseEstimator, TransformerMixin):
    def __init__(self, toNumbers=True):
        self.toNumbers = toNumbers
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        def getRegiao(estado):
            if (estado in ['AM', 'RR', 'AC', 'RO', 'PA', 'TO', 'AP']):
                return 1
            elif (estado in ['BA', 'SE', 'AL', 'PE', 'PB', 'RN', 'CE', 'MA', 'PI']):
                return 2
            elif (estado in ['GO', 'MT', 'MS', 'DF']):
                return 3        
            elif (estado in ['PR', 'SC', 'RS']):
                return 4
            elif (estado in ['ES', 'MG', 'SP', 'RJ']):
                return 5
        X['regioes'] = X['estado'].map(getRegiao);
        return X.drop('estado', axis=1)
    def fit_transform(self, X, y=None):
        return self.fit(X).transform(X)

In [5]:
#trocar as datas completas pelo mês apenas
import re
class DataEmMes(BaseEstimator, TransformerMixin):
    def __init__(self, toDf=False):
        self.toDf = toDf        
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        mes = X['data'].map(lambda x: int(re.search('/(.+?)/', x).group(1)))
        X['mes'] = mes
        return X.drop('data', axis=1)
    def fit_transform(self, X, y=None):
        return self.fit(X).transform(X)

In [6]:
from sklearn.pipeline import Pipeline

cat_features = ['estado','estacao','data']
cat_transformer = Pipeline([
    ('emMes', DataEmMes()),
    ('emRegiao', EstadosEmRegiao()),
    ('oneHot', OneHotEncoder())
])

num_features = ['precipitacao','temp_max','temp_min','insolacao','evaporacao_piche',
 'temp_comp_med','umidade_rel_med','vel_vento_med','altitude']
num_transformer = Pipeline([
    ('imputer', IterativeImputer(max_iter=20)),
    ('scaler', StandardScaler()),
])

preprocessor = ColumnTransformer([
    ('cat', cat_transformer, cat_features),
    ('num', num_transformer, num_features)
])


df_tr = preprocessor.fit_transform(df)

In [7]:
#Separando Df de treino e df de teste
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_tr, target, test_size=0.2, random_state=42)

In [8]:
#Treinando KNN e testando
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 27, weights = 'distance', metric = 'minkowski', algorithm = 'auto', leaf_size = 5, p = 1 )
print('fitting...')
knn.fit(X_train, y_train)

fitting...


KNeighborsClassifier(algorithm='auto', leaf_size=5, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=27, p=1,
                     weights='distance')

In [9]:
respostas = pd.read_csv('respostas.csv')
respId = respostas['id'].copy()
respostas.drop('id', axis=1, inplace=True)
resp_tr = preprocessor.fit_transform(respostas)
resp_tr

<44342x265 sparse matrix of type '<class 'numpy.float64'>'
	with 532104 stored elements in Compressed Sparse Row format>

In [10]:
gabarito = knn.predict(resp_tr)

In [21]:
final = pd.Series(gabarito, index=respId ,dtype= 'int32')
final.to_csv('final.csv', header=False)