# Regressão Logística

## Importações

In [3]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, train_test_split
from scipy.stats import uniform

from model_pipeline import *

seed = 777
rng = np.random.default_rng(seed)

def rng_int():
    return rng.integers(1, 10000)

## Preparação e separação do conjunto de dados

In [4]:
df = pd.read_csv('../../data/preprocessed/_90_drp_outl.csv', index_col='obj_ID')

X = df[['alpha', 'delta', 'u', 'g', 'r', 'i', 'z', 'redshift']]
y = df['class']

X_train, _, y_train, _ = train_test_split(X, y, train_size=.8, stratify=y, random_state=rng_int())

## Configurando pipeline

In [5]:
kfold = KFold(n_splits=5, shuffle=True, random_state=rng_int())

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('feature_selection', SelectKBest(f_classif)),
    ('pca', PCA()),
    ('classifier', LogisticRegression())
])

param_dist = {
    'feature_selection__k': [4, 5, 6, 7],
    'pca__n_components': [2, 3, 4],
    'classifier__C': uniform(loc=0.01, scale=10)
}

best_model, best_params, best_score = get_best_params(pipeline, param_dist, kfold, X_train, y_train)

print("Melhores parâmetros:", best_params)
print("Melhor acurácia:", best_score)

Melhores parâmetros: {'classifier__C': 9.42033870069984, 'feature_selection__k': 6, 'pca__n_components': 3}
Melhor acurácia: 0.9559485845533671


## Armazenando melhor modelo

In [7]:
dump_model(best_model, '../dump/logistic_regression', best_score)