In [58]:
from river import datasets
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [125]:
init_train_ratio = 0.1
stream_ratio = 0.9

In [143]:
datasets_dict = {
    'Elect2': {'path': '../data/electricity.csv',
               'header':'infer',
               'y':'class',
               'X':['date', 'day', 'period', 'nswprice', 'nswdemand', 'vicprice', 'vicdemand', 'transfer']
               },
    # 'iris': {'path': '../data/iris.csv',
    #          'header':None,
    #          'y': 4,
    #          'X': [0, 1, 2, 3]}
}

models = {
    'LogisticRegression': {
        'model': LogisticRegression(solver = 'saga', max_iter = 10000),
        'params': {
            'penalty': ['l1','l2','elasticnet'],
            'C': [0.001, 0.01, 0.1, 0.5, 1, 5, 10, 100],
            'l1_ratio': [0.1, 0.5, 0.9]
        }
    }
}

In [144]:
LogisticRegression().get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'deprecated',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [146]:
for dataset in datasets_dict.keys():
    print(f'Iniciating {dataset}')
    df = pd.read_csv(
        datasets_dict[dataset]['path'],
        header =  datasets_dict[dataset]['header']
        )

    X = df[datasets_dict[dataset]['X']]

    y = df[datasets_dict[dataset]['y']]

    X_init_train = X.iloc[:round(X.shape[0]*init_train_ratio)]
    y_init_train = y.iloc[:round(y.shape[0]*init_train_ratio)]

    X_stream = X.iloc[round(X.shape[0]*init_train_ratio):round(X.shape[0]*stream_ratio)]
    y_stream = y.iloc[round(y.shape[0]*init_train_ratio):round(y.shape[0]*stream_ratio)]

    X_final_val = X.iloc[round(X.shape[0]*stream_ratio):]
    y_final_val = y.iloc[round(y.shape[0]*stream_ratio):]

    print(y.shape)
    print(y_init_train.shape)
    print(y_stream.shape)
    print(y_final_val.shape)
    print (y.shape[0] == y_init_train.shape[0] + y_stream.shape[0] +  y_final_val.shape[0])

    best_models = {}

    for name, config in models.items():
        print(f"Treinando {name}...")
        grid_search = GridSearchCV(config['model'], config['params'], cv=5, scoring='accuracy', n_jobs=-1, error_score = 'raise')
        grid_search.fit(X_init_train, y_init_train)

        # Salvando o melhor modelo e seus hiperparâmetros
        best_models[name] = {
            'best_estimator': grid_search.best_estimator_,
            'best_params': grid_search.best_params_,
            'best_score': grid_search.best_score_
        }

        print(f"Melhores parâmetros para {name}: {grid_search.best_params_}")
        print(f"Melhor acurácia (validação cruzada): {grid_search.best_score_:.4f}")

    # Exemplo de como acessar os melhores modelos
    print(best_models)

Iniciating Elect2
(45312,)
(4531,)
(36250,)
(4531,)
True
Treinando LogisticRegression...




Melhores parâmetros para LogisticRegression: {'C': 1, 'l1_ratio': 0.1, 'penalty': 'l1'}
Melhor acurácia (validação cruzada): 0.7954
{'LogisticRegression': {'best_estimator': LogisticRegression(C=1, l1_ratio=0.1, max_iter=10000, penalty='l1',
                   solver='saga'), 'best_params': {'C': 1, 'l1_ratio': 0.1, 'penalty': 'l1'}, 'best_score': np.float64(0.7954396878825715)}}


In [100]:
df = pd.read_csv('../data/iris.csv', header = None)
df

Unnamed: 0,0,1,2,3,4
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [None]:
models = {
    'LogisticRegression': {
        'model': LogisticRegression(),
        'params': {
            'penalty': ['l1', 'l2', 'elasticnet'],
            'C': [0.001, 0.01, 0.1, 0.5, 1, 5, 10, 100],
            'max_iter': [1000],
            'l1_ratio': [0.1, 0.5, 0.9]
        }
    }
}

# Executando GridSearchCV
best_models = {}

for name, config in models.items():
    print(f"Treinando {name}...")
    grid_search = GridSearchCV(config['model'], config['params'], cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X, y)

    # Salvando o melhor modelo e seus hiperparâmetros
    best_models[name] = {
        'best_estimator': grid_search.best_estimator_,
        'best_params': grid_search.best_params_,
        'best_score': grid_search.best_score_
    }

    print(f"Melhores parâmetros para {name}: {grid_search.best_params_}")
    print(f"Melhor acurácia (validação cruzada): {grid_search.best_score_:.4f}")

# Exemplo de como acessar os melhores modelos
print(best_models)

Treinando LogisticRegression...




Melhores parâmetros para LogisticRegression: {'C': 100, 'l1_ratio': 0.1, 'max_iter': 1000, 'penalty': 'l2'}
Melhor acurácia (validação cruzada): 0.9800
{'LogisticRegression': {'best_estimator': LogisticRegression(C=100, l1_ratio=0.1, max_iter=1000), 'best_params': {'C': 100, 'l1_ratio': 0.1, 'max_iter': 1000, 'penalty': 'l2'}, 'best_score': np.float64(0.9800000000000001)}}


240 fits failed out of a total of 360.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
120 fits failed with the following error:
Traceback (most recent call last):
  File "/home/chacon/.pyenv/versions/mestrado/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/chacon/.pyenv/versions/mestrado/lib/python3.11/site-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/chacon/.pyenv/versions/mestrado/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py", line 1193, in fit
    solver = _check_solver(self.solver, self.