In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, ConfusionMatrixDisplay, accuracy_score

In [2]:
df = pd.read_csv('../../../../datasets/parte1/dataset_cleaned.csv')

In [3]:
X = df.drop(['RainTomorrow'], axis=1)
y = df['RainTomorrow']

if 'weight' in df:
    class_weights_dict = df.set_index('RainTomorrow')['weight'].to_dict()

    X.drop(['weight'], axis=1, inplace=True)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2023)

In [5]:
model = None

if 'weight' in df:
    model = LogisticRegression(random_state=2023, class_weight=class_weights_dict)

else:
    model = LogisticRegression(random_state=2023)

In [6]:
# Use GridSearchCV para encontrar os melhores hiperparâmetros
param_grid = {
    'solver': ['newton-cg', 'lbfgs', 'sag', 'saga', 'liblinear'], #  'liblinear'
    'penalty' : [None, 'l1', 'l2'],
    'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000]
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, refit=True, verbose=2, cv=5, scoring="f1", n_jobs=-1)
grid_search.fit(X_train, y_train)

model = grid_search.best_estimator_

Fitting 5 folds for each of 105 candidates, totalling 525 fits


In [None]:
grid_search.best_params_

In [None]:
predictions = model.predict(X_test)

In [None]:
print("%0.2f accuracy" % (accuracy_score(y_test, predictions)))

In [None]:
print(classification_report(y_test,predictions))

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test, predictions)
plt.show()