In [1]:
from sklearn.datasets import load_iris
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

**Grid Search Cross Validation** ile hem KNN'deki optimum k değerini bulabilir hem de farklı k değerlerini denerken cross validation yaparız.

# 1) Read Data

In [2]:
iris = load_iris()
x = iris.data
y = iris.target
x_train = x[:100, :] # ilk 2 class'ı almak için ilk 100 satır seçildi. 100'den sonraki verilerde 3. class da bulunduğu için ilk 100 satırı aldık.
y_train = y[:100] 

# 2) Preprocessing

In [3]:
x = (x-np.min(x)) / (np.max(x) - np.min(x))

# 3) Split Data

In [4]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 42)

# 4) Create Logistic Regression Model
LR: binary outputları classifier etmede kullanılır. Örneğin 1 ya da 0, Kedi ya da Köpek etc.

- Bu yüzden target classı 3 ten 2'ye düşürmemiz gerekir.

- C büyükse overfitting(aşırı ezberleme), C çok küçükse underfitting olur. Hiçbir şekilde datayı öğrenemez.

- Dolayısıyla C'yi güzel seçmeliyiz.

- l1 ve l2 = loss functions


In [5]:
grid = {"C" : np.logspace(-3, 3, 7), "penalty":["l1","l2"]}  # l1 = lasso ve l2 = ridge
logreg = LogisticRegression()

# 5) Create Grid Search Model with Cross Validation

In [6]:
logreg_cv = GridSearchCV(logreg, grid, cv = 10)
logreg_cv.fit(x_train, y_train)

Traceback (most recent call last):
  File "C:\Users\Pointo2\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Pointo2\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\Pointo2\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "C:\Users\Pointo2\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Pointo2\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.sol

GridSearchCV(cv=10, estimator=LogisticRegression(),
             param_grid={'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
                         'penalty': ['l1', 'l2']})

# 6) Train Data

Aşağıdaki sonuca göre en optimum C değeri 100'dür. Bu C parametresini kullanarak accuracy değeri 1'dir. 

In [7]:
print("tuned hyperparameters: (best parameters): ", logreg_cv.best_params_)
print("accuracy: ", logreg_cv.best_score_)

tuned hyperparameters: (best parameters):  {'C': 100.0, 'penalty': 'l2'}
accuracy:  0.9627272727272727


# 7) Test Data

In [12]:
logreg2 = LogisticRegression(C = 100, penalty='l2')
logreg2.fit(x_train, y_train)
print("accuracy: ", logreg2.score(x_test, y_test))

accuracy:  1.0
