# 다항 로지스틱 회귀모형 실습

__[예제]__   
iris 데이터의 Species 를 분류하는 다항 로지스틱 회귀분석을 실시하고 오분류표를 만들어라

In [16]:
import pandas as pd
import numpy as np
iris = pd.read_csv('./data/iris.csv')

X = iris.drop(['target'], axis = 1)
y = iris.target

In [3]:
X

Unnamed: 0,sepal length,sepal width,petal length,petal width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [5]:
# 훈련셋 테스트셋 분리하기
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, train_size = 0.7,
                                                   test_size = 0.3, random_state = 123)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(105, 4) (45, 4) (105,) (45,)


## sklearn 모델 생성

* sklearn 모델은 l2 패널티를 이용하여 전통적 통계모델에서 다중공선성의 문제를 내부적으로 해결해준다. 
* 독립변수 간의 상관성이 높은 변수라면, l2패널티를 0에 가깝게하여 변수를 삭제하는 것과 같은 효과를 보인다. 

In [7]:
# 모델 적합하기
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)
print(model)

LogisticRegression()


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## 모델 평가

In [9]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
# 테스트셋 예측
predicted = model.predict(X_test)

# 오분류표 생성
cm = confusion_matrix(y_test, predicted)
cmtb = pd.DataFrame(cm, columns=['predicted_setosa', 'predicted_versicolor', 'predicted_virginica'], 
                   index = ['setosa', 'versicolor', 'virginica'])

cmtb

Unnamed: 0,predicted_setosa,predicted_versicolor,predicted_virginica
setosa,15,0,0
versicolor,0,14,1
virginica,0,0,15


In [11]:
model.predict_proba(X_test)

array([[1.19460728e-03, 5.20347683e-01, 4.78457709e-01],
       [1.00943407e-05, 4.96999454e-02, 9.50289960e-01],
       [4.31666756e-02, 9.33399362e-01, 2.34339626e-02],
       [3.50915452e-03, 7.32085919e-01, 2.64404926e-01],
       [2.47468778e-07, 4.75253027e-03, 9.95247222e-01],
       [9.76993748e-01, 2.30061565e-02, 9.53088477e-08],
       [9.84980591e-01, 1.50193779e-02, 3.08186927e-08],
       [3.08338600e-03, 7.21544109e-01, 2.75372505e-01],
       [9.81963203e-01, 1.80367386e-02, 5.83423966e-08],
       [9.72865099e-01, 2.71348119e-02, 8.95204688e-08],
       [8.96074379e-08, 4.79987800e-03, 9.95200032e-01],
       [9.79252348e-01, 2.07475557e-02, 9.67598849e-08],
       [1.08341553e-04, 1.26062253e-01, 8.73829405e-01],
       [9.94239845e-01, 5.76014822e-03, 7.17543429e-09],
       [7.01820025e-03, 7.80937973e-01, 2.12043826e-01],
       [9.64954547e-01, 3.50453043e-02, 1.48459213e-07],
       [4.04859845e-04, 2.37475764e-01, 7.62119376e-01],
       [4.78818602e-04, 1.85360

In [12]:
# 정확도
print('Accuracy Score: ', accuracy_score(y_test, predicted))

print('\n')
# 분류 리포트 생성하기
class_report = classification_report(y_test, predicted)
print(class_report)

Accuracy Score:  0.9777777777777777


                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        15
Iris-versicolor       1.00      0.93      0.97        15
 Iris-virginica       0.94      1.00      0.97        15

       accuracy                           0.98        45
      macro avg       0.98      0.98      0.98        45
   weighted avg       0.98      0.98      0.98        45



In [13]:
roc_auc_score(y_test, model.predict_proba(X_test), multi_class = 'ovr')

0.9985185185185186

## 다항 로지스틱 회귀 계수 해석 

In [14]:
# 회귀계수 확인하기
print('Intercept: \n', model.intercept_)
print('Coefficient: \n', model.coef_)

Intercept: 
 [  9.42940015   2.10066833 -11.53006848]
Coefficient: 
 [[-0.45747705  0.87262687 -2.30840796 -0.96053751]
 [ 0.37578083 -0.19466078 -0.16297032 -0.75289644]
 [ 0.08169622 -0.6779661   2.47137828  1.71343395]]


In [17]:
pd.DataFrame(np.exp(model.coef_), columns = X_train.columns, index = model.classes_)

Unnamed: 0,sepal length,sepal width,petal length,petal width
Iris-setosa,0.632878,2.393189,0.099419,0.382687
Iris-versicolor,1.456128,0.823114,0.849616,0.471
Iris-virginica,1.085126,0.507648,11.838753,5.54798


*<b> 다른 변수가 일정할 때, sepal width가 1단위 증가하면 Iris-setosa로 분류될 확률이 2.393189배 증가한다. </b>