In [64]:
import numpy as np
import matplotlib.pyplot as plt

# 1. LogisticRegression 클래스 사용하기

In [65]:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

X, y = load_breast_cancer(return_X_y=True, as_frame=True)
y

0      0
1      0
2      0
3      0
4      0
      ..
564    0
565    0
566    0
567    0
568    1
Name: target, Length: 569, dtype: int32

In [66]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.33,random_state=1234)

X_train = X_train.iloc[:, :3] #피쳐를 0,1,2만 사용
X_test = X_test.iloc[:, :3]

In [67]:
clf = LogisticRegression(random_state=1234, max_iter=100, C=100,solver='lbfgs')

clf = clf.fit(X_train, y_train)
y_train_pred = clf.predict(X_train)
y_pred = clf.predict(X_test)

print(f'학습 데이터셋 정확도:{(y_train == y_train_pred).sum() / len(y_train) * 100: .2f}%')
print(f'테스트 데이터셋 정확도:{(y_test == y_pred).sum() / len(y_test) * 100: .2f}%')

학습 데이터셋 정확도: 93.18%
테스트 데이터셋 정확도: 87.23%


In [68]:
clf.coef_, clf.intercept_, clf.n_features_in_, clf.n_iter_

(array([[ 8.63354352, -0.27382884, -1.47585863]]),
 array([19.85290223]),
 3,
 array([45]))

In [69]:
clf = LogisticRegression(random_state=1234, max_iter=10000, C=100,solver='sag')

clf = clf.fit(X_train, y_train)
y_train_pred = clf.predict(X_train)  #c.f. clf.predict_proba(X_train)
y_pred = clf.predict(X_test)

print(f'학습 데이터셋 정확도:{(y_train == y_train_pred).sum() / len(y_train) * 100: .2f}%')
print(f'테스트 데이터셋 정확도:{(y_test == y_pred).sum() / len(y_test) * 100: .2f}%')

학습 데이터셋 정확도: 92.39%
테스트 데이터셋 정확도: 86.17%


In [70]:
clf.coef_, clf.intercept_, clf.n_features_in_, clf.n_iter_

(array([[ 4.71308493, -0.00534137, -0.77250937]]),
 array([5.07447802]),
 3,
 array([3866]))

### tips 예시

In [71]:
import pandas as pd

In [72]:
tips = pd.read_csv("examples/tips.csv")
tips.head()

Unnamed: 0,total_bill,tip,smoker,day,time,size
0,16.99,1.01,No,Sun,Dinner,2
1,10.34,1.66,No,Sun,Dinner,3
2,21.01,3.5,No,Sun,Dinner,3
3,23.68,3.31,No,Sun,Dinner,2
4,24.59,3.61,No,Sun,Dinner,4


### total bill과 tip을 통해, smoker를 분류하는 로지스틱 회귀 모델을 학습하고, 성능을 확인하자

In [73]:
X_train, X_test, y_train, y_test = train_test_split(tips.iloc[:,:2], tips.iloc[:,2],test_size=0.33,random_state=1234) #0,1번 피쳐만 추출

In [74]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)         #transform을 진행하기위해 fit을 통해 주어진 데이터를 통해 평균과, 분산을 계산해야함

X_train_scaled = scaler.transform(X_train)  #스케일링
X_test_scaled = scaler.transform(X_test)  #스케일링


In [75]:
clf = LogisticRegression(random_state=1234, max_iter=10000, C=1000,solver='sag')

In [76]:
clf = clf.fit(X_train_scaled, y_train)
y_train_pred = clf.predict(X_train_scaled)  #c.f. clf.predict_proba(X_train)
y_pred = clf.predict(X_test_scaled)

print(f'학습 데이터셋 정확도:{(y_train == y_train_pred).sum() / len(y_train) * 100: .2f}%')
print(f'테스트 데이터셋 정확도:{(y_test == y_pred).sum() / len(y_test) * 100: .2f}%')

학습 데이터셋 정확도: 61.96%
테스트 데이터셋 정확도: 61.73%


In [77]:
clf.coef_, clf.intercept_, clf.n_features_in_, clf.n_iter_

(array([[0.05740051, 0.01046315]]), array([-0.4884633]), 2, array([34]))

### total bill과 tip을 통해, day를 분류하는 로지스틱 회귀 모델을 학습하고, 성능을 확인하자 (Thur vs Sun)

In [78]:
tips_selected=tips.loc[(tips.day=="Sun") | (tips.day=="Thur"), :]

In [79]:
X_train, X_test, y_train, y_test = train_test_split(tips_selected.iloc[:,:2], tips_selected.iloc[:,3],test_size=0.33,random_state=1234)

In [80]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)         #transform을 진행하기위해 fit을 통해 주어진 데이터를 통해 평균과, 분산을 계산해야함

X_train_scaled = scaler.transform(X_train)  #스케일링
X_test_scaled = scaler.transform(X_test)  #스케일링


In [81]:
y_train

129    Thur
192    Thur
43      Sun
55      Sun
199    Thur
       ... 
45      Sun
185     Sun
118    Thur
81     Thur
112     Sun
Name: day, Length: 92, dtype: object

In [82]:
y_test

167     Sun
182     Sun
6       Sun
158     Sun
164     Sun
122    Thur
131    Thur
83     Thur
116     Sun
51      Sun
156     Sun
85     Thur
159     Sun
139    Thur
126    Thur
128    Thur
188     Sun
151     Sun
175     Sun
143    Thur
133    Thur
78     Thur
177     Sun
79     Thur
8       Sun
119    Thur
46      Sun
44      Sun
124    Thur
155     Sun
49      Sun
191    Thur
54      Sun
160     Sun
179     Sun
9       Sun
195    Thur
201    Thur
176     Sun
123    Thur
1       Sun
204    Thur
113     Sun
117    Thur
17      Sun
127    Thur
Name: day, dtype: object

In [83]:
clf = LogisticRegression(random_state=1234, max_iter=10000, C=1000,solver='sag')

In [84]:
clf = clf.fit(X_train_scaled, y_train)
y_train_pred = clf.predict(X_train_scaled)  #c.f. clf.predict_proba(X_train)
y_pred = clf.predict(X_test_scaled)

print(f'학습 데이터셋 정확도:{(y_train == y_train_pred).sum() / len(y_train) * 100: .2f}%')
print(f'테스트 데이터셋 정확도:{(y_test == y_pred).sum() / len(y_test) * 100: .2f}%')

학습 데이터셋 정확도: 67.39%
테스트 데이터셋 정확도: 60.87%


### iris data 셋에서 target중 0과 1인 데이터에 대해서만, 로지스틱회귀 분류를 통해, 정확도를 확인하세요

### digits data 셋에서 target중 0과 7인 데이터에 대해서만, 로지스틱회귀 분류를 통해, 정확도를 확인하세요