## [1] 다중 로지스틱 회귀 예제

### breast cancer dataset

In [28]:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X, y = load_breast_cancer(return_X_y=True, as_frame=True)
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.33,random_state=1234)
scaler = StandardScaler()
scaler.fit(X_train)         
X_train_scaled = scaler.transform(X_train)  
X_test_scaled = scaler.transform(X_test)  

In [36]:
clf = LogisticRegression(random_state=1234, solver='sag', penalty='l2', C=10, max_iter=10000, multi_class='multinomial')
clf.fit(X_train_scaled,y_train)

In [37]:
y_pred = clf.predict(X_test_scaled)
print(f'정확도:{(y_pred == y_test).mean() * 100: .2f}%')

정확도: 96.28%


### iris dataset

In [38]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split


X, y = load_iris(return_X_y=True, as_frame=True)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33,random_state=1234)
scaler = MinMaxScaler()
scaler.fit(X_train)         
X_train_scaled = scaler.transform(X_train)  
X_test_scaled = scaler.transform(X_test)  

In [42]:
clf = LogisticRegression(random_state=1234, solver='sag', penalty='l2', C=10, max_iter=10000, multi_class='multinomial')
clf.fit(X_train_scaled,y_train)

In [43]:
y_pred = clf.predict(X_test_scaled)
print(f'정확도:{(y_pred == y_test).mean() * 100: .2f}%')

정확도: 96.00%


### tips dataset

In [52]:
import pandas as pd
tips = pd.read_csv("examples/tips.csv")
tips.head()

Unnamed: 0,total_bill,tip,smoker,day,time,size
0,16.99,1.01,No,Sun,Dinner,2
1,10.34,1.66,No,Sun,Dinner,3
2,21.01,3.5,No,Sun,Dinner,3
3,23.68,3.31,No,Sun,Dinner,2
4,24.59,3.61,No,Sun,Dinner,4


#### tip을 4 분위로 이산화하고 나머지 피처를 인풋으로 분류하는 모델을 학습

In [53]:
tips.tip=pd.cut(tips['tip'],bins=3,labels=['low','midde','high'])
tips['smoker']=tips['smoker'].map({'No':0,"Yes":1})
tips['day']=tips['day'].map({'Thur':0,"Fri":1,"Sat":2,"Sun":3})
tips['time']=tips['time'].map({'Lunch':0,"Dinner":1})

In [54]:
X_train, X_test, y_train, y_test = train_test_split(tips.drop(columns='tip'), tips['tip'],test_size=0.33,random_state=1234)

In [55]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)         

X_train_scaled = scaler.transform(X_train)  
X_test_scaled = scaler.transform(X_test)  

In [56]:
clf = LogisticRegression(random_state=1234, solver='sag', penalty='l2', C=10, max_iter=10000, multi_class='multinomial')
clf.fit(X_train_scaled,y_train)

In [57]:
y_pred = clf.predict(X_test_scaled)
print(f'정확도:{(y_pred == y_test).mean() * 100: .2f}%')

정확도: 80.25%


## [2] 회귀에서의 성능 지표

In [66]:
from sklearn.linear_model import LinearRegression
from sklearn.datasets import load_diabetes
from sklearn.linear_model import SGDRegressor

X, y = load_diabetes(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33,random_state=1234)


reg = SGDRegressor(penalty=None, max_iter=10000, learning_rate='constant', eta0=0.1)
reg = reg.fit(X_train, y_train)
y_pred_train = reg.predict(X_train)
y_pred = reg.predict(X_test)
print('테스트 데이터셋 MAE:',np.abs(y_pred - y_test).mean())
print('테스트 데이터셋 MSE:',((y_pred - y_test)**2).mean())
print('테스트 데이터셋 RMSE:',np.sqrt(((y_pred - y_test)**2).mean()))
print('테스트 데이터셋 R2 score', reg.score(X_test,y_test))

테스트 데이터셋 MAE: 46.031956468579686
테스트 데이터셋 MSE: 3187.571941475622
테스트 데이터셋 RMSE: 56.45858607400315
테스트 데이터셋 R2 score 0.44480663449386015


In [67]:
tips = pd.read_csv("examples/tips.csv")
tips.head()
tips['smoker']=tips['smoker'].map({'No':0,"Yes":1})
tips['day']=tips['day'].map({'Thur':0,"Fri":1,"Sat":2,"Sun":3})
tips['time']=tips['time'].map({'Lunch':0,"Dinner":1})
X_train, X_test, y_train, y_test = train_test_split(tips.drop(columns="tip"),tips.tip,test_size=0.33,random_state=1234)
reg = SGDRegressor(penalty=None, max_iter=10000, learning_rate='constant', eta0=0.001)
reg = reg.fit(X_train, y_train)

y_pred = reg.predict(X_test)
print('테스트 데이터셋 MAE:',np.abs(y_pred - y_test).mean())
print('테스트 데이터셋 MSE:',((y_pred - y_test)**2).mean())
print('테스트 데이터셋 RMSE:',np.sqrt(((y_pred - y_test)**2).mean()))
print('테스트 데이터셋 R2 score', reg.score(X_test,y_test))

테스트 데이터셋 MAE: 1.0152324107303334
테스트 데이터셋 MSE: 2.0559952285868293
테스트 데이터셋 RMSE: 1.433874202497147
테스트 데이터셋 R2 score 0.12116008467906614


## [3] 분류에서의 성능 지표

In [70]:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X, y = load_breast_cancer(return_X_y=True, as_frame=True)
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.33,random_state=1234)
scaler = StandardScaler()
scaler.fit(X_train)         
X_train_scaled = scaler.transform(X_train)  
X_test_scaled = scaler.transform(X_test)  
clf = LogisticRegression(random_state=1234, solver='sag', penalty='l2', C=10, max_iter=10000, multi_class='multinomial')
clf.fit(X_train_scaled,y_train)

In [96]:
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score, classification_report

y_pred = clf.predict(X_test_scaled)
print(f'정확도:{(y_pred == y_test).mean() : }')
print(f'정확도:{clf.score(X_test_scaled,y_test): }')
print(f'정확도:{accuracy_score(y_test,y_pred): }')  #### y_test가 먼저, y_pred가 그 다음

정확도: 0.9627659574468085
정확도: 0.9627659574468085
정확도: 0.9627659574468085


In [97]:
print(f'정밀도:{precision_score(y_test,y_pred): }')  #### y_test가 먼저, y_pred가 그 다음

정밀도: 0.9508196721311475


In [98]:
print(f'재현율:{recall_score(y_test,y_pred): }')  #### y_test가 먼저, y_pred가 그 다음

재현율: 0.9914529914529915


In [99]:
print(f'F1 score:{f1_score(y_test,y_pred): }')  #### y_test가 먼저, y_pred가 그 다음

F1 score: 0.9707112970711297


In [101]:
print(classification_report(y_test,y_pred))


              precision    recall  f1-score   support

           0       0.98      0.92      0.95        71
           1       0.95      0.99      0.97       117

    accuracy                           0.96       188
   macro avg       0.97      0.95      0.96       188
weighted avg       0.96      0.96      0.96       188



In [102]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split


X, y = load_iris(return_X_y=True, as_frame=True)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33,random_state=1234)
scaler = MinMaxScaler()
scaler.fit(X_train)         
X_train_scaled = scaler.transform(X_train)  
X_test_scaled = scaler.transform(X_test)
clf = LogisticRegression(random_state=1234, solver='sag', penalty='l2', C=10, max_iter=10000, multi_class='multinomial')
clf.fit(X_train_scaled,y_train)  

In [103]:
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score, classification_report

y_pred = clf.predict(X_test_scaled)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        17
           1       0.94      0.94      0.94        17
           2       0.94      0.94      0.94        16

    accuracy                           0.96        50
   macro avg       0.96      0.96      0.96        50
weighted avg       0.96      0.96      0.96        50



In [104]:
import pandas as pd
tips = pd.read_csv("examples/tips.csv")
tips.tip=pd.cut(tips['tip'],bins=3,labels=['low','midde','high'])
tips['smoker']=tips['smoker'].map({'No':0,"Yes":1})
tips['day']=tips['day'].map({'Thur':0,"Fri":1,"Sat":2,"Sun":3})
tips['time']=tips['time'].map({'Lunch':0,"Dinner":1})
X_train, X_test, y_train, y_test = train_test_split(tips.drop(columns='tip'), tips['tip'],test_size=0.33,random_state=1234)

In [105]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)         

X_train_scaled = scaler.transform(X_train)  
X_test_scaled = scaler.transform(X_test) 
clf = LogisticRegression(random_state=1234, solver='sag', penalty='l2', C=10, max_iter=10000, multi_class='multinomial')
clf.fit(X_train_scaled,y_train) 

In [106]:
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score, classification_report

y_pred = clf.predict(X_test_scaled)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

        high       0.50      0.50      0.50         2
         low       0.89      0.89      0.89        66
       midde       0.38      0.38      0.38        13

    accuracy                           0.80        81
   macro avg       0.59      0.59      0.59        81
weighted avg       0.80      0.80      0.80        81

