### 선형 분류 모델
- 선형 회귀 모델에서는 가설함수(y=wx+b)가 예측값을 직접적으로 활용했다면, 선형 분류 모델에서는 결정경계로 활용
- Logisic Regression(로지스틱 회귀 -> 분류만 가능), SVM(서포트 벡터 머신) 등

In [1]:
# 와인 정보를 토대로 레드, 화이트 와인 분류해보자! -> 이진 분류
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
wine = pd.read_csv("./data/wine.csv")
wine.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,color
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,1
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,1
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,1
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,1


| 컬럼 이름             | 설명                             |
|----------------------|---------------------------------|
| fixed acidity        | 고정 산도 (타르타르산 같은 비휘발성 산의 양) |
| volatile acidity     | 휘발성 산도 (와인의 향에 영향을 주는 아세트산의 양) |
| citric acid          | 구연산의 양 (와인의 신선함에 영향을 줌) |
| residual sugar       | 잔류 설탕량 (발효 후 남은 설탕의 양) |
| chlorides            | 염화물의 양 (와인의 소금기에 영향을 줌) |
| free sulfur dioxide  | 유리 이산화황의 양 (와인을 신선하게 보존하는 데 도움) |
| total sulfur dioxide | 총 이산화황의 양 (발효 과정과 와인의 보존에 중요) |
| density              | 밀도 (와인의 알코올 함량과 설탕량에 영향을 받음) |
| pH                   | pH 수치 (와인의 산성도를 나타냄) |
| sulphates            | 황산염의 양 (와인의 발효 과정과 저장 기간에 영향) |
| alcohol              | 알코올 함량 (와인의 도수) |
| quality              | 품질 (와인의 전반적인 품질을 나타내는 등급) |
| color                | 색 (와인의 색상을 나타내는 지표, 예: 레드(0), 화이트(1)) |

In [5]:
wine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         6497 non-null   float64
 1   volatile acidity      6497 non-null   float64
 2   citric acid           6497 non-null   float64
 3   residual sugar        6497 non-null   float64
 4   chlorides             6497 non-null   float64
 5   free sulfur dioxide   6497 non-null   float64
 6   total sulfur dioxide  6497 non-null   float64
 7   density               6497 non-null   float64
 8   pH                    6497 non-null   float64
 9   sulphates             6497 non-null   float64
 10  alcohol               6497 non-null   float64
 11  quality               6497 non-null   int64  
 12  color                 6497 non-null   int64  
dtypes: float64(11), int64(2)
memory usage: 660.0 KB


In [6]:
wine.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
fixed acidity,6497.0,7.215307,1.296434,3.8,6.4,7.0,7.7,15.9
volatile acidity,6497.0,0.339666,0.164636,0.08,0.23,0.29,0.4,1.58
citric acid,6497.0,0.318633,0.145318,0.0,0.25,0.31,0.39,1.66
residual sugar,6497.0,5.443235,4.757804,0.6,1.8,3.0,8.1,65.8
chlorides,6497.0,0.056034,0.035034,0.009,0.038,0.047,0.065,0.611
free sulfur dioxide,6497.0,30.525319,17.7494,1.0,17.0,29.0,41.0,289.0
total sulfur dioxide,6497.0,115.744574,56.521855,6.0,77.0,118.0,156.0,440.0
density,6497.0,0.994697,0.002999,0.98711,0.99234,0.99489,0.99699,1.03898
pH,6497.0,3.218501,0.160787,2.72,3.11,3.21,3.32,4.01
sulphates,6497.0,0.531268,0.148806,0.22,0.43,0.51,0.6,2.0


#### 이상치 탐지 기법
- 기술통계치(요약본)를 통해 max값이 이상하다는 거를 확인
- 이상치를 걸러내야 하는데, 기준이 필요 -> IQR 기반 필터링, Z-score 등

In [26]:
# Z-score : Z-score(표준 점수)를 이용하여 이상치를 필터링하는 방법
from scipy import stats # 검정, 추론을 위한 도구

In [49]:
z_score = stats.zscore(wine["residual sugar"]) # z-점수(각각의 데이터 포인트가 평균으로부터 얼마나 떨어져 있는지 나타내는 값)
wine[np.abs(z_score) >= 3] # 3보다 이상인 경우 이상치라고 판단

# z-점수의 절대값이 3보다 작은지 필터링
# 일반적으로 z-점수가 3보다 크거나 작은 데이터를 이상치라고 판단

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,color
1599,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6,0
1606,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6,0
1781,6.8,0.28,0.4,22.0,0.048,48.0,167.0,1.001,2.93,0.5,8.7,5,0
1790,6.8,0.28,0.4,22.0,0.048,48.0,167.0,1.001,2.93,0.5,8.7,5,0
1891,7.4,0.28,0.42,19.8,0.066,53.0,195.0,1.0,2.96,0.44,9.1,5,0
2043,6.9,0.24,0.36,20.8,0.031,40.0,139.0,0.9975,3.2,0.33,11.0,6,0
3053,8.3,0.21,0.49,19.8,0.054,50.0,231.0,1.0012,2.99,0.54,9.2,5,0
3207,6.9,0.27,0.49,23.5,0.057,59.0,235.0,1.0024,2.98,0.47,8.6,5,0
3252,7.9,0.33,0.28,31.6,0.053,35.0,176.0,1.0103,3.15,0.38,8.8,6,0
3262,7.9,0.33,0.28,31.6,0.053,35.0,176.0,1.0103,3.15,0.38,8.8,6,0


In [17]:
# IQR 기법 : IQR 지수를 사용해서 이상치를 걸러내는 방법
Q1 = wine["residual sugar"].quantile(0.25) # 사분위수 추출하는데 활용하는 함수
Q3 = wine["residual sugar"].quantile(0.75)
IQR = Q3 - Q1

# 이상치 탐지를 위한 상한선과 하한선 기준 설정
over = Q3 + (IQR * 1.5) # 상한선
under = Q1 - (IQR * 1.5) # 하한선

In [25]:
# and, or : 논리연산자 -> 단일값에 대해서만 논리 연산을 수행
# &, | : 비트연산자 -> 다중값에 대해서 논리 연산을 수행
wine[(wine["residual sugar"] >= over) | (wine["residual sugar"] <= under)]

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,color
1599,7.0,0.270,0.36,20.70,0.045,45.0,170.0,1.00100,3.00,0.45,8.80,6,0
1606,7.0,0.270,0.36,20.70,0.045,45.0,170.0,1.00100,3.00,0.45,8.80,6,0
1613,8.3,0.420,0.62,19.25,0.040,41.0,172.0,1.00020,2.98,0.67,9.70,5,0
1637,7.3,0.240,0.39,17.95,0.057,45.0,149.0,0.99990,3.21,0.36,8.60,5,0
1638,7.3,0.240,0.39,17.95,0.057,45.0,149.0,0.99990,3.21,0.36,8.60,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6290,6.9,0.190,0.31,19.25,0.043,38.0,167.0,0.99954,2.93,0.52,9.10,7,0
6293,6.9,0.190,0.31,19.25,0.043,38.0,167.0,0.99954,2.93,0.52,9.10,7,0
6347,6.1,0.340,0.24,18.35,0.050,33.0,184.0,0.99943,3.12,0.61,9.30,5,0
6348,6.2,0.350,0.25,18.40,0.051,28.0,182.0,0.99946,3.13,0.62,9.30,6,0


In [52]:
# X, y 분리
X = wine.iloc[:,:-1] # color를 제외한 슬라이싱으로 X 데이터 생성
y = wine.iloc[:,-1] # color 컬럼 지정

In [54]:
from sklearn.model_selection import train_test_split

In [58]:
# train, test 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.3, 
                                                    random_state = 2024, stratify = y)
# stratify = y : y의 클래스 비율에 따라 데이터를 분할 -> 분류 시에만 적용

In [59]:
y_train.value_counts()

color
0    3428
1    1119
Name: count, dtype: int64

In [60]:
y_test.value_counts()

color
0    1470
1     480
Name: count, dtype: int64

In [61]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4547, 12), (1950, 12), (4547,), (1950,))

### Logistic Regression(로지스틱 회귀, 분류 모델!)
- 시그모이드 함수 : 0 ~ 1 사이의 확률값으로 반환

1. 파라미터 초기화
2. 예측
3. 오차 계산 -> MSE가 아닌 Cross Entropy(교차 엔트로피)
4. 기울기 계산
5. 파라미터 업데이트

In [62]:
from sklearn.linear_model import LogisticRegression

In [72]:
# 모델 초기화
lr_clf = LogisticRegression(
    max_iter = 10000
)

In [73]:
# 학습
lr_clf.fit(X_train, y_train)

In [74]:
# 평가
lr_clf.score(X_test, y_test) # 데이터 정제가 잘 되어있어서 예측을 잘함

0.9830769230769231

### 분류 평가 지표
| 혼돈행렬 | 예측 | 실제 |
|---------|------|-----|
|TP(True Positive)|P|P|
|FP(False Positive)|P|N|
|TN(True Negative)|N|N|
|FN(False Negative)|N|P|

- accuracy(정확도) : (TP + TN) / (TP + FP + TN + FN)
    - 전체 데이터 중에 맞춘(예측을 성공한) 비율
    - 일반적으로 높은 정확도를 가진 모델 선호

- recall(재현율) : TP / (TP + FN)
    - 실제 양성 샘플들 중에서 양성으로 올바르게 예측한 샘플의 비율
    - 실제 질병이 있는 환자를 걸러내는게(양성을 걸러내는게) 중요한 경우 선호
    - 재현율이 높다 -> 실제 "양성"을 모델이 놓치지 않고 잘 포착한다를 의미!

- precision(정밀도) : TP / (TP + FP)
    - 양성으로 예측 된 샘플들 중에서 실제 양성인 샘플의 비율(얼마나 정밀하게 잘 맞추냐?)
    - 오분류로 인한 비용이 클 때 선호 (ex.어린이에게 안정적인 영상 제공 시 선정/안정적인 영상을 분류 할 때)
    - 정밀도가 높다 -> "양성"이라고 분류한 것들 중에 실제로 양성일 확률이 높다는 것 -> 얼마나 정밀하게 잘 맞추는지를 의미
 
- f1_score(조화평균) : (2\*정밀도\*재현율) / (정밀도*재현율)
    - 정밀도와 재현율의 조화 평균
    - 두 지표가 한 쪽에 치우치지 않고, 양성 케이스들을 얼마나 정확하고 효율적으로 식별하는지에 대한 정도를 나타냄

In [75]:
# 분류 평가 지표 도구 불러오기
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [76]:
y_pred = lr_clf.predict(X_test)

# 정확도 확인
accuracy_score(y_test, y_pred)

0.9830769230769231

In [77]:
# 정밀도 확인
precision_score(y_test, y_pred)

0.9848156182212582

In [78]:
# 재현율 확인
recall_score(y_test, y_pred)

0.9458333333333333

In [79]:
# 정밀도와 재현율의 조화 평균 확인
f1_score(y_test, y_pred)

0.9649309245483528

In [90]:
# 서포트 벡터 머신으로도 모델링
from sklearn.svm import SVC # 서포트 벡터머신

In [91]:
svc_model = SVC(max_iter=100000)

In [92]:
svc_model.fit(X_train, y_train)

In [93]:
y_pred = svc_model.predict(X_test)

accuracy_score(y_test, y_pred)

0.936923076923077

In [95]:
X_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
fixed acidity,4547.0,7.207203,1.279084,3.8,6.4,7.0,7.7,15.6
volatile acidity,4547.0,0.342695,0.167454,0.08,0.23,0.3,0.41,1.58
citric acid,4547.0,0.316793,0.145021,0.0,0.24,0.31,0.39,1.0
residual sugar,4547.0,5.432714,4.791851,0.6,1.8,3.0,8.1,65.8
chlorides,4547.0,0.056199,0.033858,0.009,0.038,0.047,0.066,0.611
free sulfur dioxide,4547.0,30.435232,17.679733,1.0,17.0,29.0,41.0,289.0
total sulfur dioxide,4547.0,115.555311,56.506885,6.0,77.25,118.0,156.0,440.0
density,4547.0,0.9947,0.003028,0.98711,0.992345,0.99488,0.996995,1.03898
pH,4547.0,3.218133,0.159431,2.77,3.11,3.21,3.32,4.01
sulphates,4547.0,0.531513,0.147764,0.22,0.43,0.51,0.6,1.95
