In [16]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import MinMaxScaler

## 머신러닝 프로세스
1. 문제정의
2. 데이터수집(크롤링, 파일데이터, DB 등)
3. 데이터전처리(결측치제거, 이상치, 특성제거)
4. 탐색적 데이터 분석(EDA : 각 특성관련 영향이해)
5. 모델선정 및 하이퍼파라미터 튜닝
6. 모델학습(train, validation, test)
7. 모델평가(회귀 : 오차, 분류 : 맞춘갯수)
    - 회귀 : MSE(평균제곱오차), RMSE(제곱오차에 다시 루트로 원복), MAE(평균절대값오차)
    - 분류 : Accurocy(정확도), Precision(정밀도), Recall(재현율), f1-score, RCC Curve

#### 1. 문제정의
- 와인데이터를 이용한 분류
- 컬럼 중 등급(quality)사용

#### 2. 데이터수집

In [2]:
data = pd.read_csv('data/winequality-red.csv')
data

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [3]:
x = data.loc[:,:'alcohol']
y = data['quality']

In [5]:
x.shape

(1599, 11)

In [6]:
y.shape

(1599,)

In [9]:
x_train, x_test, y_train, y_test = train_test_split(x, y)

In [10]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(1199, 11)
(1199,)
(400, 11)
(400,)


#### 3.데이터탐색

In [11]:
data.info() #결측치확인

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [12]:
data.describe() #기술통계

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [13]:
data.corr() #상관계수

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
fixed acidity,1.0,-0.256131,0.671703,0.114777,0.093705,-0.153794,-0.113181,0.668047,-0.682978,0.183006,-0.061668,0.124052
volatile acidity,-0.256131,1.0,-0.552496,0.001918,0.061298,-0.010504,0.07647,0.022026,0.234937,-0.260987,-0.202288,-0.390558
citric acid,0.671703,-0.552496,1.0,0.143577,0.203823,-0.060978,0.035533,0.364947,-0.541904,0.31277,0.109903,0.226373
residual sugar,0.114777,0.001918,0.143577,1.0,0.05561,0.187049,0.203028,0.355283,-0.085652,0.005527,0.042075,0.013732
chlorides,0.093705,0.061298,0.203823,0.05561,1.0,0.005562,0.0474,0.200632,-0.265026,0.37126,-0.221141,-0.128907
free sulfur dioxide,-0.153794,-0.010504,-0.060978,0.187049,0.005562,1.0,0.667666,-0.021946,0.070377,0.051658,-0.069408,-0.050656
total sulfur dioxide,-0.113181,0.07647,0.035533,0.203028,0.0474,0.667666,1.0,0.071269,-0.066495,0.042947,-0.205654,-0.1851
density,0.668047,0.022026,0.364947,0.355283,0.200632,-0.021946,0.071269,1.0,-0.341699,0.148506,-0.49618,-0.174919
pH,-0.682978,0.234937,-0.541904,-0.085652,-0.265026,0.070377,-0.066495,-0.341699,1.0,-0.196648,0.205633,-0.057731
sulphates,0.183006,-0.260987,0.31277,0.005527,0.37126,0.051658,0.042947,0.148506,-0.196648,1.0,0.093595,0.251397


In [15]:
data['quality'].value_counts()

5    681
6    638
7    199
4     53
8     18
3     10
Name: quality, dtype: int64

#### 4. 전처리

In [17]:
minMax = MinMaxScaler()

In [18]:
minMax.fit(x_train) #최대값과 최소값을 구하기

MinMaxScaler(copy=True, feature_range=(0, 1))

In [19]:
x_train_scaled = minMax.transform(x_train)

In [20]:
x_train_scaled

array([[0.32110092, 0.41780822, 0.        , ..., 0.50393701, 0.07784431,
        0.21428571],
       [0.27522936, 0.35958904, 0.03      , ..., 0.49606299, 0.07784431,
        0.33928571],
       [0.52293578, 0.21917808, 0.5       , ..., 0.42519685, 0.2994012 ,
        0.55357143],
       ...,
       [0.29357798, 0.2739726 , 0.25      , ..., 0.54330709, 0.19161677,
        0.10714286],
       [0.44036697, 0.14383562, 0.59      , ..., 0.2992126 , 0.1257485 ,
        0.64285714],
       [0.30275229, 0.36986301, 0.        , ..., 0.54330709, 0.1497006 ,
        0.19642857]])

In [22]:
x_train.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
1238,8.1,0.73,0.0,2.5,0.081,12.0,24.0,0.99798,3.38,0.46,9.6
1352,7.6,0.645,0.03,1.9,0.086,14.0,57.0,0.9969,3.37,0.46,10.3
327,10.3,0.44,0.5,4.5,0.107,5.0,13.0,0.998,3.28,0.83,11.5
798,9.4,0.5,0.34,3.6,0.082,5.0,14.0,0.9987,3.29,0.52,10.7
627,8.8,0.6,0.29,2.2,0.098,5.0,15.0,0.9988,3.36,0.49,9.1


#### 5. 모델링
- 교차검증
    1. KNN
    2. DecisionTree
    3. Logistic
    4. LinearSVC

In [23]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

In [25]:
knn_model = KNeighborsClassifier()
tree_model = DecisionTreeClassifier()
lin_model = LogisticRegression()
svc_model = LinearSVC()

knn_model.fit(x_train, y_train)
tree_model.fit(x_train, y_train)
lin_model.fit(x_train, y_train)
svc_model.fit(x_train, y_train)

In [46]:
result = cross_val_score(knn_model, x_train_scaled, y_train, cv=5)
result.mean()

0.5921896792189679

In [47]:
result = cross_val_score(tree_model, x_train_scaled, y_train, cv=5)
result.mean()

0.6180160390516038

In [48]:
result = cross_val_score(lin_model, x_train_scaled, y_train, cv=5)
result.mean()

0.6071513249651325

In [49]:
result = cross_val_score(svc_model, x_train_scaled, y_train, cv=5)
result.mean()

0.5913040446304045

#### 6. 하이퍼파라미터튜닝

In [50]:
for k in range(1, 50, 2) :
    knn_rs = cross_val_score(KNeighborsClassifier(n_neighbors=k), x_train_scaled, y_train, cv=5)
    print("이웃의 숫자 : ",k)
    print("정확도 : ",knn_rs.mean())

이웃의 숫자 :  1
정확도 :  0.6380509065550907
이웃의 숫자 :  3
정확도 :  0.6021931659693165
이웃의 숫자 :  5
정확도 :  0.5921896792189679
이웃의 숫자 :  7
정확도 :  0.5854776847977685
이웃의 숫자 :  9
정확도 :  0.5938284518828452
이웃의 숫자 :  11
정확도 :  0.5996582984658299
이웃의 숫자 :  13
정확도 :  0.6021513249651325
이웃의 숫자 :  15
정확도 :  0.6046582984658299
이웃의 숫자 :  17
정확도 :  0.6038214783821478
이웃의 숫자 :  19
정확도 :  0.6054881450488144
이웃의 숫자 :  21
정확도 :  0.5938284518828452
이웃의 숫자 :  23
정확도 :  0.5921513249651326
이웃의 숫자 :  25
정확도 :  0.5946513249651325
이웃의 숫자 :  27
정확도 :  0.5979881450488145
이웃의 숫자 :  29
정확도 :  0.5904707112970712
이웃의 숫자 :  31
정확도 :  0.5871408647140864
이웃의 숫자 :  33
정확도 :  0.5854672245467226
이웃의 숫자 :  35
정확도 :  0.5879741980474199
이웃의 숫자 :  37
정확도 :  0.5946478382147837
이웃의 숫자 :  39
정확도 :  0.5971513249651326
이웃의 숫자 :  41
정확도 :  0.5954811715481172
이웃의 숫자 :  43
정확도 :  0.5971582984658299
이웃의 숫자 :  45
정확도 :  0.5954881450488145
이웃의 숫자 :  47
정확도 :  0.6021582984658298
이웃의 숫자 :  49
정확도 :  0.5963110181311018


In [51]:
final_knn = KNeighborsClassifier(n_neighbors=19)

In [53]:
for dep_no in range(1, 30) :
    tree_rs =  cross_val_score(DecisionTreeClassifier(max_depth=dep_no), x_train_scaled, y_train, cv=5)
    print("깊이의 숫자 : ",dep_no)
    print("정확도 : ",tree_rs.mean())

깊이의 숫자 :  1
정확도 :  0.5671164574616457
깊이의 숫자 :  2
정확도 :  0.5687831241283124
깊이의 숫자 :  3
정확도 :  0.5579672245467225
깊이의 숫자 :  4
정확도 :  0.5779707112970712
깊이의 숫자 :  5
정확도 :  0.5779707112970712
깊이의 숫자 :  6
정확도 :  0.5788249651324965
깊이의 숫자 :  7
정확도 :  0.5854916317991632
깊이의 숫자 :  8
정확도 :  0.5963249651324964
깊이의 숫자 :  9
정확도 :  0.6055125523012552
깊이의 숫자 :  10
정확도 :  0.6063598326359833
깊이의 숫자 :  11
정확도 :  0.6105230125523012
깊이의 숫자 :  12
정확도 :  0.6104951185495118
깊이의 숫자 :  13
정확도 :  0.6055299860529986
깊이의 숫자 :  14
정확도 :  0.6146966527196652
깊이의 숫자 :  15
정확도 :  0.6146757322175732
깊이의 숫자 :  16
정확도 :  0.6163563458856346
깊이의 숫자 :  17
정확도 :  0.6221966527196653
깊이의 숫자 :  18
정확도 :  0.6146931659693166
깊이의 숫자 :  19
정확도 :  0.6121792189679219
깊이의 숫자 :  20
정확도 :  0.6205334728033473
깊이의 숫자 :  21
정확도 :  0.6138563458856346
깊이의 숫자 :  22
정확도 :  0.6080090655509066
깊이의 숫자 :  23
정확도 :  0.6238702928870292
깊이의 숫자 :  24
정확도 :  0.6155125523012552
깊이의 숫자 :  25
정확도 :  0.6146687587168758
깊이의 숫자 :  26
정확도 :  0.608009065550

In [55]:
final_tree = DecisionTreeClassifier(max_depth=17)

#### 7.평가하기

In [56]:
final_knn.fit(x_train, y_train)
final_knn_pre = final_knn.predict(x_test)
print(classification_report(final_knn_pre, y_test))

              precision    recall  f1-score   support

           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         1
           5       0.67      0.62      0.65       194
           6       0.60      0.46      0.52       190
           7       0.11      0.40      0.18        15
           8       0.00      0.00      0.00         0

    accuracy                           0.54       400
   macro avg       0.23      0.25      0.22       400
weighted avg       0.61      0.54      0.57       400



  _warn_prf(average, modifier, msg_start, len(result))


In [57]:
final_tree.fit(x_train, y_train)
final_tree_pre = final_tree.predict(x_test)
print(classification_report(final_tree_pre, y_test))

              precision    recall  f1-score   support

           3       0.00      0.00      0.00         4
           4       0.09      0.07      0.08        14
           5       0.65      0.67      0.66       176
           6       0.53      0.50      0.51       157
           7       0.51      0.59      0.55        46
           8       0.14      0.33      0.20         3

    accuracy                           0.56       400
   macro avg       0.32      0.36      0.33       400
weighted avg       0.56      0.56      0.56       400

