In [37]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import MinMaxScaler

## 머신러닝 프로세스
1. 문제정의
2. 데이터수집(크롤링, 파일데이터, DB 등)
3. 데이터전처리(결측치제거, 이상치, 특성제거)
4. 탐색적 데이터 분석(EDA : 각 특성관련 영향이해)
5. 모델선정 및 하이퍼파라미터 튜닝
6. 모델학습(train, validation, test)
7. 모델평가(회귀 : 오차, 분류 : 맞춘갯수)
    - 회귀 : MSE(평균제곱오차), RMSE(제곱오차에 다시 루트로 원복), MAE(평균절대값오차)
    - 분류 : Accurocy(정확도), Precision(정밀도), Recall(재현율), f1-score, RCC Curve

#### 1. 문제정의
- 와인데이터를 이용한 분류
- 컬럼 중 등급(quality)사용

#### 2. 데이터수집

In [38]:
data = pd.read_csv('data/winequality-red.csv')
data

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [39]:
x = data.loc[:,:'alcohol']
y = data['quality']

In [40]:
x.shape

(1599, 11)

In [41]:
y.shape

(1599,)

In [42]:
x_train, x_test, y_train, y_test = train_test_split(x, y)

In [43]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(1199, 11)
(1199,)
(400, 11)
(400,)


#### 3.데이터탐색

In [44]:
data.info() #결측치확인

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [45]:
data.describe() #기술통계

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [46]:
data.corr() #상관계수

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
fixed acidity,1.0,-0.256131,0.671703,0.114777,0.093705,-0.153794,-0.113181,0.668047,-0.682978,0.183006,-0.061668,0.124052
volatile acidity,-0.256131,1.0,-0.552496,0.001918,0.061298,-0.010504,0.07647,0.022026,0.234937,-0.260987,-0.202288,-0.390558
citric acid,0.671703,-0.552496,1.0,0.143577,0.203823,-0.060978,0.035533,0.364947,-0.541904,0.31277,0.109903,0.226373
residual sugar,0.114777,0.001918,0.143577,1.0,0.05561,0.187049,0.203028,0.355283,-0.085652,0.005527,0.042075,0.013732
chlorides,0.093705,0.061298,0.203823,0.05561,1.0,0.005562,0.0474,0.200632,-0.265026,0.37126,-0.221141,-0.128907
free sulfur dioxide,-0.153794,-0.010504,-0.060978,0.187049,0.005562,1.0,0.667666,-0.021946,0.070377,0.051658,-0.069408,-0.050656
total sulfur dioxide,-0.113181,0.07647,0.035533,0.203028,0.0474,0.667666,1.0,0.071269,-0.066495,0.042947,-0.205654,-0.1851
density,0.668047,0.022026,0.364947,0.355283,0.200632,-0.021946,0.071269,1.0,-0.341699,0.148506,-0.49618,-0.174919
pH,-0.682978,0.234937,-0.541904,-0.085652,-0.265026,0.070377,-0.066495,-0.341699,1.0,-0.196648,0.205633,-0.057731
sulphates,0.183006,-0.260987,0.31277,0.005527,0.37126,0.051658,0.042947,0.148506,-0.196648,1.0,0.093595,0.251397


In [47]:
data['quality'].value_counts()

5    681
6    638
7    199
4     53
8     18
3     10
Name: quality, dtype: int64

#### 4. 전처리

In [48]:
minMax = MinMaxScaler()

In [49]:
minMax.fit(x_train) #최대값과 최소값을 구하기

MinMaxScaler(copy=True, feature_range=(0, 1))

In [50]:
x_train_scaled = minMax.transform(x_train)
x_test_scaled = minMax.transform(x_test)

In [51]:
x_train_scaled

array([[0.26548673, 0.38356164, 0.02      , ..., 0.33858268, 0.4491018 ,
        0.12307692],
       [0.21238938, 0.34246575, 0.1       , ..., 0.42519685, 0.16766467,
        0.12307692],
       [0.38938053, 0.18493151, 0.4       , ..., 0.36220472, 0.2994012 ,
        0.38461538],
       ...,
       [0.27433628, 0.48972603, 0.        , ..., 0.44094488, 0.11377246,
        0.13846154],
       [0.19469027, 0.3630137 , 0.02      , ..., 0.48031496, 0.17365269,
        0.30769231],
       [0.26548673, 0.11643836, 0.49      , ..., 0.4488189 , 0.16766467,
        0.26153846]])

In [52]:
x_train.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
161,7.6,0.68,0.02,1.3,0.072,9.0,20.0,0.9965,3.17,1.08,9.2
1309,7.0,0.62,0.1,1.4,0.071,27.0,63.0,0.996,3.28,0.61,9.2
1219,9.0,0.39,0.4,1.3,0.044,25.0,50.0,0.99478,3.2,0.83,10.9
913,9.4,0.395,0.46,4.6,0.094,3.0,10.0,0.99639,3.27,0.64,12.2
1007,9.1,0.3,0.34,2.0,0.064,12.0,25.0,0.99516,3.26,0.84,11.7


#### 5. 모델링
- 교차검증
    1. KNN
    2. DecisionTree
    3. Logistic
    4. LinearSVC

In [53]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

In [54]:
knn_model = KNeighborsClassifier()
tree_model = DecisionTreeClassifier()
lin_model = LogisticRegression()
svc_model = LinearSVC()

knn_model.fit(x_train, y_train)
tree_model.fit(x_train, y_train)
lin_model.fit(x_train, y_train)
svc_model.fit(x_train, y_train)

In [55]:
result = cross_val_score(knn_model, x_train_scaled, y_train, cv=5)
result.mean()

0.5546827057182706

In [56]:
result = cross_val_score(tree_model, x_train_scaled, y_train, cv=5)
result.mean()

0.5546757322175733

In [57]:
result = cross_val_score(lin_model, x_train_scaled, y_train, cv=5)
result.mean()

0.5863423988842399

In [58]:
result = cross_val_score(svc_model, x_train_scaled, y_train, cv=5)
result.mean()

0.5779846582984658

#### 6. 하이퍼파라미터튜닝

In [59]:
for k in range(1, 50, 2) :
    knn_rs = cross_val_score(KNeighborsClassifier(n_neighbors=k), x_train_scaled, y_train, cv=5)
    print("이웃의 숫자 : ",k)
    print("정확도 : ",knn_rs.mean())

이웃의 숫자 :  1
정확도 :  0.5946966527196652
이웃의 숫자 :  3
정확도 :  0.5496792189679219
이웃의 숫자 :  5
정확도 :  0.5546827057182706
이웃의 숫자 :  7
정확도 :  0.5596861924686192
이웃의 숫자 :  9
정확도 :  0.5730299860529986
이웃의 숫자 :  11
정확도 :  0.5805404463040447
이웃의 숫자 :  13
정확도 :  0.585536959553696
이웃의 숫자 :  15
정확도 :  0.5780195258019526
이웃의 숫자 :  17
정확도 :  0.5947245467224547
이웃의 숫자 :  19
정확도 :  0.6013842398884239
이웃의 숫자 :  21
정확도 :  0.593036959553696
이웃의 숫자 :  23
정확도 :  0.5913702928870294
이웃의 숫자 :  25
정확도 :  0.5755230125523012
이웃의 숫자 :  27
정확도 :  0.5830160390516039
이웃의 숫자 :  29
정확도 :  0.5830299860529986
이웃의 숫자 :  31
정확도 :  0.5897036262203625
이웃의 숫자 :  33
정확도 :  0.5913668061366806
이웃의 숫자 :  35
정확도 :  0.5905299860529986
이웃의 숫자 :  37
정확도 :  0.5972036262203627
이웃의 숫자 :  39
정확도 :  0.5947001394700139
이웃의 숫자 :  41
정확도 :  0.5930230125523013
이웃의 숫자 :  43
정확도 :  0.5863458856345886
이웃의 숫자 :  45
정확도 :  0.5913633193863319
이웃의 숫자 :  47
정확도 :  0.5830230125523013
이웃의 숫자 :  49
정확도 :  0.5880195258019525


In [60]:
final_knn = KNeighborsClassifier(n_neighbors=19)

In [61]:
for dep_no in range(1, 30) :
    tree_rs =  cross_val_score(DecisionTreeClassifier(max_depth=dep_no), x_train_scaled, y_train, cv=5)
    print("깊이의 숫자 : ",dep_no)
    print("정확도 : ",tree_rs.mean())

깊이의 숫자 :  1
정확도 :  0.5588075313807531
깊이의 숫자 :  2
정확도 :  0.5571408647140864
깊이의 숫자 :  3
정확도 :  0.5588040446304043
깊이의 숫자 :  4
정확도 :  0.5563249651324965
깊이의 숫자 :  5
정확도 :  0.5680055788005578
깊이의 숫자 :  6
정확도 :  0.5596478382147838
깊이의 숫자 :  7
정확도 :  0.5588145048814505
깊이의 숫자 :  8
정확도 :  0.5579741980474198
깊이의 숫자 :  9
정확도 :  0.5696827057182705
깊이의 숫자 :  10
정확도 :  0.561352859135286
깊이의 숫자 :  11
정확도 :  0.5571827057182707
깊이의 숫자 :  12
정확도 :  0.5613354253835425
깊이의 숫자 :  13
정확도 :  0.5488214783821478
깊이의 숫자 :  14
정확도 :  0.5454776847977685
깊이의 숫자 :  15
정확도 :  0.5479986052998606
깊이의 숫자 :  16
정확도 :  0.5463284518828451
깊이의 숫자 :  17
정확도 :  0.5563110181311018
깊이의 숫자 :  18
정확도 :  0.5588389121338911
깊이의 숫자 :  19
정확도 :  0.5504951185495118
깊이의 숫자 :  20
정확도 :  0.5504951185495118
깊이의 숫자 :  21
정확도 :  0.5555090655509065
깊이의 숫자 :  22
정확도 :  0.5546966527196653
깊이의 숫자 :  23
정확도 :  0.5555125523012553
깊이의 숫자 :  24
정확도 :  0.5513458856345885
깊이의 숫자 :  25
정확도 :  0.5480055788005579
깊이의 숫자 :  26
정확도 :  0.5529986052998

In [62]:
final_tree = DecisionTreeClassifier(max_depth=17)

#### 7.평가하기

In [63]:
final_knn.fit(x_train, y_train)
final_knn_pre = final_knn.predict(x_test_scaled)
print(classification_report(final_knn_pre, y_test))

              precision    recall  f1-score   support

           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         0
           5       1.00      0.39      0.56       400
           6       0.00      0.00      0.00         0
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         0

    accuracy                           0.39       400
   macro avg       0.17      0.06      0.09       400
weighted avg       1.00      0.39      0.56       400



  _warn_prf(average, modifier, msg_start, len(result))


In [64]:
final_tree.fit(x_train, y_train)
final_tree_pre = final_tree.predict(x_test_scaled)
print(classification_report(final_tree_pre, y_test))

              precision    recall  f1-score   support

           3       0.00      0.00      0.00         0
           4       0.67      0.05      0.10       115
           5       0.00      0.00      0.00         1
           6       0.78      0.44      0.56       282
           7       0.02      0.50      0.03         2
           8       0.00      0.00      0.00         0

    accuracy                           0.33       400
   macro avg       0.24      0.17      0.12       400
weighted avg       0.74      0.33      0.43       400



  _warn_prf(average, modifier, msg_start, len(result))


## 앙상블
1. voting 투표진행
    - 3개의 모델샘플에 대해 판단해서 과반수의 결론을 내림
2. bagging
    - 같은 알고리즘을 가진 모델을 여러개 생성하고 나온 결과값을 종합해서 결론을 내림
    - 같은 알고리즘을 가지기에 비슷한 예측결과가 나온다. 다양한 결과값이 나올 수 있도록 모델에 차이를 줘야함
    - 별도의 작업없이 병렬적으로 결과값 도출 가능
3. boosting 실시간 업그레이드모델
    - 같은 알고리즘을 가진 모델을 여러개 생성한다.
    - 단, 이전 모델이 가졌던 오차를 반영해서 보다 업그레이드된 다른 모델을 만들어 예측함
    - 기존의 오차를 반영해서 하기에 학습 등 시간이 오래걸리고 순차적인 속성을 가짐

In [65]:
from sklearn.ensemble import VotingClassifier

In [66]:
voting = VotingClassifier(estimators=[('knn',final_knn), ('tree',final_tree), ('logi', lin_model)],
                         voting='soft') 
#어떤 모델을 넣을지 적는 estimators
#투표방법을 정하는 voting(최종예측정보로 결정하는 hard, 확률(확신도)로 종합해서 표현하는 soft)

In [67]:
voting.fit(x_train_scaled, y_train)

VotingClassifier(estimators=[('knn',
                              KNeighborsClassifier(algorithm='auto',
                                                   leaf_size=30,
                                                   metric='minkowski',
                                                   metric_params=None,
                                                   n_jobs=None, n_neighbors=19,
                                                   p=2, weights='uniform')),
                             ('tree',
                              DecisionTreeClassifier(ccp_alpha=0.0,
                                                     class_weight=None,
                                                     criterion='gini',
                                                     max_depth=17,
                                                     max_features=None,
                                                     max_leaf_nodes=None,
                                                     min_impurity_de

In [68]:
voting.predict(x_test)
voting.score(x_test_scaled, y_test)

0.62

In [69]:
rs = cross_val_score(voting, x_train_scaled, y_train, cv=5)

In [70]:
rs.mean()

0.5771931659693166

#### 함수로 만들어진 앙상블(sklearn에서 제공하는 함수)
1. Randomforest - bagging방식으로 만들어진 함수
    - 트리를 만들 때 사용하는 데이터포인트샘플을 무작위로 선택 (n_estimer)
    - 노드구성시 기준이 되는 특성을 무작위로 선택 (max_features)
    - 랜덤하게 만들어지기에 random_state로 고정해야 기존의 결과값을 볼 수 있음
2. Gradientboosting - boosting방식으로 만들어진 함수
- 그외의 것들은 보통 sklearn이 아닌 외부에서 가져와서 사용하는 앙상블함수