# 랜덤포레스트 다중 분류
- wine dataset

In [19]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### 1.데이터 준비

In [20]:
# !pip install gdown

In [21]:
# !wget https://raw.githubusercontent.com/devdio/flyai_datasets/main/winequalityN.csv

In [22]:
!gdown https://raw.githubusercontent.com/devdio/flyai_datasets/main/winequalityN.csv

Downloading...
From: https://raw.githubusercontent.com/devdio/flyai_datasets/main/winequalityN.csv
To: /content/winequalityN.csv
  0% 0.00/95.2k [00:00<?, ?B/s]390kB [00:00, 21.0MB/s]        


In [23]:
# 로딩

wine = pd.read_csv('/content/winequalityN.csv')
wine.shape

(6497, 13)

In [24]:
wine.head()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,white,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,white,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [25]:
df = wine.copy()
df['type'].unique()

array(['white', 'red'], dtype=object)

In [26]:
df['type'].value_counts()

white    4898
red      1599
Name: type, dtype: int64

In [27]:
df['quality'].value_counts().sort_index()

3      30
4     216
5    2138
6    2836
7    1079
8     193
9       5
Name: quality, dtype: int64

In [28]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
fixed acidity,6487.0,7.216579,1.29675,3.8,6.4,7.0,7.7,15.9
volatile acidity,6489.0,0.339691,0.164649,0.08,0.23,0.29,0.4,1.58
citric acid,6494.0,0.318722,0.145265,0.0,0.25,0.31,0.39,1.66
residual sugar,6495.0,5.444326,4.758125,0.6,1.8,3.0,8.1,65.8
chlorides,6495.0,0.056042,0.035036,0.009,0.038,0.047,0.065,0.611
free sulfur dioxide,6497.0,30.525319,17.7494,1.0,17.0,29.0,41.0,289.0
total sulfur dioxide,6497.0,115.744574,56.521855,6.0,77.0,118.0,156.0,440.0
density,6497.0,0.994697,0.002999,0.98711,0.99234,0.99489,0.99699,1.03898
pH,6488.0,3.218395,0.160748,2.72,3.11,3.21,3.32,4.01
sulphates,6493.0,0.531215,0.148814,0.22,0.43,0.51,0.6,2.0


In [29]:
# 결측치
df.isna().sum(axis=0)

type                     0
fixed acidity           10
volatile acidity         8
citric acid              3
residual sugar           2
chlorides                2
free sulfur dioxide      0
total sulfur dioxide     0
density                  0
pH                       9
sulphates                4
alcohol                  0
quality                  0
dtype: int64

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   type                  6497 non-null   object 
 1   fixed acidity         6487 non-null   float64
 2   volatile acidity      6489 non-null   float64
 3   citric acid           6494 non-null   float64
 4   residual sugar        6495 non-null   float64
 5   chlorides             6495 non-null   float64
 6   free sulfur dioxide   6497 non-null   float64
 7   total sulfur dioxide  6497 non-null   float64
 8   density               6497 non-null   float64
 9   pH                    6488 non-null   float64
 10  sulphates             6493 non-null   float64
 11  alcohol               6497 non-null   float64
 12  quality               6497 non-null   int64  
dtypes: float64(11), int64(1), object(1)
memory usage: 660.0+ KB


- 결측치 삭제

In [31]:
df = df.dropna()
df.isna().sum(axis=0)

type                    0
fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [32]:
df.head()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,white,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,white,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [33]:
# 3~9 -> 0~6

df['quality'] = df['quality'] - 3
df.head()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,3
1,white,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,3
2,white,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,3
3,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,3
4,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,3


In [35]:
df['quality'].value_counts().sort_index()

0      30
1     214
2    2128
3    2820
4    1074
5     192
6       5
Name: quality, dtype: int64

### 2.테스트 분리

In [36]:
X = df.drop(['type', 'quality'], axis=1)
y = df['quality']

In [37]:
X.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9


In [38]:
y.head()

0    3
1    3
2    3
3    3
4    3
Name: quality, dtype: int64

In [39]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=0)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((5170, 11), (1293, 11), (5170,), (1293,))

In [41]:
y_train.value_counts().sort_index()

0      24
1     171
2    1702
3    2256
4     859
5     154
6       4
Name: quality, dtype: int64

In [42]:
y_test.value_counts().sort_index()

0      6
1     43
2    426
3    564
4    215
5     38
6      1
Name: quality, dtype: int64

- 오버샘플링

In [43]:
!pip install -U imbalanced-learn



In [46]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler()
X_train, y_train = ros.fit_resample(X_train, y_train)

In [47]:
y_train.value_counts().sort_index()

0    2256
1    2256
2    2256
3    2256
4    2256
5    2256
6    2256
Name: quality, dtype: int64

##### 스케일링
- 표준화(StandardScaler)

In [48]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
X_train_s = ss.fit_transform(X_train)
X_train_s[:5]

array([[ 0.4221439 ,  0.73324613, -1.61566684, -0.64195462,  1.79329554,
        -0.7511279 , -1.17801188,  0.99678906, -0.36474217,  0.81117756,
        -1.02741246],
       [-2.67281256, -0.68994224,  0.52795904, -0.17254003, -0.60661161,
        -0.12426769,  0.00445769, -1.59760484,  2.00625708, -1.04916672,
         1.45430122],
       [-0.37143468, -0.30179996,  0.04391449,  1.84594271, -0.29603539,
         0.89850423,  0.7823982 ,  0.83343833, -0.85110099,  0.38186734,
        -0.80180212],
       [ 0.58085962,  1.19901687, -1.89226373, -0.68889608,  1.05920629,
        -0.68514262, -1.33359998,  1.07686295,  0.72956518,  0.73962586,
        -0.65139523],
       [ 0.26342819,  0.83675074,  1.15030204,  2.12759146, -0.4936748 ,
         0.13967345,  0.59569248,  0.58040485, -0.60792158, -0.90606331,
         0.70226677]])

In [50]:
X_test_s = ss.transform(X_test)

### 학습
- 베이스라인

In [51]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.fit(X_train_s, y_train)

In [52]:
# predict
y_pred = rfc.predict(X_test_s)
# 정확도


In [54]:
from sklearn.metrics import accuracy_score

print('정확도:', accuracy_score(y_test, y_pred))
# 정밀도, 재현율

정확도: 0.7153905645784996


#### 4.튜닝

In [55]:
%%time

from sklearn.model_selection import GridSearchCV

params = {
    'max_depth':[50, 100, 150, 200]
}
rfc = RandomForestClassifier()

grid_cv = GridSearchCV(rfc, param_grid=params, n_jobs=-1)
grid_cv.fit(X_train_s, y_train)

CPU times: user 2.82 s, sys: 93.8 ms, total: 2.91 s
Wall time: 38.8 s


In [56]:
grid_cv.best_estimator_

In [57]:
grid_cv.best_params_

{'max_depth': 200}

In [58]:
grid_cv.best_estimator_.score(X_test_s, y_test)

0.7122969837587007

### 모델 비교

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# KNN
# SVC
# DecisionTree

In [None]:
# model.score(X_test_s, y_test_s)