### Ensenble - RandomForest & ExtraTree
 - 배깅 방식의 앙상블 ==> 중복을 허용한 랜덤 샘플 + 동일 모델(DT)
    * 대표 알고리즘 : RandomForestC/R :
 - 페이스트 방식의 앙상블 ==> 랜덤 샘플 + 동일 모델 (DT)
    * 대표 알고리즘 : ExtraTreeC/R 

 [목표] 와인 분류 => 0과 1 종류 분류

[1] 모듈 로딩 및 데이터 준비

In [1]:
# 모듈로딩
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [4]:
# 데이터
DATA_FILE = '../data/wine.csv'

# CSV >> DataFrame 
wineDF = pd.read_csv(DATA_FILE)

In [5]:
# 데이터 확인
wineDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   alcohol  6497 non-null   float64
 1   sugar    6497 non-null   float64
 2   pH       6497 non-null   float64
 3   class    6497 non-null   float64
dtypes: float64(4)
memory usage: 203.2 KB


In [6]:
wineDF.head(2)

Unnamed: 0,alcohol,sugar,pH,class
0,9.4,1.9,3.51,0.0
1,9.8,2.6,3.2,0.0


In [7]:
# 타겟/라벨의 클래스 분포
wineDF['class'].value_counts()

class
1.0    4898
0.0    1599
Name: count, dtype: int64

In [8]:
wineDF.describe()

Unnamed: 0,alcohol,sugar,pH,class
count,6497.0,6497.0,6497.0,6497.0
mean,10.491801,5.443235,3.218501,0.753886
std,1.192712,4.757804,0.160787,0.430779
min,8.0,0.6,2.72,0.0
25%,9.5,1.8,3.11,1.0
50%,10.3,3.0,3.21,1.0
75%,11.3,8.1,3.32,1.0
max,14.9,65.8,4.01,1.0


[2] 학습 준비

In [9]:
# 학습용 & 테스트용 데이터셋 분할
from sklearn.model_selection import train_test_split

In [11]:
featureDF = wineDF[wineDF.columns[:-1]]
targetSR = wineDF[wineDF.columns[-1]]

print(f'featureDF : {featureDF.shape}, targetSR : {targetSR.shape}')

featureDF : (6497, 3), targetSR : (6497,)


In [12]:
# 학습용 테스트용 데이터셋 분리
X_train, X_test, y_train, y_test = train_test_split(featureDF,
                                                    targetSR,
                                                    test_size=0.2,
                                                    stratify=targetSR,
                                                    random_state=1
                                                    )

In [13]:
print(f'X_train : {X_train.shape} y_train : {y_train}')
print(f'X_test : {X_test.shape} y_trest : {y_test}')

X_train : (5197, 3) y_train : 6378    1.0
3069    1.0
4986    1.0
993     0.0
4988    1.0
       ... 
453     0.0
1460    0.0
5755    1.0
6299    1.0
3369    1.0
Name: class, Length: 5197, dtype: float64
X_test : (1300, 3) y_trest : 1002    0.0
2345    1.0
79      0.0
4208    1.0
2092    1.0
       ... 
4179    1.0
2803    1.0
6454    1.0
5451    1.0
1360    0.0
Name: class, Length: 1300, dtype: float64


[3] 학습진행

In [14]:
# 학습방법 : 지도학습 > 분류
# 알고리즘 : 앙상블 > 배깅 - RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

In [15]:
# 인스턴스 생성 => 100개의 내부 DT 모델에서 사용할 데이터셋 생성
#                random_state 매개변수 설정으로 고정된 데이터셋 생성
#                oob_score 배개변ㅇ수 ㅣ 샘플 데이터셋 추출 휴ㅜ 남은 데이터셋 검즈용ㅇ으로 사용
lf_model = RandomForestClassifier(random_state=7. oob_Score = True)

# 학습
lf_model.fit(X_train, y_train)

In [17]:
# 모델 파라미터
print(f'classes_ : {lf_model.classes_}')
print(f'n_classes_ : {lf_model.n_classes_}')
print()
print(f'feature_names_in_ : {lf_model.feature_names_in_}')
print(f'n_features_in_ : {lf_model.n_features_in_}')
print(f'feature_importances_ : {lf_model.feature_importances_}')

classes_ : [0. 1.]
n_classes_ : 2

feature_names_in_ : ['alcohol' 'sugar' 'pH']
n_features_in_ : 3
feature_importances_ : [0.23572103 0.49995154 0.26432743]


In [25]:
# 모델 파라미터
print(f'clases_         : {lf_model.estimator__}')

for est in lf_model.estimators_: print(est.samples_)

AttributeError: 'RandomForestClassifier' object has no attribute 'estimators__'

In [None]:
# 모델 파라미터
print(f'clases_         : {lf_model.classes_}')

for est in lf_model.estimators_: print(est)

In [None]:
print(f'clases_         : {lf_model.classes_}')

for est in lf_model.estimators_: print(est)

[4] 성능평가

In [21]:
train_score = lf_model.score(X_train, y_train)
test_score = lf_model.score(X_test, y_test)


In [22]:
print(f'oob_score : {lf_model.oob_score_}')

AttributeError: 'RandomForestClassifier' object has no attribute 'oob_score_'

[5] 튜닝
 - RandomizedSearchCV 하이퍼파라미터 최적화 클래스
    * 범위가 넓은 하이퍼파라미터 설정에 좋음
    * 지정된 범위에서 지정된 횟수만큼 하이퍼파라미터를 추출항 조합 진행

In [26]:
# 모듈 로딩
from sklearn.model_selection import RandomizedSearchCV                                     

In [27]:
# RandomForestClassifier 하이퍼파라미터 
params = {'max_depth' : range(2,16),
          'min_samples_leaf' : range(5,16),
          'criterion' : ['gini', 'entropy', 'log_loss']
          }

In [28]:
rf_model = RandomForestClassifier(random_state=7)

In [34]:
searchCV = RandomizedSearchCV(rf_model, param_distributions=params, n_iter = 50, verbose=4)
# verbose 

In [35]:
# searchCV.best_params_

searchCV.fit(X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV 1/5] END ...max_depth=3, min_samples_leaf=8;, score=0.805 total time=   0.1s
[CV 2/5] END ...max_depth=3, min_samples_leaf=8;, score=0.821 total time=   0.1s
[CV 3/5] END ...max_depth=3, min_samples_leaf=8;, score=0.835 total time=   0.1s
[CV 4/5] END ...max_depth=3, min_samples_leaf=8;, score=0.843 total time=   0.1s
[CV 5/5] END ...max_depth=3, min_samples_leaf=8;, score=0.825 total time=   0.1s
[CV 1/5] END .max_depth=10, min_samples_leaf=11;, score=0.873 total time=   0.2s
[CV 2/5] END .max_depth=10, min_samples_leaf=11;, score=0.838 total time=   0.3s
[CV 3/5] END .max_depth=10, min_samples_leaf=11;, score=0.880 total time=   0.2s
[CV 4/5] END .max_depth=10, min_samples_leaf=11;, score=0.882 total time=   0.2s
[CV 5/5] END .max_depth=10, min_samples_leaf=11;, score=0.869 total time=   0.2s
[CV 1/5] END ..max_depth=8, min_samples_leaf=12;, score=0.869 total time=   0.2s
[CV 2/5] END ..max_depth=8, min_samples_leaf=12

In [36]:
# 모델 파라미터
print(f'[searchCV.best_score_] {searchCV.best_score_}')
print(f'[searchCV.best_param_] {searchCV.best_params_}')
print(f'[searchCV.best_estimator_] {searchCV.best_estimator_}')

cv_resultDF = pd.DataFrame(searchCV.cv_results_)
cv_resultDF

[searchCV.best_score_] 0.877817613089509
[searchCV.best_param_] {'min_samples_leaf': 3, 'max_depth': 12}
[searchCV.best_estimator_] RandomForestClassifier(max_depth=12, min_samples_leaf=3, random_state=7)


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_min_samples_leaf,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.21418,0.012052,0.016913,0.001384,8,3,"{'min_samples_leaf': 8, 'max_depth': 3}",0.804808,0.821154,0.835419,0.843118,0.824832,0.825866,0.013081,42
1,0.313653,0.009673,0.020245,0.000596,11,10,"{'min_samples_leaf': 11, 'max_depth': 10}",0.873077,0.838462,0.879692,0.881617,0.869105,0.86839,0.015626,18
2,0.310828,0.025501,0.019178,0.001237,12,8,"{'min_samples_leaf': 12, 'max_depth': 8}",0.869231,0.8375,0.875842,0.87488,0.866218,0.864734,0.014074,27
3,0.202709,0.007491,0.015328,0.00055,6,3,"{'min_samples_leaf': 6, 'max_depth': 3}",0.803846,0.821154,0.835419,0.843118,0.824832,0.825674,0.013393,43
4,0.336964,0.006393,0.022943,0.002691,5,12,"{'min_samples_leaf': 5, 'max_depth': 12}",0.879808,0.85,0.884504,0.882579,0.876805,0.874739,0.01264,6
5,0.250739,0.010439,0.017274,0.001141,5,5,"{'min_samples_leaf': 5, 'max_depth': 5}",0.853846,0.833654,0.868142,0.872955,0.853705,0.856461,0.013734,37
6,0.307479,0.005459,0.02074,0.001117,12,13,"{'min_samples_leaf': 12, 'max_depth': 13}",0.873077,0.836538,0.881617,0.881617,0.870067,0.868583,0.016667,15
7,0.283122,0.011932,0.018298,0.001255,13,7,"{'min_samples_leaf': 13, 'max_depth': 7}",0.867308,0.835577,0.875842,0.87873,0.86333,0.864157,0.015338,29
8,0.287823,0.012364,0.018832,0.000885,11,7,"{'min_samples_leaf': 11, 'max_depth': 7}",0.866346,0.836538,0.876805,0.873917,0.86333,0.863387,0.014287,31
9,0.355524,0.018465,0.021954,0.00169,4,13,"{'min_samples_leaf': 4, 'max_depth': 13}",0.882692,0.848077,0.882579,0.888354,0.87488,0.875317,0.014278,4
