<a href="https://colab.research.google.com/github/chasubeen/python_selfstudy/blob/master/%E1%84%92%E1%85%A9%E1%86%AB%E1%84%8C%E1%85%A1%20%E1%84%80%E1%85%A9%E1%86%BC%E1%84%87%E1%85%AE%E1%84%92%E1%85%A1%E1%84%82%E1%85%B3%E1%86%AB%20%E1%84%86%E1%85%A5%E1%84%89%E1%85%B5%E1%86%AB%E1%84%85%E1%85%A5%E1%84%82%E1%85%B5%E1%86%BC%2B%E1%84%83%E1%85%B5%E1%86%B8%E1%84%85%E1%85%A5%E1%84%82%E1%85%B5%E1%86%BC%20%E1%84%89%E1%85%B5%E1%86%AF%E1%84%89%E1%85%B3%E1%86%B8/5-3.%20%ED%8A%B8%EB%A6%AC%EC%9D%98%20%EC%95%99%EC%83%81%EB%B8%94.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##**⭐ 용어정리**

###**정형 데이터와 비정형 데이터**

- 정형 데이터: 구조화된 데이터
- 비정형 데이터: 데이터베이스나 엑셀 등으로 정형화시키기 어려운 데이터들

###**랜덤 포레스트**

In [2]:
# 데이터셋을 로드하고, 훈련 세트와 테스트 세트로 나누기
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
wine = pd.read_csv('https://bit.ly/wine_csv_data')
data = wine[['alcohol','sugar','pH']].to_numpy()
target = wine['class'].to_numpy()

train_input, test_input, train_target, test_target = train_test_split(
    data,target,test_size = 0.2,random_state = 42
)

In [4]:
# 교차 검증 수행하기
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_jobs = -1, random_state = 42)
scores = cross_validate(rf,train_input, train_target, 
                        return_train_score = True, n_jobs = -1)
print(np.mean(scores['train_score']),np.mean(scores['test_score']))

# 모델이 훈련 세트에 다소 과대적합되었다고 판단할 수 있다.

0.9973541965122431 0.8905151032797809


In [5]:
# 랜덤 포레스트 모델을 훈련한 후 특성 중요도 출력하기
rf.fit(train_input, train_target)
print(rf.feature_importances_)

[0.23167441 0.50039841 0.26792718]


In [6]:
# 랜덤 포레스트 모델의 OOB 점수 출력하기
rf = RandomForestClassifier(oob_score = True, n_jobs = -1, random_state = 42)
rf.fit(train_input, train_target)
print(rf.oob_score_)

0.8934000384837406


###**엑스트라 트리**

In [9]:
from sklearn.ensemble import ExtraTreesClassifier
et = ExtraTreesClassifier(n_jobs = -1,random_state = 42)
scores = cross_validate(et,train_input, train_target,return_train_score = True,n_jobs = -1)
print(np.mean(scores['train_score']),np.mean(scores['test_score']))

0.9974503966084433 0.8887848893166506


In [10]:
# 특성 중요도 확인하기
et.fit(train_input, train_target)
print(et.feature_importances_)

[0.20183568 0.52242907 0.27573525]


###**그레이디언트 부스팅**

In [12]:
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier(random_state = 42)
scores = cross_validate(gb,train_input,train_target,
                        return_train_score = True,n_jobs = -1)
print(np.mean(scores['train_score']),np.mean(scores['test_score']))

0.8881086892152563 0.8720430147331015


In [13]:
# 학습률을 증가시키고 트리의 개수를 늘려 성능 향상시키기
gb = GradientBoostingClassifier(n_estimators = 500, learning_rate = 0.2,random_state = 42)
scores = cross_validate(gb,train_input, train_target,
                        return_train_score = True, n_jobs = -1)
print(np.mean(scores['train_score']),np.mean(scores['test_score']))

0.9464595437171814 0.8780082549788999


In [14]:
# 특성 중요도 확인하기
gb.fit(train_input, train_target)
print(gb.feature_importances_)

[0.15872278 0.68010884 0.16116839]


###**히스토그램 기반 그레디언트 부스팅**

In [15]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
hgb = HistGradientBoostingClassifier(random_state = 42)
scores = cross_validate(hgb,train_input,train_target,return_train_score = True)
print(np.mean(scores['train_score']),np.mean(scores['test_score']))

  "Since version 1.0, "


0.9321723946453317 0.8801241948619236


In [16]:
# 특성 중요도 파악하기
from sklearn.inspection import permutation_importance

hgb.fit(train_input, train_target)
result = permutation_importance(hgb,train_input,train_target,
                                n_repeats = 10, random_state = 42,n_jobs = -1)
print(result.importances_mean) # 특성 중요도, 평균, 표준편차

[0.08876275 0.23438522 0.08027708]


In [17]:
# 테스트 세트에서의 성능 확인하기
hgb.score(test_input, test_target)

0.8723076923076923

In [18]:
# XGBoost 알고리즘 사용하기
from xgboost import XGBClassifier
xgb = XGBClassifier(tree_method = 'hist',random_state = 42)
scores = cross_validate(xgb,train_input,train_target,
                        return_train_score = True)
print(np.mean(scores['train_score']),np.mean(scores['test_score']))

0.8824322471423747 0.8726214185237284


In [20]:
# LightGBM 부스팅 알고리즘 사용하기
from lightgbm import LGBMClassifier
lgb = LGBMClassifier(random_state = 42)
scores = cross_validate(lgb,train_input, train_target,return_train_score = True,n_jobs = -1)
print(np.mean(scores['train_score']),np.mean(scores['test_score']))

0.9338079582727165 0.8789710890649293
