# random forest
- 앙상블 학습의 대표 주자
- 훈련 데이터에서 랜덤하게 샘플을 추출하여 훈련 데이터를 만듬 ( 중복 발생 가능) => 부트스트랩 새플(bootstrap sample, 데이터 세트에서 중복 허용)
- 랜덤하게 선택한 샘플과 특성을 사용하기 때문에 훈련 세트에 과대적합 되는것을 막아주고
- 검증세트와 테스트 세트에서 안정적인 성능을 얻을 수 있음



In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
wine = pd.read_csv('https://bit.ly/wine_csv_data')
data = wine[['alcohol', 'sugar', 'pH']].to_numpy()
target = wine['class'].to_numpy()

train_input, test_input, train_target, test_target = train_test_split(data, target, test_size=0.2, random_state=42)

In [2]:
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_jobs=-1, random_state=42)

scores = cross_validate(rf, train_input, train_target, return_train_score=True, n_jobs=-1)
for k, v in scores.items():
  print(k, v)
print(np.mean(scores['train_score']), np.mean(scores['test_score']))

fit_time [0.68380952 0.58343172 0.65770268 0.58043933 0.45149183]
score_time [0.10766935 0.10225987 0.1034832  0.10220242 0.10226512]
test_score [0.88461538 0.88942308 0.90279115 0.88931665 0.88642926]
train_score [0.9971133  0.99663219 0.9978355  0.9973545  0.9978355 ]
0.9973541965122431 0.8905151032797809


In [3]:
rf.fit(train_input, train_target)
# 'alcohol', 'sugar', 'pH'
print(rf.feature_importances_)

[0.23167441 0.50039841 0.26792718]


## RandomForestClassifier's oob_score
- 부트스트랩 샘플(중복 허용)을 만들어 결정 트리를 훈련, 이때 부트스트랩 샘플에 포함되지 않고 남는 샘플을 Out of Bag 샘플이라고 하여, 이 샘플로 검증을 수행함

In [4]:
rf = RandomForestClassifier(oob_score=True, n_jobs=-1, random_state=42)
rf.fit(train_input, train_target)
print(rf.oob_score_)

0.8934000384837406


# Gradient Boosting
- 깊이가 얕은 결정 트리를 사용, 과대적합에 강하고, 일반적으로 높은 일반화 성능


In [13]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(random_state=42)
scores = cross_validate(gb, train_input, train_target, return_train_score=True, n_jobs=-1)
print(scores)
print(np.mean(scores['train_score']), np.mean(scores['test_score']))

{'fit_time': array([0.52005076, 0.52520967, 0.52149487, 0.53356552, 0.34815049]), 'score_time': array([0.00412202, 0.00439501, 0.00403571, 0.00461769, 0.00391245]), 'test_score': array([0.86634615, 0.87019231, 0.89412897, 0.86044273, 0.86910491]), 'train_score': array([0.89006495, 0.88958383, 0.88239538, 0.89249639, 0.88600289])}
0.8881086892152563 0.8720430147331015


In [7]:

gb = GradientBoostingClassifier(n_estimators=500, learning_rate=0.2, random_state=42)
scores = cross_validate(gb, train_input, train_target, return_train_score=True, n_jobs=-1)

print(scores)
print(np.mean(scores['train_score']), np.mean(scores['test_score']))

{'fit_time': array([2.47571707, 2.48331618, 2.47680306, 2.48959398, 1.63700652]), 'score_time': array([0.01302719, 0.0130012 , 0.01347756, 0.01302195, 0.00912833]), 'test_score': array([0.875     , 0.87211538, 0.89701636, 0.8719923 , 0.87391723]), 'train_score': array([0.9494828 , 0.94443108, 0.94468494, 0.94324194, 0.95045695])}
0.9464595437171814 0.8780082549788999


In [8]:
gb.fit(train_input, train_target)

# wine[['alcohol', 'sugar', 'pH']].to_numpy()
print(gb.feature_importances_)

[0.15872278 0.68010884 0.16116839]


## xgboost


In [11]:
from xgboost import XGBClassifier
xgb = XGBClassifier(tree_method='hist', random_state=42)
scores = cross_validate(xgb, train_input, train_target, return_train_score=True)

print(scores)
print(np.mean(scores['train_score']), np.mean(scores['test_score']))

{'fit_time': array([0.07817364, 0.06521797, 0.06423998, 0.07020569, 0.06615543]), 'score_time': array([0.00407147, 0.00397897, 0.0041008 , 0.00395489, 0.00431418]), 'test_score': array([0.86057692, 0.87115385, 0.89316651, 0.86236766, 0.87584216]), 'train_score': array([0.88669714, 0.87972095, 0.87782588, 0.88552189, 0.88239538])}
0.8824322471423747 0.8726214185237284


## LightGBM

In [12]:
from lightgbm import LGBMClassifier

lgb = LGBMClassifier(random_state=42)
scores = cross_validate(lgb, train_input, train_target, return_train_score=True)

print(scores)
print(np.mean(scores['train_score']), np.mean(scores['test_score']))

{'fit_time': array([0.09331393, 0.08685398, 0.07536006, 0.08238935, 0.07152987]), 'score_time': array([0.00831389, 0.0093472 , 0.00797272, 0.00819659, 0.00927401]), 'test_score': array([0.86442308, 0.88076923, 0.90856593, 0.86429259, 0.87680462]), 'train_score': array([0.93312485, 0.93432764, 0.93217893, 0.93241943, 0.93698894])}
0.9338079582727165 0.8789710890649293
