# 랜덤포레스트
: 배깅 + 랜덤 특성(feature) -> 특성 중요도 제공

- RandomForestClassifier()
- RandomForestRegressor()

In [None]:
# 이진 분류 (AUC)

# data 준비
from sklearn.datasets import load_breast_cancer
breast_cancer = load_breast_cancer()
data = breast_cancer.data
target = breast_cancer.target

# 데이터 분할
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, stratify=target, random_state=2205)

# 랜덤포레스트 분류 모델 생성, 학습
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators = 500,
                                max_depth=3,
                                min_samples_leaf = 10,
                                max_features = 'sqrt',
                                criterion = 'gini',
                                random_state=2022)
model.fit(X_train, y_train)

# 성능 평가
from sklearn.metrics import roc_curve, auc
y_score = model.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_score)
AUC = auc(fpr, tpr)
print(AUC)



0.9996693121693121


In [12]:
# 다지 분류 (f1 score)

# data 준비
from sklearn.datasets import load_breast_cancer
breast_cancer = load_breast_cancer()
data = breast_cancer.data
target = breast_cancer.target

# 데이터 분할
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, stratify=target, random_state=2205)

# 랜덤포레스트 분류 모델 생성, 학습
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators = 500,
                                max_depth = 3,
                                min_samples_leaf = 15,
                                max_features = 'sqrt',
                                criterion = 'gini',
                                random_state = 2022)
model.fit(X_train, y_train)

# 성능 평가
from sklearn.metrics import f1_score
y_pred = model.predict(X_test)
macro_f1 = f1_score(y_test, y_pred, average = "macro")
print(macro_f1)

0.962987012987013


In [19]:
# 회귀 (RMSE)

# data 불러오기
from sklearn.datasets import load_diabetes
diabetes = load_diabetes()
data = diabetes.data
target = diabetes.target

# 데이터 분할
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=2205)

# 랜덤포레스트 회귀 모델 생성, 학습
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators = 500,
                                max_depth = 3,
                                min_samples_leaf = 10,
                                max_features = 3,
                                random_state = 2022)
model.fit(X_train, y_train)

# 성능 평가
from sklearn.metrics import mean_squared_error
y_pred = model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared = False)
print(rmse)

63.49825173792558
