랜덤 포레스트(Random Forest)

In [2]:
import pandas as pd 
feature_name_df = pd.read_csv('../00.data/UCI HAR Dataset/features.txt', sep='\s+',
                                header=None, names=['col_index','col_name'])

In [3]:
def get_new_feature_name_df(old_df):
    dup_df = pd.DataFrame({'dup_cnt':feature_name_df.groupby('col_name').cumcount()})
    new_df = pd.merge(old_df.reset_index(), dup_df.reset_index())
    new_df['col_name'] = new_df[['col_name', 'dup_cnt']].\
        apply(lambda x: x[0]+'_'+str(x[1]) if x[1] > 0 else x[0], axis=1)
    new_df = new_df.drop(['index'], axis=1)
    return new_df

In [4]:

new_feature_df = get_new_feature_name_df(feature_name_df)
feature_list = list(new_feature_df.col_name.values)

In [6]:
X_train = pd.read_csv('../00.data/UCI HAR Dataset/train/X_train.txt', 
                      header=None, sep='\s+', names=feature_list)
X_test = pd.read_csv('../00.data/UCI HAR Dataset/test/X_test.txt', 
                     header=None, sep='\s+', names=feature_list)
y_train = pd.read_csv('../00.data/UCI HAR Dataset/train/y_train.txt', 
                      header=None, sep='\s+', names=['action'])
y_test = pd.read_csv('../00.data/UCI HAR Dataset/test/y_test.txt', 
                     header=None, sep='\s+', names=['action'])

랜덤 포레스트 모델 생성/학습/예측/평가

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf_clf = RandomForestClassifier(random_state=156)
rf_clf.fit(X_train, y_train)
pred = rf_clf.predict(X_test)
acc = accuracy_score(y_test, pred)
print(f'랜덤 포레스트 정확도: {acc:.4f}')

최적 파라미터 찾기

In [None]:
from sklearn.model_selection import GridSearchCV

params = {
    'n_estimators': [10, 50, 100],
    'max_depth': [8, 12, 16],
    'min_samples_split': [12, 16, 20]
}

In [None]:
rf_clf = RandomForestClassifier(random_state=156, n_jobs=-1)
grid_cv = GridSearchCV(rf_clf, param_grid=params, cv=3, n_jobs=-1)
grid_cv.fit(X_train, y_train)
print(f'최고 평균 정확도: {grid_cv.best_score_:.4f}')
print('최적 파라미터:', grid_cv.best_params_)

In [None]:
df = pd.DataFrame(grid_cv.cv_results_)
df = df[['param_n_estimators', 'param_max_depth', 'param_min_samples_split', 'mean_test_score']]
df

튜닝된 파라미터로 재평가

In [None]:
best_clf = grid_cv.best_estimator_
pred = best_clf.predict(X_test)
acc = accuracy_score(y_test, pred)
print(f'최적 파라미터 랜덤 포레스트 정확도: {acc:.4f}')

재탐색

In [None]:
params = {
    'n_estimators': [10, 50, 100]
}
rf_clf = RandomForestClassifier(random_state=156, n_jobs=-1)
grid_cv = GridSearchCV(rf_clf, param_grid=params, cv=3, n_jobs=-1)
grid_cv.fit(X_train, y_train)
print(f'최고 평균 정확도: {grid_cv.best_score_:.4f}')
print('최적 파라미터:', grid_cv.best_params_)

In [None]:

best_clf = grid_cv.best_estimator_
pred = best_clf.predict(X_test)
acc = accuracy_score(y_test, pred)
print(f'최적 파라미터 랜덤 포레스트 정확도: {acc:.4f}')

K 최근접 이웃(K-Nearest Neighbor)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.get_params()

In [None]:
knn.fit(X_train, y_train)
pred = knn.predict(X_test)
accuracy_score(y_test, pred)