In [5]:
from sklearn.utils import all_estimators
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.ensemble import RandomForestClassifier

import pandas as pd
import numpy as np

In [6]:
# 경고 무시
import warnings
warnings.filterwarnings(action = "ignore")

## [1] 데이터 가져오기 및 전처리

In [7]:
mushroom = pd.read_csv("../Data/mushroom.csv", header = None)

In [8]:
mushroom.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       8124 non-null   object
 1   1       8124 non-null   object
 2   2       8124 non-null   object
 3   3       8124 non-null   object
 4   4       8124 non-null   object
 5   5       8124 non-null   object
 6   6       8124 non-null   object
 7   7       8124 non-null   object
 8   8       8124 non-null   object
 9   9       8124 non-null   object
 10  10      8124 non-null   object
 11  11      8124 non-null   object
 12  12      8124 non-null   object
 13  13      8124 non-null   object
 14  14      8124 non-null   object
 15  15      8124 non-null   object
 16  16      8124 non-null   object
 17  17      8124 non-null   object
 18  18      8124 non-null   object
 19  19      8124 non-null   object
 20  20      8124 non-null   object
 21  21      8124 non-null   object
 22  22      8124 non-null   

In [9]:
# 데이터와 타겟으로 분리
data = mushroom.iloc[:, 1:]
target = mushroom[0]

In [10]:
# labelencoding 사용
le = LabelEncoder()

for column_name in data.columns:
    if data[column_name].dtype == object:
        data[column_name] = le.fit_transform(data[column_name])
    else:
        pass


In [11]:
# 특징 배열 및 벡터 형태로 변경
X = data.to_numpy()
y = target.to_numpy()

X.shape, y.shape

((8124, 22), (8124,))

In [12]:
# train test 셋 분리
train_X, test_X, train_y, test_y = train_test_split(
    X, y, test_size=0.2
)

## [2] 분류 모델 선택

In [13]:
def all_estimators_classifier(data, target):
    from collections import defaultdict
    # 필터 타입에 해당하는 sklearn에 존재하는 모든 모델 이름과 객체 리스트로 반환
    models = all_estimators(type_filter = "classifier")

    train_X, test_X, train_y, test_y = train_test_split(
    data, target, test_size=0.2, random_state=42
    )

    scores = defaultdict()

    for name, model in models:
        try:
            # 모델 객체 생성
            md = model()
            # 학습
            md.fit(train_X, train_y)
            # 평가
            score = md.score(test_X, test_y)
            scores[name] = score
        except:
            pass

    return dict(scores)

In [14]:
all_estimators_classifier(train_X, train_y)

{'AdaBoostClassifier': 1.0,
 'BaggingClassifier': 1.0,
 'BernoulliNB': 0.8384615384615385,
 'CalibratedClassifierCV': 0.9553846153846154,
 'CategoricalNB': 0.9461538461538461,
 'ComplementNB': 0.8084615384615385,
 'DecisionTreeClassifier': 1.0,
 'DummyClassifier': 0.5223076923076924,
 'ExtraTreeClassifier': 1.0,
 'ExtraTreesClassifier': 1.0,
 'GaussianNB': 0.9407692307692308,
 'GaussianProcessClassifier': 1.0,
 'GradientBoostingClassifier': 1.0,
 'HistGradientBoostingClassifier': 1.0,
 'KNeighborsClassifier': 0.9992307692307693,
 'LabelPropagation': 1.0,
 'LabelSpreading': 1.0,
 'LinearDiscriminantAnalysis': 0.9523076923076923,
 'LinearSVC': 0.9576923076923077,
 'LogisticRegression': 0.9523076923076923,
 'LogisticRegressionCV': 0.9684615384615385,
 'MLPClassifier': 1.0,
 'MultinomialNB': 0.8053846153846154,
 'NearestCentroid': 0.7923076923076923,
 'NuSVC': 0.9069230769230769,
 'PassiveAggressiveClassifier': 0.9261538461538461,
 'Perceptron': 0.9553846153846154,
 'QuadraticDiscriminantA

## [3] RandomForestClassifier 사용

In [15]:
# 모델 학습 및 평가
rf = RandomForestClassifier(
    n_jobs = 4
)

rf.fit(train_X, train_y)

rf.score(test_X, test_y)

1.0

In [16]:
# cross validate 사용
result = cross_validate(rf, X, y, return_train_score=True, cv=10)

In [17]:
result

{'fit_time': array([2.88328624, 0.4459703 , 0.4584043 , 0.46194148, 0.44521952,
        0.45227814, 0.45282578, 0.44771528, 0.44543743, 0.45608687]),
 'score_time': array([0.10420394, 0.10367823, 0.10351825, 0.10555243, 0.10363507,
        0.10369134, 0.1034646 , 0.1035738 , 0.10393667, 0.10355902]),
 'test_score': array([0.68511685, 1.        , 1.        , 1.        , 1.        ,
        1.        , 1.        , 1.        , 0.95197044, 1.        ]),
 'train_score': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])}