# sklearn

In [119]:
import sklearn
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
sklearn.__version__

'1.6.1'

## sklearn을 이용하여 Iris 데이터 품종 예측하기

In [120]:
iris = load_iris()
print(type(iris))
keys = iris.keys()
print(type(keys), keys)
iris_data = iris.data
print(type(iris_data), iris_data.shape)
iris_label = iris.target
print(type(iris_label), len(iris_label), iris_label)
print(type(iris.feature_names), len(iris.feature_names), iris.feature_names)
print(type(iris.target_names), len(iris.target_names), iris.target_names)

<class 'sklearn.utils._bunch.Bunch'>
<class 'dict_keys'> dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])
<class 'numpy.ndarray'> (150, 4)
<class 'numpy.ndarray'> 150 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]
<class 'list'> 4 ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
<class 'numpy.ndarray'> 3 ['setosa' 'versicolor' 'virginica']


### 데이터 세트 분리 (학습 데이터와 테스트 데이터)

In [121]:
x_train, x_test, y_train, y_test = train_test_split(iris_data, iris_label, test_size=0.2, random_state=11)
print(type(x_train), len(x_train), x_train.shape)
print(type(y_train), len(y_train), y_train)
print(type(x_test), len(x_test), x_test.shape)
print(type(y_test), len(y_test), y_test)

<class 'numpy.ndarray'> 120 (120, 4)
<class 'numpy.ndarray'> 120 [0 2 2 0 0 2 2 1 0 1 1 2 0 1 2 1 1 0 2 0 2 2 1 2 1 0 0 1 0 0 2 2 2 0 0 0 1
 0 1 2 2 1 1 2 2 0 1 1 2 2 2 0 2 0 0 0 0 2 0 0 0 1 0 1 1 2 1 0 0 0 1 1 1 2
 1 0 1 2 0 2 2 1 0 0 0 2 1 0 2 1 2 0 0 1 1 2 1 2 2 1 1 2 2 0 1 2 0 2 2 0 1
 2 0 1 1 1 0 1 1 1]
<class 'numpy.ndarray'> 30 (30, 4)
<class 'numpy.ndarray'> 30 [2 2 2 1 2 0 1 0 0 1 2 1 1 2 2 0 2 1 2 2 1 0 0 1 0 0 2 1 0 1]


### (학습 데이터 세트로) 모델 학습(Train) 수행

In [122]:
dt_clf = DecisionTreeClassifier(random_state=11)
dt_clf.fit(x_train, y_train)

### (테스트 데이터 세트로) 예측(Predict) 수행 

In [123]:
y_pred = dt_clf.predict(x_test)
print(len(y_pred), y_pred)
print(len(y_test), y_test)

30 [2 2 1 1 2 0 1 0 0 1 1 1 1 2 2 0 2 1 2 2 1 0 0 1 0 0 2 1 0 1]
30 [2 2 2 1 2 0 1 0 0 1 2 1 1 2 2 0 2 1 2 2 1 0 0 1 0 0 2 1 0 1]


### 평가 (예측 정확도)

In [124]:
print("accuracy_score : {0:.4f}".format(accuracy_score(y_test, y_pred)))

accuracy_score : 0.9333


## DataFrame 예제

In [125]:
iris_df = pd.DataFrame(data=iris_data, columns=iris.feature_names)
iris_df['target'] = iris_label
print(len(iris_df))
iris_df.head()

150


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [126]:
data_df = iris_df.iloc[:, :-1]
target_s = iris_df.iloc[:, -1]
print(type(data_df), len(data_df), data_df.shape)
print(type(target_s), len(target_s))
x_train, x_test, y_train, y_test = train_test_split(data_df, target_s, test_size=0.2, random_state=11)
print(type(x_train), len(x_train), x_train.shape)
print(type(y_train), len(y_train))
print(type(x_test), len(x_test), x_test.shape)
print(type(y_test), len(y_test))
dt_clf = DecisionTreeClassifier(random_state=11)
dt_clf.fit(x_train, y_train)
y_pred = dt_clf.predict(x_test)
print(len(y_pred), y_pred, type(y_pred))
print(len(y_test), y_test.values, type(y_test))
print("accuracy_score : {0:.4f}".format(accuracy_score(y_test, y_pred)))
print(iris_df.target.value_counts()) # 전체 데이터의 레이블 분포
print(y_train.value_counts()) # train 데이터의 레이블 분포
print(y_test.value_counts()) # test 데이터의 레이블 분포

<class 'pandas.core.frame.DataFrame'> 150 (150, 4)
<class 'pandas.core.series.Series'> 150
<class 'pandas.core.frame.DataFrame'> 120 (120, 4)
<class 'pandas.core.series.Series'> 120
<class 'pandas.core.frame.DataFrame'> 30 (30, 4)
<class 'pandas.core.series.Series'> 30
30 [2 2 1 1 2 0 1 0 0 1 1 1 1 2 2 0 2 1 2 2 1 0 0 1 0 0 2 1 0 1] <class 'numpy.ndarray'>
30 [2 2 2 1 2 0 1 0 0 1 2 1 1 2 2 0 2 1 2 2 1 0 0 1 0 0 2 1 0 1] <class 'pandas.core.series.Series'>
accuracy_score : 0.9333
target
0    50
1    50
2    50
Name: count, dtype: int64
target
0    41
1    40
2    39
Name: count, dtype: int64
target
2    11
1    10
0     9
Name: count, dtype: int64


## 교차 검증

### KFold

In [127]:
dt_clf = DecisionTreeClassifier(random_state=156)
kfold = KFold(n_splits=5)
cv_accruacy = []
n_iter = 0
for train_index, test_index in kfold.split(iris.data):
    print(type(train_index), len(train_index), train_index)
    print(type(test_index), len(test_index), test_index)
    x_train, x_test = iris.data[train_index], iris.data[test_index]
    y_train, y_test = iris.target[train_index], iris.target[test_index]
    print(pd.Series(y_train).value_counts())
    print(pd.Series(y_test).value_counts())
    dt_clf.fit(x_train, y_train)
    y_pred = dt_clf.predict(x_test)
    accuracy = np.round(accuracy_score(y_test, y_pred), 4)
    cv_accruacy.append(accuracy)
    n_iter += 1
    print("{0}. 교차 검증 정확도 : {1}, 학습 데이터 크기 : {2}, 검증 데이터 크기 : {3}".format(n_iter, accuracy, x_train.shape, x_test.shape))
    print(y_pred, y_test, ' ', sep='\n')
print("평균 검증 정확도 : ", np.mean(cv_accruacy))

<class 'numpy.ndarray'> 120 [ 30  31  32  33  34  35  36  37  38  39  40  41  42  43  44  45  46  47
  48  49  50  51  52  53  54  55  56  57  58  59  60  61  62  63  64  65
  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80  81  82  83
  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 101
 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137
 138 139 140 141 142 143 144 145 146 147 148 149]
<class 'numpy.ndarray'> 30 [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29]
1    50
2    50
0    20
Name: count, dtype: int64
0    30
Name: count, dtype: int64
1. 교차 검증 정확도 : 1.0, 학습 데이터 크기 : (120, 4), 검증 데이터 크기 : (30, 4)
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 
<class 'numpy.ndarray'> 120 [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15

### Stratified KFold

In [128]:
dt_clf = DecisionTreeClassifier(random_state=156)
skf = StratifiedKFold(n_splits=5)
cv_accruacy = []
n_iter = 0
for train_index, test_index in skf.split(iris.data, iris.target):
    print(type(train_index), len(train_index), train_index)
    print(type(test_index), len(test_index), test_index)
    x_train, x_test = iris.data[train_index], iris.data[test_index]
    y_train, y_test = iris.target[train_index], iris.target[test_index]
    print("train\n", pd.Series(y_train).value_counts())
    print("test\n", pd.Series(y_test).value_counts())
    dt_clf.fit(x_train, y_train)
    y_pred = dt_clf.predict(x_test)
    accuracy = np.round(accuracy_score(y_test, y_pred), 4)
    cv_accruacy.append(accuracy)
    n_iter += 1
    print("{0}. 교차 검증 정확도 : {1}, 학습 데이터 크기 : {2}, 검증 데이터 크기 : {3}".format(n_iter, accuracy, x_train.shape, x_test.shape))
    print(y_pred, y_test, ' ', sep='\n')
print("교차 검증별 정확도 : ", np.round(cv_accruacy, 4))
print("평균 검증 정확도 : ", np.mean(cv_accruacy))

<class 'numpy.ndarray'> 120 [ 10  11  12  13  14  15  16  17  18  19  20  21  22  23  24  25  26  27
  28  29  30  31  32  33  34  35  36  37  38  39  40  41  42  43  44  45
  46  47  48  49  60  61  62  63  64  65  66  67  68  69  70  71  72  73
  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90  91
  92  93  94  95  96  97  98  99 110 111 112 113 114 115 116 117 118 119
 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137
 138 139 140 141 142 143 144 145 146 147 148 149]
<class 'numpy.ndarray'> 30 [  0   1   2   3   4   5   6   7   8   9  50  51  52  53  54  55  56  57
  58  59 100 101 102 103 104 105 106 107 108 109]
train
 0    40
1    40
2    40
Name: count, dtype: int64
test
 0    10
1    10
2    10
Name: count, dtype: int64
1. 교차 검증 정확도 : 0.9667, 학습 데이터 크기 : (120, 4), 검증 데이터 크기 : (30, 4)
[0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 1 2 2 2]
[0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2]
 
<class 'numpy.ndarray'> 120 [ 

### cross_val_score()

In [129]:
from sklearn.model_selection import cross_val_score, cross_validate

dt_clf = DecisionTreeClassifier(random_state=156)
scores = cross_val_score(dt_clf, iris.data, iris.target, scoring='accuracy', cv=5)
print("교차 검증별 정확도 : ", np.round(scores, 4))
print("평균 검증 정확도 : ", np.mean(scores))

교차 검증별 정확도 :  [0.9667 0.9667 0.9    0.9667 1.    ]
평균 검증 정확도 :  0.9600000000000002


### GridSearchCV

In [130]:
from sklearn.model_selection import GridSearchCV

x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=121)
dtree = DecisionTreeClassifier()
parameters = {'max_depth' : [1, 2, 3], 'min_samples_split':[2,3]}
grid_dtree = GridSearchCV(dtree, param_grid=parameters, cv=3, refit=True)
grid_dtree.fit(x_train, y_train)

scores_df = pd.DataFrame(grid_dtree.cv_results_)
scores_df[['params', 'mean_test_score', 'rank_test_score', 'split0_test_score', 'split1_test_score', 'split2_test_score']]

Unnamed: 0,params,mean_test_score,rank_test_score,split0_test_score,split1_test_score,split2_test_score
0,"{'max_depth': 1, 'min_samples_split': 2}",0.7,5,0.7,0.7,0.7
1,"{'max_depth': 1, 'min_samples_split': 3}",0.7,5,0.7,0.7,0.7
2,"{'max_depth': 2, 'min_samples_split': 2}",0.958333,3,0.925,1.0,0.95
3,"{'max_depth': 2, 'min_samples_split': 3}",0.958333,3,0.925,1.0,0.95
4,"{'max_depth': 3, 'min_samples_split': 2}",0.975,1,0.975,1.0,0.95
5,"{'max_depth': 3, 'min_samples_split': 3}",0.975,1,0.975,1.0,0.95


In [131]:
print(type(grid_dtree.cv_results_))
grid_dtree.cv_results_

<class 'dict'>


{'mean_fit_time': array([0.00053159, 0.00050894, 0.00040404, 0.00041024, 0.00042669,
        0.00041135]),
 'std_fit_time': array([1.34790281e-05, 1.02187574e-04, 1.35435371e-05, 2.04561968e-05,
        1.87218897e-05, 5.56196474e-06]),
 'mean_score_time': array([0.00042542, 0.00036446, 0.00030812, 0.00030168, 0.00029826,
        0.00031678]),
 'std_score_time': array([4.98244761e-05, 5.40463361e-05, 6.85865053e-06, 7.32875944e-06,
        9.45493258e-06, 7.33651155e-06]),
 'param_max_depth': masked_array(data=[1, 1, 2, 2, 3, 3],
              mask=[False, False, False, False, False, False],
        fill_value=999999),
 'param_min_samples_split': masked_array(data=[2, 3, 2, 3, 2, 3],
              mask=[False, False, False, False, False, False],
        fill_value=999999),
 'params': [{'max_depth': 1, 'min_samples_split': 2},
  {'max_depth': 1, 'min_samples_split': 3},
  {'max_depth': 2, 'min_samples_split': 2},
  {'max_depth': 2, 'min_samples_split': 3},
  {'max_depth': 3, 'min_sample

In [132]:
print("GridSearchCV 최적 파라미터 : ", grid_dtree.best_params_)
print("GridSearchCV 최고 정확도 : {0:.4f}".format(grid_dtree.best_score_))
# y_pred = grid_dtree.predict(x_test)
# print("테스트 데이터 세트 정확도 : {0:.4f}".format(accuracy_score(y_test, y_pred)))
# print(y_pred)
# print(y_test)

GridSearchCV 최적 파라미터 :  {'max_depth': 3, 'min_samples_split': 2}
GridSearchCV 최고 정확도 : 0.9750


## 데이터 인코딩

### Label Encoding

In [133]:
from sklearn.preprocessing import LabelEncoder
items = ['TV', '냉장고', '전자레인지', '컴퓨터', '선풍기', '선풍기', '믹서', '믹서']
encoder = LabelEncoder()
encoder.fit(items)
labels = encoder.transform(items)
codes = encoder.inverse_transform([4,5,2,0,1,1,3,3])
print(encoder.classes_)
print(labels)
print(codes)

['TV' '냉장고' '믹서' '선풍기' '전자레인지' '컴퓨터']
[0 1 4 5 3 3 2 2]
['전자레인지' '컴퓨터' '믹서' 'TV' '냉장고' '냉장고' '선풍기' '선풍기']


### One-Hot encoding

In [134]:
from sklearn.preprocessing import OneHotEncoder
items = ['TV', '냉장고', '전자레인지', '컴퓨터', '선풍기', '선풍기', '믹서', '믹서']
encoder = LabelEncoder()
encoder.fit(items)
labels = encoder.transform(items)
print(labels)
labels = labels.reshape(-1, 1)
print(labels)
encoder = OneHotEncoder()
encoder.fit(labels)
oh_labels = encoder.transform(labels)
print(oh_labels.toarray())
print(oh_labels.shape)

[0 1 4 5 3 3 2 2]
[[0]
 [1]
 [4]
 [5]
 [3]
 [3]
 [2]
 [2]]
[[1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]]
(8, 6)


In [135]:
df = pd.DataFrame({'item':['TV', '냉장고', '전자레인지', '컴퓨터', '선풍기', '선풍기', '믹서', '믹서']})
pd.get_dummies(df)

Unnamed: 0,item_TV,item_냉장고,item_믹서,item_선풍기,item_전자레인지,item_컴퓨터
0,True,False,False,False,False,False
1,False,True,False,False,False,False
2,False,False,False,False,True,False
3,False,False,False,False,False,True
4,False,False,False,True,False,False
5,False,False,False,True,False,False
6,False,False,True,False,False,False
7,False,False,True,False,False,False


## Feature Scaling과 정규화

### StandardScaler

In [136]:
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
print("# mean", iris_df.mean(), sep="\n")
print("# var", iris_df.var(), sep="\n")

# mean
sepal length (cm)    5.843333
sepal width (cm)     3.057333
petal length (cm)    3.758000
petal width (cm)     1.199333
dtype: float64
# var
sepal length (cm)    0.685694
sepal width (cm)     0.189979
petal length (cm)    3.116278
petal width (cm)     0.581006
dtype: float64


In [137]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(iris_df)
iris_scaled = scaler.transform(iris_df)
iris_scaled_df = pd.DataFrame(data=iris_scaled, columns=iris.feature_names)
print("# mean", iris_scaled_df.mean(), sep="\n")
print("# std", iris_scaled_df.std(), sep="\n")

# mean
sepal length (cm)   -1.690315e-15
sepal width (cm)    -1.842970e-15
petal length (cm)   -1.698641e-15
petal width (cm)    -1.409243e-15
dtype: float64
# std
sepal length (cm)    1.00335
sepal width (cm)     1.00335
petal length (cm)    1.00335
petal width (cm)     1.00335
dtype: float64


### MinMaxScaler

In [138]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(iris_df)
iris_scaled = scaler.transform(iris_df)
print(type(iris_scaled), iris_scaled.shape)
iris_scaled_df = pd.DataFrame(data=iris_scaled, columns=iris.feature_names)
print("# min", iris_scaled_df.min(), sep="\n")
print("# max", iris_scaled_df.max(), sep="\n")

<class 'numpy.ndarray'> (150, 4)
# min
sepal length (cm)    0.0
sepal width (cm)     0.0
petal length (cm)    0.0
petal width (cm)     0.0
dtype: float64
# max
sepal length (cm)    1.0
sepal width (cm)     1.0
petal length (cm)    1.0
petal width (cm)     1.0
dtype: float64


### scaler를 이용하여 학습 데이터와 테스트 데이터에 fit(), transform(), fit_transform() 적용시 유의사항

In [139]:
train_array = np.arange(0, 11).reshape(-1, 1)
test_array = np.arange(0, 6).reshape(-1, 1)
scaler = MinMaxScaler()

In [140]:
scaler.fit(train_array)
train_scaled = scalar.transform(train_array)
print("origin : ", np.round(train_array.reshape(-1), 2))
print("scaled : ", np.round(train_scaled.reshape(-1), 2))
scaler.fit(test_array)
test_scaled = scaler.transform(test_array)
print("origin : ", np.round(test_array.reshape(-1), 2))
print("scaled : ", np.round(test_scaled.reshape(-1), 2))

origin :  [ 0  1  2  3  4  5  6  7  8  9 10]
scaled :  [0.  0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1. ]
origin :  [0 1 2 3 4 5]
scaled :  [0.  0.2 0.4 0.6 0.8 1. ]


In [141]:
scaler.fit(train_array)
train_scaled = scalar.transform(train_array)
print("origin : ", np.round(train_array.reshape(-1), 2))
print("scaled : ", np.round(train_scaled.reshape(-1), 2))
# test_array에 scale 변환을 할 때는 반드시 fit()을 호출하지 않고 transform()만으로 변환해야 한다.
test_scaled = scaler.transform(test_array)
print("origin : ", np.round(test_array.reshape(-1), 2))
print("scaled : ", np.round(test_scaled.reshape(-1), 2))

origin :  [ 0  1  2  3  4  5  6  7  8  9 10]
scaled :  [0.  0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1. ]
origin :  [0 1 2 3 4 5]
scaled :  [0.  0.1 0.2 0.3 0.4 0.5]
