# 분류 예측의 불확실성 추정

#### intro

In [2]:
from sklearn.ensemble import GradientBoostingClassifier

In [3]:
from sklearn.datasets import make_circles

In [5]:
X, y = make_circles(noise=0.25, factor=0.5, random_state=1)
# 데이터에 노이즈를 추가하고, 내원과 외원 사이의 거리를 좁혔으며, 재구현성을 위해 시드를 줌

In [None]:
'''
make_circles(n_samples=100, shuffle=True, noise=None, random_state=None, factor=0.8)
Docstring:
Make a large circle containing a smaller circle in 2d.

A simple toy dataset to visualize clustering and classification
algorithms.

Read more in the :ref:`User Guide <sample_generators>`.

Parameters
----------
n_samples : int, optional (default=100)
    The total number of points generated.

shuffle : bool, optional (default=True)
    Whether to shuffle the samples.

noise : double or None (default=None)
    Standard deviation of Gaussian noise added to the data.
    데이터에 추가된 노이즈가 어느 정도의 표준 정규분포를 따를 지 결정

random_state : int, RandomState instance or None, optional (default=None)
    If int, random_state is the seed used by the random number generator;
    int값으로 random_state가 주어지면 그 값은 시드다. 재구현성을 제공함
    If RandomState instance, random_state is the random number generator;
    If None, the random number generator is the RandomState instance used
    by `np.random`.

factor : double < 1 (default=.8)
    Scale factor between inner and outer circle.
    속에 있는 원과 바깥에 있는 원 사이의 거리

Returns
-------
X : array of shape [n_samples, 2]
    The generated samples.

y : array of shape [n_samples]
    The integer labels (0 or 1) for class membership of each sample.
    
    X와 y를 return(numpy.ndarray)
    
File:      ~/anaconda3/lib/python3.6/site-packages/sklearn/datasets/samples_generator.py
Type:      function
'''

In [7]:
X

array([[-0.37748684, -0.0400769 ],
       [ 0.02701955, -0.48423931],
       [-0.36118825, -1.4007017 ],
       [-0.68154165,  0.00577195],
       [ 0.79966997, -0.1374838 ],
       [-0.44067409,  0.65738634],
       [-0.51680069, -0.64905973],
       [-0.48984706, -0.07126104],
       [-0.24030273,  0.4245925 ],
       [-0.15480597,  0.3987051 ],
       [ 0.22202915, -0.56040775],
       [ 0.14845311, -0.83918918],
       [ 0.55447318,  0.52619405],
       [ 0.73776469, -0.06026285],
       [ 0.56048402, -0.22169632],
       [-0.24288865, -0.81429088],
       [-1.16124083, -0.74556572],
       [ 0.77723853, -0.37259801],
       [-0.47721837, -0.37664926],
       [ 0.11550395,  1.10356127],
       [ 0.58181291,  0.19749795],
       [-0.85136464,  0.13473314],
       [ 0.47159612, -0.34434378],
       [-0.0609338 ,  0.20718973],
       [-0.72283227,  0.43895304],
       [-0.38474582, -1.09366122],
       [ 0.13179551, -0.53591877],
       [-0.93195251, -0.462223  ],
       [ 0.41735023,

In [8]:
y

array([1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0,
       1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0])

In [10]:
type(X)

numpy.ndarray

In [11]:
type(y)

numpy.ndarray

In [12]:
y_named = np.array(["blue", "red"])[y]

In [13]:
y_named

array(['red', 'red', 'blue', 'red', 'red', 'blue', 'blue', 'red', 'red',
       'red', 'red', 'blue', 'red', 'red', 'red', 'blue', 'blue', 'blue',
       'red', 'blue', 'blue', 'red', 'red', 'red', 'blue', 'blue', 'red',
       'blue', 'blue', 'blue', 'red', 'red', 'red', 'red', 'red', 'blue',
       'blue', 'red', 'blue', 'blue', 'red', 'red', 'red', 'blue', 'red',
       'blue', 'blue', 'red', 'blue', 'red', 'blue', 'red', 'blue',
       'blue', 'red', 'blue', 'blue', 'red', 'blue', 'red', 'blue', 'red',
       'red', 'blue', 'blue', 'red', 'blue', 'red', 'blue', 'red', 'red',
       'blue', 'red', 'red', 'blue', 'red', 'blue', 'red', 'red', 'blue',
       'blue', 'blue', 'blue', 'blue', 'blue', 'red', 'blue', 'blue',
       'red', 'red', 'blue', 'blue', 'red', 'red', 'blue', 'red', 'blue',
       'red', 'blue', 'blue'], dtype='<U4')

In [15]:
blue_red = np.array(["blue", "red"])

In [17]:
blue_red[0]

'blue'

In [18]:
blue_red[1]

'red'

In [19]:
blue_red[y]
# index가 0인 blue가 y의 0값들에 매핑
# index가 1인 red가 y의 1값들에 매핑 
# (numpy.ndarray A)[숫자로 이루어진 numpy.ndarray B]
# (numpy.ndarray A)[변환될 numpy.ndarray B]

array(['red', 'red', 'blue', 'red', 'red', 'blue', 'blue', 'red', 'red',
       'red', 'red', 'blue', 'red', 'red', 'red', 'blue', 'blue', 'blue',
       'red', 'blue', 'blue', 'red', 'red', 'red', 'blue', 'blue', 'red',
       'blue', 'blue', 'blue', 'red', 'red', 'red', 'red', 'red', 'blue',
       'blue', 'red', 'blue', 'blue', 'red', 'red', 'red', 'blue', 'red',
       'blue', 'blue', 'red', 'blue', 'red', 'blue', 'red', 'blue',
       'blue', 'red', 'blue', 'blue', 'red', 'blue', 'red', 'blue', 'red',
       'red', 'blue', 'blue', 'red', 'blue', 'red', 'blue', 'red', 'red',
       'blue', 'red', 'red', 'blue', 'red', 'blue', 'red', 'red', 'blue',
       'blue', 'blue', 'blue', 'blue', 'blue', 'red', 'blue', 'blue',
       'red', 'red', 'blue', 'blue', 'red', 'red', 'blue', 'red', 'blue',
       'red', 'blue', 'blue'], dtype='<U4')

In [20]:
type(blue_red)

numpy.ndarray

In [21]:
type(y)

numpy.ndarray

In [23]:
blue_red[np.array([1, 0])]

array(['red', 'blue'], dtype='<U4')

In [24]:
X_train, X_test, y_train_named, y_test_named, y_train, y_test = \
        train_test_split(X, y_named, y, random_state=0)
    # train:test = 75:25의 비율로 split

In [28]:
len(y_train_named)

75

In [27]:
len(y_test_named)

25

In [29]:
len(blue_red[y])

100

In [30]:
gbrt = GradientBoostingClassifier(random_state=0)

In [32]:
gbrt

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=0, subsample=1.0, verbose=0,
              warm_start=False)

In [34]:
gbrt.fit(X_train, y_train_named)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=0, subsample=1.0, verbose=0,
              warm_start=False)

- 분류를 예측한 결과에 대하여 어느 정도로 확신하는지는 불확실성 추정으로 알아볼 수 있다. scikit-learn이 제공해줌
- from sklearn.ensemble import GradientBoostingClassifier
- from sklearn.datasets import make_circles
- numpy.ndarray X, numpy.ndarray y = make_circles(noise, random_state, factor)
  - 임의의 데이터셋 만들기
- numpy.ndarray A[numpy.ndarray B]
  - 숫자 elements B를 A로 라벨링
- A1,A2, B1,B2, C1,C2 = train_test_split(A, B, C, random_state)
- GradientBoostingClassifier(random_state).fit(A1, B1)