# **sklearn基础**

## 1. 导入模块

In [1]:
import numpy as np
from sklearn.datasets import load_iris
iris_datasets = load_iris()
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier as KNC

## 2. 数据结构

#### **数据结构可用以下方法编写：**
```python
Class dataset:
    def __init__(self):
        data = 
        target = 
        target_names = 
        DESCR = 
        feature_names = 
        filename = 
```

In [2]:
print(iris_datasets.keys())

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])


## 3. 数据集特性

In [3]:
print('\n----------------------------------------------------------------------------\n')
print('First five rows of data:')
print(iris_datasets.data[:5])
print('Shape:{}'.format(iris_datasets.data.shape))
print('Feature names:{}'.format(iris_datasets.feature_names))
print('\n----------------------------------------------------------------------------\n')
print('Target sets:')
print(iris_datasets.target)
print('Shape:{}'.format(iris_datasets.target.shape))
print('Target names:{}'.format(iris_datasets.target_names))
print('\n----------------------------------------------------------------------------\n')


----------------------------------------------------------------------------

First five rows of data:
[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]]
Shape:(150, 4)
Feature names:['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']

----------------------------------------------------------------------------

Target sets:
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]
Shape:(150,)
Target names:['setosa' 'versicolor' 'virginica']

----------------------------------------------------------------------------



In [4]:
print('data:{}'.format(type(iris_datasets.data)))
print('target:{}'.format(type(iris_datasets.target)))

data:<class 'numpy.ndarray'>
target:<class 'numpy.ndarray'>


## 4. 创建训练集和测试集

#### **train_test_split参数选择:**
`random_state = type(int)`: 伪随机参数    
`stratify = type(numpy)`：传入某个target集合，使训练集和测试集中各类别数据的占比与此target集合相同    
`shuffle = type(bool)`: 数据集是否被打乱，默认为True    
`test_size = type(float) or type(int)`: 确定测试集的大小。输入小于1的数为测试集的占比，输入整数则为测试集的个数

In [5]:
X_train, X_test, y_train, y_test = train_test_split(iris_datasets.data, iris_datasets.target, 
                                                   random_state=0)
print('Shape of traing set:')
print(X_train.shape)
print(y_train.shape)
print('\n')
print('Shape of testing set:')
print(X_test.shape)
print(y_test.shape)

Shape of traing set:
(112, 4)
(112,)


Shape of testing set:
(38, 4)
(38,)


## 5. 模型建立与预测

In [6]:
knn = KNC(n_neighbors=1)
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')

In [7]:
X_new = np.array([[5, 2.9, 1, 0.2]])
prediction = knn.predict(X_new)
print(type(prediction))
print(iris_datasets.target_names[prediction[0]])
print('\n')
print('predict again:')
prediction = knn.predict(X_test)
print('prediciton:')
print(prediction)
print('y_test:')
print(y_test)
print('\n')
accuracy = knn.score(X_test, y_test)
print('Accuracy:' + str(accuracy))

<class 'numpy.ndarray'>
setosa


predict again:
prediciton:
[2 1 0 2 0 2 0 1 1 1 2 1 1 1 1 0 1 1 0 0 2 1 0 0 2 0 0 1 1 0 2 1 0 2 2 1 0
 2]
y_test:
[2 1 0 2 0 2 0 1 1 1 2 1 1 1 1 0 1 1 0 0 2 1 0 0 2 0 0 1 1 0 2 1 0 2 2 1 0
 1]


Accuracy:0.9736842105263158
