### MNIST

In [1]:
import numpy as np 
from sklearn.datasets import fetch_mldata

In [2]:
mnist = fetch_mldata("MNIST original") # 也就是下载一些机器学习数据集

In [3]:
mnist

{'COL_NAMES': ['label', 'data'],
 'DESCR': 'mldata.org dataset: mnist-original',
 'data': array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ..., 
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
 'target': array([ 0.,  0.,  0., ...,  9.,  9.,  9.])}

In [4]:
X, y = mnist['data'], mnist['target']

In [5]:
X.shape # 这个数据集不需要train_test_split

(70000, 784)

In [6]:
X_train = X[:60000] # 其实X[:60000]已经是np.array类型了吧
X_train = np.array(X[:60000], dtype=float) # 哦，已经帮我们分类好了啊
y_train = np.array(y[:60000], dtype=float)
X_test = np.array(X[60000:], dtype=float)
y_test = np.array(y[60000:], dtype=float)

In [7]:
X_train.shape

(60000, 784)

In [8]:
y_train.shape # 得到的是什么，不应该是矩阵吗？对啊，是矩阵啊，不对不对，是向量，只是没有所谓行列之分

(60000,)

In [9]:
X_test.shape # 测试数据集

(10000, 784)

In [10]:
y_test.shape # 对应的输出标记

(10000,)

### 使用kNN

In [11]:
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier()
%time knn_clf.fit(X_train, y_train) # 数据集比较大，所以计时下，对于比较大的样本的数据用了树结构存储加快处理

Wall time: 50.3 s


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [12]:
# %time knn_clf.score(X_test, y_test) # 预测更加耗时了，因为要计算每个样本点

In [13]:
# 没有用归一化的过程，是因为样本数据的每个特征是处于同一个尺度下的，所以可以不进行数据归一化

### PCA进行降维

In [14]:
from sklearn.decomposition import PCA

pca = PCA(0.9) # 只保留0.9的信息量哦
pca.fit(X_train) # 的新的坐标系
X_train_reduction = pca.transform(X_train)

In [15]:
X_train_reduction.shape # 87维的数据却保留了原有数据的0.9的信息

(60000, 87)

In [16]:
knn_clf = KNeighborsClassifier()
%time knn_clf.fit(X_train_reduction, y_train)

Wall time: 1.12 s


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [17]:
X_test_reduction = pca.transform(X_test)

In [18]:
X_test_reduction.shape

(10000, 87)

In [19]:
y_test.shape

(10000,)

In [20]:
%time knn_clf.fit(X_train_reduction, y_train)

Wall time: 1.08 s


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [21]:
knn_clf.score(X_test_reduction, y_test)

0.9728