In [16]:
import numpy as np
import gzip
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import datetime

In [18]:
# 数据集加载函数
def load_data(data_folder):
  files = [
      'train-labels-idx1-ubyte.gz', 'train-images-idx3-ubyte.gz',
      't10k-labels-idx1-ubyte.gz', 't10k-images-idx3-ubyte.gz'
  ]# 原始文件列表
  paths = []
  for fname in files: # 将文件读入内存
    paths.append(os.path.join(data_folder,fname))
  with gzip.open(paths[0], 'rb') as lbpath: # 读入训练集label
    y_train = np.frombuffer(lbpath.read(), np.uint8, offset=8)
  with gzip.open(paths[1], 'rb') as imgpath: # 读入训练集image
    x_train = np.frombuffer(
        imgpath.read(), np.uint8, offset=16).reshape(len(y_train), 28, 28)
  with gzip.open(paths[2], 'rb') as lbpath: # 读入测试集label
    y_test = np.frombuffer(lbpath.read(), np.uint8, offset=8)
  with gzip.open(paths[3], 'rb') as imgpath: #读入测试机image
    x_test = np.frombuffer(
        imgpath.read(), np.uint8, offset=16).reshape(len(y_test), 28, 28)
  return (x_train, y_train), (x_test, y_test)


In [22]:
# 加载数据，完成规整
(X_train,Y_train),(X_test,Y_test) = load_data('.\data\MNIST\raw')
X_train = X_train.reshape(60000,784)
Y_train = Y_train.reshape(-1,1)
X_test = X_train.reshape(60000,784)
Y_test = Y_train.reshape(-1,1)

In [24]:
# Lineaer SVC
starttime = datetime.datetime.now() #时间戳

l_svc = LinearSVC()
Y_train = Y_train.reshape(-1,1).ravel() #最后加上.ravel()，不然jupyter notebook会报错
l_svc.fit(X_train,Y_train)

y_pred = l_svc.predict(X_test)
print("精确率",precision_score(Y_test, y_pred, average='weighted'))
print("召回率",recall_score(Y_test, y_pred, average='weighted'))
print("F1度量值",f1_score(Y_test, y_pred, average='weighted'))

endtime = datetime.datetime.now()
time = (endtime - starttime).seconds
print("训练完成，时间为 ",time)



精确率 0.8919066641797292
召回率 0.8914166666666666
F1度量值 0.8909880814795903
训练完成，时间为  124


In [33]:
# 决策树
starttime = datetime.datetime.now() #时间戳

gini_model = DecisionTreeClassifier(criterion='gini',max_depth=10,splitter='best')
gini_model.fit(X_train,Y_train)
gini_y_pred = gini_model.predict(X_test)

print("精确率",precision_score(Y_test, gini_y_pred, average='weighted'))
print("召回率",recall_score(Y_test, gini_y_pred, average='weighted'))
print("F1度量值",f1_score(Y_test, gini_y_pred, average='weighted'))

endtime = datetime.datetime.now()
time = (endtime - starttime).seconds
print("训练完成，时间为 ",time)

精确率 0.9005920173366391
召回率 0.8995166666666666
F1度量值 0.8997757185746292
训练完成，时间为  9


In [30]:
# 随机森林
starttime = datetime.datetime.now() #时间戳

random_forest = RandomForestClassifier(n_estimators=10)
random_forest.fit(X_train,Y_train)
rf_y_pred = random_forest.predict(X_test)

print("精确率",precision_score(Y_test, rf_y_pred, average='weighted'))
print("召回率",recall_score(Y_test, rf_y_pred, average='weighted'))
print("F1度量值",f1_score(Y_test, rf_y_pred, average='weighted'))

endtime = datetime.datetime.now()
time = (endtime - starttime).seconds
print("训练完成，时间为 ",time)

精确率 0.9992667060926934
召回率 0.9992666666666666
F1度量值 0.9992666387689756
训练完成，时间为  3
