In [1]:
import urllib.request as req
import gzip, os, os.path
savepath = "./mnist"
baseurl = "https://github.com/golbin/TensorFlow-MNIST/raw/master/mnist/data/"
files = [
    "train-images-idx3-ubyte.gz",
    "train-labels-idx1-ubyte.gz",
    "t10k-images-idx3-ubyte.gz",
    "t10k-labels-idx1-ubyte.gz"]
# 다운로드
if not os.path.exists(savepath): os.mkdir(savepath)
for f in files:
    url = baseurl + "/" + f
    loc = savepath + "/" + f
    print("download:", url)
    if not os.path.exists(loc):
        req.urlretrieve(url, loc)
# GZip 압축 해제
for f in files:
    gz_file = savepath + "/" + f
    raw_file = savepath + "/" + f.replace(".gz", "")
    print("gzip:", f)
    with gzip.open(gz_file, "rb") as fp:
        body = fp.read()
        with open(raw_file, "wb") as w:
            w.write(body)
print("ok")

download: https://github.com/golbin/TensorFlow-MNIST/raw/master/mnist/data//train-images-idx3-ubyte.gz
download: https://github.com/golbin/TensorFlow-MNIST/raw/master/mnist/data//train-labels-idx1-ubyte.gz
download: https://github.com/golbin/TensorFlow-MNIST/raw/master/mnist/data//t10k-images-idx3-ubyte.gz
download: https://github.com/golbin/TensorFlow-MNIST/raw/master/mnist/data//t10k-labels-idx1-ubyte.gz
gzip: train-images-idx3-ubyte.gz
gzip: train-labels-idx1-ubyte.gz
gzip: t10k-images-idx3-ubyte.gz
gzip: t10k-labels-idx1-ubyte.gz
ok


In [1]:
import struct

def to_csv(name, maxdata):
    lbl_f = open('./mnist/' + name + '-labels-idx1-ubyte', 'rb')
    img_f = open('./mnist/' + name + '-images-idx3-ubyte', 'rb')
    csv_f = open('./mnist/' + name + '.csv', 'w', encoding='utf-8')

    # 헤더 정보 읽기
    mag, lbl_count = struct.unpack('>II', lbl_f.read(8))
    mag, img_count = struct.unpack('>II', img_f.read(8))
    rows, cols = struct.unpack('>II', img_f.read(8))
    pixels = rows * cols

    # 이미지 데이터를 읽고 csv로 저장하기
    res = []
    for idx in range(lbl_count):
        if idx > maxdata: break
        label = struct.unpack('B', lbl_f.read(1))[0]
        bdata = img_f.read(pixels)
        sdata = list(map(lambda n: str(n), bdata))
        csv_f.write(str(label) + ',')
        csv_f.write(','.join(sdata)+'\r\n')

        # 테스트
        if idx < 10:
            s = 'P2 28 28 255\n'
            s += ' '.join(sdata)
            iname = './mnist/{0}-{1}-{2}.pgm'.format(name, idx, label)
            with open(iname, 'w', encoding='utf-8') as f:
                f.write(s)

    csv_f.close()
    lbl_f.close()
    img_f.close()

to_csv('train', 1000)
to_csv('t10k', 500)

print('ok')

ok


In [2]:
from sklearn import model_selection, svm, metrics

# CSV 파일을 읽어 들이고 가공하기 --- (※1)
def load_csv(fname):
    labels = []
    images = []
    with open(fname, "r") as f:
        for line in f:
            cols = line.split(",")
            if len(cols) < 2: continue
            labels.append(int(cols.pop(0)))
            vals = list(map(lambda n: int(n) / 256, cols))
            images.append(vals)
    return {"labels":labels, "images":images}
data = load_csv("./mnist/train.csv")
test = load_csv("./mnist/t10k.csv")

# 학습하기 --- (※2)
clf = svm.SVC()
clf.fit(data["images"], data["labels"])

# 예측하기 --- (※3)
predict = clf.predict(test["images"])

# 결과 확인하기 --- (※4)
ac_score = metrics.accuracy_score(test["labels"], predict)
cl_report = metrics.classification_report(test["labels"], predict)
print("정답률 =", ac_score)
print("리포트 =")
print(cl_report)

정답률 = 0.9720558882235529
리포트 =
              precision    recall  f1-score   support

           0       0.95      1.00      0.98        42
           1       1.00      1.00      1.00        67
           2       0.96      0.96      0.96        55
           3       0.98      0.98      0.98        46
           4       0.98      0.96      0.97        55
           5       1.00      0.96      0.98        50
           6       0.98      0.95      0.96        43
           7       0.96      0.98      0.97        49
           8       0.93      0.97      0.95        40
           9       0.96      0.94      0.95        54

    accuracy                           0.97       501
   macro avg       0.97      0.97      0.97       501
weighted avg       0.97      0.97      0.97       501

