In [1]:
# written by Python3

In [2]:
import urllib.request as req
import gzip, os, os.path

In [3]:
savepath = "./mnist"
baseurl = "http://yann.lecun.com/exdb/mnist"

In [4]:
files = [
    "train-images-idx3-ubyte.gz",
    "train-labels-idx1-ubyte.gz",
    "t10k-images-idx3-ubyte.gz",
    "t10k-labels-idx1-ubyte.gz"
]

In [5]:
## download
if not os.path.exists(savepath): os.mkdir(savepath)

In [7]:
for f in files:
    url = baseurl + "/" + f
    loc = savepath + "/" + f
    print("download:", url)
    if not os.path.exists(loc):
        req.urlretrieve(url, loc)

download: http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
download: http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
download: http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
download: http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz


In [11]:
# GZip 압축 해제
for f in files:
    gz_file = savepath + "/" + f
    raw_file = savepath + "/" + f.replace(".gz", "")
    print("gzip:", f)
    with gzip.open(gz_file, "rb") as fp:
        body = fp.read()
        with open(raw_file, "wb") as w:
            w.write(body)

print("ok")

gzip: train-images-idx3-ubyte.gz
gzip: train-labels-idx1-ubyte.gz
gzip: t10k-images-idx3-ubyte.gz
gzip: t10k-labels-idx1-ubyte.gz
ok


## 변환프로그램

In [12]:
import struct

In [18]:
def to_csv(name, maxdata):
    # open label files and image files
    lbl_f = open("./mnist/" + name + "-labels-idx1-ubyte", "rb")
    img_f = open("./mnist/" + name + "-images-idx3-ubyte", "rb")
    csv_f = open("./mnist/" + name + ".csv", "w", encoding="utf-8")
    
    # read header information
    # 데이터가 리틀 앤디안으로 저장되어 있기 때문에, struct 모듈로 리틀 앤디안 데이터를 읽기 위해 ">" 기호 사용
    # 각 파일에서 매직 넘버를 나타내는 32비트(4바이트)와 이미지 개수를 나타내는 32비트(4바이트) 정수를 읽음
    #
    mag, lbl_count = struct.unpack(">II", lbl_f.read(8))
    mag, img_count = struct.unpack('>II', img_f.read(8))
    rows, cols = struct.unpack('>II', img_f.read(8))
    pixels = rows * cols
    
    # read image data and save to csv
    # 이미지 개수만큼 반복해서 데이터 읽기
    #
    res = []
    for idx in range(lbl_count):
        if idx > maxdata: break
        label = struct.unpack('B', lbl_f.read(1))[0]
        bdata = img_f.read(pixels)
        sdata = list(map(lambda n: str(n), bdata))
        csv_f.write(str(label)+ ',')
        csv_f.write(','.join(sdata) + '\r\n')
            
        # test
        # PGM 형식으로 이미지 데이터 저장
        if idx < 10:
            s = 'P2 28 28 255\n'
            s += " ".join(sdata)
            iname = './mnist/{0}-{1}-{2}.pgm'.format(name, idx, label)
            with open(iname, 'w', encoding = 'utf-8') as f:
                f.write(s)
    csv_f.close()
    lbl_f.close()
    img_f.close()
    

In [27]:
# result
to_csv('train', 99999)
to_csv('t10k', 500)

## 이미지 데이터 학습시키기

In [20]:
from sklearn import model_selection, svm, metrics

In [28]:
# reading csv and processing
# CSV 파일 읽고 레이블과 이미지 데이터를 배열로 만든다. 
# 이미지 데이터의 각 픽셀은 0부터 255까지 integer인데, 이를 256으로 나누기 때문에 0 이상이고 1미만인 실수 벡터가 됨

def load_csv(fname):
    labels = []
    images = []
    
    with open(fname, 'r') as f:
        for line in f:
            cols = line.split(',')
            if len(cols) < 2: continue
            labels.append(int(cols.pop(0)))
            vals = list(map(lambda n: int(n) / 256, cols))
            images.append(vals)
    return {"labels": labels, "images": images}

In [29]:
data = load_csv("./mnist/train.csv")
test = load_csv("./mnist/t10k.csv")

In [30]:
# training
# scikit-learn의 SVM(SVC) 알고리즘 이용
# fit() method로 학습

clf = svm.SVC()
clf.fit(data["images"], data["labels"])

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [24]:
clf.fit

<bound method BaseLibSVM.fit of SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)>

In [31]:
# predicting
# predict() method로 예측

predict = clf.predict(test["images"])

In [32]:
# checking result
ac_score = metrics.accuracy_score(test["labels"], predict)
cl_report = metrics.classification_report(test["labels"], predict)
print("정답률 =", ac_score)
print("리포트 =")
print(cl_report)

정답률 = 0.946107784431
리포트 =
             precision    recall  f1-score   support

          0       0.91      0.98      0.94        42
          1       1.00      1.00      1.00        67
          2       0.96      0.95      0.95        55
          3       0.91      0.87      0.89        46
          4       0.98      0.98      0.98        55
          5       0.90      0.92      0.91        50
          6       0.98      0.93      0.95        43
          7       0.92      0.94      0.93        49
          8       0.93      0.97      0.95        40
          9       0.94      0.91      0.92        54

avg / total       0.95      0.95      0.95       501

