In [2]:
import struct
from tqdm import tqdm_notebook
from sklearn import svm, model_selection, metrics

In [1]:
def decoding_mnist_rawData(dataStyle='train', maxCount=0):
    label_f = open(f'../data/mnist/{dataStyle}-labels-idx1-ubyte','rb')
    image_f = open(f'../data/mnist/{dataStyle}-images-idx3-ubyte','rb')
    csv_f = open(f'../data/mnist/{dataStyle}.csv', 'w', encoding='utf-8')
    label_magic_number, label_count = struct.unpack('>II',label_f.read(4+4))
    image_magic_number, image_count, row, col = struct.unpack('>IIII',image_f.read(4+4+4+4))

    pixels = row * col

    for idx in tqdm_notebook(range(image_count)):
        if idx >= maxCount: break
        label_tmp = struct.unpack('B', label_f.read(1))
        label = label_tmp[0]

        binaryData = image_f.read(pixels)
        strData = list(map(lambda x : str(x), binaryData))
        csv_f.write(str(label)+',')
        csv_f.write(','.join(strData)+'\n')

        with open(f'../data/mnist/{label}.pgm','w',encoding='utf-8') as f:
            f.write('P2 28 28 255\n' + ' '.join(strData))
        
    label_f.close()
    image_f.close()
    csv_f.close()

In [3]:
def load_csv(dataType='train'):
    f = open(f'../data/mnist/{dataType}.csv', 'r')
    
    labels = list()
    images = list()
    
    while True:
        row = f.readline()
        if not row: break
        labels.append(int(row.split(',')[0]))
        images.append(list(map(lambda x: int(x)/256, row.split(',')[1:])))
    f.close()
    return { 'labels':labels, 'images':images }

In [4]:
clf = svm.SVC()
clf.fit(train['images'], train['labels'] )

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [5]:
predict = clf.predict(test['images'])

In [6]:
metrics.accuracy_score(test['labels'], predict)

0.9742

In [7]:
clf_report = metrics.classification_report(test['labels'], predict)
print(clf_report)

              precision    recall  f1-score   support

           0       0.98      0.99      0.98       980
           1       0.99      0.99      0.99      1135
           2       0.97      0.98      0.97      1032
           3       0.97      0.98      0.97      1010
           4       0.97      0.98      0.97       982
           5       0.98      0.97      0.97       892
           6       0.98      0.98      0.98       958
           7       0.97      0.96      0.96      1028
           8       0.97      0.97      0.97       974
           9       0.97      0.95      0.96      1009

    accuracy                           0.97     10000
   macro avg       0.97      0.97      0.97     10000
weighted avg       0.97      0.97      0.97     10000



In [11]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler().fit( train['images'])
X_train_scaled = scaler.transform( train['images'] )
clf2 = svm.SVC()

In [12]:
clf2.fit( X_train_scaled, train['labels'] )

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [13]:
clf2.score( scaler.transform(test['images']), test['labels'] )

0.9737

In [4]:
result_ac_scores = list()
result_clf_reports = list()
for cnt in range(1,15):
    decoding_mnist_rawData('t10k',250*cnt)
    decoding_mnist_rawData('train',750*cnt)
    train = load_csv('train')
    test = load_csv('t10k')
    SEED = 2020
    clf = svm.SVC(random_state=SEED)
    clf.fit(train['images'], train['labels'])
    predict = clf.predict(test['images'])
    ac_score = metrics.accuracy_score(test['labels'], predict)
    print(f'테스트셋 {250*cnt}개일 때 정확도:',ac_score)
    result_ac_scores.append(ac_score)
    clf_report = metrics.classification_report(test['labels'], predict)
    result_clf_reports.append(clf_report)

HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))

HBox(children=(IntProgress(value=0, max=60000), HTML(value='')))


테스트셋 250개일 때 정확도: 0.9


HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))

HBox(children=(IntProgress(value=0, max=60000), HTML(value='')))



테스트셋 500개일 때 정확도: 0.9


HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))

HBox(children=(IntProgress(value=0, max=60000), HTML(value='')))




테스트셋 750개일 때 정확도: 0.908


HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))

HBox(children=(IntProgress(value=0, max=60000), HTML(value='')))


테스트셋 1000개일 때 정확도: 0.922


HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))

HBox(children=(IntProgress(value=0, max=60000), HTML(value='')))

테스트셋 1250개일 때 정확도: 0.9216


HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))

HBox(children=(IntProgress(value=0, max=60000), HTML(value='')))




KeyboardInterrupt: 