In [20]:
import os
import struct
import numpy as np
import matplotlib.pyplot as plt


"""
Loosely inspired by http://abel.ee.ucla.edu/cvxopt/_downloads/mnist.py
which is GPL licensed.
"""

def read(dataset = "training", path = "."):
    """
    Python function for importing the MNIST data set.  It returns an iterator
    of 2-tuples with the first element being the label and the second element
    being a numpy.uint8 2D array of pixel data for the given image.
    """

    if dataset is "training":
        fname_img = os.path.join(path, 'train-images.idx3-ubyte')
        fname_lbl = os.path.join(path, 'train-labels.idx1-ubyte')
    elif dataset is "testing":
        fname_img = os.path.join(path, 't10k-images.idx3-ubyte')
        fname_lbl = os.path.join(path, 't10k-labels.idx1-ubyte')
    else:
        raise Exception("dataset must be 'testing' or 'training'")

    # Load everything in some numpy arrays
    with open(fname_lbl, 'rb') as flbl:
        magic, num = struct.unpack(">II", flbl.read(8))
        lbl = np.fromfile(flbl, dtype=np.int8)
        
    with open(fname_img, 'rb') as fimg:
        magic, num, rows, cols = struct.unpack(">IIII", fimg.read(16))
        img = np.fromfile(fimg, dtype=np.uint8).reshape(len(lbl), rows, cols)

    get_img = lambda idx: (lbl[idx], img[idx])

    # Create an iterator which returns each image in turn
    for i in range(len(lbl)):
        yield get_img(i)

def show(image):
    """
    Render a given numpy.uint8 2D array of pixel data.
    """
    from matplotlib import plt
    import matplotlib as mpl
    fig = pyplot.figure()
    ax = fig.add_subplot(1,1,1)
    imgplot = ax.imshow(image, cmap=mpl.cm.Greys)
    imgplot.set_interpolation('nearest')
    ax.xaxis.set_ticks_position('top')
    ax.yaxis.set_ticks_position('left')
pyplot.show()

In [7]:
tr = list(read("training", r"../MNIST"))
te = list(read("testing", r"../MNIST"))

In [9]:
import pandas as pd

label, pixel = tr[0]
pixel = pixel.reshape(784)
tr_label = np.array((label))
tr_image = pixel

for i in range(1, 1000): #숫자 수정 필요!
    temp_label, temp_pixel = tr[i]
    temp_pixel = temp_pixel.reshape(784)
    tr_image = np.vstack((tr_image, temp_pixel))
    tr_label = np.vstack((tr_label, temp_label))
    
#트레이닝 데이터 프래임 만들기
tr_data = np.hstack((tr_image, tr_label))
tr_data_df = pd.DataFrame(tr_data)
tr_data_df = tr_data_df.rename(columns={784:"label"})

X_tr_data = tr_data_df.drop('label', axis=1)
y_tr_data = tr_data_df['label']

### Scaling

from sklearn.preprocessing import StandardScaler, MinMaxScaler

st_scarler = StandardScaler()
mm_scarler = MinMaxScaler()

X_tr_st = pd.DataFrame(st_scarler.fit_transform(X_tr_data))
X_tr_mm = pd.DataFrame(mm_scarler.fit_transform(X_tr_data))

In [10]:
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

lr_clf = LogisticRegression()
svm = SVC()



def exec_Skfold(clf, folds=5) :
    Skfold=StratifiedKFold(n_splits=folds)
    scores=[]
    
    for iter_counts, (train_index, test_index) in enumerate(Skfold.split(X_tr_st, y_tr_data)):
        X_train, X_test = X_tr_st.values[train_index], X_tr_st.values[test_index]
        y_train, y_test = y_tr_data.values[train_index], y_tr_data.values[test_index]
        
        print('## 교차검증 : {0}'.format(iter_counts))
        print('학습 레이블 데아터 분포 :\n', pd.Series(y_train).value_counts())
        print('검증 레이블 데이터 분포 :\n', pd.Series(y_test).value_counts())
        
        clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)
        accuracy = accuracy_score(y_test, predictions)
        scores.append(accuracy)
        print("교차 검증 {0} 정확도 :{1:.4f}".format(iter_counts, accuracy))
        
        print("\n")
        
    mean_score = np.mean(scores)
    print("평균 정확도 :{0:.4f}".format(mean_score))
    print("\n")
    
exec_Skfold(lr_clf, folds=5)
exec_Skfold(svm, folds=5)

## 교차검증 : 0
학습 레이블 데아터 분포 :
 7    93
1    92
4    84
9    80
2    79
0    77
6    75
3    74
5    73
8    69
dtype: int64
검증 레이블 데이터 분포 :
 7    24
1    24
4    21
9    20
2    20
0    20
6    19
5    19
3    19
8    18
dtype: int64




교차 검증 0 정확도 :0.7794


## 교차검증 : 1
학습 레이블 데아터 분포 :
 7    93
1    93
4    84
9    80
2    79
0    77
6    75
3    74
5    73
8    69
dtype: int64
검증 레이블 데이터 분포 :
 7    24
1    23
4    21
9    20
2    20
0    20
6    19
5    19
3    19
8    18
dtype: int64




교차 검증 1 정확도 :0.8522


## 교차검증 : 2
학습 레이블 데아터 분포 :
 7    94
1    93
4    84
9    80
2    79
0    78
6    75
5    74
3    74
8    70
dtype: int64
검증 레이블 데이터 분포 :
 7    23
1    23
4    21
9    20
2    20
6    19
3    19
0    19
5    18
8    17
dtype: int64




교차 검증 2 정확도 :0.8090


## 교차검증 : 3
학습 레이블 데아터 분포 :
 7    94
1    93
4    84
9    80
2    79
0    78
6    75
3    75
5    74
8    70
dtype: int64
검증 레이블 데이터 분포 :
 7    23
1    23
4    21
9    20
2    20
6    19
0    19
5    18
3    18
8    17
dtype: int64




교차 검증 3 정확도 :0.7778


## 교차검증 : 4
학습 레이블 데아터 분포 :
 7    94
1    93
4    84
9    80
2    80
0    78
6    76
3    75
5    74
8    70
dtype: int64
검증 레이블 데이터 분포 :
 7    23
1    23
4    21
9    20
2    19
0    19
6    18
5    18
3    18
8    17
dtype: int64




교차 검증 4 정확도 :0.7704


평균 정확도 :0.7978


## 교차검증 : 0
학습 레이블 데아터 분포 :
 7    93
1    92
4    84
9    80
2    79
0    77
6    75
3    74
5    73
8    69
dtype: int64
검증 레이블 데이터 분포 :
 7    24
1    24
4    21
9    20
2    20
0    20
6    19
5    19
3    19
8    18
dtype: int64




교차 검증 0 정확도 :0.8824


## 교차검증 : 1
학습 레이블 데아터 분포 :
 7    93
1    93
4    84
9    80
2    79
0    77
6    75
3    74
5    73
8    69
dtype: int64
검증 레이블 데이터 분포 :
 7    24
1    23
4    21
9    20
2    20
0    20
6    19
5    19
3    19
8    18
dtype: int64




교차 검증 1 정확도 :0.8768


## 교차검증 : 2
학습 레이블 데아터 분포 :
 7    94
1    93
4    84
9    80
2    79
0    78
6    75
5    74
3    74
8    70
dtype: int64
검증 레이블 데이터 분포 :
 7    23
1    23
4    21
9    20
2    20
6    19
3    19
0    19
5    18
8    17
dtype: int64




교차 검증 2 정확도 :0.8141


## 교차검증 : 3
학습 레이블 데아터 분포 :
 7    94
1    93
4    84
9    80
2    79
0    78
6    75
3    75
5    74
8    70
dtype: int64
검증 레이블 데이터 분포 :
 7    23
1    23
4    21
9    20
2    20
6    19
0    19
5    18
3    18
8    17
dtype: int64




교차 검증 3 정확도 :0.8687


## 교차검증 : 4
학습 레이블 데아터 분포 :
 7    94
1    93
4    84
9    80
2    80
0    78
6    76
3    75
5    74
8    70
dtype: int64
검증 레이블 데이터 분포 :
 7    23
1    23
4    21
9    20
2    19
0    19
6    18
5    18
3    18
8    17
dtype: int64




교차 검증 4 정확도 :0.8469


평균 정확도 :0.8578




In [12]:
from sklearn.datasets import load_iris
iris = load_iris()
from sklearn.model_selection import cross_val_score

logreg = LogisticRegression()

In [13]:
score = cross_val_score(logreg, iris.data, iris.target, cv=5)
print('cross validation score : %s'%score)

cross validation score : [1.         0.96666667 0.93333333 0.9        1.        ]




In [28]:
from sklearn.model_selection import KFold
kfold = KFold(n_splits=6, shuffle=True, random_state=0)
score = cross_val_score(logreg, iris.data, iris.target, cv=kfold)
print('cross validation score : {}'.format(score))
print('mean score : {:,2f}',format(score.mean()))

cross validation score : [0.96 0.92 0.92 0.96 1.   0.88]
mean score : {:,2f} 0.94




In [22]:
def plot_cv_indices(cv, X, y, group, ax, n_splits, lw=10):
    """Create a sample plot for indices of a cross-validation object."""

    # Generate the training/testing visualizations for each CV split
    for ii, (tr, tt) in enumerate(cv.split(X=X, y=y, groups=group)):
        # Fill in indices with the training/test groups
        indices = np.array([np.nan] * len(X))
        indices[tt] = 1
        indices[tr] = 0

        # Visualize the results
        ax.scatter(range(len(indices)), [ii + .5] * len(indices),
                   c=indices, marker='_', lw=lw, cmap=cmap_cv,
                   vmin=-.2, vmax=1.2)

    # Plot the data classes and groups at the end
    ax.scatter(range(len(X)), [ii + 1.5] * len(X),
               c=y, marker='_', lw=lw, cmap=cmap_data)

    ax.scatter(range(len(X)), [ii + 2.5] * len(X),
               c=group, marker='_', lw=lw, cmap=cmap_data)

    # Formatting
    yticklabels = list(range(n_splits)) + ['class', 'group']
    ax.set(yticks=np.arange(n_splits+2) + .5, yticklabels=yticklabels,
           xlabel='Sample index', ylabel="CV iteration",
           ylim=[n_splits+2.2, -.2], xlim=[0, 100])
    ax.set_title('{}'.format(type(cv).__name__), fontsize=15)
    return ax

In [24]:
groups = np.hstack([[ii] * 10 for ii in range(10)])
# fig, ax = plt.subplots()
# plot_cv_indices(kfold, iris.data, iris.target, groups, ax, 2)

In [27]:
groups

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6,
       6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8,
       8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9])