In [1]:
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
# Read data
# Change to data path on your computer
# train path
train_images_path = 'train-images-idx3-ubyte.gz'
train_labels_path = 'train-labels-idx1-ubyte.gz'
# test path
test_images_path = 't10k-images-idx3-ubyte.gz'
test_labels_path = 't10k-labels-idx1-ubyte.gz'


def get_mnist_data(images_path, labels_path, num_images, shuffle=False, _is=True, image_size=28):
    """
    This shuffle param is active when .gz is downloaded at:
    - 'http://yann.lecun.com/exdb/mnist/'
    - This function return random num_images in 60000 or 10000
    """
    # read data
    import gzip  # to decompress gz (zip) file
    # open file training to read training data
    f_images = gzip.open(images_path, 'r')
    # skip 16 first bytes because these are not data, only header infor
    f_images.read(16)
    # general: read num_images data samples if this parameter is set;
    # if not, read all (60000 training or 10000 test)
    real_num = num_images if not shuffle else (60000 if _is else 10000)
    # read all data to buf_images (28x28xreal_num)
    buf_images = f_images.read(image_size * image_size * real_num)
    # images
    images = np.frombuffer(buf_images, dtype=np.uint8).astype(np.float32)
    images = images.reshape(real_num, image_size, image_size, )
    # Read labels
    f_labels = gzip.open(labels_path, 'r')
    f_labels.read(8)
    labels = np.zeros((real_num)).astype(np.int64)
    # rearrange to correspond the images and labels
    for i in range(0, real_num):
        buf_labels = f_labels.read(1)
        labels[i] = np.frombuffer(buf_labels, dtype=np.uint8).astype(np.int64)
    # shuffle to get random images data
    if shuffle is True:
        rand_id = np.random.randint(real_num, size=num_images)
        images = images[rand_id, :]
        labels = labels[rand_id,]
    # change images data to type of vector 28x28 dimentional
    images = images.reshape(num_images, image_size * image_size)
    return images, labels

In [3]:
train_images, train_labels = get_mnist_data(train_images_path,
                                            train_labels_path, 5000, shuffle=True)
test_images, test_labels = get_mnist_data(test_images_path,
                                          test_labels_path, 10000, _is=False, shuffle=True)


# Convert matrix to image
def get_image(image):
    return image.reshape(28, 28)

  labels[i] = np.frombuffer(buf_labels, dtype=np.uint8).astype(np.int64)


In [4]:
# Standardize data
train_image = StandardScaler().fit_transform(train_images)
# Dimensionality reduction
pca = PCA(n_components=100)
train_images_pca = pca.fit_transform(train_image)

In [5]:
# Multinomial Logistic Regression (Softmax) approach
def softmax_approach(x_train, x_test, y_train, y_test):
    # Train model
    start = time.perf_counter()  # start counting time
    softmax = LogisticRegression(max_iter=5000)
    softmax.fit(x_train, y_train)
    # Predict result
    y_predict = softmax.predict(x_test)
    end = time.perf_counter()  # end counting time
    print(f'Accuracy score: {accuracy_score(y_test, y_predict)}')
    print(f'Execution time: {end - start}s')

In [6]:
# Split training set and validation set
# Original data
x_train, x_test, y_train, y_test = train_test_split(train_images,
                                                    train_labels, test_size=0.3, random_state=42)
# Apply PCA method then split train - test set
x_train_pca_split, x_test_pca_split, y_train_pca_split, y_test_pca_split = (
    train_test_split(train_images_pca, train_labels, test_size=0.3, random_state=42))
# Split train - test then apply PCA
x_train_split_pca = pca.fit_transform(x_train)
x_test_split_pca = pca.fit_transform(x_test)

In [7]:
# Original data
softmax_approach(x_train, x_test, y_train, y_test)

Accuracy score: 0.8706666666666667
Execution time: 0.825858407000851s


In [8]:
# Apply PCA method then split train - test set
softmax_approach(x_train_pca_split, x_test_pca_split, y_train_pca_split, y_test_pca_split)

Accuracy score: 0.888
Execution time: 0.32638319400030014s


In [9]:
# Split train - test then apply PCA
softmax_approach(x_train_split_pca, x_test_split_pca, y_train, y_test)

Accuracy score: 0.29733333333333334
Execution time: 4.902660175001074s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


- Độ chính xác của hướng tiếp cận giảm chiều dữ liệu rồi chia train - test khá gần so với thực hiện mô hình Hồi quy multinomial logistic (softmax) trên tập dữ liệu gốc. Tuy nhiên thời gian thực thi của hướng tiếp cận giảm chiều dữ liệu rồi chia train - test nhanh hơn rất nhiều so với thực hiện trên bộ dữ liệu gốc.
- Hướng tiếp cận chia tập train - test rồi thực hiện giảm chiều cho độ chính xác thấp hơn đáng kể so với hướng tiếp cận giảm chiều dữ liệu rồi mới chia train - test. Thời gian thực thi giữa 2 hướng tiếp cận cũng không có sự khác biệt quá lớn.
- Do đó, hướng tiếp cận giảm chiều dữ liệu rồi chia train - test là phù hợp với bài toán này.