In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
import seaborn as sns

In [2]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1, cache=True)
mnist.keys()

  warn(


dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])

In [3]:
X, y = mnist["data"], mnist["target"]

In [4]:
y = y.astype(np.uint8)

In [5]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()

In [10]:
y.values

array([5, 0, 4, ..., 4, 5, 6], dtype=uint8)

In [11]:
enc.fit(y.values[:,np.newaxis])

In [12]:
Y = enc.transform(y.values[:,np.newaxis]).toarray()

In [13]:
X_train, X_test, y_train, y_test = X[:60000], X[60000:], Y[:60000], Y[60000:]

In [14]:
X_train = X_train / 255
X_test = X_test / 255

In [15]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [16]:
def softmax(X, W):
    K = np.size(W, 1)
    A = np.exp(X @ W)
    B = np.diag(1 / (np.reshape(A @ np.ones((K,1)), -1)))
    Y = B @ A
    return Y

In [25]:
def compute_cost(X, T, W, lmbda=0.01):
    epsilon = 1e-5
    N = len(T)
    K = np.size(T, 1)
    cost = - (1/N) * np.ones((1,N)) @ (np.multiply(np.log(softmax(X, W) + epsilon), T)) @ np.ones((K,1))
    regularization = lmbda * np.linalg.norm(W)
    return cost + regularization

In [20]:
def predict(X, W):
    return np.argmax((X @ W), axis=1)

In [22]:
def batch_gd(X, T, W, learning_rate, iterations, batch_size):
    N = len(T)
    cost_history = np.zeros((iterations,1))
    shuffled_indices = np.random.permutation(N)
    X_shuffled = X[shuffled_indices]
    T_shuffled = T[shuffled_indices]

    for i in range(iterations):
        j = i % N
        X_batch = X_shuffled[j:j+batch_size]
        T_batch = T_shuffled[j:j+batch_size]
        # batch가 epoch 경계를 넘어가는 경우, 앞 부분으로 채워줌
        if X_batch.shape[0] < batch_size:
            X_batch = np.vstack((X_batch, X_shuffled[:(batch_size - X_batch.shape[0])]))
            T_batch = np.vstack((T_batch, T_shuffled[:(batch_size - T_batch.shape[0])]))
        W = W - (learning_rate/batch_size) * (X_batch.T @ (softmax(X_batch, W) - T_batch))
        cost_history[i] = compute_cost(X_batch, T_batch, W)
        if i % 1000 == 0:
            print(cost_history[i][0])

    return (cost_history, W)

In [23]:
X = np.hstack((np.ones((np.size(X_train, 0),1)),X_train))
T = y_train

K = np.size(T, 1)
M = np.size(X, 1)
W = np.zeros((M,K))

iterations = 50000
learning_rate = 0.01

initial_cost = compute_cost(X, T, W)

print("Initial Cost is: {} \n".format(initial_cost[0][0]))

(cost_history, W_optimal) = batch_gd(X, T, W, learning_rate, iterations, 64)

Initial Cost is: 2.302485097993717 

2.2719144775656384
0.5581339617854693
0.36994635795019265
0.4378231715749116
0.4051113802075499
0.3296834200796072
0.38825484409740085
0.2583812132099521
0.3345958471344481
0.31185848022694457
0.4118494940602113
0.38286540452624435
0.299782641909314
0.4332830147807972
0.47185934497704435
0.26139482389151425
0.46356617355781726
0.2665022159124113
0.36315558267444137
0.42030793022242996
0.31663105117213525
0.3841232337705601
0.2461436780513172
0.26898146575920107
0.4529290936213825
0.3919619237912696
0.3121794668165689
0.25900451504139377
0.42351204606872694
0.5813083139018221
0.2128940011207524
0.25655905784218197
0.493615527335473
0.5786210490572523
0.3734278168530569
0.34130509788487245
0.2874839383521016
0.37461915956504327
0.45245585957358614
0.25245359532132317
0.20719249783379717
0.37958554020932267
0.29543326505011086
0.47833371300321537
0.32165249512141714
0.4782409881763794
0.40182837554678585
0.38183041920520444
0.38111871731313435
0.274922

In [24]:
## Accuracy
X_ = np.hstack((np.ones((np.size(X_test, 0),1)),X_test))
T_ = y_test
y_pred = predict(X_, W_optimal)
score = float(sum(y_pred == np.argmax(T_, axis=1)))/ float(len(y_test))

print(score)

0.9164


In [31]:
from sklearn.model_selection import train_test_split

def batch_gd(X, T, W, learning_rate, iterations, batch_size):
    N = len(T)
    cost_history = np.zeros((iterations,1))
    valid_cost_history = np.zeros((iterations,1))
    shuffled_indices = np.random.permutation(N)
    X_shuffled = X[shuffled_indices]
    T_shuffled = T[shuffled_indices]
    X_train, X_valid, y_train, y_valid = train_test_split(X_shuffled,T_shuffled, test_size=0.2, random_state=42)

    for i in range(iterations):
        j = i % N
        X_batch = X_train[j:j+batch_size]
        T_batch = y_train[j:j+batch_size]
        X_valid_batch = X_valid[j:j+batch_size]
        T_valid_batch = y_valid[j:j+batch_size]
        # batch가 epoch 경계를 넘어가는 경우, 앞 부분으로 채워줌
        if X_batch.shape[0] < batch_size:
            X_batch = np.vstack((X_batch, X_train[:(batch_size - X_batch.shape[0])]))
            T_batch = np.vstack((T_batch, y_train[:(batch_size - T_batch.shape[0])]))
            X_valid_batch = np.vstack((X_valid_batch, X_valid[:(batch_size - X_valid_batch.shape[0])]))
            T_valid_batch = np.vstack((T_valid_batch, y_valid[:(batch_size - T_valid_batch.shape[0])]))
        W = W - (learning_rate/batch_size) * (X_batch.T @ (softmax(X_batch, W) - T_batch))
        cost_history[i] = compute_cost(X_batch, T_batch, W)
        valid_cost_history[i] = compute_cost(X_valid_batch, T_valid_batch, W)
        if i % 1000 == 0:
            print('train:',cost_history[i][0])
            print('valid:',valid_cost_history[i][0])

    return (cost_history, W)

In [32]:
X = np.hstack((np.ones((np.size(X_train, 0),1)),X_train))
T = y_train

K = np.size(T, 1)
M = np.size(X, 1)
W = np.zeros((M,K))

iterations = 50000
learning_rate = 0.01

initial_cost = compute_cost(X, T, W)

print("Initial Cost is: {} \n".format(initial_cost[0][0]))

(cost_history, W_optimal) = batch_gd(X, T, W, learning_rate, iterations, 64)

Initial Cost is: 2.302485097993717 

train: 2.2777930549384697
valid: 2.292026605430034
train: 0.6089292390648721
valid: 0.7498610341437
train: 0.44824597833221513
valid: 0.4741166375608433
train: 0.48665880843778436
valid: 0.6316495378605207
train: 0.37404612703207557
valid: 0.3878211646949781
train: 0.46966442214305426
valid: 0.40924401690418455
train: 0.4402473956114526
valid: 0.5411806114040938
train: 0.269679294753382
valid: 0.3956813679790134
train: 0.25996253879236064
valid: 0.5369589723127759
train: 0.379854091253051
valid: 0.45614242927643606
train: 0.5018239164638626
valid: 0.5327533185378526
train: 0.3402549582786679
valid: 0.7386878762037165


ZeroDivisionError: division by zero