# Softmax Classification
여러개를 Classification 함, 여기서는 logistic을 쓰진 않을 것임, score값에 softmax를 거치면 확률화를 해줌


In [1]:
import numpy as np
from random import shuffle
from cifar10_data_util import load_CIFAR10
import matplotlib.pyplot as plt

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

$$L_i = -log(\frac{e^{f_{y_i}}}{\sum_{j} e^{f_j}})$$

${e^{f_{y_i}}}$이거랑 ${\sum_{j} e^{f_j}}$ 이거는 exponetial이라 갑자기 겁나 커진다. 큰 수를 나누는 것이 unsafety해짐. 그래서 Normalization씀 이거는 계산 트릭인데 exponential한 계산을 다룰 때 꼭 써야함

$$logC=−max_jf_j$$ .

$$아래처럼 Normalization$$

$$\frac{Ce^{f_{y_i}}}{C\sum_{j} e^{f_j}}= \frac{e^{f_{y_i}+logC }}{\sum_{j} e^{f_j+logC }}$$


In [2]:
def softmax_loss_naive(X, W, y, reg): # 더 빠른 연산이 존재
#inputs:
# - X : (N,D)
# - W : (D,C)
#y값은 onehot에서 추출된 상태 => Cifar data를 쓸것이기 때문에
# - y : (N,) 
# - reg : (float) regularization strength
#   Li = -log(e^fyi/∑je^fj)
    loss = 0.0
    dW = np.zeros_like(W)
    
    num_classes = W.shape[1]
    num_train = X.shape[0]
    
    for i in range(num_train):
        scores = np.dot(X[i],W)
        shift_scores = scores - max(scores)
        
        loss_i = np.log(sum(np.exp(shift_scores))) -shift_scores[y[i]]
        loss += loss_i
        
        for j in range(num_classes):
            softmax_output = np.exp( shift_scores[j])/sum(np.exp(shift_scores) )
            if j==y[i]:
                dW[:,y[i]] += (-1 + softmax_output)*X[i]
            else:
                dW[:,j] += softmax_output*X[i]
    
    loss /= num_train
    loss += 0.5 * reg * np.sum(W*W)
    dW = dW/num_train + reg* W
    #  W - (learning_rate)*dW 
    #  reg* W라는 텀이 추가로 더 붙어서 Weight가 더 까임
    return loss, dW

In [3]:
def get_CIFAR10_data(num_training=49000, num_validation=1000, num_test=1000, num_dev=500):
    """
    Load the CIFAR-10 dataset from disk and perform preprocessing to prepare
    it for the linear classifier. These are the same steps as we used for the
    SVM, but condensed to a single function.  
    """
    # Load the raw CIFAR-10 data
    cifar10_dir = './cifar-10-batches-py'
    
    X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir)
    
    # subsample the data
    mask = list(range(num_training, num_training + num_validation))
    X_val = X_train[mask]
    y_val = y_train[mask] # 49000~50000까지 val 
    
    mask = list(range(num_training)) # 49000까지 train_set
    X_train = X_train[mask]
    y_train = y_train[mask]
    
    mask = list(range(num_test)) # test set 1000
    X_test = X_test[mask]
    y_test = y_test[mask]
    
    mask = np.random.choice(num_training, num_dev, replace=False)
    X_dev = X_train[mask]
    y_dev = y_train[mask]
    
    # Preprocessing: reshape the image data into rows
    # -1값을 주면 지멋대로 element 개수 맞춰서 reshape
    X_train = np.reshape(X_train, (X_train.shape[0], -1))
    X_val = np.reshape(X_val, (X_val.shape[0], -1))
    X_test = np.reshape(X_test, (X_test.shape[0], -1))
    X_dev = np.reshape(X_dev, (X_dev.shape[0], -1))
    
    # Normalize the data: subtract the mean image
    mean_image = np.mean(X_train, axis = 0)
    X_train -= mean_image
    X_val -= mean_image
    X_test -= mean_image
    X_dev -= mean_image
    
    # add bias dimension and transform into columns
    X_train = np.hstack([X_train, np.ones((X_train.shape[0], 1))])
    X_val = np.hstack([X_val, np.ones((X_val.shape[0], 1))])
    X_test = np.hstack([X_test, np.ones((X_test.shape[0], 1))])
    X_dev = np.hstack([X_dev, np.ones((X_dev.shape[0], 1))])
    
    return X_train, y_train, X_val, y_val, X_test, y_test, X_dev, y_dev

In [4]:
X_train, y_train, X_val, y_val, X_test, y_test, X_dev, y_dev = get_CIFAR10_data()
print('Train data shape: ', X_train.shape)
print('Train labels shape: ', y_train.shape)
print('Validation data shape: ', X_val.shape)
print('Validation labels shape: ', y_val.shape)
print('Test data shape: ', X_test.shape)
print('Test labels shape: ', y_test.shape)
print('dev data shape: ', X_dev.shape)
print('dev labels shape: ', y_dev.shape)

Train data shape:  (49000, 3073)
Train labels shape:  (49000,)
Validation data shape:  (1000, 3073)
Validation labels shape:  (1000,)
Test data shape:  (1000, 3073)
Test labels shape:  (1000,)
dev data shape:  (500, 3073)
dev labels shape:  (500,)


In [5]:
import time

# Generate a random softmax weight matrix and use it to compute the loss.
W = np.random.randn(3073, 10) * 0.0001 
loss, grad = softmax_loss_naive(X_dev, W, y_dev, 0.0)

# As a rough sanity check, our loss should be something close to -log(0.1).
print('loss: %f' % loss)
print('sanity check: %f' % (-np.log(0.1)))

loss: 2.398645
sanity check: 2.302585


In [6]:
def predit(X, W):
    y_pred = np.zeros(X.shape[0])
    scores=X.dot(W)
    y_pred = np.argmax(scores, axis=1)
    return y_pred 

In [7]:
tic = time.time()
loss_naive, grad_naive = softmax_loss_naive(X_dev,W, y_dev, 0.000005)
toc = time.time()

learning_rate = 1e-3
max_itr=1000
for i in range(max_itr):
    loss_naive, grad_naive = softmax_loss_naive(X_dev, W, y_dev, 0.000005)
    W = W-learning_rate*grad_naive
    y_dev_pred = predit(X_dev, W)
    val_accuracy = np.mean(y_dev == y_dev_pred)
    
    if i% 100==0:
        print("[%d]/[%d] loss is : " %( i,max_itr) ,loss_naive)
        print("accuracy : ", val_accuracy) 

[0]/[1000] loss is :  2.398645070764766
accuracy :  0.264
[100]/[1000] loss is :  58.69969204185975
accuracy :  0.702
[200]/[1000] loss is :  0.12309558728507125
accuracy :  0.994
[300]/[1000] loss is :  5.015792823739109e-05
accuracy :  1.0
[400]/[1000] loss is :  4.342502281254835e-05
accuracy :  1.0
[500]/[1000] loss is :  4.0776202783114726e-05
accuracy :  1.0
[600]/[1000] loss is :  3.93303906573008e-05
accuracy :  1.0
[700]/[1000] loss is :  3.840774750409443e-05
accuracy :  1.0
[800]/[1000] loss is :  3.776163035016607e-05
accuracy :  1.0
[900]/[1000] loss is :  3.728041661094423e-05
accuracy :  1.0
