# Softmax regression for handwritten digits

Today, we'll implement a softmax classifier recognizing handwritten digits.  We'll begin by using a relatively small collection (around 1800) of low resolution (8 by 8pix) digits.  This can be easily acquired using scikit-learn.

In [None]:
import numpy as np
from sklearn.datasets import load_digits
digits = load_digits()

In [None]:
import matplotlib.pyplot as plt
# num images, num pixels
# digits.data.shape
plt.imshow(digits.data[0].reshape(8,8))
print(digits.target[0])

In [None]:
digits.data.shape

The digits appear as an $m\times n$ array, where $m$ is the number of data instances and $n$ is the number of features.  It's important to recognize that for this problem, the number of features is $8\times8 = 64$: the instances are flattened.  If you want to plot a digit from the dataset using, for example, matplotlib's imshow, you'll need to reshape this.  

You'll also want to be careful to normalize the data, preferably by subtracting the mean and dividing by the standard deviation.  

In [None]:
#! Perform normalization
X = digits.data
X -= X.mean()
X /= X.std()
X

The labels appear as integers.  Write and apply a function that converts from this integer representation to a one-hot encoding.

In [None]:
#! Convert the labels to a one-hot encoding
def one_hot(y):
    N = len(np.unique(y)) # The number of classes (different digits)
    m = len(y) # The number of data points (images)
    z = np.zeros((m,N))
    for i in range(m):
        z[i,y[i]] = 1
    return z

z = one_hot(digits.target)
z

Another important step is to split the dataset into training and testing sets.  I like using the function sklearn.model_selection.train_test_split

In [None]:
#! Split the dataset into training and testing sets
from sklearn import model_selection as ms

x_train,x_test,z_train,z_test = ms.train_test_split(X,z)

With data in hand, we now need to implement the model.  Recall that our predictions will be computed as
$$
Y_{pred} = \mathrm{Softmax}(\Phi W)
$$
Implement the softmax method, generate the matrix $\Phi$ (I suggest a linear model, which is to say that all you need to do will be to prepend a column of ones to the $m\times n$ matrix of pixel values, and instantiate the parameter matrix $W$ (I suggest instantiating to an array of very small random numbers).  Your implementation of Softmax should be vectorized, in that it should take a $m \times N$ array of logits and output and $m \times N$ array without using a loop.  Make a prediction using this untrained model: a sensible result at this stage is that all classes are approximately equally likely.

In [None]:
# Implementation of softmax
def Softmax(a):
    numerator = np.exp(a)
    denominator = numerator.sum(axis=1)
    return numerator/denominator[:, np.newaxis]

# a = np.array([[1.,2.,3.], [2.,3.,4.]])
# Softmax(a)

In [None]:
# Build Phi
# My attempt was the same as np.r_
# Phi = np.insert(X, 0, np.ones_like(X[:,0]), axis=0) 
Phi = np.c_[np.ones_like(x_train[:,0]),x_train]
# Phi[:,0]
Phi

In [None]:
# Build W
N = len(digits.target_names) # The number of classes (different digits)
m = X.shape[0]   # The number of data points (images)
n = X.shape[1] + 1   # The number of features (pixels), add one for column of ones

W = np.random.randn(n,N)
W = W / 10000 # divide to reduce noise

a = Phi@W

Softmax(a).sum(axis=1)

Now generate functions (or one function with multiple outputs) to compute the categorical cross entropy and its gradient.  These are given by 
$$
\mathcal{L}(W,\Phi,Y_{obs}) = -\frac{1}{mN} \sum_{i=1}^m \sum_{j=1}^N \left(Y_{obs,i} \cdot \ln \mathrm{Softmax}(\Phi_i W)\right).
$$
and 
$$
\frac{ \partial \mathcal{L}}{\partial W} = -\frac{1}{mN} \sum_{i=1}^m \left[(Y_{obs,i} - \mathrm{Softmax}(\Phi W)_i)^T \Phi_i \right]^T. 
$$
As you implement these functions, consider how to do so in as efficient a manner as possible.  Note that it is possible to vectorize the sums.    

In [None]:
def L(Y,Phi,W):
    m = len(Y)
    N = len(Y[0])
    a = Phi@W
    L_ = -(1/(m*N))*Y*np.log(Softmax(a))
    return L_.sum()

def L_grad(Y,Phi,W):
    m = len(Y)
    N = len(Y[0])
    n = Phi.shape[1]
    a = Phi@W
    L_grad_ = np.zeros((n,N))
    for _ in range(m):
        L_grad_ += ((Y-Softmax(a)).T@Phi).T
    return -(1/(m*N))*L_grad_

In [None]:
L(z_train,Phi,W)

In [None]:
L_grad(z_train,Phi,W).shape

Implement gradient descent and train this model.  Record the value of $\mathcal{L}$ as a function of gradient descent iteration, and produce a plot convincing yourself that the model is converging to a minimum.

In [None]:
# pick an eeta
eeta = 0.02
n_samples = 25

In [None]:
# gradient descent to train the W's
w_vec = [W]
for i in range(1, n_samples):
    W = W - eeta*L_grad(z_train,Phi,W)
    w_vec.append(W)
    
params = np.array(w_vec)


# use trained W's to get Losses for test data at each step
Phi = np.c_[np.ones_like(x_test[:,0]),x_test]
Loss = L(z_test,Phi,W)
L_vec = [Loss]
for i in range(1, n_samples):
    Loss1 = L(z_test,Phi,params[i])
    L_vec.append(Loss1)
    
Losses = np.array(L_vec)

In [None]:
print(params.shape)
print(Losses.shape)

In [None]:
Losses

In [None]:
# Plot the Loss at each step, should converge to zero
x_axis = np.linspace(-5,5,n_samples)
plt.plot(x_axis, Losses)

One very interesting result of working with image data is that we can interpret the learned parameters as images (the weight matrix is $N\times (1+n)$.  If you get rid of the first entry, which corresponds to a constant offset, the remaining $N \times n$ weights are each associated with a given input pixel for a given class).  Plot your weights as images (there should be ten of them).  Evaluate the pattern that you find.    

In [None]:
# plt.imshow(digits.data[0].reshape(8,8))
# use params[-1] to get last W (learned parameters)
data = params[-1][1:params[-1].shape[0]]

In [None]:
fig, axs = plt.subplots(2,5)
axs[0,0].imshow(data[:,0].reshape(8,8))
axs[0,1].imshow(data[:,1].reshape(8,8))
axs[0,2].imshow(data[:,2].reshape(8,8))
axs[0,3].imshow(data[:,3].reshape(8,8))
axs[0,4].imshow(data[:,4].reshape(8,8))
axs[1,0].imshow(data[:,5].reshape(8,8))
axs[1,1].imshow(data[:,6].reshape(8,8))
axs[1,2].imshow(data[:,7].reshape(8,8))
axs[1,3].imshow(data[:,8].reshape(8,8))
axs[1,4].imshow(data[:,9].reshape(8,8))

**Pattern:** I see numbers 0 through 9!

Finally, once this task is complete, scale your method up to the larger (in both number of instances and resolution) dataset MNIST (you can get it using the command sklearn.datasets.fetch_openml('mnist_784', version=1, return_X_y=True, as_frame=False)).  This will take substantial time to train!  Only do this once you are satisfied with your implementation on the digits dataset.  

In [1]:
import numpy as np
from sklearn import model_selection as ms
import matplotlib.pyplot as plt

In [2]:
import sklearn.datasets
digits2 = sklearn.datasets.fetch_openml('mnist_784', version=1, return_X_y=True, as_frame=False)

In [3]:
data = digits2[0]
labels = digits2[1]

In [4]:
#! Perform normalization
X = data
X -= X.mean()
X /= X.std()
X.shape

(70000, 784)

In [5]:
#! Convert the labels to a one-hot encoding
def one_hot(y):
    N = len(np.unique(y)) # The number of classes (different digits)
    m = len(y) # The number of data points (images)
    z = np.zeros((m,N))
    for i in range(m):
        z[i,int(y[i])] = 1
    return z

z = one_hot(labels)
z

array([[0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [6]:
#! Split the dataset into training and testing sets
x_train,x_test,z_train,z_test = ms.train_test_split(X,z)

In [7]:
# Build Phis
Phi_train = np.c_[np.ones_like(x_train[:,0]),x_train]
Phi_test = np.c_[np.ones_like(x_test[:,0]),x_test]

In [8]:
# Implementation of softmax
def Softmax(a):
    numerator = np.exp(a)
    denominator = numerator.sum(axis=1)
    return numerator/denominator[:, np.newaxis]

In [9]:
# Build W
N = len(np.unique(labels)) # The number of classes (different digits)
m = X.shape[0]   # The number of data points (images)
n = X.shape[1] + 1   # The number of features (pixels), add one for column of ones

W = np.random.randn(n,N)
W = W / 10000 # divide to reduce noise

a = Phi_train@W

Softmax(a).sum(axis=1)

array([1., 1., 1., ..., 1., 1., 1.])

In [10]:
# Loss function and gradient
def L(Y,Phi,W):
    m = len(Y)
    N = len(Y[0])
    a = Phi@W
    L_ = -(1/(m*N))*Y*np.log(Softmax(a))
    return L_.sum()

def L_grad(Y,Phi,W):
    m = len(Y)
    N = len(Y[0])
    n = Phi.shape[1]
    a = Phi@W
    L_grad_ = np.zeros((n,N))
    for _ in range(m):
        L_grad_ += ((Y-Softmax(a)).T@Phi).T
    return -(1/(m*N))*L_grad_

In [11]:
# pick an eeta
eeta = 100
n_samples = 7

In [None]:
L_grad(z_train,Phi_train,W)

In [None]:
# gradient descent to train the W's
w_vec = [W]
for i in range(1, n_samples):
    W = W - eeta*L_grad(z_train,Phi_train,W)
    w_vec.append(W)
    print(np.linalg.norm(W))
    
params = np.array(w_vec)

In [None]:
# use trained W's to get Losses for test data at each step
Loss = L(z_test,Phi_test,W)
L_vec = [Loss]
for i in range(1, n_samples):
    Loss1 = L(z_test,Phi_test,params[i])
    print(Loss1)
    L_vec.append(Loss1)
    
Losses = np.array(L_vec)

In [None]:
# Plot the Loss at each step, should converge to zero
x_axis = np.linspace(-5,5,n_samples)
plt.plot(x_axis, Losses)

In [None]:
# Grab the new data from our learned parameters
data = params[-1][1:params[-1].shape[0]]

In [None]:
# Plot learned digit images
fig, axs = plt.subplots(2,5)
axs[0,0].imshow(data[:,0].reshape(28,28))
axs[0,1].imshow(data[:,1].reshape(28,28))
axs[0,2].imshow(data[:,2].reshape(28,28))
axs[0,3].imshow(data[:,3].reshape(28,28))
axs[0,4].imshow(data[:,4].reshape(28,28))
axs[1,0].imshow(data[:,5].reshape(28,28))
axs[1,1].imshow(data[:,6].reshape(28,28))
axs[1,2].imshow(data[:,7].reshape(28,28))
axs[1,3].imshow(data[:,8].reshape(28,28))
axs[1,4].imshow(data[:,9].reshape(28,28))