# MNIST DATA

We are working with MNIST data, a toy dataset to test algorithms on.

We have training data and test data.

**Basics**: 28 x 28 images of handwritten digits as well as the label of which digit $0 \leq label \leq 9.$

**Format**: label, pix-11, pix-12, pixe-13, pix-ij where the pixel is in the ith row and the jth column.

In [101]:
import numpy as np
import math
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline

In [102]:
# let's first start w. logistic regression
# read in the data
train = np.asarray(pd.read_csv('mnist_train.csv', header = None))
test = np.asarray(pd.read_csv('mnist_test.csv', header = None))

In [103]:
trainT = np.asarray([i for i in train if i[0] == 0 or i[0] == 1])
testT = np.asarray([i for i in test if i[0] == 0 or i[0] == 1])

In [104]:
# split the features from the prediction values
xTrain = trainT[:,1:]
yTrain = trainT[:,0].reshape(trainT[:,0].shape[0],1)

someNum = int(trainT.shape[0]/2)
xtt = trainT[:someNum,1:]
ytt = trainT[:someNum,0].reshape(someNum,1)


xTest = testT[:,1:]
yTest = testT[:,0].reshape(testT[:,0].shape[0],1)



In [105]:
print("The number of samples in the training set for 0s and 1s is {}".format(trainT.shape))
print("The number of samples in the x training set for 0s and 1s is {}".format(xTrain.shape))
print("The number of samples in the y training set for 0s and 1s is {}".format(yTrain.shape))

print("The number of samples in the test set for 0s and 1s is {}".format(testT.shape))
print("The number of samples in the x test set for 0s and 1s is {}".format(xTest.shape))
print("The number of samples in the y test set for 0s and 1s is {}".format(yTest.shape))

The number of samples in the training set for 0s and 1s is (12665, 785)
The number of samples in the x training set for 0s and 1s is (12665, 784)
The number of samples in the y training set for 0s and 1s is (12665, 1)
The number of samples in the test set for 0s and 1s is (2115, 785)
The number of samples in the x test set for 0s and 1s is (2115, 784)
The number of samples in the y test set for 0s and 1s is (2115, 1)


Now, we create gradient descent from a log-likehood estimate, hence we find the equation
$$ \theta_j := \theta_j + \alpha \frac{\partial l}{\partial \theta}$$
where 
$$ \frac{\partial l}{\partial \theta} =  y-\sigma(\theta^T x)x_i + \alpha \theta$$
where 
$$ \sigma(\theta^T x) = \frac{1}{1+e^{-\theta^T x}}$$


In [106]:
# x is an np array
def sigmoid(x, thetas):
    return float(1) / (1 + np.exp(-x @ thetas))

In [107]:
def log_likelihood(X,y, thetas, alpha=1e-6):
    sig = sigmoid(X,thetas)
    first = y * np.log(sig)
    second = (1-y)* np.log(1-sig)
    llhood = np.sum(-first - second)
    reg = alpha * np.inner(thetas,thetas)/2
    nexty = llhood + reg
    return np.mean(nexty)

In [110]:
thetas = np.zeros((xTrain.shape[1],1)) # should be the same shape as the number of features
log_likelihood(xTrain,yTrain, thetas,1e-3)

8778.7090417917225

In [111]:
def grad_desc(thetas, X, y, lr=.001, converge_change=.001):
    X = (X - np.mean(X, axis=0))/ np.std(X, axis =0)
    cost_iter = []
    cost = log_likelihood(X, y, thetas)
    cost_iter.append([0, cost])
    change_cost = 1
    i = 1
    while (change_cost > converge_change):
        old_cost = cost
        thetas = thetas - (lr * grad_llhood(X, y, thetas))
        cost = log_likelihood(X,y,thetas)
        cost_iter.append([i, cost])
        change_cost = old_cost - cost
        i+=1
    return thetas, np.array(cost_iter)

In [112]:
def grad_llhood(X, y, thetas, alpha=1e-4):
    val = ( sigmoid(X,thetas) -y ).T @ X + alpha * thetas
    return val

In [115]:
a, b = grad_desc(thetas, xTrain, yTrain)
print(a)
print(b)

[[ nan  nan  nan ...,  nan  nan  nan]
 [ nan  nan  nan ...,  nan  nan  nan]
 [ nan  nan  nan ...,  nan  nan  nan]
 ..., 
 [ nan  nan  nan ...,  nan  nan  nan]
 [ nan  nan  nan ...,  nan  nan  nan]
 [ nan  nan  nan ...,  nan  nan  nan]]
[[  0.  nan]
 [  1.  nan]]


In [26]:
grad_llhood(xTrain,yTrain, thetas)

array([[  0.00000000e+00],
       [  0.00000000e+00],
       [  0.00000000e+00],
       [  0.00000000e+00],
       [  0.00000000e+00],
       [  0.00000000e+00],
       [  0.00000000e+00],
       [  0.00000000e+00],
       [  0.00000000e+00],
       [  0.00000000e+00],
       [  0.00000000e+00],
       [  0.00000000e+00],
       [  0.00000000e+00],
       [  0.00000000e+00],
       [  0.00000000e+00],
       [  0.00000000e+00],
       [  0.00000000e+00],
       [  0.00000000e+00],
       [  0.00000000e+00],
       [  0.00000000e+00],
       [  0.00000000e+00],
       [  0.00000000e+00],
       [  0.00000000e+00],
       [  0.00000000e+00],
       [  0.00000000e+00],
       [  0.00000000e+00],
       [  0.00000000e+00],
       [  0.00000000e+00],
       [  0.00000000e+00],
       [  0.00000000e+00],
       [  0.00000000e+00],
       [  0.00000000e+00],
       [  0.00000000e+00],
       [  0.00000000e+00],
       [  0.00000000e+00],
       [  0.00000000e+00],
       [  0.00000000e+00],
 

In [96]:
def gradient_desc(X,y, alpha=1e-4, multi = 1e-3,max_iters=10, tol=1e-6):
    
    thetas = np.zeros((xTrain.shape[1],1)) # should be the same shape as the number of features
    outputs = [log_likelihood(X,y, thetas, alpha=alpha)] # store the log_likelihood values
    gradient = grad_llhood(X, y, thetas, alpha=alpha) # takes the gradient of the likelihood for descent
    
    while len(outputs)-1  <= max_iters and \
        np.linalg.norm(gradient) > tol:
            gradient = grad_llhood(X, y, thetas, alpha=alpha)
            thetas = thetas - 1e-3 * gradient
            outputs.append(log_likelihood(X, y, thetas, alpha=alpha))
    return thetas, outputs

In [97]:
theta, outs = gradient_desc(xtt, ytt)

In [116]:
print(outs)

[4389.0079473055685, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]


In [99]:
thetas = np.zeros((xTrain.shape[1],1)) # should be the same shape as the number of features
log_likelihood(xTrain,yTrain.astype(bool), thetas, alpha=1e-6)

8778.7090417917225

In [100]:
grad_llhood(xTrain,yTrain.astype(bool), thetas, alpha=1e-6)

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [86]:
def logistic_func(theta, x):
    return float(1) / (1 + math.e**(-x.dot(theta)))
def log_gradient(theta, x, y):
    first_calc = logistic_func(theta, x) - np.squeeze(y)
    final_calc = first_calc.T.dot(x)
    return final_calc
def cost_func(theta, x, y):
    log_func_v = logistic_func(theta,x)
    y = np.squeeze(y)
    step1 = y * np.log(log_func_v)
    step2 = (1-y) * np.log(1 - log_func_v)
    final = -step1 - step2
    return np.mean(final)
def grad_desc(theta_values, X, y, lr=.001, converge_change=.001):
    #normalize
    X = (X - np.mean(X, axis=0)) / np.std(X, axis=0)
    #setup cost iter
    cost_iter = []
    cost = cost_func(theta_values, X, y)
    cost_iter.append([0, cost])
    change_cost = 1
    i = 1
    while(change_cost > converge_change):
        old_cost = cost
        theta_values = theta_values - (lr * log_gradient(theta_values, X, y))
        cost = cost_func(theta_values, X, y)
        cost_iter.append([i, cost])
        change_cost = old_cost - cost
        i+=1
    return theta_values, np.array(cost_iter)
def pred_values(theta, X, hard=True):
    #normalize
    X = (X - np.mean(X, axis=0)) / np.std(X, axis=0)
    pred_prob = logistic_func(theta, X)
    pred_value = np.where(pred_prob >= .5, 1, 0)
    if hard:
        return pred_value
    return pred_prob

In [95]:
thetas = np.zeros(xTrain.shape[1])
a, b = grad_desc(thetas, xTrain, np.logical_not(yTrain))

print(a)
print(b)

[ nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan
  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan
  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan
  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan
  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan
  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan
  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan
  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan
  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan
  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan
  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan
  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan
  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan
  nan  nan  

array([[0],
       [1],
       [1],
       ..., 
       [1],
       [0],
       [1]])