In [2]:
import math
import matplotlib.pyplot as plt
import numpy as np
import random
import pandas as pd
np.random.seed(42)

In [6]:
def logistic(z):
    return 1/(1 + np.exp(-z))


def log_prob(z, y_i):
    '''
    Returns the log_prob for one point
    '''
    fz = logistic(z)
    return y_i * np.log(fz) + (1 - y_i) * np.log(1 - fz)


def neg_log_likelihood(X, w, y):
    '''Compute the negative log likelihood'''
    L = 0
    for _x,_y in zip(X, y):
        z = w.dot(_x)
        L += log_prob(z=z, y_i=_y)
    return -1 * L


def fast_logistic(X, w):
    '''Compute the logistic function over many data points'''
    return 1/(1 + np.exp(-1 * X.dot(w)))


def grad(_X, w, _y, lambda_=.5):
    '''
    Return the gradient
    
    - https://web.stanford.edu/~jurafsky/slp3/5.pdf
    '''
    grad = np.zeros_like(w)
    
    N,D= _X.shape
    
    b = _X * (fast_logistic(_X, w) - _y).reshape((N, 1))

    return np.sum(b, axis=0) + (lambda_ * 2 * w)


def squared_l2_norm(w):
    '''
    Return the L2 norm of the weights, squared. 
    
    Remember that we square the norm of the weights,
    to make the math easier when computing the gradients
    
    $\sqrt{\Sigma w_i^2} ^ 2
    '''
    return np.sqrt(np.sum(np.square(w))) ** 2


def grad_descent(_X, _y, eta = .0001, lambda_ = 0, tolerance=1e-4, verbose=True, batch_size=None, iters=None):
    '''
    Perform gradient descent
    '''
    w = np.random.uniform(low=-5, high=2, size=dim_)
    
    losses = []
    for i in range(1000):
        if i > iters and iters is not None:
            break
        this_ll = neg_log_likelihood(_X, w, _y)
        loss = this_ll + lambda_ * squared_l2_norm(w)
        losses.append(loss)
        if verbose:
            print("iter: {}, loss: {}, accuracy: {}".format(i, loss, accuracy(_X, w, _y)))
        
        if (squared_l2_norm(grad(_X, w, _y, lambda_=lambda_))) < tolerance:
            break
        
        if batch_size is None:
            w -= eta * grad(_X, w, _y, lambda_=lambda_)
        else:
            _N,F = _X.shape
            idx = np.random.randint(_N, size=batch_size)
            w -= eta * grad(_X[idx], w, _y[idx], lambda_=lambda_)/batch_size
        
    return w, losses

def prediction(X, w, threshold=.5):
    '''
    - Return a Boolean array of length N.
    - The array should be True if the weights dotted with the features for a given instance is greater than .5
    '''
    N, D = X.shape
    return X.dot(w) > threshold

def accuracy(X, w, y):
    '''
    Return a value between 0 and 1, showing the fraction of data points which have been classified correctly
    '''
    return np.mean(prediction(X, w) == y)

def init_data(N, dim_):
    '''
    Initialize data. Note how we generate y below. We know how the data is generated.
    '''
    w = np.random.uniform(low=-1, high=1, size=dim_)
    X = (np.random.rand(dim_ * N) > .5).astype(int)
    X = X.reshape(N, dim_)

    z_ = X.dot(w) + np.random.uniform(low=-1, high=1, size=X.dot(w).size)

    y =  1/(1 + np.exp(-1 * z_)) > .5
    
    return X, y

np.random.seed(42)

N = 10000
dim_ = 10

w = np.random.uniform(low=-5, high=2, size=dim_)

X, y = init_data(N, dim_)

split = int(N/2)

X_train = X[0:split]
X_test = X[split:]
y_train = y[0:split]
y_test = y[split:]


lambda_ = .1

w, losses = grad_descent(X_train, y_train,
                        eta=1, tolerance=.0001,
                        iters=100, verbose=False,
                        lambda_=lambda_, batch_size=10)


### Questions: normalization
- Complete the L2 norm function

- What does the variable `lambda` do in the code above? 

- What happens if you set `lambda` to a huge number? What happens if you set `lambda` to a small number?  What should you see in terms of accuracy and the norm of the weights? Try systematically varying lambda

### Questions: Stochastic gradient descent
- Print the loss and vary the batch size:
    - How do you think that varying eta will vary the amount of noise in the loss?
    - How do you think that varying batch size will vary the amount of noise in the loss?
    
- Test your answers to the previous two questions by making a plot. Your plot should show the loss each iteration, for different batch sizes. You should try batch sizes of 1, 10 and 100. What do you observe in your plot?