# Gradient Descent and Dataset Tools

In [24]:
%%capture
import random, math
import import_ipynb
from stats_and_prob import *
from linear_algebra import *

This notebook contains a few basic tools to help with data tasks (vector manipulation and the like), gradient descent algorithms, and PCA routines

In [2]:
def shape(A):
    num_rows = len(A)
    num_cols = len(A[0]) if A else 0
    return num_rows, num_cols

In [3]:
def scalar_multiply(c, v):
    return [c * v_i for v_i in v]

def vector_subtract(v, w):
    '''componentwise subtraction of two vectors'''
    return [v_i - w_i for v_i, w_i in zip(v,w)]

def dot(v, w):
    '''v_1 * w_1 + ... + v_n * w_n'''
    return sum(v_i * w_i for v_i, w_i in zip(v, w))

def sum_of_squares(v):
    '''v_1 * v_1 + ... + v_n * v_n'''
    return dot(v, v)

def squared_distance(v, w):
    return sum_of_squares(vector_subtract(v, w))

def distance(v, w):
   return math.sqrt(squared_distance(v, w))

In [21]:
def partial_difference_quotient(f, v, i, h):
    '''computes the ith partial difference quotient of f at v'''
    w = [v_j + (h if j == i else 0) for j, v_j in enumerate(v)]
    return (f(w) - f(v))/h

def estimate_gradient(f, v, h=0.00001):
    grad = [partial_difference_quotient(f, v, i, h) for i, _ in enumerate(v)]

In [20]:
def step(v, direction, step_size):
    """move step_size in the direction from v"""
    return [v_i + step_size * direction_i
            for v_i, direction_i in zip(v, direction)]

def sum_of_squares_gradient(v):
    return [2 * v_i for v_i in v]

def safe(f):
    """define a new function that wraps f and return it"""
    def safe_f(*args, **kwargs):
        try:
            return f(*args, **kwargs)
        except:
            return float('inf')         # this means "infinity" in Python
    return safe_f

In [6]:
step_sizes = [100, 10, 1, 0.1, 0.01, 0.001, 0.0001, 0.00001]

### Gradient Descent

Gradient descent plays many roles in data science, but most notably it serves as a way to find the minimum or maximum of functions when fitting models.  The general idea is to calculate the gradient and move either with it or against it.  For instance, when finding the best fit of a model where we seek to minimize the sum of squares, the algorithm would step through the parameter space in a way that goes against the gradient to find the parameters that minimize the error function.

In [22]:
def minimize_batch(target_fn, gradient_fn, theta_0, tolerance=0.000001):
    """use gradient descent to find theta that minimizes target function"""

    step_sizes = [100, 10, 1, 0.1, 0.01, 0.001, 0.0001, 0.00001]

    theta = theta_0                           # set theta to initial value
    target_fn = safe(target_fn)               # safe version of target_fn
    value = target_fn(theta)                  # value we're minimizing

    while True:
        gradient = gradient_fn(theta)
        next_thetas = [step(theta, gradient, -step_size)
                       for step_size in step_sizes]

        # choose the one that minimizes the error function
        next_theta = min(next_thetas, key=target_fn)
        next_value = target_fn(next_theta)

        # stop if we're "converging"
        if abs(value - next_value) < tolerance:
            return theta
        else:
            theta, value = next_theta, next_value

In [23]:
def negate(f):
    """return a function that for any input x returns -f(x)"""
    return lambda *args, **kwargs: -f(*args, **kwargs)

def negate_all(f):
    """the same when f returns a list of numbers"""
    return lambda *args, **kwargs: [-y for y in f(*args, **kwargs)]

def maximize_batch(target_fn, gradient_fn, theta_0, tolerance=0.000001):
    return minimize_batch(negate(target_fn),
                          negate_all(gradient_fn),
                          theta_0,
                          tolerance)

### Stochastic Gradient Descent

Rather than using a batch approach of calculating error functions for the entire dataset, which can be computationally expensive, we can use stochastic gradient descent (SGD) when errors are additive.  Instead, we can start updating the model parameters from the first sample of the training data (chosen randomly) to stochastically converge on the optimal parameters.  Formally, this only serves to approximate the best fit, but it is much faster and for practical purposes just as adequate as full gradient descent.

In [10]:
def in_random_order(data):
    '''generator that returns the elements of dataset in random order'''
    indexes = [i for i, _ in enumerate(data)]    # create a list of indexes
    random.shuffle(indexes)                      # shuffle in place
    for i in indexes:
        yield data[i]

In [11]:
def minimize_stochastic(target_fn, gradient_fn, x, y, theta_0, alpha_0=0.01):
    
    data = list(zip(x, y))
    theta = theta_0                             # initial guess
    alpha = alpha_0                             # initial step size
    min_theta, min_value = None, float('inf')   # the minimum so far
    iterations_with_no_improvement = 0
    
    # if we go more than 100 iterations without improvement, stop
    while iterations_with_no_improvement < 100:
        value = sum(target_fn(x_i, y_i, theta) for x_i, y_i in data)
        
        if value < min_value:
            # if we find a new minimum, remember it
            # and go back to original step size
            min_theta, min_value = theta, value
            iterations_with_no_improvement = 0
            alpha = alpha_0
        else:
            # otherwise we're not improving, so try shrinking the step size
            iterations_with_no_improvement += 1
            alpha *= 0.9
        
        # and take a gradient step for each of the data points
        for x_i, y_i in in_random_order(data):
            gradient_i = gradient_fn(x_i, y_i, theta)
            theta = vector_subtract(theta, scalar_multiply(alpha, gradient_i))
            
    return min_theta
    

In [12]:
def maximize_stochastic(target_fn, gradient_fn, x, y, theta_0, alpha_0=0.01):
    return minimize_stochastic(negate(target_fn),
                               negate_all(gradient_fn),
                               x, y, theta_0, alpha_0)

## Data Tools

In [13]:
def correlation_matrix(data):
    '''returns the num_columns x num_columns matrix whose (i,j)th 
    entry is the correlation between columns i and j of the data'''
    _, num_columns = shape(data)
    def matrix_entry(i, j):
        return correlation(get_column(data, i), get_column(data, j))
    
    return make_matrix(num_columns, num_columns, matrix_entry)

In [14]:
def scale(data_matrix):
    '''returns the means and standard deviations of each column'''
    num_rows, num_cols = shape(data_matrix)
    means = [mean(get_column(data_matrix, j)) for j in range(num_cols)]
    stdevs = [standard_deviation(get_column(data_matrix, j)) for j in range(num_cols)]
    
    return means, stdevs

def rescale(data_matrix):
    '''rescales the input data so column has mean of 0 and stdev of 1'''
    means, stdevs = scale(data_matrix)
    
    def rescaled(i, j):
        if stdevs[j] > 0:
            return (data_matrix[i][j]-means[j])/stdevs[j]
        else:
            return data_matrix[i][j]
        
    num_rows, num_cols = shape(data_matrix)
    
    return make_matrix(num_rows, num_cols, rescaled)

In [15]:
def de_mean_matrix(A):
    '''returns the result of subtracting from every value 
    in A the mean value of its column'''
    nr, nc = shape(A)
    column_means, _ = scale(A)
    return make_matrix(nr, nc, lambda i, j: A[i][j]-column_means[j])

def direction(w):
    mag = magnitude(w)
    return [w_i/mag for w_i in w]

def directional_variance_i(x_i, w):
    '''the variance of the row x_i in the direction of w'''
    return dot(x_i, direction(w))**2

def directional_variance(X, w):
    '''the variance of the data in the direction determined by w'''
    return sum(directional_variance_i(x_i, w) for x_i in X)

def directional_variance_gradient_i(x_i, w):
    '''the contribution of row x_i to the gradient of the dirctoin-w variance'''
    projection_length = dot(x_i, direction(w))
    return [2*projection_length*x_ij for x_ij in x_i]

def directional_variance_gradient(X, w):
    return vector_sum(directional_variance_gradient_i(x_i, w) for x_i in X)

### PCA

Principle component analysis (PCA) is a way to reduce the dimensionality of your data by transforming it through projects and removing dimensions that contain less information.  The principle components are identified as the directions with the highest variance, i.e. the eigenvectors with the largest eigenvalues.  This technique is also useful because it reprojects your data into orthogonal directions.  For instance, consider a three-dimensional dataset that looks like an angled cigar.  The principle component is the long direction and PCA would re-orient the data through linear transformation to deproject it along the x, y, and z directions.

In [16]:
def first_principle_component(X):
    guess = [1 for _ in X[0]]
    unscaled_maximizer = maximize_batch(
        partial(directional_variance, X),           # as a function of w
        partial(directional_variance_gradient, X),  # as a function of w
        guess)
    return direction(unscaled_maximizer)

def first_principle_component_sgd(X):
    guess = [1 for _ in X[0]]
    unscaled_maximizer = maximize_stochastic(
        lambda x, _, w: directional_variance_i(x, w),
        lambda x, _, w: directional_variance_gradient_i(x, w),
        X,
        [None for _ in X],
        guess)
    return direction(unscaled_maximizer)

In [17]:
def project(v, w):
    '''return the projection of v onto the direction w'''
    projection_length = dot(v, w)
    return scalar_multiply(projection_length, w)

def remove_projection_from_vector(v, w):
    '''projects v onto w and subtracts the result from v'''
    return vector_subtract(v, project(v,w))

def remove_projection(X,w):
    '''for each row of X projects the row onto w, 
    then subtracts result from row'''
    return [remove_projection_from_vector(x_i, w) for x_i in X]

In [18]:
def principal_component_analysis(X, num_components):
    components=[]
    for _ in range(num_components):
        component = first_principal_component(X)
        components.append(component)
        X = remove_projection(X, component)
    
    return components

In [19]:
def transform_vector(v, components):
    return [dot(v, w) for w in components]

def transform(X, components):
    return [transform_vector(x_i, components) for x_i in X]