In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import numpy.random as rnd
import bokeh.charts as bc
import bokeh.plotting as bp
from bokeh.palettes import Spectral11, Blues9

In [2]:
import bokeh as bk
bk.io.output_notebook()

## Logistic Regression, Softmax and Gradient Descent

### Generate some 2D data to work with

In [3]:
import data_gen as gen
gm = gen.GaussianMixture(n_class=2)

### Frequentist Logistic regression:

#### The model:
$$Y|X \sim \text{Bernolli}( \frac{1}{1+e^{-wX}})$$

#### MLE:
* log-likelihood: 

\[ 
\begin{align*}
    &\log{P(Y_1, Y_2, \ldots, Y_n\mid X_1, X_2, \ldots, X_n, w)}    \\
    =& \log{\prod_{i=1}^{n} {P(Y_i \mid X_i, w)}} \\
    =& \sum_{i=1}^{n}{\log{P(Y_i \mid X_i, w)}} \\
    & \text{(Assume \( Y_i\) is bernolli distrubuted with \( p=\frac{1}{1+e^{-wX}}\) )} \\
    =& \sum_{i=1}^{n}{\log{P(Y_i=1 \mid X_i, w)^{I_{Y_i=1}}P(Y_i=0 \mid X_i, w)^{I_{Y_i=0}}}} \\
    =& \sum_{i=1}^{n}{\log{p^{I_{Y_i=1}}(1-p)^{I_{Y_i=0}}}} \\
    =& \sum_{i=1}^{n}{{I_{Y_i=1}}\log{p} +{I_{Y_i=0}}\log{(1-p)}} \\
    =& \sum_{i=1}^{n}{Y_i\log{p} + (1-Y_i)\log{(1-p)}} \\
    =& \sum_{i=1}^{n}{Y_i\log{\frac{p}{1-p}} + \log{(1-p)}} \\
    =& \sum_{i=1}^{n}{Y_iwX_i + \log{\frac{e^{-wX_i}}{1+e^{-wX_i}}}} \\
    =& \sum_{i=1}^{n}{Y_iwX_i - wX_i - \log{(1+e^{-wX_i})}} \\
\end{align*}
\] 

* Maximize log-likelihood by gradient descent(ascent): 
Gradient: 

\[ 
\begin{align*}
    & \nabla_w{(\sum_{i=1}^{n}{Y_iwX_i - wX_i - \log{(1+e^{-wX_i})}})} \\
    =& \sum_{i=1}^{n} Y_iX_i - X_i + \frac{X_ie^{-wX_i}}{1+e^{-wX_i}}\\
%    =& \sum_{i=1}^{n} Y_iX_i - X_i\frac{1}{1+e^{-wX_i}}\\
    & \text{(Let}\hat{Y}_i = \frac{1}{1+e^{-wX_i}}) \\
    =& \sum_{i=1}^{n} (Y_i-\hat{Y}_i) X_i\\
\end{align*}
\] 

Find the maximum by: 

$$
    w_{t+1} = w_{t} + \sum_{i=1}^{n} (Y_i-\hat{Y}_i) X_i
$$

In [4]:
def logistic(_x, _w):  
    # _x : numpy 2DArray
    # _w : numpy 1DArray
    # _w.size = _x.shape[1]
    return 1/(1 + np.exp(-_x.dot(_w)))

max_iter = 1000
min_err_change = 0.0001

add_offset = lambda X: np.hstack((X, np.ones((X.shape[0], 1))))
X_with_offset = add_offset(gm.tr.X)

def regress(_x, _w):
    return logistic(add_offset(_x), _w)

w = rnd.uniform(-1, 1, X_with_offset.shape[1])

lr = 0.1  # Learning rate
beta = 0.8 # Backtracking
alpha = 0.5 # Backtracking

def log_likelihood(_X, _Y, _w):
    wX = _X.dot(_w)
    return (_Y - 1).dot(wX) - np.sum(np.logaddexp(0, -wX))
    
def log_likelihood2(_Y, _Yhat):
    _p = (1 - _Y)*(1 - _Yhat) + _Y*_Yhat
    return np.sum(np.log(_p))

err0 = None

for _ in range(max_iter):
    pred_lbs = logistic(X_with_offset, w)
    diff_lb = gm.tr.y - pred_lbs
    err = np.sum(np.abs(diff_lb))
    
    _lll = log_likelihood(X_with_offset, gm.tr.y, w)
    _lll2 = log_likelihood2(gm.tr.y, pred_lbs)
    # print((lr, err, _lll, _lll2, w))
    if err0:
        if abs(err - err0) < min_err_change:
            print(_)
            break
    err0 = err
    grad = diff_lb.dot(X_with_offset)
    lr = 0.1  # Learning rate
    while log_likelihood(X_with_offset, gm.tr.y, w+lr*grad) < _lll + lr*alpha*grad.dot(grad):
        lr *= beta
    w += lr*grad


In [5]:
gen.visualize_2D(gm.Classes, classifyF=lambda x: regress(x, w), res=80)

### Multi-class Logistic Regression

* Assumption:

$$
Y|X = \text{Categorical}(p_1, \ldots, p_n = \frac{e^{w_n X}}{\sum_{k=1}^{n}{e^{w_k X}} })
$$

* Log likelihood:

\[
\begin{align*}
    =& \log{\prod_{i=1}^{n} {P(Y_i \mid X_i, w_1, \ldots, w_n)}} \\
    =& \sum_{i=1}^{n}{\log{P(Y_i \mid X_i, w_1, \ldots, w_n)}} \\
    & (\text{Assume \( Y_i \) is catigorical distrubuted with } p_1, \ldots, p_n) \\
    =& \sum_{i=1}^{n}{\log{\prod_{k=1}^{n}{P(Y_i=1 \mid X_i, w_k)^{I_{Y_i=k}}}}} \\
    =& \sum_{i=1}^{n}{\sum_{k=1}^n {\log{p_k^{I_{Y_i=k}}}}} \\
    =& \sum_{i=1}^{n}{\sum_{k=1}^n {I_{Y_i=k}\log{p_k}}} \\
    =& \sum_{i=1}^{n}{\sum_{k=1}^n {Y_i\log{\frac{e^{w_n X_i}}{\sum_{k=1}^{n}{e^{w_k X_i}}}}}} \\
    =& \sum_{i=1}^{n}{\sum_{k=1}^n {Y_i(w_k X_i - \log{\sum_{k=1}^{n}{e^{w_k X_i}}})}} \\
\end{align*}
\]

* Gradient of log likelihood

\[
\begin{align*}
    &\nabla_W{\sum_{i=1}^{n}{\sum_{k=1}^n {Y_i(w_k X_i - \log{\sum_{k=1}^{n}{e^{w_k X_i}}})}}}\\
    =& \sum_{i=1}^{n}{Y_i X_i + \hat{Y}_i X_i} \\
    &(\text{where } \hat{Y}_i = \frac{e^{w_n X_i}}{\sum_{k=1}^{n}{e^{w_k X_i}}})
\end{align*}
\]


In [139]:
gm3 = gen.GaussianMixture(n_class=3)

In [6]:
def softmax(_x, _W):  
    # _x : numpy 2DArray
    # _w : numpy 1DArray
    # _w.size = _x.shape[1]
    
    return 1/np.sum(np.exp(_x.dot(_W)))

max_iter = 100
min_err_change = 0.0001

lr = 1  # Learning rate
beta = 0.8 # Backtracking
alpha = 0.5 # Backtracking
w0 = None

w = rnd.uniform(-1, 1, x_dim)

def log_likelihood(_X, _Y, _w):
    wX = _X.dot(w)
    return (_Y - 1).dot(wX) - np.sum(np.logaddexp(0, -wX))
    
err0 = None
for _ in range(max_iter):
    pred_lbs = logistic(tr_fts, w)
    diff_lb = tr_lbs - pred_lbs
    err = np.sum(np.abs(diff_lb))
    
    _lll = log_likelihood(tr_fts, tr_lbs, w)
    print((err, _lll, w))
    if err0:
        if abs(err - err0) < min_err_change:
            print(_)
            break
    err0 = err
    grad = diff_lb.dot(tr_fts)
    while log_likelihood(tr_fts, tr_lbs, w+lr*grad) > _lll + lr*alpha*grad.dot(grad):
        lr *= beta
    w += lr*grad

def regress(_x):
    return logistic(_x, w)

NameError: name 'x_dim' is not defined