ml_cheat_sheet.tex

% You should title the file with a .tex extension (hw1.tex, for example)
\documentclass[11pt]{article}

\usepackage{mathtools}
\usepackage{amssymb}
\usepackage{fancyhdr}
\usepackage{listings}
\usepackage{algpseudocode}
\usepackage{color}
\usepackage{color}   %May be necessary if you want to color links
\usepackage{hyperref}


\oddsidemargin0cm
\topmargin-2cm     %I recommend adding these three lines to increase the 
\textwidth16.5cm   %amount of usable space on the page (and save trees)
\textheight23.5cm  


\begin{document}
\tableofcontents

\medskip                        % Skip a "medium" amount of space
                                % (latex determines what medium is)
                                % Also try: \bigskip, \littleskip


\section{Some definitions}
\begin{itemize}
\item overfitting: Given $H$, $h$ overfits if $\exists h' \in H$ such that $h'$ has smaller error over all the instances even though $h$ has a smaller error over the training examples.
\end{itemize}

\section{Lazy vs Eager}
\begin{itemize}
\item k-NN, locally weighted regression, and case-based reasoning are lazy
\item \textsc{BACKPROP}, RBF is eager (why?), ID3 eager
\item Lazy algorithms may use query instance $x_q$ when deciding how to generalize (can represent as a bunch of local functions). Eager methods have already developed what they think is the global function.
\end{itemize}

\section{Decision Trees}

\subsection{ID3 Algorithm}
\begin{itemize}
\item Constructs trees topdown. Greedy algorithm. Hypothesis space of ID3: set of decision trees. Complete space, maintains only a single hypothesis. Uses all traning examples at each step (reduced sensitivity to individual error).
\begin{itemize}
\item A $\leftarrow$ best attribute
\item assign A as decision attribute for Node
\item for each value of A, create a descendant of node
\item sort training examples to leaves
\item if examples perfectly classified, stop
\item else iterate over leaves
\end{itemize}
\item $Entropy(S) = \sum_{i=1}^{c} -p_i lg(p_i)$ ($p_i$ is proportion of $S$ belonging to class $i$, also base can vary--what would cause us to do that?)
\item $Gain(S,A) = Entropy(S) - \sum_{v \in values(A)} \frac{|S_v|}{|S|} Entropy(S_v)$
\begin{itemize}
\item $S_v$: subset of $S$ for which attribute $A$ has value $v$
\end{itemize}
\end{itemize}

\subsection{Inductive Bias of ID3}
\begin{itemize}
\item prefers shorter trees
\item highest info gain attributes
\end{itemize}

\subsection{Pruning}
\begin{itemize}
\item Reduced error  (?)
\item Rule post-pruning (?)
\begin{itemize}
\item grow the tree
\item convert tree into equivalent set of rules
\item prune (generalize) each rule by removing preconditions that result in improving its estimated accuracy
\item sort pruned rules by estimated accuracy. Consider them in this sequence when classifying subsequent instances.
\end{itemize}
\end{itemize}

\subsection{Adapting Decision Trees to Regression(?)}
\begin{itemize}
\item splitting criteria: variance
\item leaves: average local linear fit
\end{itemize}

\section{Regression and Classification}
\begin{itemize}
\item Least squared error:The objective consists of adjusting the parameters of a model function to best fit a data set. A simple data set consists of n points (data pairs) $(x_i,y_i)$, i = 1, ..., n, where $x_i$ is an independent variable and $y_i$ is a dependent variable whose value is found by observation. The model function has the form $f(x,\beta)$, where the m adjustable parameters are held in the vector $\boldsymbol \beta$. The goal is to find the parameter values for the model which "best" fits the data. The least squares method finds its optimum when the sum, S, of squared residuals

$S=\sum_{i=1}^{n}{r_i}^2$
is a minimum. A residual is defined as the difference between the actual value of the dependent variable and the value predicted by the model.

$r_i=y_i-f(x_i,\boldsymbol \beta)$
An example of a model is that of the straight line in two dimensions. Denoting the intercept as $\beta_0$ and the slope as $\beta_1$, the model function is given by $f(x,\boldsymbol \beta)=\beta_0+\beta_1 x$.
\end{itemize}

\section{Neural Networks}
\subsection{Perceptrons}
$$o(x_1...x_n) =
    \begin{cases}
            1, &         \text{if } w_0+w_1x_1+...+w_nx_n>0,\\
            0, &         \text{otherwise}.
    \end{cases}
$$
where $w_0,...,w_n$ is a real-valued weight. Note that $w_0$ is a threshold that must be surpassed for the perceptron to output 1. Alternatively: $o(\vec{x}) = sgn(\vec{w}\vec{x})$. $H =\{\vec{w} | \vec{w} \in \mathbb{R}^{n+1} \}$.

\subsection{Perceptron Training Rule vs Delta Rule}
\begin{itemize}
\item Perceptron training rule: begin with random weights, apply perceptron to each training example, update perceptron weights when it misclassifies. Iterates through training examples repeatedly until it classifies all examples correctly.
\begin{itemize}
\item $w_i \leftarrow w_i + \Delta w_i$
\item $\Delta w_i=\eta (t-o) x_i$, $t$: target output for current training example. $o$:output generated for current training example. $\eta$: learning rate.
\end{itemize}
\item To converge, Perceptron training rule needs data to be linearly separable( Decision for this hyperplane is $\vec{w}\vec{x} >0$) and for $\eta$ to be sufficiently small.
\item Delta rule uses \textit{gradient descent}.
\begin{itemize}
\item (?) task of training linear unit (1st stage of a perceptron without the threshold): $o(\vec{x}) = \vec{w}\vec{x}$
\item training error: $E(\vec{w})=\frac{1}{2} \sum_{d \in D} (t_d - o_d)^2$, where $D$: training examples, $t_d$: target output for training example $d$, and $o_d$: output of linear unit for training example $d$.
\item Gradient descent finds global minimum of $E$ by initializing weights, then repeatedly modifying until it hits the global min. Modification: alters in the direction that gives steepest descent. $\nabla E(\vec{w})= [\frac{\partial E}{\partial w_0},...,\frac{\partial E}{\partial w_n}]$
\item Training rule for gradient descent: $w_i \leftarrow w_i + \Delta w_i$\\
$\Delta \vec{w} = -\eta \nabla E(\vec{w})$
\item Training rule can also be written in its component form: $w_i \leftarrow w_i+\Delta w_i$\\ $\Delta w_i = -\eta \frac{\partial E}{\partial w_i}$
\item Efficient way of finding $\frac{\partial E}{\partial w_i} = \sum_{d \in D} (t_d-o_d)(-x_{id})$, where $x_{id}$ (?) represents single input component $x_i$ for training example $d$.
\item Rewrite: $\Delta w_i = \eta \sum_{d\in D} (t_d - o_d) (x_{id})$ (true gradient descent)
\item Problems: slow; possibly multiple local minima in error surface (?-I thought error function was smooth, and would always find the global minimum. Example why not?)
\item (?) Stochastic gradient descent: $\Delta w_i = \eta (t-o)x_i$ (known as delta rule). Error rule: $E_d(\vec{w}) = \frac{1}{2} (t_d - o_d)^2$ (?-relationship to the other gradient descent? Why don't we need to separate it by $x_{id}$ anymore? Is this a vector?)
\item Stochastic versus True gradient descent
\begin{itemize}
\item true: error summed over all examples before updating weights. stochastic: weights updated upon examining each training example
\item summing over multiple examples require more computation per weight update step. But using true gradient, so can use a larger step size
\item Stochastic avoids multiple local minima because it uses $\nabla E_d(\vec{w})$ not $\nabla E(\vec{w})$
\end{itemize}
\end{itemize}
\item The cost function for a neural network is non-convex, so it may have multiple minima. Which minimum you find with gradient descent depends on the initialization.
\end{itemize}

\subsection{Threshold Unit}
Unit for multilayer networks. Want a network that can represent highly nonlinear functions. Need unit whose output is nonlinear, but the output is also differentiable function of its inputs. $o = \sigma (\vec{w} \vec{x})$ where $\sigma(y) = \frac{1}{1-e^y}$
\subsection{\textsc{BACKPROP}}
$$E(\vec{w}) = \frac12 \sum_{d\in D} \sum_{k \in outputs}{(t_{kd} - o_{kd})^2}$$ where outputs: set of output units in network, $t_{kd}$ target, $o_{kd}$ output associated with $k^{th}$ output unit and training example $d$. (?)\\
{\color{red}Algorithm BACKPROP}
\begin{itemize}
\item until termination condition is met:
\item for i = 1 to m (m is the number of training examples)
\begin{itemize}
\item set $a^{(1)} = x^{(i)}$ ($i^{th}$ training example)
\item Perform forward propagation by computing $a^{(l)}$ for $l = 2,...,L$ ($L$ is total number of layers) $a^{(l)}= \sigma (w^{(l-1)}a^{(l-1)}) $ = output of the $l^{th}$ layer. 
\item Using $y^{(i)}$ compute $\delta^{(L)} = a^{(L)} - y^{(i)}$ ($y^{(i)}$ is the target for the $i^{th}$ training example)
\item Then calculate (??) $\delta^{(L-1)}$ up until $\delta^{(2)}$ ($\delta^{(l)}$ is the "error" of layer $l$ and $$\delta^{(l)} = w^{(l)}\delta^{(l+1)} .* \sigma'(w^{(l)} a^{(l)})$$
\item update $w^{(l)} = w^{(l)} + \Delta w^{(l)}$ (represents a vector of the weights of layer $l$) where  $$\Delta w^{(l)} = \eta \delta^{(l)}.*x^{(l)}$$
\end{itemize}
\end{itemize}

\subsection{Momentum}
$$\Delta w^{(l)}_n= \eta \delta^{(l)}.*x^{(l)} +\alpha w^{(l)} (n-1) $$\\ where $n$ is the iteration (adds a momentum $\alpha$)
\begin{itemize}
\item $E_d(\vec{w}) = \frac12 \sum_{k\in outputs}{(t_k - o_k)^2}$ error on training example $d$
\item How to derive the \textsc{BACKPROP} rule??
\item \textsc{BACKPROP} for multi-layer networks may converge only at a local minimum (because error surface for multi-layer networks may contain many different minima).
\item Alternative Error Functions?
\item Alternative Error Minimization Procedures
\end{itemize}

\subparagraph{Recurrent Networks}
What do I need to know about recurrent networks?

\subsection{Radial Basis Functions}
\begin{itemize}
\item $\hat{f}(x) = w_0+ \sum_{u=1}^k{w_uKern_u(d(x_u, x))}$
\item Equation can be thought of as training a 2-layer network. First layer computes $Kern_u$, second layer computes a linear combination of these first layer values.
\item Kernel is defined such that $d(x_u, x) \uparrow \implies Kern_u \downarrow$
\item RBF gives global approximation to target function represented by linear combinations of many local kernel functions (smooth linear combination).
\item Faster to train than \textsc{BACKPROP} because input and output layer are trained separately.
\item RBF is eager: represents global function as a linear combo of multiple local kernel functions. Local approximations RBF creates are not specifically targeted to the query.
\item A type of ANN constructed from spatially localized kernel functions. Sort of the `link' between k-NN and ANN?
\end{itemize}


\section{Instance Based Learning}
\subsection{k-NN}
\begin{itemize}
\item discrete: $$\hat{f} (x_q) =argmax_{v \in V} \sum_{i=1}^k{\delta(v, f(x_i))}$$ where $\delta(a,b) =1$ if $a=b$ and $0$ otherwise.
\item continuous (for a new value, $x_q$): $$\hat{f} (x_q) = \frac{\sum_{i=1}^{k}{f(x_i)}}{k}$$
\item distance-weighted: $w_i = \frac{1}{d(x_q, x_i)^2}$. If $x_q = x_i$ assign $\hat{f} (x_q) = f(x_i)$ (if more than one, do a majority).
\item real valued distance weighted: $$\hat{f} (x_q) = \frac{\sum_{i=1}^{k}{f(x_i)}}{\sum_{i=1}^{k}{w_i}}$$
\item  Inductive Bias of k-NN: assumption that nearest points are most similar
\item k-NN is sensitive to having many irrelevant attributes `curse of dimensionality' (can deal with it by `stretching the axes', add a weight to each attribute. Can even get rid of some of the attributes by setting the weight =0)
\end{itemize}

\subsection{Locally Weighted Linear Regression}
\begin{itemize}
\item $f$ approximated near $x_q$ using $\hat{f}(\vec{x}) = \vec{w} \cdot \vec{x}$ (is this appropriate notation?)
\item Error function using kernel: $E(x_q) = \frac12 \sum_{k\in K}{(f(x) - \hat{f}(x))^2 Kern(d(x_q, x))}$ where $K$ is the set of $k$ closest $x$ to $x_q$.
\end{itemize}

\section{Support Vector Machines}
Maximal Margin Hyperplanes: if data linearly separable, then $\exists (\vec{w}, b)$ such that $\vec{w}^T\vec{x} + b \geq 1$ $\forall \vec{x_i} \in P$ and $\vec{w}^T\vec{x} + b \leq -1$ $\forall \vec{x_i} \in N$ (N, P are the two classes). Want to minimize $\vec{w}^T\vec{w}$ subject to constraints of linear separability.\\ Or, maximize $\frac2{|w|}$ while $y_i(\vec{w}^T\vec{x_1}+b) \geq 1$ $\forall i$. Note $y_i =\{+1, -1\}$. Or minimize $\frac12 |w|^2$. This is quadratic programming problem.\\
$W(\alpha) = \sum_{i} \alpha_i - \frac12 \sum_{i,j} \alpha_i \alpha_j y_i y_j x_i^T x_j$. $w = \sum_i \alpha_i x_i y_i$. $\alpha_i$ mostly 0 $\implies$ only a few of the x's matter.
\subsection{Kernel Induced Feature Spaces}
Map to higher dimensional \textit{feature space}, construct a separating hyperplane. $X \rightarrow H$ is $\vec{x} \rightarrow \phi(\vec{x}).$\\
Decision function is $f(\vec{x}) = sgn (\phi(\vec{x}) w^*+b^*)$ (* means optimal weight and bias)\\
Kernel function: $K(\vec{x} \vec{z}) =\phi(\vec{x})^T\phi(\vec{z})$. If $K$ exists, we don't even need to know what $\phi$ is.\\
Mercer's condition: \\
What if data is not linearly separable? (slack variables?)\\
Lagrangian?\\
Mercer's Theorem? \\

\subsection{Relationship between SVMs and Boosting}
$H_{trial} (x) = \frac{sgn(\sum_i{\alpha_i x_i})}{\sum_i \alpha_i}$. As we use more and more weak learners, the error stays the same, but the confidence goes up. This equates to having a big margin (big margins tend to avoid overfitting). 


\section{Boosting}
Boosting problem: set of weak learners combined to produce a learner with an arbitrary high accuracy.\\
The original boosting problem asks whether a set
of weak learners can be combined to produce a learner with an arbitrary
high accuracy. A weak learner is a learner whose performance (at
classification or regression) is only slightly better than random guessing.
AdaBoost: trains multiple weak classifiers on training data, then combines into single boosted classifier. Weighted sum of weak classifiers with weights dependent on weak classifier accuracy.\\
$N$ training examples: $x_i$, $y_i \in \{-1, +1\}$. Each example $i$ has an observation weight $w_i$ (how important example $i$ is for our current learning task).\\
Classifier $G$: $err_S = \sum_{i=1}^N{w_i I(y_i \neq G(x_i))}$ \\
Using weights: $err = \frac{\sum_{i=1}^N{w_i I(y_i \neq G(x_i))}}{\sum_{i=1}^N w_i}$\\
In this way, our error metric is more sensitive to misclassified examples
that have a greater importance weight. Denominator is only for normalization (we want an answer between 0 and $N$).
Boosting: weights are sequentially updated. Algorithm:\\
\begin{itemize}
\item initialize $w_i = \frac1N$
\item for $m =1$ to $M$:
\begin{itemize}
\item fit $G_m(x)$ using $w_i$'s
\item compute $$err_m = \frac{\sum_{i=1}^N{w_i I(y_i \neq G_m(x_i))}}{\sum_{i=1}^N w_i}$$
\item $\alpha_m = \frac{log(1-err_m)}{err_m}$
\item $w_i \leftarrow w_i \cdot exp(\alpha_m I(y_i \neq G_m(x_i))$ for $i = 1 \dots N$
\end{itemize}
\item $G(x) = sign [ \sum_{m=1}^M \alpha_m G_m(x)]$ In this way, classifiers that have a poor accuracy (high error rate, low $\alpha_m$) are penalized in the final sum.
\end{itemize}

Question : where are these $G_m$'s coming from? Are they pre-set or are they created by the algorithm?

\section{Computational Learning Theory}
\subsection{Definitions}
\begin{itemize}
\item $H$--hypothesis space. $c \in H$--true hypothesis. $h \in H$--candidate hypothesis. $S \subseteq H$--training set.
\item Consistent learner: Learner outputs a hypothesis such that $h(x) = c(x)$ $\forall x \in S$
\item Version space: $VS(S) = \{ h \in H: $h$ \text{ consistent wrt to } S \}$ (ie, hypothesis consistent with training examples)
\item training error: fraction of training examples misclassified by $h$.
\item true error: fraction of examples that would be misclassified on sample drawn from $D$ (distribution over inputs). $error_D(h) = Pr_{x \sim D} [c(x) \neq h(x)]$
\item $C$ is PAC-learnable by learner $L$ using $H$ $\iff$ $L$ will output $h \in H$ (with probability $1-\delta$) such that $error_D(h) \leq \varepsilon$ in time and samples polynomial in $1/\varepsilon$, $1/ \delta$, $|H|$.
\item $\varepsilon$-exhausted version space: $VS(S)$ exhausted iff $\forall h \in VS(S)$ $error_D(h) \leq \varepsilon$.
\end{itemize}

\subsection{Haussler Theorem}
Bounds true error.\\
Let $error_D(h_i) > \varepsilon$ for $i = 1 \dots k$ (some $h_i$'s in $H$). How much data do we need to ``knock out'' all these hypotheses?\\
$Pr_{x \sim D} [h_i(x) = c(x)] \leq 1- \varepsilon$ (probability that $h_i$ matches true concept is low)\\
$Pr(h_i \text{ consistent with $c$ on $m$ examples}) \leq (1- \varepsilon)^m$ (independent).\\
$Pr(\exists h_i \text{ consistent with $c$ on $m$ examples}) = k \cdot (1- \varepsilon)^m \leq |H| \cdot (1-\varepsilon)^m$\\
$-\varepsilon \geq ln(1- \varepsilon) \implies (1- \varepsilon)^m \leq exp(-\varepsilon m)$\\
Upper bound that VS not $\varepsilon$-exhausted after $m$ samples: $|H|\cdot exp(-\varepsilon m)$.\\
Want: $|H| \cdot exp(-\varepsilon m) \leq \delta$ (solve for m).\\
$m \geq \frac{1}{\varepsilon} (ln(|H|) + ln (\frac{1}{\delta})$

\subsection{Infinite Hypotheses Spaces}
\begin{itemize}
\item Examples: linear separators, ANNs, decision trees (continuous inputs)
\item $m \geq \frac{1}{\varepsilon} (8VC(H)lg(\frac{13}{\varepsilon}) + 4 lg( \frac{2}{\delta})$
\item shatter: A set of instances $S$ is shattered by $H$ if every possible dichotomy of $S$ $\exists h \in H$ that is consistent with this dichotomy.
\item $VC(H)$ is size of largest finite subset of instance space that can be shattered by $H$.
\item $C$ PAC-learnable iff VC dimension is finite.
\end{itemize}


\section{Bayesian Learning}
\subsection{Equations and Definitions}
\begin{itemize}
\item $P(h)$ : probability that a hypothesis $h$ holds
\item $P(D)$: probability that training data $D$ will be observed
\item Bayes' Rule: $$P(h|D) = \frac{P(D|h)P(h)}{P(D)}$$
\item Find most probable $h \in H$ given $D$: $$h_{map} = argmax_{h \in H} P(h|D) = argmax_{h \in H} P(D|h)P(h)$$
\item if every $h\in H$ a priori equally probable: $$h_{ml} = argmax_{h \in H} P(D|h)$$
\end{itemize}

\subparagraph{BRUTE FORCE MAP learning algorithm}
Output $h_{map}$

Let's assume:
\begin{itemize}
\item $D$ is noise-free
\item Target function $c \in H$
\item all $h$ (a priori) are equally likely
\end{itemize}
Then $P(h) = \frac1{|H|}$\\
$$P(D|h) =
    \begin{cases}
            1, &         \text{if } d_i =h(x_i) \forall d_i \in D,\\
            0, &         \text{otherwise}.
    \end{cases}
$$
$$P(D) = \frac{|VS_{H,D}|}{|H|}$$ $|VS_{H,D}|$ is the set of hypotheses in $H$ that are consistent with $D$. Consistent learned outputs an $h$ with zero error over training examples.\\
Therefore $$P(h|D) = \begin{cases}
            \frac1{|VS_{H,D}|}, &         \text{if $h$ consistent with $D$}\\
            0, &         \text{otherwise}.
    \end{cases}
$$
Every consistent hypothesis is a MAP hypothesis (with these assumptions)!

\subsection{ML and Least-Squared Error}
Under certain assumptions any learner that minimizes squared error between the outputs of hypothesis $h$ and training data will output an ML hypothesis. No idea why. ?? ML hypothesis is the one that minimizes the sum of squared errors over the training data.

\subsection{Bayes Optimal Classifier}
$$P(v_j |D) = \sum_{h_j \in H}{P(v_j|h_i) P(h_i|D)}$$ (probability that correct classification is $v_j$)\\
$$v_{map} = argmax_{v_j \in V} P(v_j|D)$$

\subsection{Bayesian Belief Networks}
\subparagraph{Naive Bayes} Classify given attributes: $v_{map} = argmax_{v_j \in V} P(v_j|a_1,...,a_n)$. Rewrite using Bayes' rule and use naive assumption that all $a_i$ are conditionally independent given $v_j$. $v_{NB} = argmax_{v_j \in V} P(v_j) \prod_{i}{P(a_i|v_j)}$.\\
Whenever naive assumption is satisfied, $v_{NB}$ same as MAP classification.
\subparagraph{EM Algorithm}
\begin{itemize}
\item arbitrary initial hypothesis
\item repeatedly calculates expected values of the hidden variables
\item recalculates the ML hypothesis
\end{itemize}
This will converge to local ML hypothesis, along with estimated values for hidden variables (why?)

\section{Evaluating Hypotheses}

\section{Randomized Optimization}
\subsection{MIMIC}
Directly model distribution.\\
Algorithm:\\
\begin{itemize}
\item generate samples from $P^{\theta_t}(x)$
\item set $\theta_{t+1}$ to the n'th percentile
\item retain only those samples such that $f(x) \geq \theta_{t+1}$
\item estimate $P^{\theta_{t+1}}(x)$
\item repeat!
\end{itemize}

\subsection{Simulated Annealing}
Algorithm:\\
\begin{itemize}
\item for finite number of iterations:
\item sample new point $x_t$ in $N(x)$
\item Jump to new sample with probability $P(x, x_t, T)$
\item decrease $T$
\end{itemize}
$$P(x, x_t, T) =
    \begin{cases}
            1, &         \text{if } f(x_t) \geq f(x),\\
            exp(\frac{f(x_t)-f(x)}{T}), &         \text{otherwise}.
    \end{cases}
$$

\subparagraph{Genetic Algorithms}
WHAT IS??

\section{Information Theory}
\subparagraph{Definitions}
We'll use shorthand: Just write $x$ instead of $X= x$ for all the possible values that a random event $X$ could take on. (Am I using the terms correctly?)
\begin{itemize}
\item Mutual Information: $I(X,Y) = H(X) - H(X|Y)$
\item Entropy: $H(A) = - \sum_{s \in A} P(s) lg(P(s))$
\item Joint entropy: $H(X, Y) = - \sum_{x \in X} \sum_{y \in Y} P(x,y) lg(P(x,y))$
\item Conditional Entropy: $H(Y|X) = -\sum_{x \in X} \sum_{y \in Y} P(x,y) lg(P(y|x))$
\item If X independent of Y: $H(Y|X) = H(Y)$ and $H(Y,X) = H(Y) +H(X)$
\item Kullback-Leibler divergence: $KL(p||q) = - \sum_{x \in X} p(x) lg(\frac{p(x)}{q(x)})$ for two different distributions $p$, $q$.
\end{itemize}

\end{document}