In [1]:
import sklearn as skl
from algorithms.iris.Reader import IrisReader
from algorithms.tfAlgos.DataOps import IrisPrep
from typing import Dict, Tuple, Callable, List
import pandas as pd
import numpy as np
from functools import reduce
from cvxopt import matrix, solvers

iris_reader: IrisReader = IrisReader()
iris_reader.load()
raw_data: Dict[str, np.array] = iris_reader.data

raw_data

{'setosa': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],


In [2]:
def test_train_split(raw_data: Dict[str, np.array], labels: List[str], train_prop: float) -> Tuple[np.array, np.array]:
    
    def process_label_group(data: np.array, idx: int) -> np.array:
        n: int = len(data)
        lab_idx: np.array = np.repeat(idx, n).reshape(n, 1)
        return np.concatenate([lab_idx, data], axis=1)
    
    print("Label Mapping: ", list(enumerate(labels)))
    data: np.array = np.concatenate(list(
        map(lambda lab: process_label_group(raw_data[lab[1]], lab[0]), enumerate(labels))
    ))
    permuted: np.array = np.random.permutation(data)
    
    train_n: int = int(len(permuted) * 0.8)
    return (permuted[:train_n], permuted[train_n:])
    
train, test = test_train_split(raw_data, ["setosa", "versicolor", "virginica"], .8)

Label Mapping:  [(0, 'setosa'), (1, 'versicolor'), (2, 'virginica')]


For the dual soft margin SVM our task is to minimize the following relation with respect to $\alpha$.

\begin{align}
    L(\alpha) &= \Sigma_{n=1}^N - \frac{1}{2} \Sigma_{n=1}^N \Sigma_{m=1}^N a_n a_m y_n y_m k(\textbf{x_n}, \textbf{x_m}) \\
    &= \alpha^T - \frac{1}{2} \alpha^T Q \alpha \\
\end{align}
where $Q_{N \times N} = y_n^T y_m * K(\textbf{x_n}, \textbf{x_m})$ and subject to $0 \leq \alpha_n \leq C \hspace{3pt} \forall \hspace{3pt} n$ and $\alpha^T y = 0$. The matrix $K(\textbf{x_n}, \textbf{x_m})$ is commonly known as the Gram Matrix, and is the insertion point for the gaussian kernel (or any other kernel we might choose).

In [3]:
def process_data(data: np.array, pos_y_class: int) -> Tuple[np.array, np.array]:
    y: np.array = np.where(data[:,0] == pos_y_class, 1, -1)
    x_in: np.array = data[:, 1:]
    X: np.array = (x_in - x_in.mean(axis=0)) / x_in.std(axis=0)
    return y, X
    
y_train, X_train = process_data(train, 0)

In [13]:
def gram_matrix(X: np.array, Y: np.array, spread: float) -> np.array:
    x_rows, x_cols = X.shape
    y_rows, y_cols = Y.shape
    out: np.array = np.zeros((x_rows, y_rows))
    
    def g(x_0: np.array, x_n: np.array) -> float:
        normalization: float = 1 / ((np.sqrt(2*np.pi)*spread)**len(x_0))
        distance: float = (x_0-x_n).dot((x_0-x_n))
        exponential: float = np.exp((-0.5/spread**2) * distance)
        return normalization * exponential
    
    for i, x in enumerate(X):
        for j, y in enumerate(Y):
            out[i, j] = g(x, y)
            
    return out

gram_matrix(X_train, X_train, 0.3)

array([[3.12719703e+00, 1.76151224e-45, 1.06361760e-37, ...,
        5.21712840e-04, 1.29928410e-45, 1.41492147e+00],
       [1.76151224e-45, 3.12719703e+00, 9.35128473e-04, ...,
        3.75929694e-46, 1.13549401e+00, 4.00984034e-42],
       [1.06361760e-37, 9.35128473e-04, 3.12719703e+00, ...,
        9.30009902e-42, 9.05700485e-03, 2.00015430e-33],
       ...,
       [5.21712840e-04, 3.75929694e-46, 9.30009902e-42, ...,
        3.12719703e+00, 2.28038248e-45, 6.65528741e-06],
       [1.29928410e-45, 1.13549401e+00, 9.05700485e-03, ...,
        2.28038248e-45, 3.12719703e+00, 2.98163848e-42],
       [1.41492147e+00, 4.00984034e-42, 2.00015430e-33, ...,
        6.65528741e-06, 2.98163848e-42, 3.12719703e+00]])

Finding the optimal solution of $\alpha$ is actually a quadratic programming problem, so we will use a solver from an optimization library called [CVXOPT](https://cvxopt.org/userguide/intro.html). Specifically, we will use the [`qp`](https://cvxopt.org/userguide/coneprog.html#quadratic-programming) solver because it is designed for such problems.

In [20]:
%%html
<iframe src="https://cvxopt.org/userguide/coneprog.html#quadratic-programming" width="1200" height="1000"></iframe>

To understand what we are doing here, let's walk through each of the arguments and see how they correspond.

+ `x` is the parameter to be optimized over, so it is $\alpha$
+ `P` is our scaled Gram Matrix $Q_{N \times N} = y_n^T y_m * K(\textbf{x_n}, \textbf{x_m})$
+ `q` appears to be our initial guess for $\alpha$
+ Our first constraint has to be reworked a bit to fit the shape of the API (see $Gx \leq h$). Instead of expressing the bounds on $\alpha$ as the interval $0 \leq \alpha_n \leq C$, we will break it up into two expressions: $-\alpha_n \leq 0$ and $\alpha_n \leq C$. From here, we can stack the two composite constraints together.
    + `h` is the vertical concatenation of two $N \times 1$ vectors. The first is filled with zeros, capturing the lower bound of our initial constraint. The second is filled with values of $C$, the upper bound of the initial constraint. 
    + `G` is the cofficient matrix that enables us to compare each value of `x` (i.e. $\alpha$) to the constraints. In this case, we just want a direct value comparison, so we will stack two identity matrices on top of one another. Note, that each corresponds to each half of our initial constraint, so we must multiply the upper identity matrix by -1 to test $-\alpha_n \leq 0$.
+ `A` plays the role of the labels $y$ in the second constraint $\alpha^T y = 0$, and `b` plays the role of the zero.

In [17]:
def train(y, X, margin_error, bandwidth):
    n_rows, n_cols = X.shape
    y_2d: np.array = y.reshape(1, n_rows).astype(float)
    
    P: np.array = -np.dot(y_2d.T, y_2d) * gram_matrix(X, X, bandwidth)
    q: np.array = np.ones((n_rows, 1))
    h: np.array = np.concatenate([
        np.zeros((n_rows, 1)),
        np.ones((n_rows, 1)) * margin_error
    ], axis=0)
    G: np.array = np.concatenate([
        -np.eye((n_rows)),
        np.eye((n_rows))
    ], axis=0)
    A: np.array = y_2d
    b: np.array = np.zeros(1)
        
    for elem in [P, q, h, G, A, b]:
        print(np.ndim(elem))
        
    solvers.options['abstol'] = 1e-10
    solvers.options['reltol'] = 1e-10
    solvers.options['feastol'] = 1e-10
    
    return solvers.qp(
        matrix(P), 
        matrix(q), 
        matrix(G), 
        matrix(h), 
        matrix(A), 
        matrix(b)
    )

train(y_train, X_train, 10, 0.3)

2
2
2
2
2
1


ValueError: Rank(A) < p or Rank([P; A; G]) < n

In [39]:
??matrix