### David Herrera
### Estid Lozano

In [None]:
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import openml as oml
import math
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
import time

# Exercise 1

**2 pts**

Let $K$ be the Gaussian kernel and let $D = \{(x_1, y_1), ..., (x_n, y_n)\}$ be a dataset. Now consider the empirical reproducing kernel map $φ$ with $φ(x) = K(x, ·)$, and suppose that we use this kernel map to generate new features based on these “landmarks”. Formally, let $D' = \{(φ(x)(D), y) | (x, y) ∈ D\}$ be the dataset we obtain if we have one feature for each datapoint indicating the similarity as per the Gaussian kernel $K$. So $D' ∈ R^{n×n}$.

Show that it is wrong to say that kernels produce new features by showing that the dot product of two arbitrary points $x_i$, $x_j$ in $D'$ is not identical to the kernel value $K(x_i, x_j)$.

**proof**:

$$
\\ K(x_i, x_j) = exp\{-\frac{\Vert{x_i-x_j}\Vert^2}{2\sigma^2}\} 
\\ \phi(x) = (K(x_1,x),K(x_2,x),...,K(x_n,x))^T
\\ \phi(x_i)^T \phi(x_j) = \sum_{k=1}^n(K(x_k,x_i)K(x_k,x_j))
\\ = \sum_{k=1}^n(
exp\{-\frac{\Vert{x_k-x_i}\Vert^2}{2\sigma^2}\}
exp\{-\frac{\Vert{x_k-x_j}\Vert^2}{2\sigma^2}\})
\\ = \sum_{k=1}^n(exp\{-\frac{\Vert{x_k-x_i}\Vert^2+\Vert{x_k-x_j}\Vert^2}{2\sigma^2}\})
\neq exp\{-\frac{\Vert{x_i-x_j}\Vert^2}{2\sigma^2}\} 
$$

# Exercise 2

**(Raw Empirical vs. corrected Empirical Kernel Map - 4 pts)**

**2.1.** Implement the polynomial kernel and the Gaussian kernel explicitly.

In [None]:
# polynomial
def polynomialKernel(x1, x2, c=0, q=1):
    return pow(c+np.matmul(x1,x2),q)

In [None]:
# gaussian
def gaussianKernel(x1, x2, sigma=1):
    return math.exp(-pow(np.linalg.norm(x1-x2),2)/(2*pow(sigma,2)))

**2.2.** Write a function **mapDataset(X, kernel, corrected=True)** that takes a dataset (only the attribute not the labels) and explicitly computes the empirical feature map of the given kernel (functional argument). Depending on whether **corrected** is true, it should adjust the data using the $K^{−1/2}$ matrix.

In [None]:
def mapDataset(X, kernel, corrected=True):
    if isinstance(X, pd.DataFrame):
        X = X.to_numpy()
    # compute empirical feature map
    K = np.empty((len(X), len(X)))
    for i in range(len(K)):
        for j in range(i):
            K[i][j] = K[j][i] = kernel(X[i], X[j])
        K[i][i] = kernel(X[i], X[i])
    # corrected
    if corrected:
        K_1 = np.linalg.pinv(K) # lol
        return np.matmul(np.matmul(K, K_1), K)
    return K

**2.3.** Explicitly test that using the corrected version, the kernel computes the correct dot product in the feature space when applying both polynomial and Gaussian kernel to the iris, amazon, and madelon dataset.

In [None]:
def getDataset(_id):
    dtset = oml.datasets.get_dataset(_id)
    X, y, catInd, attrs = dtset.get_data()
    if y is None and attrs[-1].lower() == "class":
        X, y = X.iloc[:,:-1], X.iloc[:,-1]
    return dtset.name, X.to_numpy(), y.to_numpy()

datasets = [getDataset(i) for i in [61, 1457, 1485]]
kernels = [polynomialKernel, gaussianKernel]

In [None]:
# test
for dtset in datasets:
    name, X, y = dtset
    print(name)
    startTime = time.monotonic()
    for kernel in kernels:
        print("- "+kernel.__name__)
        mapped = mapDataset(X, kernel)
        print(mapped)
    print(round(time.monotonic() - startTime, 3), "seconds")

**2.4.** Report performance of Logistic Regression, Naive Bayes and Decision Trees when applied to these three (mapped) datasets using the linear kernel, the quadratic kernel (each of which with $c ∈ \{0, 1, 10\}$) and with the Gaussian kernel (for values of $σ ∈ \{0.5, 1, 10\}$).

In [None]:
classifiers = [LogisticRegression, GaussianNB, DecisionTreeClassifier]
linearKernel = lambda x1, x2, c=0: polynomialKernel(x1, x2, c, 1) # only for the name
quadraticKernel = lambda x1, x2, c=0: polynomialKernel(x1, x2, c, 2)
linearKernel.__name__, quadraticKernel.__name__ = "linearKernel", "quadraticKernel"
kernels = [linearKernel, quadraticKernel, gaussianKernel]
params = [("c", [0, 1, 10])] * 2 + [("sigma", [0.5, 1, 10])]

In [None]:
# performance
for dtset in datasets:
    dtsetName, X, y = dtset
    print(dtsetName)
    dtsetTime = time.monotonic()
    for ik, kernel in enumerate(kernels):
        print("* "+kernel.__name__)
        paramName, paramList = params[ik]
        kernelTime = time.monotonic()
        for param in paramList:
            print("  + "+paramName+": "+str(param))
            XNew = mapDataset(X, kernel)
            XTrain, XTest, yTrain, yTest = train_test_split(XNew, y, test_size=0.8, random_state=0)
            for clf in classifiers:
                yPred = clf().fit(XTrain, yTrain).predict(XTest)
                print("    - "+clf.__name__+" performance: "+str(round(1-(yTest != yPred).sum()/len(y), 3)))
        print("  + "+kernel.__name__+" time", round(time.monotonic() - kernelTime, 3), "seconds")
    print("* "+dtsetName+" time", round(time.monotonic() - dtsetTime, 3), "seconds")

# Exercise 3

**2 pts**

Create an artificial dataset with two attributes and two classes. There should be 100 instances in the range of 1 around the origin and 400 instances outside the unit sphere (but in a range of less than 10) uniformly distributed. Visualize your data. Now map your data with the feature map corresponding to the quadratic kernel for different values of c into a 3D-space. Create 3D plots (with appropriate axes labels) and explain what you observe and in how far linear separability is changed.

In [None]:
# artificial dataset
Angles = np.random.uniform(0, 2 * math.pi, 500)
Distances = np.append(np.random.uniform(0, 1, 100), np.random.uniform(1, 10, 400))
X = np.column_stack((Distances * np.cos(Angles), Distances * np.sin(Angles)))
y = np.append(np.repeat(0, 100), np.repeat(1, 400))

In [None]:
# visualize
plt.figure(figsize = (10, 10))
colors = ["r", "g"]
for i, c in enumerate(np.unique(y)):
    filt = np.where(y == c)[0]
    plt.scatter(X[filt, 0], X[filt, 1], s = 10, c = colors[i], alpha = 0.5)
plt.show()

In [None]:
# map data

In [None]:
# 3D plots

**Answer:** We observe...