# Integrantes
* ### David Herrera
* ### Estid Lozano

In [None]:
# Imports
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np
import openml
import math
import pandas as pd
import scipy.stats

## Exercise 1
### 1.1 Write a probabilistic learner LDA that builds models for binary classification via the linear discriminant analysis. Prediction should be made assuming a (1-dimensional) normal distribution for each class with means and variances according to the built model. When returning probabilities, normalize the densities assigned to each class so that the vector sums up to 1.

In [None]:
class LDA():
    def train(self, _X, _Y):
        data = {}
        for x, y in zip(_X, _Y):
            if y not in data.keys():
                data[y] = []
            data[y].append(x)
        self.data = data.copy()
        mean = {}
        centerData = data
        S = {}

        for key in data:
            mean[key] = np.mean(data[key], axis=0)
            centerData[key] -= mean[key].T
            S[key] = np.dot(centerData[key].T, centerData[key])

        S = np.sum(list(S.values()), axis=0)
        S_1 = np.linalg.inv(S)

        self.means = mean
        means = list(mean.values())
        diffMean = means[0]-means[1]

        B = np.outer(diffMean, diffMean)
        S_1B = np.dot(S_1, B)
        w = None
        if np.linalg.det(S):
            w = np.dot(S_1, diffMean)
            w = w/np.linalg.norm(w)
        else:
            values, vectors = np.linalg.eig(S_1B)
            w = vectors[:, values.argmax()]
        self.w = w
        return w

    def predict(self, _X):
        res = []
        for x in _X:
            resTemp = []
            x = np.dot(self.w, x)
            for key in self.data:
                projectedPoints = np.array([np.dot(self.w, x) for x in self.data[key]])
                # projectedMean=np.mean(projectedPoints)
                projectedMean = np.dot(self.w, self.means[key])
                variance = np.var(projectedPoints)
                normal = scipy.stats.norm(projectedMean, variance).pdf(x)
                resTemp.append(normal)
            resTemp = resTemp/np.linalg.norm(resTemp)
            res.append(list(self.data.keys())[resTemp.argmax()])
        return res


df = pd.read_csv('iris.csv')
x = df.iloc[:, :2].to_numpy()
y = df.replace("virginica", "versicolor").values[:, -1]

model = LDA()
model.train(x, y)
model.predict([x[0],x[-1],x[10]])


### 1.2 Now implement the kernel-based logic in a KernelLDA classifier. The kernel should be passed as an argument kernel at initialization time, which accepts two elements of the input space and produces their similarity value.

In [None]:
def linearKernel(x1, x2):
    return np.dot(x1, x2)


class KernelLDA():
    def __init__(self, _kernel):
        self._kernel = _kernel

    def mapDataset(self, _X, _Y, corrected=True):
        K = np.zeros((len(_X), len(_X)))
        classes = list(np.unique(_Y))
        for i in range(len(K)):
            for j in range(i):
                K[i][j] = K[j][i] = self._kernel(_X[i], _X[j])
            K[i][i] = self._kernel(_X[i], _X[i])


        K_c = dict((el, np.zeros((len(_X),list(_Y).count(el)))) for el in classes)
        for k in range(len(_X)):
            for j in range(K_c[_Y[k]].shape[1]):
                K_c[_Y[k]][k][j]=self._kernel(_X[k],_X[j])
        
        if corrected:
            K_1 = np.linalg.pinv(K) 
            for cla in classes:
                K_c_1 = np.linalg.pinv(K_c[cla])
                K_c[cla] = np.matmul(np.matmul(K_c[cla], K_c_1), K_c[cla])
            return np.matmul(np.matmul(K, K_1), K), K_c
        return K, K_c

    def train(self, _X, _Y):
        K, K_c = self.mapDataset(_X, _Y)
        mean = {}
        N = {}
        for key in K_c:
            n = K_c[key].shape[1]
            mean[key] = np.matmul((K_c[key]/n), [1]*n)
            N[key] = np.matmul( K_c[key] ,np.matmul((np.identity(n) - np.ones(n)/n), K_c[key].T))
        N = np.sum(list(N.values()), axis=0)
        N = K
        means = list(mean.values())
        diffMean = means[0]-means[1]
        M = np.outer(diffMean, diffMean)
        N_1 = np.linalg.inv(N)
        N_1M = np.dot(N_1, M)

        if np.linalg.det(N):
            a = np.dot(N_1, diffMean)
        else:
            values, vectors = np.linalg.eig(N_1M)
            a = vectors[:, values.argmax()]
        a = a/np.sqrt(np.dot(a.T, np.dot(K, a)))

df = pd.read_csv('iris.csv')
xa = df.iloc[:, :2].to_numpy()
ya = df.replace("virginica", "versicolor").values[:, -1]

model = KernelLDA(linearKernel)
model.train(xa, ya)


### 1.3 Empirically check that the two algorithm have the same behavior if you use the linear kernel.

## Exercise 2

### 2.1 Implement generators for the polynomial kernel and the Gaussian kernel (so that you can choose the parameters c, q and σ when producing the kernel function).

In [None]:
def gaussianKernel(x1, x2, sigma=1):
    return math.exp(-pow(np.linalg.norm(x1-x2),2)/(2*pow(sigma,2)))
# polynomial
def polynomialKernel(x1, x2, c=0, q=1):
    return pow(c+np.matmul(x1,x2),q)

### 2.2 Write a function to show a projection line w for some given dataset. The intercept should be chosen so that the line passes the mean of the data.

In [None]:
def showProjectionLine(_w,_x,_y):
    fig, ax = plt.subplots()
    classes = list(np.unique(_y))
    mean = np.mean(_x,axis=0)
    X1 = _x[_y == classes[0]]
    X2 = _x[_y == classes[1]]
    ax.scatter(X1[:,0], X1[:,1])
    ax.scatter(X2[:,0], X2[:,1])
    slope = _w[0] / _w[1]
    intercept = -slope*mean[0]+mean[1]
    a2_min, a2_max = min(_x[:,1]), max(_x[:,1])
    q1 = (a2_max - intercept) / slope
    q2 = (a2_min - intercept) / slope
    x_from = max(min(_x[:,0]), min(q1, q2))
    x_to = min(max(_x[:,0]), max(q1, q2))
    hp = lambda x: x * slope + intercept
    domain = np.linspace(x_from, x_to, 100)
    ax.plot(domain, hp(domain), color="black")
    
df = pd.read_csv('iris.csv')
x = df.iloc[:, :2].to_numpy()
y = df.replace("virginica", "versicolor").values[:, -1]
model = LDA()
w = model.train(x, y)
showProjectionLine(w,x,y)



### 2.3 Implement the feature map belonging to the quadratic homogeneous kernel. Consider the PCA iris dataset with two classes. Explicitly transform the dataset with the feature map of the quadratic kernel, apply the LDA in the new dataset, and visualize the solution in a 3D plot

In [None]:
# %matplotlib inline
%matplotlib widget
df = pd.read_csv('iris_pca_notseparable.csv')
quadraticMap = lambda x1: [np.power(x1[0],2),np.power(x1[1],2),np.sqrt(2)*x1[0]*x1[1]]
X = df.iloc[:, :2].to_numpy()
Y = df.iloc[:, -1].to_numpy()
X = np.array([quadraticMap(x) for x in X])
model = LDA()
w = model.train(X,Y)

fig = plt.figure(figsize=plt.figaspect(0.5))
ax = ax = fig.add_subplot(1, 2, 1, projection='3d')
classes= np.unique(Y)
X1 = X[Y == classes[0]]
X2 = X[Y == classes[1]]
ax.scatter(X1[:,0], X1[:,1], X1[:,2])
ax.scatter(X2[:,0], X2[:,1], X2[:,2])


ax = ax = fig.add_subplot(1, 2, 2, projection='3d')
projectedX1 =  np.array([np.dot(w,np.dot(xa,w.T)) for xa in X1])
projectedX2 =  np.array([np.dot(w,np.dot(xa,w.T)) for xa in X2])
ax.scatter(projectedX1[:,0], projectedX1[:,1], projectedX1[:,2])
ax.scatter(projectedX2[:,0], projectedX2[:,1], projectedX2[:,2])

t = np.linspace(-5,15,50)
x = w[0]*t
y = w[1]*t
z = w[2]*t
ax.plot(x,y,z)

### 2.4 Create a function that takes a 2D database X with the ground truth labels y and a prediction vector yˆ. Create a scatter plot in which the different classes get different symbols, and they are scattered in green if the prediction is correct and in red if the prediction is wrong. Get predictions for the standard LDA and the Kernel LDA with different kernels (try also different parameters for each kernel) and plot the predictions for the Iris PCA dataset. Which algorithm produces best results?

In [None]:
%matplotlib inline
# %matplotlib widget
def plotDiff(_X,_Yt,_Yp):
    fig, ax = plt.subplots()
    classes = np.unique(_Yt)
    X1 = _X[_Yt == classes[0]]
    X2 = _X[_Yt == classes[1]]
    diff= _Yt == _Yp
    ax.scatter(X1[:,0], X1[:,1],marker='*',color=["green" if temp else "blue" for temp in diff[_Yt == classes[0]] ] )
    ax.scatter(X2[:,0], X2[:,1],marker='.',color=["green" if temp else "blue" for temp in diff[_Yt == classes[1]] ] )

df = pd.read_csv('iris_pca_notseparable.csv')
X = df.iloc[:, :2].to_numpy()
Y = df.iloc[:, -1].to_numpy()

model = LDA()
model.train(X, Y)
ypredicted = model.predict(X)
plotDiff(X,Y,ypredicted)

# model = KernelLDA(gaussianKernel)
# model.train(X, Y)
# ypredicted = model.predict(X)
# plotDiff(X,Y,ypredicted)


# model = KernelLDA(polynomialKernel)
# model.train(X, Y)
# ypredicted = model.predict(X)
# plotDiff(X,Y,ypredicted)