In [None]:
%matplotlib notebook

import json
import matplotlib.pyplot as plt
import matplotlib.patches
import numpy as np
import sklearn as sk

from sklearn import svm
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSCanonical


np.set_printoptions(precision=3, linewidth=100)


In [None]:
def rebuild_covariance_matrix(v):
    cov = np.zeros((6,6))
    cursor = 0
    for i in range(6):
        cov[i, 0:(i+1)] = v[cursor:(cursor + i + 1)]
        cursor += i + 1
        
    cov = np.dot(cov, cov.T)
        
    return cov

In [None]:
def plot_covariance_matrix(c, dims, ax, color='black'):
    submatrix = (c[list(dims)])[:,list(dims)]
    eigvals, eigvecs = np.linalg.eig(submatrix)
    
    angle = np.arctan2(eigvecs[0][1], eigvecs[0][0])
    
    ell = matplotlib.patches.Ellipse((0., 0.), np.sqrt(eigvals[0]), np.sqrt(eigvals[1]), np.rad2deg(angle),
                                    linewidth=1.0, edgecolor=color, fill=False)
    
    ax.add_artist(ell)
    

In [None]:
def compare_covariance_matrices(c1, c2):
    fig, ax = plt.subplots()
    plot_covariance_matrix(c1, (0,1), ax)
    plot_covariance_matrix(c2, (0,1), ax)
    plt.show()

In [None]:
def covariance_matrices_bar_plot(c1, c2, ax):
    indices = np.arange(6)
    
    width = 0.2
    ax.bar(indices, np.sqrt(np.diagonal(c1)), width, color='black')
    ax.bar(indices + width, np.sqrt(np.diagonal(c2)), width, color='0.6')
    ax.set_xticks(indices + width / 2)

In [None]:
def compute_loss(predicted, validation):
    return np.mean(np.linalg.norm(predicted - validation, axis=1))

# Creation of the two datasets

In [None]:
with open('/home/dlandry/dataset/recov/learning_sets/2018-01-15-overlapping.json') as dataset_file:
    dataset_dict = json.load(dataset_file)
    
meta = dataset_dict['metadata']
print('{},{},{}'.format(meta['combiner'], meta['binner'], meta['clustering']))

In [None]:
xs = np.array(dataset_dict['data']['xs'])
ys = np.array(dataset_dict['data']['ys'])

In [None]:
idx = np.arange(len(xs))
np.random.shuffle(idx)
training_size = int(len(xs) * 0.8)

xs_training = xs[idx[0:training_size]]
ys_training = ys[idx[0:training_size]]

xs_validation = xs[idx[training_size:]]
ys_validation = ys[idx[training_size:]]

# PLS

In [None]:
for n in range(1,10):
    pls = PLSCanonical(n_components=n, scale=True)
    pls.fit(xs_training, ys_training)
    
    ys_predicted = pls.predict(xs_validation)
    loss_matrix = ys_validation - ys_predicted
    norm = np.linalg.norm(loss_matrix, axis=1)
    avg_loss = np.mean(norm)
    
    print('{:.3f} average loss for {} components'.format(avg_loss, n))

In [None]:
pls = PLSCanonical(n_components=1, scale=True)
pls.fit(xs_training,ys_training)
ys_predicted = pls.predict(xs_validation)

# KNN

In [None]:
def test_kdtree(tree, training_ys, validation_xs, validation_ys, k):
    dist, indices = tree.query(validation_xs, k=k)
    response = np.zeros(validation_ys.shape)
    for i in range(len(validation_ys)):
        for j in range(k):
            ratio = dist[i,j] / np.sum(dist[i])
            response[i] += training_ys[indices[i,j]] * ratio
            
    return response

In [None]:
import sklearn.neighbors

tree = sklearn.neighbors.KDTree(xs_training)

for k in range(1, 15):
    predicted = test_kdtree(tree, ys_training, xs_validation, ys_validation, k)
    print('K {}. Avg Loss {}.'.format(k, compute_loss(predicted, ys_validation)))





# Validation of examples

In [None]:
fig, ax = plt.subplots()

to_compare = 30

c1 = rebuild_covariance_matrix(ys_validation[to_compare])
c2 = rebuild_covariance_matrix(ys_predicted[to_compare])
covariance_matrices_bar_plot(c1, c2, ax)

In [None]:
fig, ax = plt.subplots()

to_compare = 45

plot_covariance_matrix(rebuild_covariance_matrix(ys_validation[to_compare]), (0,1), ax, color='0.0')
plot_covariance_matrix(rebuild_covariance_matrix(ys_predicted[to_compare]), (0,1), ax, color='0.5')

ax.set_xlim(-1, 1)
ax.set_ylim(-1, 1)

