In [None]:
%matplotlib notebook

import functools
import json
import matplotlib.pyplot as plt
import matplotlib.patches
import numpy as np
import pathlib
import sklearn as sk

from sklearn import svm
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSCanonical


np.set_printoptions(precision=3, linewidth=100)

In [None]:
def rebuild_covariance_matrix(v):
    cov = np.zeros((6,6))
    cursor = 0
    for i in range(6):
        cov[i, 0:(i+1)] = v[cursor:(cursor + i + 1)]
        cursor += i + 1
        
    cov = np.dot(cov, cov.T)
        
    return cov

In [None]:
def plot_covariance_matrix(c, dims, ax, color='black'):
    submatrix = (c[list(dims)])[:,list(dims)]
    eigvals, eigvecs = np.linalg.eig(submatrix)
    
    angle = np.arctan2(eigvecs[0][1], eigvecs[0][0])
    
    ell = matplotlib.patches.Ellipse((0., 0.), np.sqrt(eigvals[0]), np.sqrt(eigvals[1]), np.rad2deg(angle),
                                    linewidth=1.0, edgecolor=color, fill=False)
    
    ax.add_artist(ell)
    

In [None]:
def compare_covariance_matrices(c1, c2):
    fig, ax = plt.subplots()
    plot_covariance_matrix(c1, (0,1), ax)
    plot_covariance_matrix(c2, (0,1), ax)
    plt.show()

In [None]:
def covariance_matrices_bar_plot(c1, c2, ax):
    indices = np.arange(6)
    
    width = 0.2
    ax.bar(indices, np.sqrt(np.diagonal(c1)), width, color='black')
    ax.bar(indices + width, np.sqrt(np.diagonal(c2)), width, color='0.6')
    ax.set_xticks(indices + width / 2)

In [None]:
def compute_loss(predicted, validation):
    return np.mean(np.linalg.norm(predicted - validation, axis=1))

In [None]:
def create_validation_set(xs, ys, proportion):
    idx = np.arange(len(xs))
    np.random.shuffle(idx)
    
    training_size = int(len(xs) * proportion)

    xs_training = xs[idx[0:training_size]]
    ys_training = ys[idx[0:training_size]]

    xs_validation = xs[idx[training_size:]]
    ys_validation = ys[idx[training_size:]]
    
    return xs_training, ys_training, xs_validation, ys_validation

In [None]:
def cross_validation(xs, ys, algorithm, n=30):
    losses = np.zeros(n)
    for i in range(n):
        xs_training, ys_training, xs_validation, ys_validation = create_validation_set(xs, ys, 0.7)
        
        predicted = algorithm(xs_training, ys_training, xs_validation)
        
        loss = compute_loss(predicted, ys_validation)
        losses[i] = loss
        
    return np.mean(losses), np.std(losses)

# Creation of the two datasets

In [None]:
dataset_file = pathlib.Path('/home/dlandry/dataset/recov/tst.json')

with dataset_file.open() as f:
    dataset_dict = json.load(f)
    
meta = dataset_dict['metadata']
print('{},{},{},{}'.format(dataset_file.name, meta['combiner'], meta['binner'], meta['clustering']))

In [None]:
xs = np.array(dataset_dict['data']['xs'])
ys = np.array(dataset_dict['data']['ys'])

xs_training, ys_training, xs_validation, ys_validation = create_validation_set(xs,ys,0.7)

# PLS

In [None]:
def pls(xs_training, ys_training, xs_validation, n=3):
    pls = PLSCanonical(n_components=n, scale=True)
    pls.fit(xs_training, ys_training)
    
    return pls.predict(xs_validation)

In [None]:
for n in range(1,5):
    partial_pls = functools.partial(pls, n=n)
    
    result = cross_validation(xs, ys, partial_pls, n=100)
    print('Components {}. Loss {}'.format(n, result))

# KNN

In [None]:
import sklearn.neighbors

def kd_tree(xs_training, ys_training, xs_validation, k=3):
    tree = sklearn.neighbors.KDTree(xs_training)
    dist, indices = tree.query(xs_validation, k=k)
    
    predicted = np.zeros((xs_validation.shape[0], ys_training.shape[1]))
    for i in range(len(xs_validation)):
        for j in range(k):
            ratio = dist[i,j] / np.sum(dist[i])
            predicted[i] += ys_training[indices[i,j]] * ratio
            
    return predicted

In [None]:
for k in range(1, 15):
    kd_tree_partial = functools.partial(kd_tree, k=k)
    distribution = cross_validation(xs, ys, kd_tree_partial, n=100)
    
    print('K {}. Avg Loss {}.'.format(k, distribution))

# CCA

In [None]:
from sklearn.cross_decomposition import CCA



In [None]:
def cca(xs_training, ys_training, xs_validation, n=3):
    cca = CCA(n, scale=True)
    cca.fit(xs_training, ys_training)
    return cca.predict(xs_validation)

In [None]:
for n in range(1, 6):
    cca_partial = functools.partial(cca, n=n)
    distribution = cross_validation(xs, ys, cca_partial, n=100)
    
    print('N {}. Avg Loss {}.'.format(n, distribution))

# SVM

In [None]:
from sklearn.svm import SVR

In [None]:
def support_vector_regression(xs_training, ys_training, xs_validation):
    predicted = np.zeros((xs_validation.shape[0], ys_training.shape[1]))
    for i in range(ys_training.shape[1]):
        svr = SVR()
        svr.fit(xs_training, ys_training[:,i])
        predicted[:,i] = svr.predict(xs_validation)
        
    return predicted

    

In [None]:
cross_validation(xs, ys, support_vector_regression, n=100)

# Gaussian Process

In [None]:
from sklearn.gaussian_process import GaussianProcessRegressor

In [None]:
def gp_regression(xs_training, ys_training, xs_validation):
    predicted = np.zeros((xs_validation.shape[0], ys_training.shape[1]))
    for i in range(ys_training.shape[1]):
        gp = GaussianProcessRegressor(normalize_y=True)
        gp.fit(xs_training, ys_training[:,i])
        predicted[:,i] = gp.predict(xs_validation)
        
    return predicted

In [None]:
cross_validation(xs,ys,gp_regression)

# Perceptron

In [None]:
from sklearn.neural_network import MLPRegressor

In [None]:
def mlp_regression(xs_training, ys_training, xs_validation, configuration=(100)):
    mlp = MLPRegressor(hidden_layer_sizes=configuration)
    mlp.fit(xs_training, ys_training)
    return mlp.predict(xs_validation)

In [None]:
configurations = [
#    (100,50,10,50,100),
#    (100,100,100,100,100),
#    (100),
#    (50),
#    (200),
 #   (300),
#    (200,100,50,100,200),
#    (500, 250, 125, 250, 500),
#    (1000, 500, 250, 500, 1000),
    (500, 400, 300, 200, 100, 200, 300, 400, 500),
    (1000, 800, 600, 400, 200, 400, 600, 80, 1000),
#    (500, 250, 125),
#    (200, 150, 100, 50)
]

for configuration in configurations: 
    partial_mlp_regression = functools.partial(mlp_regression, configuration=configuration)
    distribution = cross_validation(xs,ys, partial_mlp_regression)
    print('Configuration: {}. Distribution: {}.'.format(configuration, distribution))

In [None]:
def hourglass_configuration(min_neurons, max_neurons, n_layers):
    configuration = np.zeros(n_layers)
    
    configuration[0:(n_layers//2)] = np.linspace(max_neurons, min_neurons, num=n_layers//2, endpoint=False)
    configuration[n_layers//2] = min_neurons
    configuration[(n_layers//2)+1:n_layers] = np.flip(np.linspace(max_neurons, min_neurons, num=(n_layers//2 - 1 + (n_layers % 2)), endpoint=False), axis=0)
    
    return tuple(configuration.astype(np.int))

In [None]:
hourglass_configuration(500, 100, 5)

In [None]:
min_values = [100, 200, 300, 400, 500]
max_values = [250, 500, 750, 1000]
n_layers = [3, 5, 7, 9, 11]

configs = [(a,b,c) for a in min_values for b in max_values for c in n_layers]

for config in configs:
    nn_configuration = hourglass_configuration(*config)
        
    partial_mlp_regression = functools.partial(mlp_regression, configuration=nn_configuration)
    distribution = cross_validation(xs,ys, partial_mlp_regression, n=30)
    
    print('Configuration: {}. Distribution: {}.'.format(nn_configuration, distribution))
    

# Validation of examples

In [None]:
xs_training, ys_training, xs_validation, ys_validation = create_validation_set(xs, ys, 0.8)

In [None]:
ys_predicted = mlp_regression(xs_training, ys_training, xs_validation, configuration=(500, 400, 300, 200, 100, 200, 300, 400, 500)) 
#ys_predicted = kd_tree(xs_training, ys_training, xs_validation, k=8)

In [None]:
fig, ax = plt.subplots()
to_compare = 18


c1 = rebuild_covariance_matrix(ys_validation[to_compare])
c2 = rebuild_covariance_matrix(ys_predicted[to_compare])
covariance_matrices_bar_plot(c1, c2, ax)

In [None]:
fig, ax = plt.subplots()

to_compare = 45

plot_covariance_matrix(rebuild_covariance_matrix(ys_validation[to_compare]), (0,1), ax, color='0.0')
plot_covariance_matrix(rebuild_covariance_matrix(ys_predicted[to_compare]), (0,1), ax, color='0.5')

ax.set_xlim(-1, 1)
ax.set_ylim(-1, 1)