# KERNEL DATA CHALLENGE
## AUTHORS : Thibault Desfontaines, Rémi Leluc, Gauthier Tallec

In [1]:
import Classifiers.SVMClassifier as svm
import Preprocessers.PCA as pca
import ModelTesters.KernelMethodOptimizer as kmo

import Kernels.LinearKernel as lker
import Kernels.GaussianKernel as gker
import Kernels.KSpectrumKernel as kspker
import Kernels.MismatchKernel as misker


import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
def bound_normalisation(x):
    if x == 0:
        return -1
    else:
        return x

In [3]:
def string_to_array(string):
    return np.array(list(string))

## DATA IMPORTS

In [4]:
filepath = 'challenge-dataset/'
##Training sets
###Inputs
Xtr0_mat100 = pd.read_csv(filepath + 'Xtr0_mat100.csv', delimiter=' ', header = None).values
Xtr1_mat100 = pd.read_csv(filepath + 'Xtr1_mat100.csv', delimiter=' ', header = None).values
Xtr2_mat100 = pd.read_csv(filepath + 'Xtr2_mat100.csv', delimiter=' ', header = None).values

Xtr0 = pd.read_csv(filepath + 'Xtr0.csv', delimiter=',', header = 0)['seq'].values
Xtr1 = pd.read_csv(filepath + 'Xtr1.csv', delimiter=',', header = 0)['seq'].values
Xtr2 = pd.read_csv(filepath + 'Xtr2.csv', delimiter=',', header = 0)['seq'].values

###Labels
Ytr0 = (pd.read_csv(filepath + 'Ytr0.csv', delimiter=',', sep='\n', header = 0)['Bound']
          .map(bound_normalisation)
          .values
       )

Ytr1 = (pd.read_csv(filepath + 'Ytr1.csv', delimiter=',', sep='\n', header = 0)['Bound']
          .map(bound_normalisation)
          .values
       )
Ytr2 = (pd.read_csv(filepath + 'Ytr2.csv', delimiter=',', sep='\n', header = 0)['Bound']
          .map(bound_normalisation)
          .values
       )

##Testing Sets
Xte0_mat100 = pd.read_csv(filepath + 'Xte0_mat100.csv', delimiter=' ', header = None).values
Xte1_mat100 = pd.read_csv(filepath + 'Xte1_mat100.csv', delimiter=' ', header = None).values
Xte2_mat100 = pd.read_csv(filepath + 'Xte0_mat100.csv', delimiter=' ', header = None).values

Xte0 = pd.read_csv(filepath + 'Xte0.csv', delimiter=',', header = 0)['seq'].values
Xte1 = pd.read_csv(filepath + 'Xte1.csv', delimiter=',', header = 0)['seq'].values
Xte2 = pd.read_csv(filepath + 'Xte2.csv', delimiter=',', header = 0)['seq'].values

In [5]:
k = 8
m = 2
alphabet = ['A','T','G','C']
kernel_mismatch = misker.MismatchKernel(k,m,alphabet)

## MODEL TESTS

### SVM CLASSIFIER WITH K-SPECTRUM KERNEL

In [10]:
#KERNEL PART
##Kernel parameters

k_mer = 8
mismatch = 2

##Kernel Instanciation
k_mismatch_kernel = misker.MismatchKernel(k = k_mer, m = mismatch, alphabet = ['A','T','G','C'])

In [11]:
#SVM PART
##Regularization Grid Search parameter
hyper_parameters_list = np.array([[10**(-i)] for i in range(1,20)])

##SVM Instanciation
svm_classifier= svm.SVMClassifier()

In [None]:
k_fold = 5
model_optimizer = kmo.KernelMethodOptimizer(svm_classifier, k_mismatch_kernel)
prediction = model_optimizer.make_optimal_prediction(k = k_fold,
                                        inputs = Xtr1,
                                        tests = Xte1,
                                        labels = Ytr1,
                                        hyperParametersList = hyper_parameters_list,
                                        dataset = str(1),
                                        load_train = False,
                                        load_test = False)


HBox(children=(IntProgress(value=0, max=2000), HTML(value='')))

test routine  0  :  0.7175 %
test routine  1  :  0.745 %
test routine  2  :  0.7175 %
test routine  3  :  0.72 %
test routine  4  :  0.74 %
accuracy list [0.7175, 0.745, 0.7175, 0.72, 0.74]
test routine  0  :  0.7175 %
test routine  1  :  0.745 %
test routine  2  :  0.7175 %
test routine  3  :  0.72 %
test routine  4  :  0.74 %
accuracy list [0.7175, 0.745, 0.7175, 0.72, 0.74]
test routine  0  :  0.7175 %
test routine  1  :  0.745 %
test routine  2  :  0.7175 %
test routine  3  :  0.72 %
test routine  4  :  0.74 %
accuracy list [0.7175, 0.745, 0.7175, 0.72, 0.74]
test routine  0  :  0.7175 %
test routine  1  :  0.745 %
test routine  2  :  0.7175 %
test routine  3  :  0.72 %
test routine  4  :  0.74 %
accuracy list [0.7175, 0.745, 0.7175, 0.72, 0.74]
test routine  0  :  0.7175 %
test routine  1  :  0.745 %
test routine  2  :  0.7175 %
test routine  3  :  0.72 %
test routine  4  :  0.74 %
accuracy list [0.7175, 0.745, 0.7175, 0.72, 0.74]
test routine  0  :  0.7175 %
test routine  1  :  0

In [15]:
L = [1,2,3,4,5]
L[:2]
L[2:]

[3, 4, 5]

In [None]:
"""
#Perform PCA with linear Kernel
pca_kernel = lker.LinearKernel()
pca_agent = pca.PCA()
pca_inputs_al = pca_agent.perform_PCA(inputs = inputs_al,
                                      kernel = pca_kernel,
                                      dim_num =3)

pca_inputs_tr = pca_inputs_al[:1900]
pca_inputs_te = pca_inputs_al[1900:]
"""

In [None]:
"""
#Computing bandwidth for gaussian kernel as mean norms of all distances between vectors
n = pca_inputs_al.shape[0]
Inputs_dup = np.tile(pca_inputs_al[np.newaxis,:,:], reps = (n,1,1))
mean_distance = np.mean(np.linalg.norm(Inputs_dup - np.einsum('pnm-> npm', Inputs_dup), axis = 2))
del Inputs_dup
"""

In [None]:
"""
lam=10e-4
gamma = 1
bandwidth = gamma*mean_distance
kernel_svm = gker.GaussianKernel(bandwidth)
"""

In [None]:
"""
svmClassifier = svm.SVMClassifier(lam = lam, kernel = kernel_svm)
svmClassifier.fit(pca_inputs_tr, labels_tr.astype(np.double))
"""

In [None]:
"""
accuracy = svmClassifier.compute_test_accuracy(pca_inputs_te, labels_te)
"""

In [None]:
D = {'coucou' : np.array([1,2]), 'bonjour' : np.array([3,4])}
print(np.array(list(D.values())).shape)

In [None]:
np.tile(np.arange(16)[:,np.newaxis], reps=(1,5))
