# Imports

In [1]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
from scipy.sparse import csr_matrix
from itertools import product
import functools 
import operator 
import regex as re
import time

from classifiers import *
from metrics import *
from kernels import *

from sklearn.model_selection import train_test_split # lui il va partir mais pour l'instant c'est pratique


In [20]:
from classifiers import accuracy

# Load the data

In [2]:
#Xtr0 = features_into_array("data/Xtr0.csv")
Ytr0 = np.genfromtxt("data/Ytr0.csv", delimiter=',', skip_header=1)

#Xtr1 = features_into_array("data/Xtr1.csv")
Ytr1 = np.genfromtxt("data/Ytr1.csv", delimiter=',', skip_header=1)

#Xtr2 = features_into_array("data/Xtr2.csv")
Ytr2 = np.genfromtxt("data/Ytr2.csv", delimiter=',', skip_header=1)

In [3]:
train_idx_0 = np.load("train_test_split/train_idx_0.npy").astype(int)
train_idx_1 = np.load("train_test_split/train_idx_1.npy").astype(int)
train_idx_2 = np.load("train_test_split/train_idx_2.npy").astype(int)

val_idx_0 = np.load("train_test_split/val_idx_0.npy").astype(int)
val_idx_1 = np.load("train_test_split/val_idx_1.npy").astype(int)
val_idx_2 = np.load("train_test_split/val_idx_2.npy").astype(int)

ytr0 = Ytr0[train_idx_0]
ytr1 = Ytr1[train_idx_1 - 2000]
ytr2 = Ytr2[train_idx_2 - 4000]

yval0 = Ytr0[val_idx_0]
yval1 = Ytr1[val_idx_1 - 2000]
yval2 = Ytr2[val_idx_2 - 4000]

## Load the  kernels

In [4]:
# Pre-computed spectrum kernels for k = 5 to k = 10

# Dataset zero
K_tr0_spectrum = np.load("spectrum/K_train0.npy")
K_val0_spectrum = np.load("spectrum/K_val0.npy")
K_te0_spectrum = np.load("spectrum/K_te0.npy")

# Dataset one
K_tr1_spectrum = np.load("spectrum/K_train1.npy")
K_val1_spectrum = np.load("spectrum/K_val1.npy")
K_te1_spectrum = np.load("spectrum/K_te1.npy")

# Dataset two
K_tr2_spectrum = np.load("spectrum/K_train2.npy")
K_val2_spectrum = np.load("spectrum/K_val2.npy")
K_te2_spectrum = np.load("spectrum/K_te2.npy")

In [5]:
# Pre-computed mismatch kernels for k = 5 to k = 10 and m = 1

# Dataset zero
K_tr0_mismatch = np.load("mismatch/K_train0.npy")
K_val0_mismatch = np.load("mismatch/K_val0.npy")
K_te0_mismatch = np.load("mismatch/K_te0.npy")

# Dataset one
K_tr1_mismatch = np.load("mismatch/K_train1.npy")
K_val1_mismatch = np.load("mismatch/K_val1.npy")
K_te1_mismatch = np.load("mismatch/K_te1.npy")

# Dataset two
K_tr2_mismatch = np.load("mismatch/K_train2.npy")
K_val2_mismatch = np.load("mismatch/K_val2.npy")
K_te2_mismatch = np.load("mismatch/K_te2.npy")

# Get the alphas

### First we get the best C found in a gridsearch

In [6]:
C_mismatch_0 = [0.0004, 0.0002, 0.0001, 0.0004, 0.0003, 0.0006]
C_mismatch_1 = [0.0003, 0.0004, 0.0002, 0.0005, 0.0004, 0.0003]
C_mismatch_2 = [0.0002, 0.0005, 0.0005, 0.0004, 0.0004, 0.001]

C_spectrum_0 = [0.002, 0.003, 0.009, 0.01, 0.02, 0.02]
C_spectrum_1 = [0.005, 0.004, 0.007, 0.009, 0.01, 0.02]
C_spectrum_2 = [0.006, 0.008, 0.01, 0.08, 0.02, 0.02]


In [7]:
alphas_0 = []
alphas_1 = []
alphas_2 = []


for i in range(6):
    print("*"*15 + f"Treating mismatch kernels for k = {i+5} " + "*"*15)
    print("")
    print("************* SVM for dataset 0*************\n")
    
    alphas_tr0_mismatch, _,_,_,_ = SVM(K_tr0_mismatch[i], ytr0[:,1], K_val0_mismatch[i],
                                       yval0[:,1], [C_mismatch_0[i]])
    
    print("")
    print("************* SVM for dataset 1 *************\n")
    
    alphas_tr1_mismatch, _,_,_,_ = SVM(K_tr1_mismatch[i], ytr1[:,1], K_val1_mismatch[i],
                                       yval1[:,1], [C_mismatch_1[i]])
    print("")
    print("************* SVM for dataset 2 *************\n")
    alphas_tr2_mismatch, _,_,_,_ = SVM(K_tr2_mismatch[i], ytr2[:,1], K_val2_mismatch[i],
                                       yval2[:,1], [C_mismatch_2[i]])
    print("")
    print("")
    
    alphas_0 += alphas_tr0_mismatch
    alphas_1 += alphas_tr1_mismatch
    alphas_2 += alphas_tr2_mismatch
    
for i in range(6):
    print("*"*15 + f"Treating spectrum kernels for k = {i+5} " + "*"*15)
    print("")
    print("************* SVM for dataset 0*************\n")
    
    alphas_tr0_spectrum, _,_,_,_ = SVM(K_tr0_spectrum[i], ytr0[:,1], K_val0_spectrum[i],
                                       yval0[:,1], [C_spectrum_0[i]])
    
    print("")
    print("")
    print("************* SVM for dataset 1 *************\n")
    
    alphas_tr1_spectrum, _,_,_,_ = SVM(K_tr1_spectrum[i], ytr1[:,1], K_val1_spectrum[i],
                                       yval1[:,1], [C_spectrum_1[i]])
    print("")
    print("")
    print("************* SVM for dataset 2 *************\n")
    alphas_tr2_spectrum, _,_,_,_ = SVM(K_tr2_spectrum[i], ytr2[:,1], K_val2_spectrum[i],
                                       yval2[:,1], [C_spectrum_2[i]])
    print("")
    print("")
    
    alphas_0 += alphas_tr0_spectrum
    alphas_1 += alphas_tr1_spectrum
    alphas_2 += alphas_tr2_spectrum

    

***************Treating mismatch kernels for k = 5 ***************

************* SVM for dataset 0*************

---------------  lambda = 0.0004  ---------------
Training: loss = 0.632025, accuracy = 0.718750
Validation: loss = 0.791007, accuracy = 0.617500

************* SVM for dataset 1 *************

---------------  lambda = 0.0003  ---------------
Training: loss = 0.655350, accuracy = 0.730000
Validation: loss = 0.818788, accuracy = 0.610000

************* SVM for dataset 2 *************

---------------  lambda = 0.0002  ---------------
Training: loss = 0.553185, accuracy = 0.779375
Validation: loss = 0.655252, accuracy = 0.727500


***************Treating mismatch kernels for k = 6 ***************

************* SVM for dataset 0*************

---------------  lambda = 0.0002  ---------------
Training: loss = 0.605278, accuracy = 0.761875
Validation: loss = 0.777157, accuracy = 0.672500

************* SVM for dataset 1 *************

---------------  lambda = 0.0004  --------

# Majority Voting

First we will need to compute for each classifier the accuracy on the training (validation set)

In [8]:
def to_binary(y):
    """
    takes an array with values in [-1,1] and turns into 0 all the values that are below 0
    """
    y_ = np.zeros(y.shape[0])
    y_[y>0] = 1
    return(y_)

def error(y_true, y_pred):
    y_pred_ = to_binary(y_pred)
    return(1-np.mean(y_pred_ == y_true))


In [9]:
def no_weighted_mv(K_train, K_val, alphas, y_train, y_val, K_test=None, gamma= 1/2 ):
    
    """
    INPUTS
    K_train is a list of training kernels
    K_val is a list of the respective validation kernels
    alphas is a list containing the alpha vector found for each training kernel
    
    if a K_test is provided, then we also give the predictions for the testing model :o
    
    OUTPUTS
    idk
    """
    prob = 1/(K_train.shape[0])
    y_tr_pred = np.zeros(y_train.shape[0])
    y_val_pred = np.zeros(y_val.shape[0])
    
    if K_test is not None:
        y_te_pred = np.zeros(1000)
    
    for i in range(K_train.shape[0]):
        
        y_tr_i= K_train[i] @ alphas[i]
        y_val_i= K_val[i] @ alphas[i]
        
        #err = error(y_train, y_tr_i)
        #if err == 0:
        #    err = 10
        #else:
        #    err = gamma * np.log((1-err)/err)
        
        #errors += [err]
        
        y_tr_pred += prob * y_tr_i
        y_val_pred += prob * y_val_i
        
        if K_test is not None:
            y_te_pred += prob * (K_test[i] @ alphas[i])
    
    #print("Assigned Weights : ", errors)
    print(f"Training score : {1 - error(y_train, y_tr_pred)}")
    print(f"Validation score : {1 - error(y_val, y_val_pred)}")
    
    if K_test is not None:
        return(y_te_pred)

## Our prediction

In [13]:
y_te0_pred_mv = no_weighted_mv(K_tr0_mismatch,
            K_val0_mismatch,
            alphas_0[:6], ytr0[:,1], yval0[:,1],
            K_test =K_te0_mismatch)

y_te1_pred_mv = no_weighted_mv(K_tr1_mismatch,
            K_val1_mismatch,
            alphas_1[:6], ytr1[:,1], yval1[:,1],
            K_test =K_te1_mismatch)

y_te2_pred_mv = no_weighted_mv(K_tr2_mismatch[1:],
            K_val2_mismatch[1:],
            alphas_2[1:6], ytr2[:,1], yval2[:,1],
            K_test =K_te0_mismatch[1:])

Training score : 0.955625
Validation score : 0.6675
Training score : 0.97125
Validation score : 0.675
Training score : 0.989375
Validation score : 0.7525


Compute the scores

In [14]:
(0.955625 + 0.97125 + 0.989375)/3
print((0.6675 +0.675 + 0.7525)/3)

0.6983333333333333


# Weighted Majority voting

In [15]:
def weighted_mv(K_train, K_val, alphas, y_train, y_val, K_test=None, gamma= 1/2 ):
    
    """
    INPUTS
    K_train is a list of training kernels
    K_val is a list of the respective validation kernels
    alphas is a list containing the alpha vector found for each training kernel
    
    if a K_test is provided, then we also give the predictions for the testing model :o
    
    OUTPUTS
    idk
    """
    errors = []
    y_tr_pred = np.zeros(y_train.shape[0])
    y_val_pred = np.zeros(y_val.shape[0])
    
    if K_test is not None:
        y_te_pred = np.zeros(1000)
    
    for i in range(K_train.shape[0]):
        
        y_tr_i= K_train[i] @ alphas[i]
        y_val_i= K_val[i] @ alphas[i]
        
        err = error(y_train, y_tr_i)
        if err == 0:
            err = 10
        else:
            err = gamma * np.log((1-err)/err)
        
        errors += [err]
        
        y_tr_pred += err * y_tr_i
        y_val_pred += err * y_val_i
        
        if K_test is not None:
            y_te_pred += err * (K_test[i] @ alphas[i])
    
    print("Assigned Weights : ", errors)
    print(f"Training score : {1 - error(y_train, y_tr_pred)}")
    print(f"Validation score : {1 - error(y_val, y_val_pred)}")
    
    if K_test is not None:
        return(y_te_pred)

## Use only mismatch kernel

In [16]:
y_te0_pred_wmv = weighted_mv(K_tr0_mismatch,
            K_val0_mismatch,
            alphas_0[:6], ytr0[:,1], yval0[:,1],
            K_test =K_te0_mismatch)

y_te1_pred_wmv = weighted_mv(K_tr1_mismatch,
            K_val1_mismatch,
            alphas_1[:6], ytr1[:,1], yval1[:,1],
            K_test =K_te1_mismatch)

y_te2_pred_wmv = weighted_mv(K_tr2_mismatch[1:],
            K_val2_mismatch[1:],
            alphas_2[1:6], ytr2[:,1], yval2[:,1],
            K_test =K_te0_mismatch[1:])


Assigned Weights :  [0.4691348192964651, 0.5814933771772852, 0.627386509711696, 1.8717222088886272, 2.051639445477934, 2.7137320804273655]
Training score : 0.9875
Validation score : 0.68
Assigned Weights :  [0.49731128757203097, 0.8039693308892296, 0.9287273642467249, 2.6466524123622457, 3.341680472883148, 10]
Training score : 0.998125
Validation score : 0.69
Assigned Weights :  [1.0550190114895286, 1.5203004320351203, 1.978816758340098, 2.7911211950610486, 3.688566856416988]
Training score : 0.9975
Validation score : 0.7725


In [18]:
print((0.9875 + 0.998125 + 0.9975)/3)
print((0.68 + 0.69 + 0.7725)/3)


0.9943750000000001
0.7141666666666667


In [None]:
import numpy as np
import pandas as pd

def write_predictions_csv_good2(y_tests, path, mode="SVM"):
    
    n = 1000
    print(n)
    predictions = np.zeros(3*n, dtype=int)
    
    for i in range(3):
        y_pred = y_tests[i]
        if mode == 'SVM':
            print("entered mode SVM")
            y_pred_ = np.ones(n)
            y_pred_[y_pred < 0] = 0
        else:
            y_pred_ = np.zeros(n)
            y_pred_[y_pred >= 0.5] = 1
   
        predictions[n*i:n*(i+1)] = y_pred_
    
    #predictions = predictions.astype(int)
    pred = pd.DataFrame({"Bound" : predictions})
    print("saving predictions")
    pred.to_csv(path, index=True,index_label="Id")
    #np.savetxt("data/Ytest_KRR.csv", predictions, header = "Id, Bound", delimiter =",")
    print("saved predictions")
    return(predictions)

In [None]:
y_tests = [y_te0_pred, y_te1_pred, y_te2_pred]
aa = write_predictions_csv_good2(y_tests, "data/Ytest_mj_mismatch12_allbut52.csv", mode="SVM")

In [None]:
print(aa[0], aa[1000], aa[2000])