In [1]:
# Navigate to appropriate directory
import os
while os.path.split(os.getcwd())[1] != "data-challenge-kernel-methods":
    os.chdir("..")
    if os.getcwd() == "/":
        raise ValueError()

import numpy as np
import kernels
import matplotlib.pyplot as plt
from time import time
from tqdm import tqdm
import pickle
import networkx as nx
import pandas as pd

from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score


from molecules import Molecule
from kernel_methods import KernelSVC, KernelPCA, SVM
from auc import auc_score

In [2]:
with open("data/training_data.pkl", "rb") as file:
    training_data = pickle.load(file)

with open("data/training_labels.pkl", "rb") as file:
    training_labels = 2 * pickle.load(file) - 1  # labels in {-1, 1}

with open("data/test_data.pkl", "rb") as file:
    test_data = pickle.load(file)

training_data = [Molecule(graph) for graph in training_data]
test_data = [Molecule(graph) for graph in test_data]

In [15]:
K = np.load('kernels/wlsk.npy')
K_test = np.load('kernels/wlsk_test.npy')

np.random.seed(44)
idx = np.random.permutation(6000)

K_train = K[idx[:5000],:][:,idx[:5000]]
K_eval = K[idx[5000:],:][:,idx[:5000]]

labels_train = training_labels[idx[:5000]]
labels_eval = training_labels[idx[5000:]]

In [4]:
svc = [SVC(kernel='precomputed', C=1), SVM(C=1)][1]  # change index here to change type of SVC
svc.fit(K_train, labels_train)

In [17]:
for C in [1e-3, 3e-3, 1e-2, 3e-2, 1e-1]:

    svc = [SVC(kernel='precomputed', C=C), SVM(C=1)][0]  # change index here to change type of SVC
    svc.fit(K_train, labels_train)

    scores_train = svc.decision_function(K_train)
    scores_eval = svc.decision_function(K_eval)

    print("C", C)
    print("AUC train:", auc_score(labels_train, scores_train))
    print("AUC eval:", auc_score(labels_eval, scores_eval))

C 0.001
AUC train: 0.9526826329010795
AUC eval: 0.899359192254913
C 0.003
AUC train: 0.9553139686634832
AUC eval: 0.9008195792574687
C 0.01
AUC train: 0.9704211645959219
AUC eval: 0.9074668580277221
C 0.03
AUC train: 0.9854203913669933
AUC eval: 0.9132454583223175
C 0.1
AUC train: 0.9925454130551218
AUC eval: 0.9116717654316324


In [6]:
# train SVC using the whole training data
svc = [SVC(kernel='precomputed', C=1), SVM(C=1)][1]  # change index here to change type of SVC
svc.fit(K, training_labels)

# plot score on training data
scores_train = svc.decision_function(K)
print(auc_score(training_labels, svc.decision_function(K)))

#predict labels on test data of different size 
pred = svc.decision_function(K_test.T)




0.9946736819464093


In [7]:
Yte = {'Predicted' : pred} 
dataframe = pd.DataFrame(Yte) 
dataframe.index += 1 
dataframe.to_csv('test_pred_2.csv',index_label='Id') 