In [113]:
import numpy as np
import math
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import ShuffleSplit, cross_val_score

In [24]:
def read_label_matrix(file_path):
    def read():
        with open(file_path, "r") as f:
            for line in f:
                yield int(line.strip())
    return np.array(list(read()))

In [25]:
def read_kernel_matrix(file_path):
    def read():
        with open(file_path, 'r') as f:
            for line in f:
                yield line.strip().split(' ')
    return np.array(list(read()))

In [None]:
# Finished: MUTAG, PROTEINS, NCI1

In [155]:
DATASET = 'MUTAG'
LABEL_PATH = '/Users/Fabian/Documents/HPI/Master/18SS/smart_representations/datasets/{}/{}_graph_labels.txt'.format(DATASET, DATASET)
KERNEL_PATH = '/Users/Fabian/Documents/HPI/Master/18SS/smart_representations/algorithms/MLGkernel/data/results/output_{}.txt'.format(DATASET)

In [156]:
label_matrix = read_label_matrix(LABEL_PATH)
kernel_matrix = read_kernel_matrix(KERNEL_PATH)

In [157]:
def score_n_fold(train, test, n, c):
    cv = ShuffleSplit(n_splits=n, test_size=0.33)
    clf = svm.SVC(kernel='precomputed', C=c, class_weight='balanced')
    return cross_val_score(clf, kernel_matrix, label_matrix, cv=cv).mean(), c

In [158]:
score_n_fold(kernel_matrix, label_matrix, 10, 1000)

(0.6079365079365079, 1000)

In [159]:
penalties = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
scores = [score_n_fold(kernel_matrix, label_matrix, 10, c) for c in penalties]
scores

[(0.4841269841269841, 0.001),
 (0.43650793650793646, 0.01),
 (0.4746031746031747, 0.1),
 (0.5936507936507937, 1),
 (0.619047619047619, 10),
 (0.6238095238095238, 100),
 (0.6174603174603175, 1000)]