In [1]:
import sys
import os

In [2]:
sys.path.append('..')

In [3]:
from utils import load_data, nx2gt

In [4]:
from graph_tool.all import *

In [5]:
import networkx as nx

In [6]:
import numpy as np

# Load graphs (MUTAG)

In [7]:
mutag_graphs = load_data("MUTAG", degree_as_tag=False)

loading data
# classes: 2
# maximum node tag: 7
# data: 188


In [8]:
for i in nx.generators.nonisomorphic_trees(4):
    print(i.is_directed())

False
False


# Test counting trees

In [9]:
test_g = nx2gt(mutag_graphs[0][0].g)

In [10]:
trees_4 = [nx2gt(f) for f in nx.generators.nonisomorphic_trees(4)]

In [11]:
%%time
motifs(g=test_g, k=4, p=1.0, motif_list=trees_4)

CPU times: user 994 µs, sys: 0 ns, total: 994 µs
Wall time: 997 µs


([<Graph object, undirected, with 4 vertices and 3 edges at 0x7f6a9b7d9b38>,
  <Graph object, undirected, with 4 vertices and 3 edges at 0x7f6a9b7d9a58>],
 [10, 63])

In [12]:
trees_6 = [nx2gt(f) for f in nx.generators.nonisomorphic_trees(6)]

In [13]:
%%time
motifs(g=test_g, k=6, p=1.0, motif_list=trees_6)

CPU times: user 1.86 ms, sys: 0 ns, total: 1.86 ms
Wall time: 1.87 ms


([<Graph object, undirected, with 6 vertices and 5 edges at 0x7f6a9b7f5400>,
  <Graph object, undirected, with 6 vertices and 5 edges at 0x7f6a9b7f52b0>,
  <Graph object, undirected, with 6 vertices and 5 edges at 0x7f6a9b7effd0>,
  <Graph object, undirected, with 6 vertices and 5 edges at 0x7f6a9b7efe80>,
  <Graph object, undirected, with 6 vertices and 5 edges at 0x7f6a9b7f5160>,
  <Graph object, undirected, with 6 vertices and 5 edges at 0x7f6a9b7efda0>],
 [0, 0, 10, 70, 77, 113])

In [14]:
trees_2 = [nx2gt(f) for f in nx.generators.nonisomorphic_trees(2)]

In [15]:
%%time
motifs(g=test_g, k=2, p=1.0, motif_list=trees_2)

CPU times: user 295 µs, sys: 18 µs, total: 313 µs
Wall time: 315 µs


([<Graph object, undirected, with 2 vertices and 1 edge at 0x7f6a9b7f56d8>],
 [27])

In [16]:
mutag_graphs[0][0].label

0

# Run homomorphism profile for MUTAG

In [17]:
trees_2 = [nx2gt(f) for f in nx.generators.nonisomorphic_trees(2)]
trees_3 = [nx2gt(f) for f in nx.generators.nonisomorphic_trees(3)]
trees_4 = [nx2gt(f) for f in nx.generators.nonisomorphic_trees(4)]
trees_5 = [nx2gt(f) for f in nx.generators.nonisomorphic_trees(5)]
trees_6 = [nx2gt(f) for f in nx.generators.nonisomorphic_trees(6)]
trees_7 = [nx2gt(f) for f in nx.generators.nonisomorphic_trees(7)]

In [18]:
all_trees = [trees_2, trees_3, trees_4, trees_5, trees_6, trees_7]

In [19]:
%%time
X = []
y = []
for g in mutag_graphs[0]:
    gt = nx2gt(g.g)
    profile = []
    for i, trees in enumerate(all_trees):
        profile.extend(motifs(g=gt, k=i+2, p=1.0, motif_list=trees)[1])
    X.append(profile)
    y.append(g.label)

CPU times: user 978 ms, sys: 313 µs, total: 978 ms
Wall time: 978 ms


# Test simple ML approach

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer
from sklearn import svm
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [22]:
normalizer = Normalizer().fit(X_train)

In [23]:
X_train = normalizer.fit_transform(X_train)

In [24]:
X_test = normalizer.fit_transform(X_test)

# (E1) Default SVM

In [25]:
%%time
clf = svm.SVC(gamma='scale')
clf.fit(X_train, y_train)  

CPU times: user 2.52 ms, sys: 107 µs, total: 2.63 ms
Wall time: 1.59 ms


In [26]:
print("Train: ", f1_score(y_pred=clf.predict(X_train), y_true=y_train))

Train:  0.853932584269663


In [27]:
print("Test: ", f1_score(y_pred=clf.predict(X_test), y_true=y_test))

Test:  0.8095238095238095


# (E2) Tune SVM

In [19]:
%%time
clf = svm.SVC(C=1.0, kernel='rbf', degree=3, gamma='scale', 
              coef0=0.0, shrinking=True, probability=False, tol=0.001, 
              cache_size=200, class_weight=None, verbose=False, max_iter=-1, 
              decision_function_shape='ovr', random_state=None)
clf.fit(X_train, y_train)  

CPU times: user 2.1 ms, sys: 148 µs, total: 2.25 ms
Wall time: 1.48 ms


In [20]:
print("Train: ", f1_score(y_pred=clf.predict(X_train), y_true=y_train, average='micro'))
print("Test: ", f1_score(y_pred=clf.predict(X_test), y_true=y_test, average='micro'))

Train:  0.8960000000000001
Test:  0.8730158730158731


# (E3) Setup MUTAG as 18 graphs for test

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [22]:
normalizer = Normalizer().fit(X_train)
X_train = normalizer.fit_transform(X_train)
X_test = normalizer.fit_transform(X_test)

In [23]:
%%time
clf = svm.SVC(C=1000.0, kernel='rbf', degree=6, gamma='scale', 
              coef0=0.0, shrinking=True, probability=False, tol=0.001, 
              cache_size=200, class_weight=None, verbose=False, max_iter=-1, 
              decision_function_shape='ovr', random_state=None)
clf.fit(X_train, y_train)  

CPU times: user 3.34 ms, sys: 0 ns, total: 3.34 ms
Wall time: 2.55 ms


In [24]:
print("Train: ", f1_score(y_pred=clf.predict(X_train), y_true=y_train, average='micro'))
print("Test: ", f1_score(y_pred=clf.predict(X_test), y_true=y_test, average='micro'))

Train:  0.9230769230769231
Test:  0.7894736842105263


In [25]:
%%time
clf = svm.SVC(C=1000.0, kernel='poly', degree=6, gamma='scale', 
              coef0=0.0, shrinking=True, probability=False, tol=0.001, 
              cache_size=200, class_weight=None, verbose=False, max_iter=-1, 
              decision_function_shape='ovr', random_state=None)
clf.fit(X_train, y_train)  

print("Train: ", f1_score(y_pred=clf.predict(X_train), y_true=y_train, average='micro'))
print("Test: ", f1_score(y_pred=clf.predict(X_test), y_true=y_test, average='micro'))

Train:  0.9763313609467456
Test:  0.7894736842105263
CPU times: user 28.5 ms, sys: 2.99 ms, total: 31.5 ms
Wall time: 28.3 ms


# (E4) Use GIN splits

In [28]:
%%time
test_avg = []
for i in range (1,11):
    with open("../data/MUTAG/10fold_idx/test_idx-{}.txt".format(i), "r") as test_f,\
         open("../data/MUTAG/10fold_idx/train_idx-{}.txt".format(i), "r") as train_f:
        train_idx = [int(i) for i in train_f.readlines()]
        test_idx = [int(i) for i in test_f.readlines()]
        X_train = [X[i] for i in train_idx]
        X_test = [X[i] for i in test_idx]
        y_train = [y[i] for i in train_idx]
        y_test = [y[i] for i in test_idx]
        normalizer = Normalizer().fit(X_train)
        X_train = normalizer.transform(X_train)
        X_test = normalizer.transform(X_test)
    clf = svm.SVC(C=1000.0, kernel='poly', degree=6, gamma='scale', 
              coef0=0.0, shrinking=True, probability=False, tol=0.001, 
              cache_size=200, class_weight=None, verbose=False, max_iter=-1, 
              decision_function_shape='ovr', random_state=None)
    clf.fit(X_train, y_train)  
    print("Train: ", f1_score(y_pred=clf.predict(X_train), y_true=y_train, average='macro'))
    print("Test: ", f1_score(y_pred=clf.predict(X_test), y_true=y_test, average='macro'))
    test_avg.append(f1_score(y_pred=clf.predict(X_test), y_true=y_test, average='macro'))
print("Final avg: ", sum(test_avg)/10.0)

Train:  0.96040987424313
Test:  0.625
Train:  0.9616483681756656
Test:  0.9113300492610837
Train:  0.9610627576729273
Test:  0.925925925925926
Train:  0.9736065828287533
Test:  0.7749999999999999
Train:  0.974041838448618
Test:  0.8036363636363635
Train:  0.9671497584541062
Test:  0.7662337662337663
Train:  0.9668628903356593
Test:  0.8193979933110367
Train:  0.9671497584541062
Test:  0.8193979933110367
Train:  0.9665604469097918
Test:  0.888888888888889
Train:  0.9613636363636364
Test:  0.8392857142857143
Final avg:  0.8174096694853816
CPU times: user 114 ms, sys: 10.2 ms, total: 124 ms
Wall time: 121 ms


In [29]:
%%time
# Report accuracies
test_avg = []
for i in range (1,11):
    with open("../data/MUTAG/10fold_idx/test_idx-{}.txt".format(i), "r") as test_f,\
         open("../data/MUTAG/10fold_idx/train_idx-{}.txt".format(i), "r") as train_f:
        train_idx = [int(i) for i in train_f.readlines()]
        test_idx = [int(i) for i in test_f.readlines()]
        X_train = [X[i] for i in train_idx]
        X_test = [X[i] for i in test_idx]
        y_train = [y[i] for i in train_idx]
        y_test = [y[i] for i in test_idx]
        normalizer = Normalizer().fit(X_train)
        X_train = normalizer.transform(X_train)
        X_test = normalizer.transform(X_test)
    clf = svm.SVC(C=100.0, kernel='poly', degree=3, gamma='scale', 
              coef0=0.0, shrinking=True, probability=False, tol=0.001, 
              cache_size=200, class_weight=None, verbose=False, max_iter=-1, 
              decision_function_shape='ovr', random_state=None)
    clf.fit(X_train, y_train)  
    print("Train: ", accuracy_score(y_pred=clf.predict(X_train), y_true=y_train))
    print("Test: ", accuracy_score(y_pred=clf.predict(X_test), y_true=y_test))
    test_avg.append(accuracy_score(y_pred=clf.predict(X_test), y_true=y_test))
print("Final avg: ", sum(test_avg)/10.0)

Train:  0.9235294117647059
Test:  0.7777777777777778
Train:  0.9176470588235294
Test:  0.9444444444444444
Train:  0.9058823529411765
Test:  1.0
Train:  0.9235294117647059
Test:  0.8333333333333334
Train:  0.9235294117647059
Test:  0.7777777777777778
Train:  0.9235294117647059
Test:  0.7777777777777778
Train:  0.9176470588235294
Test:  0.8333333333333334
Train:  0.9235294117647059
Test:  0.8333333333333334
Train:  0.9058823529411765
Test:  0.9444444444444444
Train:  0.9117647058823529
Test:  1.0
Final avg:  0.8722222222222221
CPU times: user 38.1 ms, sys: 48 µs, total: 38.1 ms
Wall time: 34.5 ms


In [30]:
np.std(test_avg)

0.08624541497922233

# (E5) More trees

In [55]:
trees_8 = [nx2gt(f) for f in nx.generators.nonisomorphic_trees(8)]
trees_9 = [nx2gt(f) for f in nx.generators.nonisomorphic_trees(9)]

In [70]:
all_trees = [trees_2, trees_3, trees_4, trees_5, trees_6, trees_7, trees_8]

In [71]:
%%time
X = []
y = []
for g in mutag_graphs[0]:
    gt = nx2gt(g.g)
    profile = []
    for i, trees in enumerate(all_trees):
        profile.extend(motifs(g=gt, k=i+2, p=1.0, motif_list=trees)[1])
    X.append(profile)
    y.append(g.label)

CPU times: user 1.86 s, sys: 2.03 ms, total: 1.86 s
Wall time: 1.86 s


In [72]:
# Report accuracies
test_avg = []
for i in range (1,11):
    with open("../data/MUTAG/10fold_idx/test_idx-{}.txt".format(i), "r") as test_f,\
         open("../data/MUTAG/10fold_idx/train_idx-{}.txt".format(i), "r") as train_f:
        train_idx = [int(i) for i in train_f.readlines()]
        test_idx = [int(i) for i in test_f.readlines()]
        X_train = [X[i] for i in train_idx]
        X_test = [X[i] for i in test_idx]
        y_train = [y[i] for i in train_idx]
        y_test = [y[i] for i in test_idx]
        normalizer = Normalizer().fit(X_train)
        X_train = normalizer.transform(X_train)
        X_test = normalizer.transform(X_test)
    clf = svm.SVC(C=1000.0, kernel='rbf', degree=3, gamma='scale', 
              coef0=0.0, shrinking=True, probability=False, tol=0.001, 
              cache_size=200, class_weight=None, verbose=False, max_iter=-1, 
              decision_function_shape='ovr', random_state=None)
    clf.fit(X_train, y_train)  
    print("Train acc: ", accuracy_score(y_pred=clf.predict(X_train), y_true=y_train))
    print("Test acc: ", accuracy_score(y_pred=clf.predict(X_test), y_true=y_test))
    print("Train f1m: ", f1_score(y_pred=clf.predict(X_train), y_true=y_train, average='macro'))
    print("Test f1m: ", f1_score(y_pred=clf.predict(X_test), y_true=y_test, average='macro'))
    test_avg.append(accuracy_score(y_pred=clf.predict(X_test), y_true=y_test))
print("Final avg: ", sum(test_avg)/10.0)

Train acc:  0.9529411764705882
Test acc:  0.7777777777777778
Train f1m:  0.9472131656575065
Test f1m:  0.75
Train acc:  0.9588235294117647
Test acc:  0.9444444444444444
Train f1m:  0.95541568318909
Test f1m:  0.9113300492610837
Train acc:  0.9647058823529412
Test acc:  0.8888888888888888
Train f1m:  0.9610627576729273
Test f1m:  0.8392857142857142
Train acc:  0.9764705882352941
Test acc:  0.8333333333333334
Train f1m:  0.9736065828287533
Test f1m:  0.8285714285714285
Train acc:  0.9647058823529412
Test acc:  0.7777777777777778
Train f1m:  0.9610627576729273
Test f1m:  0.7230769230769231
Train acc:  0.9588235294117647
Test acc:  0.8333333333333334
Train f1m:  0.9540096618357488
Test f1m:  0.8193979933110367
Train acc:  0.9705882352941176
Test acc:  0.8888888888888888
Train f1m:  0.9668628903356593
Test f1m:  0.8831168831168831
Train acc:  0.9705882352941176
Test acc:  0.7777777777777778
Train f1m:  0.9671497584541062
Test f1m:  0.7662337662337663
Train acc:  0.9647058823529412
Test acc:

# (E6) Gaussian Process as classifier

In [31]:
from sklearn.gaussian_process import GaussianProcessClassifier

In [38]:
%%time
# Report accuracies
test_avg = []
for i in range (1,11):
    with open("../data/MUTAG/10fold_idx/test_idx-{}.txt".format(i), "r") as test_f,\
         open("../data/MUTAG/10fold_idx/train_idx-{}.txt".format(i), "r") as train_f:
        train_idx = [int(i) for i in train_f.readlines()]
        test_idx = [int(i) for i in test_f.readlines()]
        X_train = [X[i] for i in train_idx]
        X_test = [X[i] for i in test_idx]
        y_train = [y[i] for i in train_idx]
        y_test = [y[i] for i in test_idx]
        normalizer = Normalizer().fit(X_train)
        X_train = normalizer.transform(X_train)
        X_test = normalizer.transform(X_test)
    clf = GaussianProcessClassifier(kernel=None, optimizer='fmin_l_bfgs_b', 
                                n_restarts_optimizer=0, max_iter_predict=200, 
                                warm_start=False, copy_X_train=True, 
                                random_state=42, multi_class='one_vs_rest', 
                                n_jobs=4)
    clf.fit(X_train, y_train)  
    print("Train acc: ", accuracy_score(y_pred=clf.predict(X_train), y_true=y_train))
    print("Test acc: ", accuracy_score(y_pred=clf.predict(X_test), y_true=y_test))
    print("Train f1m: ", f1_score(y_pred=clf.predict(X_train), y_true=y_train, average='macro'))
    print("Test f1m: ", f1_score(y_pred=clf.predict(X_test), y_true=y_test, average='macro'))
    test_avg.append(accuracy_score(y_pred=clf.predict(X_test), y_true=y_test))
print("Final avg: ", sum(test_avg)/10.0)

Train acc:  0.7705882352941177
Test acc:  0.5
Train f1m:  0.703209633376606
Test f1m:  0.3333333333333333
Train acc:  0.7352941176470589
Test acc:  0.8888888888888888
Train f1m:  0.6669713987201253
Test f1m:  0.8
Train acc:  0.7411764705882353
Test acc:  0.8333333333333334
Train f1m:  0.6721598877980365
Test f1m:  0.7339901477832513
Train acc:  0.7588235294117647
Test acc:  0.6111111111111112
Train f1m:  0.6833401481077643
Test f1m:  0.5418181818181819
Train acc:  0.7529411764705882
Test acc:  0.7222222222222222
Train f1m:  0.6826666666666668
Test f1m:  0.6296296296296297
Train acc:  0.7411764705882353
Test acc:  0.7777777777777778
Train f1m:  0.6626984126984127
Test f1m:  0.723076923076923
Train acc:  0.7352941176470589
Test acc:  0.8333333333333334
Train f1m:  0.6470588235294117
Test f1m:  0.8193979933110367
Train acc:  0.7470588235294118
Test acc:  0.7222222222222222
Train f1m:  0.6727695957742066
Test f1m:  0.6296296296296295
Train acc:  0.7588235294117647
Test acc:  0.611111111111

# (E7) MLP as classifier

In [39]:
from sklearn.neural_network import MLPClassifier

In [51]:
%%time
# Report accuracies
test_avg = []
for i in range (1,11):
    with open("../data/MUTAG/10fold_idx/test_idx-{}.txt".format(i), "r") as test_f,\
         open("../data/MUTAG/10fold_idx/train_idx-{}.txt".format(i), "r") as train_f:
        train_idx = [int(i) for i in train_f.readlines()]
        test_idx = [int(i) for i in test_f.readlines()]
        X_train = [X[i] for i in train_idx]
        X_test = [X[i] for i in test_idx]
        y_train = [y[i] for i in train_idx]
        y_test = [y[i] for i in test_idx]
        normalizer = Normalizer().fit(X_train)
        X_train = normalizer.transform(X_train)
        X_test = normalizer.transform(X_test)
    clf = MLPClassifier(hidden_layer_sizes=(200,), activation='relu', 
                        solver='adam', alpha=0.00001, batch_size=16, 
                        learning_rate='adaptive', learning_rate_init=0.001, 
                        power_t=0.5, max_iter=500, shuffle=True, 
                        random_state=None, tol=0.0001, verbose=False, 
                        warm_start=False, momentum=0.9, 
                        nesterovs_momentum=True, early_stopping=False, 
                        validation_fraction=0.1, beta_1=0.9, beta_2=0.999, 
                        epsilon=1e-08, n_iter_no_change=10)
    clf.fit(X_train, y_train)  
    print("Train acc: ", accuracy_score(y_pred=clf.predict(X_train), y_true=y_train))
    print("Test acc: ", accuracy_score(y_pred=clf.predict(X_test), y_true=y_test))
    print("Train f1m: ", f1_score(y_pred=clf.predict(X_train), y_true=y_train, average='macro'))
    print("Test f1m: ", f1_score(y_pred=clf.predict(X_test), y_true=y_test, average='macro'))
    test_avg.append(accuracy_score(y_pred=clf.predict(X_test), y_true=y_test))
print("Final avg: ", sum(test_avg)/10.0)

Train acc:  0.8941176470588236
Test acc:  0.7777777777777778
Train f1m:  0.8822352216748769
Test f1m:  0.7662337662337663
Train acc:  0.8588235294117647
Test acc:  0.9444444444444444
Train f1m:  0.8454545454545455
Test f1m:  0.9113300492610837
Train acc:  0.8705882352941177
Test acc:  1.0
Train f1m:  0.8572301114673996
Test f1m:  1.0
Train acc:  0.8764705882352941
Test acc:  0.8888888888888888
Train f1m:  0.8582151793160968
Test f1m:  0.8875000000000001
Train acc:  0.8823529411764706
Test acc:  0.7222222222222222
Train f1m:  0.8691502463054187
Test f1m:  0.6296296296296297
Train acc:  0.8823529411764706
Test acc:  0.8333333333333334
Train f1m:  0.8680329141437664
Test f1m:  0.8193979933110367
Train acc:  0.8764705882352941
Test acc:  0.8333333333333334
Train f1m:  0.859553877021126
Test f1m:  0.8193979933110367
Train acc:  0.8941176470588236
Test acc:  0.7777777777777778
Train f1m:  0.8822352216748768
Test f1m:  0.75
Train acc:  0.8823529411764706
Test acc:  0.8888888888888888
Train f1

# (E8) Preprocess with StandardScaler

In [54]:
from sklearn.preprocessing import StandardScaler

In [62]:
%%time
# Report accuracies
test_avg = []
for i in range (1,11):
    with open("../data/MUTAG/10fold_idx/test_idx-{}.txt".format(i), "r") as test_f,\
         open("../data/MUTAG/10fold_idx/train_idx-{}.txt".format(i), "r") as train_f:
        train_idx = [int(i) for i in train_f.readlines()]
        test_idx = [int(i) for i in test_f.readlines()]
        X_train = [X[i] for i in train_idx]
        X_test = [X[i] for i in test_idx]
        y_train = [y[i] for i in train_idx]
        y_test = [y[i] for i in test_idx]
        scaler = StandardScaler().fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)
    clf = MLPClassifier(hidden_layer_sizes=(64,32), activation='relu', 
                        solver='adam', alpha=0.001, batch_size=16, 
                        learning_rate='adaptive', learning_rate_init=0.001, 
                        power_t=0.5, max_iter=500, shuffle=True, 
                        random_state=None, tol=0.0001, verbose=False, 
                        warm_start=False, momentum=0.9, 
                        nesterovs_momentum=True, early_stopping=False, 
                        validation_fraction=0.1, beta_1=0.9, beta_2=0.999, 
                        epsilon=1e-08, n_iter_no_change=10)
    clf.fit(X_train, y_train)  
    print("Train acc: ", accuracy_score(y_pred=clf.predict(X_train), y_true=y_train))
    print("Test acc: ", accuracy_score(y_pred=clf.predict(X_test), y_true=y_test))
    print("Train f1m: ", f1_score(y_pred=clf.predict(X_train), y_true=y_train, average='macro'))
    print("Test f1m: ", f1_score(y_pred=clf.predict(X_test), y_true=y_test, average='macro'))
    test_avg.append(accuracy_score(y_pred=clf.predict(X_test), y_true=y_test))
print("Final avg: ", sum(test_avg)/10.0)

Train acc:  0.9411764705882353
Test acc:  0.7777777777777778
Train f1m:  0.9356060606060606
Test f1m:  0.7662337662337663
Train acc:  0.9235294117647059
Test acc:  0.8888888888888888
Train f1m:  0.9177674418604652
Test f1m:  0.8392857142857142
Train acc:  0.9352941176470588
Test acc:  1.0
Train f1m:  0.9299389307257127
Test f1m:  1.0
Train acc:  0.9294117647058824
Test acc:  0.8888888888888888
Train f1m:  0.9214901477832513
Test f1m:  0.8875000000000001
Train acc:  0.9352941176470588
Test acc:  0.8333333333333334
Train f1m:  0.9304186046511629
Test f1m:  0.8036363636363635
Train acc:  0.9294117647058824
Test acc:  0.8888888888888888
Train f1m:  0.9227272727272726
Test f1m:  0.8831168831168831
Train acc:  0.9411764705882353
Test acc:  0.8333333333333334
Train f1m:  0.9351045961215453
Test f1m:  0.8193979933110367
Train acc:  0.9352941176470588
Test acc:  0.7777777777777778
Train f1m:  0.929431299294313
Test f1m:  0.7662337662337663
Train acc:  0.9294117647058824
Test acc:  0.83333333333

# (E9) With features

In [63]:
import copy

In [65]:
features = [list(sum(g.node_features).numpy()) for g in mutag_graphs[0]]

In [67]:
Xf = copy.deepcopy(X)
for i in range(len(X)):
    Xf[i].extend(features[i])

In [78]:
%%time
test_avg = []
for i in range (1,11):
    with open("../data/MUTAG/10fold_idx/test_idx-{}.txt".format(i), "r") as test_f,\
         open("../data/MUTAG/10fold_idx/train_idx-{}.txt".format(i), "r") as train_f:
        train_idx = [int(i) for i in train_f.readlines()]
        test_idx = [int(i) for i in test_f.readlines()]
        X_train = [Xf[i] for i in train_idx]
        X_test = [Xf[i] for i in test_idx]
        y_train = [y[i] for i in train_idx]
        y_test = [y[i] for i in test_idx]
        normalizer = Normalizer().fit(X_train)
        X_train = normalizer.transform(X_train)
        X_test = normalizer.transform(X_test)
    clf = svm.SVC(C=10.0, kernel='rbf', degree=2, gamma='scale', 
              coef0=0.0, shrinking=True, probability=False, tol=0.001, 
              cache_size=200, class_weight=None, verbose=False, max_iter=-1, 
              decision_function_shape='ovr', random_state=None)
    clf.fit(X_train, y_train)  
    #print("Train: ", f1_score(y_pred=clf.predict(X_train), y_true=y_train, average='micro'))
    #print("Test: ", f1_score(y_pred=clf.predict(X_test), y_true=y_test, average='micro'))
    test_avg.append(f1_score(y_pred=clf.predict(X_test), y_true=y_test, average='micro'))
print("Final avg: ", np.mean(test_avg), np.std(test_avg))

Final avg:  0.8833333333333334 0.0840708108356753
CPU times: user 21 ms, sys: 3.28 ms, total: 24.3 ms
Wall time: 23.3 ms
