In [1]:
import sys
import os
sys.path.append('..')
from utils import load_data, nx2gt
from graph_tool.all import motifs, Graph
import networkx as nx
import numpy as np
from random import shuffle

# Load PTC

In [2]:
ptc_graphs = load_data("PTC", degree_as_tag=False)

loading data
# classes: 2
# maximum node tag: 19
# data: 344


In [19]:
test_g = nx2gt(ptc_graphs[0][6].g)
# Workaround to allow parallel motifs
idx = test_g.vertex_index.copy("int")
shuffle(idx.a)
test_g = Graph(test_g, vorder=idx)

In [20]:
test_g

<Graph object, undirected, with 29 vertices and 31 edges at 0x7fa9b2613e48>

In [13]:
trees_5 = [nx2gt(f) for f in nx.generators.nonisomorphic_trees(5)]

In [14]:
%%time
motifs(g=test_g, k=5, p=1.0, motif_list=trees_5)

CPU times: user 1.16 ms, sys: 0 ns, total: 1.16 ms
Wall time: 1.17 ms


([<Graph object, undirected, with 5 vertices and 4 edges at 0x7fa9b2613d68>,
  <Graph object, undirected, with 5 vertices and 4 edges at 0x7fa9b27dea90>,
  <Graph object, undirected, with 5 vertices and 4 edges at 0x7fa9b27de860>],
 [1, 77, 108])

In [15]:
trees_7 = [nx2gt(f) for f in nx.generators.nonisomorphic_trees(7)]

In [16]:
%%time
motifs(g=test_g, k=7, p=1.0, motif_list=trees_7)

CPU times: user 6.07 ms, sys: 133 µs, total: 6.21 ms
Wall time: 5.74 ms


([<Graph object, undirected, with 7 vertices and 6 edges at 0x7fa9b2622470>,
  <Graph object, undirected, with 7 vertices and 6 edges at 0x7fa9b2606cf8>,
  <Graph object, undirected, with 7 vertices and 6 edges at 0x7fa9b2606710>,
  <Graph object, undirected, with 7 vertices and 6 edges at 0x7fa9b2606240>,
  <Graph object, undirected, with 7 vertices and 6 edges at 0x7fa9b2606b00>,
  <Graph object, undirected, with 7 vertices and 6 edges at 0x7fa9b2606358>,
  <Graph object, undirected, with 7 vertices and 6 edges at 0x7fa9b2606518>,
  <Graph object, undirected, with 7 vertices and 6 edges at 0x7fa9b260e048>,
  <Graph object, undirected, with 7 vertices and 6 edges at 0x7fa9b26060b8>,
  <Graph object, undirected, with 7 vertices and 6 edges at 0x7fa9b2606908>,
  <Graph object, undirected, with 7 vertices and 6 edges at 0x7fa9b260e470>],
 [0, 0, 1, 4, 0, 30, 114, 123, 264, 52, 140])

# Build Homomorphism profile

In [21]:
trees_2 = [nx2gt(f) for f in nx.generators.nonisomorphic_trees(2)]
trees_3 = [nx2gt(f) for f in nx.generators.nonisomorphic_trees(3)]
trees_4 = [nx2gt(f) for f in nx.generators.nonisomorphic_trees(4)]
trees_5 = [nx2gt(f) for f in nx.generators.nonisomorphic_trees(5)]
trees_6 = [nx2gt(f) for f in nx.generators.nonisomorphic_trees(6)]
trees_7 = [nx2gt(f) for f in nx.generators.nonisomorphic_trees(7)]

all_trees = [trees_2, trees_3, trees_4, trees_5, trees_6, trees_7]

In [91]:
%%time
X = []
y = []
for g in ptc_graphs[0]:
    gt = nx2gt(g.g)
    profile = []
    for i, trees in enumerate(all_trees):
        profile.extend(motifs(g=gt, k=i+2, p=1.0, motif_list=trees)[1])
    X.append(profile)
    y.append(g.label)

CPU times: user 2.64 s, sys: 3.34 ms, total: 2.64 s
Wall time: 2.69 s


# (E1) GIN Splits with SVM

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer
from sklearn import svm
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [26]:
%%time
test_avg = []
for i in range (1,11):
    with open("../data/PTC/10fold_idx/test_idx-{}.txt".format(i), "r") as test_f,\
         open("../data/PTC/10fold_idx/train_idx-{}.txt".format(i), "r") as train_f:
        train_idx = [int(i) for i in train_f.readlines()]
        test_idx = [int(i) for i in test_f.readlines()]
        X_train = [X[i] for i in train_idx]
        X_test = [X[i] for i in test_idx]
        y_train = [y[i] for i in train_idx]
        y_test = [y[i] for i in test_idx]
        normalizer = Normalizer().fit(X_train)
        X_train = normalizer.transform(X_train)
        X_test = normalizer.transform(X_test)
    clf = svm.SVC(C=100.0, kernel='poly', degree=6, gamma='scale', 
              coef0=0.0, shrinking=True, probability=False, tol=0.001, 
              cache_size=200, class_weight=None, verbose=False, max_iter=-1, 
              decision_function_shape='ovr', random_state=None)
    clf.fit(X_train, y_train)  
    print("Train: ", f1_score(y_pred=clf.predict(X_train), y_true=y_train, average='macro'))
    print("Test: ", f1_score(y_pred=clf.predict(X_test), y_true=y_test, average='macro'))
    test_avg.append(f1_score(y_pred=clf.predict(X_test), y_true=y_test, average='macro'))
print("Final avg: ", sum(test_avg)/10.0)

Train:  0.8325548883169701
Test:  0.4631578947368421
Train:  0.8440643863179076
Test:  0.6621499548328816
Train:  0.8448200573004867
Test:  0.6739319965126417
Train:  0.8539552015075592
Test:  0.6146469049694856
Train:  0.8438876998615134
Test:  0.6146469049694856
Train:  0.8459488892883198
Test:  0.6091954022988506
Train:  0.8316132536664856
Test:  0.5882352941176471
Train:  0.8187616263619453
Test:  0.6739319965126417
Train:  0.8600657155754312
Test:  0.5490716180371353
Train:  0.8243736623442024
Test:  0.6693191865605659
Final avg:  0.6118287153548178
CPU times: user 817 ms, sys: 2.77 ms, total: 820 ms
Wall time: 813 ms


In [37]:
%%time
test_avg = []
for i in range (1,11):
    with open("../data/PTC/10fold_idx/test_idx-{}.txt".format(i), "r") as test_f,\
         open("../data/PTC/10fold_idx/train_idx-{}.txt".format(i), "r") as train_f:
        train_idx = [int(i) for i in train_f.readlines()]
        test_idx = [int(i) for i in test_f.readlines()]
        X_train = [X[i] for i in train_idx]
        X_test = [X[i] for i in test_idx]
        y_train = [y[i] for i in train_idx]
        y_test = [y[i] for i in test_idx]
        normalizer = Normalizer().fit(X_train)
        X_train = normalizer.transform(X_train)
        X_test = normalizer.transform(X_test)
    clf = svm.SVC(C=100.0, kernel='poly', degree=6, gamma='scale', 
              coef0=0.0, shrinking=True, probability=False, tol=0.001, 
              cache_size=200, class_weight=None, verbose=False, max_iter=-1, 
              decision_function_shape='ovr', random_state=None)
    clf.fit(X_train, y_train)  
    print("Train: ", f1_score(y_pred=clf.predict(X_train), y_true=y_train, average='micro'))
    print("Test: ", f1_score(y_pred=clf.predict(X_test), y_true=y_test, average='micro'))
    test_avg.append(f1_score(y_pred=clf.predict(X_test), y_true=y_test, average='micro'))
print("Final avg: ", sum(test_avg)/10.0)

Train:  0.8354838709677419
Test:  0.47058823529411764
Train:  0.8451612903225807
Test:  0.6764705882352942
Train:  0.8483870967741935
Test:  0.6764705882352942
Train:  0.8580645161290322
Test:  0.6176470588235294
Train:  0.8451612903225807
Test:  0.6176470588235294
Train:  0.8483870967741935
Test:  0.6176470588235294
Train:  0.8354838709677419
Test:  0.5882352941176471
Train:  0.8225806451612904
Test:  0.6764705882352942
Train:  0.8612903225806452
Test:  0.5588235294117647
Train:  0.8258064516129032
Test:  0.6764705882352942
Final avg:  0.6176470588235294
CPU times: user 822 ms, sys: 46 µs, total: 822 ms
Wall time: 816 ms


# (E2) GIN split with neural net

In [27]:
from sklearn.neural_network import MLPClassifier

In [46]:
%%time
# Report accuracies
test_avg = []
for i in range (1,11):
    with open("../data/PTC/10fold_idx/test_idx-{}.txt".format(i), "r") as test_f,\
         open("../data/PTC/10fold_idx/train_idx-{}.txt".format(i), "r") as train_f:
        train_idx = [int(i) for i in train_f.readlines()]
        test_idx = [int(i) for i in test_f.readlines()]
        X_train = [X[i] for i in train_idx]
        X_test = [X[i] for i in test_idx]
        y_train = [y[i] for i in train_idx]
        y_test = [y[i] for i in test_idx]
        normalizer = Normalizer().fit(X_train)
        X_train = normalizer.transform(X_train)
        X_test = normalizer.transform(X_test)
    clf = MLPClassifier(hidden_layer_sizes=(128,64), activation='relu', 
                        solver='adam', alpha=0.001, batch_size=16, 
                        learning_rate='adaptive', learning_rate_init=0.01, 
                        power_t=0.5, max_iter=500, shuffle=True, 
                        random_state=None, tol=0.0001, verbose=False, 
                        warm_start=False, momentum=0.9, 
                        nesterovs_momentum=True, early_stopping=False, 
                        validation_fraction=0.1, beta_1=0.9, beta_2=0.999, 
                        epsilon=1e-08, n_iter_no_change=10)
    clf.fit(X_train, y_train)  
    print("Train acc: ", accuracy_score(y_pred=clf.predict(X_train), y_true=y_train))
    print("Test acc: ", accuracy_score(y_pred=clf.predict(X_test), y_true=y_test))
    print("Train f1m: ", f1_score(y_pred=clf.predict(X_train), y_true=y_train, average='macro'))
    print("Test f1m: ", f1_score(y_pred=clf.predict(X_test), y_true=y_test, average='macro'))
    test_avg.append(accuracy_score(y_pred=clf.predict(X_test), y_true=y_test))
print("Final avg: ", sum(test_avg)/10.0)

Train acc:  0.632258064516129
Test acc:  0.5882352941176471
Train f1m:  0.5674418604651162
Test f1m:  0.5041666666666667
Train acc:  0.6645161290322581
Test acc:  0.7058823529411765
Train f1m:  0.6305463879721306
Test f1m:  0.6458333333333334
Train acc:  0.6516129032258065
Test acc:  0.6764705882352942
Train f1m:  0.6054864253393666
Test f1m:  0.6693191865605659
Train acc:  0.6774193548387096
Test acc:  0.5
Train f1m:  0.6518575086474103
Test f1m:  0.48894783377541995
Train acc:  0.6741935483870968
Test acc:  0.6176470588235294
Train f1m:  0.6388321740434417
Test f1m:  0.5252416756176155
Train acc:  0.632258064516129
Test acc:  0.5294117647058824
Train f1m:  0.5674418604651164
Test f1m:  0.39555555555555555
Train acc:  0.6741935483870968
Test acc:  0.5294117647058824
Train f1m:  0.6515652299713997
Test f1m:  0.5277777777777778
Train acc:  0.6483870967741936
Test acc:  0.6470588235294118
Train f1m:  0.6378000021438295
Test f1m:  0.6458333333333333
Train acc:  0.6709677419354839
Test acc

# (E3) Ensemble model (adaboost)

In [48]:
from sklearn.ensemble import AdaBoostClassifier

In [63]:
%%time
# Report accuracies
test_avg = []
for i in range (1,11):
    with open("../data/PTC/10fold_idx/test_idx-{}.txt".format(i), "r") as test_f,\
         open("../data/PTC/10fold_idx/train_idx-{}.txt".format(i), "r") as train_f:
        train_idx = [int(i) for i in train_f.readlines()]
        test_idx = [int(i) for i in test_f.readlines()]
        X_train = [X[i] for i in train_idx]
        X_test = [X[i] for i in test_idx]
        y_train = [y[i] for i in train_idx]
        y_test = [y[i] for i in test_idx]
        normalizer = Normalizer().fit(X_train)
        X_train = normalizer.transform(X_train)
        X_test = normalizer.transform(X_test)
    clf = AdaBoostClassifier(base_estimator=svm.SVC(C=1000.0, kernel='rbf', degree=3, gamma=4.0, 
              coef0=0.0, shrinking=True, probability=False, tol=0.001, 
              cache_size=200, class_weight=None, verbose=False, max_iter=-1, 
              decision_function_shape='ovr', random_state=None), n_estimators=50, 
                             learning_rate=1.0, algorithm='SAMME', random_state=None)
    clf.fit(X_train, y_train)  
    print("Train acc: ", accuracy_score(y_pred=clf.predict(X_train), y_true=y_train))
    print("Test acc: ", accuracy_score(y_pred=clf.predict(X_test), y_true=y_test))
    print("Train f1m: ", f1_score(y_pred=clf.predict(X_train), y_true=y_train, average='macro'))
    print("Test f1m: ", f1_score(y_pred=clf.predict(X_test), y_true=y_test, average='macro'))
    test_avg.append(accuracy_score(y_pred=clf.predict(X_test), y_true=y_test))
print("Final avg: ", sum(test_avg)/10.0)

Train acc:  0.6903225806451613
Test acc:  0.6470588235294118
Train f1m:  0.6726648774693124
Test f1m:  0.6263736263736264
Train acc:  0.7129032258064516
Test acc:  0.7352941176470589
Train f1m:  0.6883577503925178
Test f1m:  0.6713211600429645
Train acc:  0.7387096774193549
Test acc:  0.5882352941176471
Train f1m:  0.7259062776304156
Test f1m:  0.5882352941176471
Train acc:  0.7387096774193549
Test acc:  0.5
Train f1m:  0.7205622141354788
Test f1m:  0.48894783377541995
Train acc:  0.7516129032258064
Test acc:  0.6470588235294118
Train f1m:  0.744703151905368
Test f1m:  0.6136363636363635
Train acc:  0.7516129032258064
Test acc:  0.6470588235294118
Train f1m:  0.7394417700931111
Test f1m:  0.5968379446640317
Train acc:  0.7258064516129032
Test acc:  0.5294117647058824
Train f1m:  0.7131972789115646
Test f1m:  0.5294117647058824
Train acc:  0.7161290322580646
Test acc:  0.6176470588235294
Train f1m:  0.6958751393534002
Test f1m:  0.5888372093023256
Train acc:  0.7548387096774194
Test acc

# (E3) Uses features

In [93]:
import copy

In [97]:
from sklearn.preprocessing import StandardScaler

In [92]:
features = [list(sum(g.node_features).numpy()) for g in ptc_graphs[0]]

In [94]:
Xf = copy.deepcopy(X)

In [95]:
for i in range(len(X)):
    Xf[i].extend(features[i])

In [111]:
%%time
test_avg = []
for i in range (1,11):
    with open("../data/PTC/10fold_idx/test_idx-{}.txt".format(i), "r") as test_f,\
         open("../data/PTC/10fold_idx/train_idx-{}.txt".format(i), "r") as train_f:
        train_idx = [int(i) for i in train_f.readlines()]
        test_idx = [int(i) for i in test_f.readlines()]
        X_train = [Xf[i] for i in train_idx]
        X_test = [Xf[i] for i in test_idx]
        y_train = [y[i] for i in train_idx]
        y_test = [y[i] for i in test_idx]
        normalizer = Normalizer().fit(X_train)
        X_train = normalizer.transform(X_train)
        X_test = normalizer.transform(X_test)
    clf = svm.SVC(C=100.0, kernel='poly', degree=4, gamma='scale', 
              coef0=0.0, shrinking=True, probability=False, tol=0.001, 
              cache_size=200, class_weight=None, verbose=False, max_iter=-1, 
              decision_function_shape='ovr', random_state=None)
    clf.fit(X_train, y_train)  
    #print("Train: ", f1_score(y_pred=clf.predict(X_train), y_true=y_train, average='micro'))
    #print("Test: ", f1_score(y_pred=clf.predict(X_test), y_true=y_test, average='micro'))
    test_avg.append(f1_score(y_pred=clf.predict(X_test), y_true=y_test, average='micro'))
print("Final avg: ", np.mean(test_avg), np.std(test_avg))

Final avg:  0.65 0.0713803594088918
CPU times: user 133 ms, sys: 0 ns, total: 133 ms
Wall time: 132 ms


In [130]:
%%time
test_avg = []
for i in range (1,11):
    with open("../data/PTC/10fold_idx/test_idx-{}.txt".format(i), "r") as test_f,\
         open("../data/PTC/10fold_idx/train_idx-{}.txt".format(i), "r") as train_f:
        train_idx = [int(i) for i in train_f.readlines()]
        test_idx = [int(i) for i in test_f.readlines()]
        X_train = [Xf[i] for i in train_idx]
        X_test = [Xf[i] for i in test_idx]
        y_train = [y[i] for i in train_idx]
        y_test = [y[i] for i in test_idx]
        normalizer = Normalizer().fit(X_train)
        X_train = normalizer.transform(X_train)
        X_test = normalizer.transform(X_test)
    clf = svm.SVC(C=400.0, kernel='rbf', degree=4, gamma=1.0, 
              coef0=0.0, shrinking=True, probability=False, tol=0.001, 
              cache_size=200, class_weight=None, verbose=False, max_iter=-1, 
              decision_function_shape='ovr', random_state=None)
    clf.fit(X_train, y_train)  
    #print("Train: ", f1_score(y_pred=clf.predict(X_train), y_true=y_train, average='micro'))
    #print("Test: ", f1_score(y_pred=clf.predict(X_test), y_true=y_test, average='micro'))
    test_avg.append(f1_score(y_pred=clf.predict(X_test), y_true=y_test, average='micro'))
print("Final avg: ", np.mean(test_avg), np.std(test_avg))

Final avg:  0.6352941176470589 0.06197443384031026
CPU times: user 115 ms, sys: 32 µs, total: 115 ms
Wall time: 114 ms
