In [12]:
import numpy as np
import pickle as pkl
import sys
sys.path.append("../..")

from utils import load_data, load_precompute, save_precompute
from sklearn.model_selection import train_test_split as tts, GridSearchCV, KFold
from sklearn.metrics import f1_score as f1, accuracy_score as acc
from sklearn.preprocessing import StandardScaler
from homomorphism import get_hom_profile

# Not important
try:
    from tqdm import tqdm
except:
    tqdm = lambda x: x

In [2]:
from sklearn.svm import SVC

# Load PTC - (SOTA: ~0.646 ± 0.07)

In [3]:
ptc, nclass = load_data("PTC", False)

loading data
# classes: 2
# maximum node tag: 19
# data: 344


In [4]:
y = np.array([d.label for d in ptc])
node_features = [d.node_features.numpy() for d in ptc]

In [5]:
hom_type = "tree"
hom_size = 6
try:
    X = load_precompute("PTC", hom_type, hom_size)
except:
    X = []
    profile_func = get_hom_profile(hom_type)
    for d in tqdm(ptc):
        profile = profile_func(d.g, size=hom_size)
        X.append(profile)
    X = np.array(X, dtype=float)

In [6]:
Xf = np.concatenate((X, np.array([d.node_features.sum(0).numpy() for d in ptc])), axis=1)

In [7]:
Xf.shape

(344, 32)

In [8]:
X.shape

(344, 13)

In [9]:
y.shape

(344,)

# SVC with poly

In [11]:
%%time
accuracies = []
for i in tqdm(range(10)):
    X_train, X_test, y_train, y_test = tts(X, y, test_size=0.1, random_state=None)
    scaler = StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    clf = SVC(C=100.0, kernel='poly', degree=3, gamma=1.0)
    clf.fit(X_train, y_train)
    accuracies.append(acc(y_pred=clf.predict(X_test), y_true=y_test))
print("{:.4f} +\- {:.4f}".format(np.mean(accuracies), np.std(accuracies)))

100%|██████████| 10/10 [06:08<00:00, 36.82s/it]

0.6229 +\- 0.0775
CPU times: user 6min 8s, sys: 18.2 ms, total: 6min 8s
Wall time: 6min 8s





In [None]:
%%time
# Return the cell above
accuracies = []
for i in tqdm(range(10)):
    X_train, X_test, y_train, y_test = tts(X, y, test_size=0.1, random_state=None)
    scaler = StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    clf = SVC(C=100.0, kernel='poly', degree=3, gamma=1.0)
    clf.fit(X_train, y_train)
    accuracies.append(acc(y_pred=clf.predict(X_test), y_true=y_test))
print("{:.4f} +\- {:.4f}".format(np.mean(accuracies), np.std(accuracies)))

 60%|██████    | 6/10 [03:19<02:03, 30.78s/it]

## KFold

In [15]:
%%time 
accuracies = []
kf = KFold(n_splits=10, random_state=None, shuffle=True)
for train_idx, test_idx in tqdm(kf.split(X)):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    scaler = StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    clf = SVC(C=100.0, kernel='poly', degree=2, gamma=1.0)
    clf.fit(X_train, y_train)
    accuracies.append(acc(y_pred=clf.predict(X_test), y_true=y_test))
print("{:.4f} +\- {:.4f}".format(np.mean(accuracies), np.std(accuracies)))

10it [00:49,  5.00s/it]

0.5637 +\- 0.0668
CPU times: user 50 s, sys: 6.66 ms, total: 50 s
Wall time: 50 s





In [21]:
%%time 
accuracies = []
kf = KFold(n_splits=10, random_state=None, shuffle=True)
for train_idx, test_idx in tqdm(kf.split(X)):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    scaler = StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    clf = SVC(C=100.0, kernel='poly', degree=3, gamma=1.0)
    clf.fit(X_train, y_train)
    accuracies.append(acc(y_pred=clf.predict(X_test), y_true=y_test))
print("{:.4f} +\- {:.4f}".format(np.mean(accuracies), np.std(accuracies)))

10it [04:58, 29.82s/it]

0.5522 +\- 0.0520
CPU times: user 4min 57s, sys: 16.5 ms, total: 4min 57s
Wall time: 4min 58s



