In [1]:
from torch_geometric.datasets import Planetoid

citeseer_dataset = Planetoid(root = "./data", name = "CiteSeer")
cora_dataset = Planetoid(root = "./data", name = "Cora")
pubmed_dataset = Planetoid(root = "./data", name = "PubMed")

In [2]:
import torch_geometric.transforms as T


def pre_process(dataset):
    dataset.transform = T.NormalizeFeatures()
    return dataset

In [3]:
CORA = "Cora"
PUBMED = "PubMed"
CITESEER = "Citeseer"

name_to_dataset = dict({})
name_to_dataset[CORA] = pre_process(cora_dataset)
name_to_dataset[PUBMED] = pre_process(pubmed_dataset)
name_to_dataset[CITESEER] = pre_process(citeseer_dataset)

In [4]:
name_to_sets = dict({})

In [5]:
from torch_geometric.utils import add_self_loops

def add_set(set_name):
    global name_to_sets, name_to_dataset
    name_to_sets[set_name] = dict({})
    
    dataset = name_to_dataset[set_name]
    X =  dataset[0].x 
    y =  dataset[0].y 
    
    test =  dataset[0].test_mask
    train = dataset[0].train_mask 
    val =  dataset[0].val_mask
    
    edge_index = add_self_loops(dataset[0].edge_index)[0]

    name_to_sets[set_name]["X"] = X
    name_to_sets[set_name]["y"] = y
    name_to_sets[set_name]["test"] = test
    name_to_sets[set_name]["train"] = train
    name_to_sets[set_name]["val"] = val
    name_to_sets[set_name]["edge_index"] = edge_index

In [6]:
def create_sets():
    for set_name in name_to_dataset.keys():
        add_set(set_name)
create_sets()

In [7]:
name_to_dataset[CITESEER].train_mask.sum()

tensor(120)

In [9]:
from hyperopt import fmin, tpe, hp,STATUS_OK, SparkTrials, space_eval 
from sklearn.metrics import accuracy_score
from tqdm.notebook import tqdm
from EnsembleFramework import Framework
from torch.nn.functional import normalize

class Data():
    def __init__(self, X, y, edge_index):
        self.X = X
        self.y = y
        self.edge_index = edge_index
        self.train = None
        self.val = None
        self.test = None
        self.X_train = None
        self.X_val = None
        self.X_test = None
    
    def set_train(self, train):
        self.train = train

    def set_test(self, test):
        self.test = test

    def set_val(self, val):
        self.val = val

    def set_X_train(self, X):
        self.X_train = X

    def set_X_val(self, X):
        self.X_val = X

    def set_X_test(self, X):
        self.X_test = X

class SparkTune():
    def __init__(self, data,clf, evals = 10):
        self.evals = evals
        self.data = data
        self.clf = clf
        
    def objective(self, params):
        model = self.clf(**params)
        model.fit(self.data.X_train, self.data.y[self.data.train])
        y_pred = model.predict(self.data.X_val)
        score = accuracy_score(self.data.y[self.data.val], y_pred)
        return {'loss': -score, 'status': STATUS_OK}
    
    def search(self, space):
        spark_trials = SparkTrials()
        best_params = fmin(self.objective, space, algo=tpe.suggest, max_evals=self.evals, trials=spark_trials)
        return best_params
    

def get_data(set_name):
    dataset = name_to_sets[set_name]
    data = Data(dataset["X"], dataset["y"], dataset["edge_index"])
    data.set_test(dataset["test"])
    data.set_val(dataset["val"])
    data.set_train(dataset["train"])
    return data
    
def search_hop_clf_attention_config(set_name,evals, hop, clf, user_function, attention_config, space):
    data:Data = get_data(set_name)
    framework = Framework([user_function], 
                     hops_list=[hop],
                     clfs=[],
                     gpu_idx=0,
                     handle_nan=0.0,
                    attention_configs=[attention_config])
    data.set_X_train(framework.get_features(data.X, data.edge_index, data.train)[0].cpu())
    data.set_X_val(framework.get_features(data.X, data.edge_index, data.val)[0].cpu())
    data.set_X_test(framework.get_features(data.X, data.edge_index, data.test)[0].cpu())
    
    sparkTune = SparkTune(data, clf, evals = evals)
    params = sparkTune.search(space)
    
    params = space_eval(space, params)

    model = clf(**params)
    kwargs={"eval_set":[(data.X_val, data.y[data.val])], "early_stopping_rounds":5} if model.__class__.__name__ == 'XGBClassifier' else {}
    
    model.fit(data.X_train,data.y[data.train],**kwargs)
    train_pred = model.predict(data.X_train)
    val_pred = model.predict(data.X_val)
    test_pred = model.predict(data.X_test)
    
    train_acc = accuracy_score(data.y[data.train], train_pred)
    val_acc = accuracy_score(data.y[data.val], val_pred)
    test_acc = accuracy_score(data.y[data.test], test_pred)
    search_dict = dict({})
    search_dict["train_acc"] = train_acc
    search_dict["val_acc"] = val_acc
    search_dict["test_acc"] = test_acc
    search_dict["model"] = model
    search_dict["user_function"] = user_function
    return search_dict
    
def search(set_name,evals, clfs, hops, user_functions, clfs_space, attention_configs):
    store = dict({})
    for clf in tqdm(clfs):
        clf_name = clf().__class__.__name__
        space = clfs_space[clf_name]
        store[clf_name] = dict({})
        for hop in tqdm(hops):
            best_search_dict = None
            best_val = float("-inf")
            for attention_config in tqdm(attention_configs):
                for user_function in user_functions:
                    print(user_function)
                    search_dict = search_hop_clf_attention_config(set_name,evals, hop, clf, user_function, attention_config, space)
                    if search_dict["val_acc"] >= best_val:
                        best_val = search_dict["val_acc"]
                        best_search_dict = search_dict
                        best_search_dict["attention_config"] = attention_config
            store[clf_name][hop] = best_search_dict
    return store

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from AutoTune2 import AutoSearch
from hyperopt import fmin, tpe, hp,STATUS_OK, SparkTrials, space_eval 

penalty = ["l1","l2", None, "elasticnet"]
penalty = [None]
max_iter = [2**i for i in range(6, 15)]

lr_choices = {
    'penalty': penalty,
    'max_iter': max_iter,
    
}

space_lr = {
    **{key: hp.choice(key, value) for key, value in lr_choices.items()},
    'tol': hp.loguniform('tol', -11, -3),
    'C': hp.uniform('C', 0.0, 20),
    # 'l1_ratio': hp.uniform('l1_ratio', 0.0, 1.0)
}


def norm_user_function(kwargs):
    return  normalize(kwargs["original_features"] + kwargs["summed_neighbors"], p=2.0, dim = 1)
    
def user_function(kwargs):
    return  kwargs["original_features"] + kwargs["summed_neighbors"]
    
clfs = [LogisticRegression]
hops = [0,3,8]
clfs_space = dict({})
clfs_space["LogisticRegression"] = space_lr
attention_configs = [None,{'inter_layer_normalize': False,
                     'use_pseudo_attention':True,
                     'cosine_eps':.01,
                     'dropout_attn': None}, 
                     {'inter_layer_normalize': True,
                     'use_pseudo_attention':True,
                     'cosine_eps':.01,
                     'dropout_attn': None},
                     {'inter_layer_normalize': True,
                     'use_pseudo_attention':True,
                     'cosine_eps':.001,
                     'dropout_attn': None}]
user_functions = [norm_user_function, user_function]

searcher = AutoSearch(name_to_sets[PUBMED], max_evals=300, pred_metric = accuracy_score)
store = searcher.search(clfs, clfs_space, hops=[0,3,8])
# pubmed_store = search(CORA,clfs, hops, user_functions, clfs_space, attention_configs)

trial task 240 failed, exception is [Errno 28] No space left on device.
 None
trial task 241 failed, exception is [Errno 28] No space left on device.
 None
trial task 242 failed, exception is [Errno 28] No space left on device.
 None


In [None]:
store

In [12]:
from EnsembleFramework import Framework

hops_list=[3]
name_to_model = dict({})
def fit_dataset(set_name,user_functions=user_functions, hops_list= hops_list, clfs = clfs, attention_configs= attention_configs):
    dataset = name_to_sets[set_name]
    y = dataset["y"]
    
    framework = Framework(user_functions, 
                     hops_list=hops_list, ## to obtain best for local neighborhood
                     clfs=clfs,
                     gpu_idx=0,
                     handle_nan=0.0,
                    attention_configs=attention_configs)
    vals = framework.get_features(dataset["X"], dataset["edge_index"], dataset["val"])
    vals = [val.cpu() for val in vals]
    print([clf.__class__.__name__ for clf in clfs])
    kwargs_list=[{"eval_set":[(vals[i], y[dataset["val"]])], "early_stopping_rounds":5} if clf.__class__.__name__ == 'XGBClassifier' else {} for i, clf in enumerate(clfs)]
    print(kwargs_list)
    framework.fit(dataset["X"], dataset["edge_index"], y, dataset["train"], kwargs_list)
    name_to_model[set_name] = framework
    return framework

In [13]:
from sklearn.metrics import accuracy_score

def predict_dataset(set_name,framework):
    dataset = name_to_sets[set_name]
    
    y = dataset["y"]
    framework = name_to_model[set_name]
    pred = framework.predict(dataset["X"], dataset["edge_index"], dataset["test"]) 
    pred_val = framework.predict(dataset["X"], dataset["edge_index"], dataset["val"]) 
    y_test = y[dataset["test"]]
    y_val = y[dataset["val"]]
    print(set_name)
    print(accuracy_score(y_val, pred_val))
    print(accuracy_score(y_test, pred))

    

In [28]:
store = store
user_functions = [store["LogisticRegression"][0]["user_function"], store["LogisticRegression"][3]["user_function"], store["LogisticRegression"][8]["user_function"]]
clfs = [store["LogisticRegression"][0]["model"], store["LogisticRegression"][3]["model"], store["LogisticRegression"][8]["model"]]
attention_configs = [store["LogisticRegression"][0]["attention_config"], store["LogisticRegression"][3]["attention_config"], store["LogisticRegression"][8]["attention_config"]]

# user_functions = [store["LogisticRegression"][3]["user_function"]]
# clfs = [store["LogisticRegression"][3]["model"]]
# attention_configs = [store["LogisticRegression"][3]["attention_config"]]
framework = fit_dataset(PUBMED, user_functions=user_functions, hops_list= [3, 8], clfs = clfs, attention_configs= attention_configs)
predict_dataset(PUBMED, framework)

['LogisticRegression', 'LogisticRegression', 'LogisticRegression']
[{}, {}, {}]




PubMed
0.822
0.79


In [479]:
citeseer_store

{'LogisticRegression': {0: {'train_acc': 1.0,
   'val_acc': 0.594,
   'test_acc': 0.615,
   'model': LogisticRegression(C=3.9057765563512103, l1_ratio=0.3841472539205063,
                      max_iter=512, tol=0.00037394547447174774),
   'user_function': <function __main__.user_function(kwargs)>,
   'attention_config': {'inter_layer_normalize': True,
    'use_pseudo_attention': True,
    'cosine_eps': 0.01,
    'dropout_attn': None}},
  3: {'train_acc': 0.95,
   'val_acc': 0.716,
   'test_acc': 0.721,
   'model': LogisticRegression(C=3.161729301367482, l1_ratio=0.41400664400322995,
                      max_iter=128, tol=0.0022154364103027916),
   'user_function': <function __main__.user_function(kwargs)>,
   'attention_config': {'inter_layer_normalize': False,
    'use_pseudo_attention': True,
    'cosine_eps': 0.01,
    'dropout_attn': None}},
  8: {'train_acc': 0.9583333333333334,
   'val_acc': 0.738,
   'test_acc': 0.721,
   'model': LogisticRegression(C=4.484384767955908, l1_rati

In [480]:
store = citeseer_store
user_functions = [store["LogisticRegression"][0]["user_function"], store["LogisticRegression"][3]["user_function"], store["LogisticRegression"][8]["user_function"]]
clfs = [store["LogisticRegression"][0]["model"], store["LogisticRegression"][3]["model"], store["LogisticRegression"][8]["model"]]
attention_configs = [store["LogisticRegression"][0]["attention_config"], store["LogisticRegression"][3]["attention_config"], store["LogisticRegression"][8]["attention_config"]]
framework = fit_dataset(CITESEER,user_functions=user_functions, hops_list= [0,3,8], clfs = clfs, attention_configs= attention_configs)
predict_dataset(CITESEER, framework)

['LogisticRegression', 'LogisticRegression', 'LogisticRegression']
[{}, {}, {}]




Citeseer
0.738
0.72


In [24]:
from sklearn.inspection import permutation_importance

for model_i in range(len(new_train_features)):
    r = permutation_importance(model, new_val_features[model_i].cpu(), val_set.y,
                            n_repeats=30,
                            random_state=0)
    print(r.importances_mean[i])

NameError: name 'new_train_features' is not defined

In [12]:
fit_dataset(PUBMED)
predict_dataset(PUBMED)

['XGBClassifier', 'XGBClassifier', 'SVC']
[{'eval_set': [(tensor([[0.0000, 0.0000, 0.0159,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0133,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0533,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.1080, 0.0000, 0.0075,  ..., 0.0000, 0.0000, 0.0000]]), tensor([2, 2, 1, 2, 1, 1, 2, 1, 0, 1, 0, 2, 0, 2, 1, 2, 2, 0, 0, 0, 0, 0, 1, 2,
        1, 2, 1, 2, 1, 1, 0, 2, 1, 2, 0, 0, 1, 2, 1, 1, 1, 0, 2, 2, 2, 1, 0, 2,
        2, 2, 0, 2, 1, 1, 1, 0, 2, 2, 1, 1, 1, 0, 2, 0, 2, 2, 0, 0, 2, 2, 0, 2,
        0, 0, 1, 2, 0, 2, 1, 0, 1, 0, 0, 2, 2, 2, 2, 1, 2, 2, 1, 0, 2, 0, 1, 0,
        2, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 2, 1, 0, 1, 1, 1, 0, 0, 2, 2, 2, 1, 2,
        1, 2, 1, 1, 1, 1, 2, 2, 1, 2, 2, 1, 2, 1, 0, 2, 1, 1, 2, 2, 2, 2, 1, 1,
        1, 2, 0, 0, 2, 2, 2, 2, 1, 2, 1, 1, 1, 1, 2, 1, 2, 1, 1



[0]	validation_0-mlogloss:0.98416
[1]	validation_0-mlogloss:0.92272
[2]	validation_0-mlogloss:0.88664
[3]	validation_0-mlogloss:0.84306
[4]	validation_0-mlogloss:0.82333
[5]	validation_0-mlogloss:0.81140
[6]	validation_0-mlogloss:0.78062
[7]	validation_0-mlogloss:0.77776
[8]	validation_0-mlogloss:0.76773
[9]	validation_0-mlogloss:0.75974
[10]	validation_0-mlogloss:0.74930
[11]	validation_0-mlogloss:0.75331
[12]	validation_0-mlogloss:0.75462
[13]	validation_0-mlogloss:0.73632
[14]	validation_0-mlogloss:0.73620
[15]	validation_0-mlogloss:0.72862
[16]	validation_0-mlogloss:0.73010
[17]	validation_0-mlogloss:0.73777
[18]	validation_0-mlogloss:0.74450
[19]	validation_0-mlogloss:0.74320
[0]	validation_0-mlogloss:1.01912
[1]	validation_0-mlogloss:0.89476
[2]	validation_0-mlogloss:0.81348
[3]	validation_0-mlogloss:0.75490
[4]	validation_0-mlogloss:0.70351
[5]	validation_0-mlogloss:0.67500
[6]	validation_0-mlogloss:0.65043
[7]	validation_0-mlogloss:0.64049
[8]	validation_0-mlogloss:0.62562
[9]	