# Evaluation graph awareness along different algorithms

## Read data

In [35]:
from torch_geometric.datasets import Planetoid
from EnsembleFramework import Framework
import torch_geometric.transforms as T
from torch_geometric.utils import add_self_loops

dataset_name = 'Cora'
split = "public"
dataset = Planetoid(root='/tmp/Cora', name=dataset_name, split=split)
dataset.transform = T.NormalizeFeatures()

features =  dataset[0].x
y =  dataset[0].y

test =  dataset[0].test_mask
train = dataset[0].train_mask
val =  dataset[0].val_mask

edge_index = dataset[0].edge_index 
edge_index = add_self_loops(edge_index)[0]

## Define Hyperparameter spaces

### Logistic regression hyperparameter space

In [60]:
from hyperopt import hp

lr_choices = {
    'penalty': ["l2"],
    'max_iter': [2**i for i in range(6, 15)],
}

lr_space = {
    **{key: hp.choice(key, value) for key, value in lr_choices.items()},
    'tol': hp.loguniform('tol', -11, -3),
    'C': hp.uniform('C', 0.0, 10)
}

### Support Vector classfiier hyperparameter space

In [61]:
from hyperopt import hp

svc_choices = {
    'gamma': ["scale", "auto"],
    "probability": [True]
}

svc_space = {
    **{key: hp.choice(key, value) for key, value in svc_choices.items()},
    'C': hp.uniform('C', 0.0, 150)
}

### Decision tree hyperparameter space

In [62]:
from hyperopt import hp

dt_choices = {
    'criterion': ["gini"],
    'max_depth': [None, *[i**2 for i in range(5, 10)]]
}

dt_space = {
    **{key: hp.choice(key, value) for key, value in dt_choices.items()},
    'min_samples_split': hp.uniform('min_samples_split', 0.0, 1.0),
    'min_samples_leaf': hp.uniform('min_samples_leaf', 0.0, .5),
    'min_weight_fraction_leaf': hp.uniform('min_weight_fraction_leaf', 0.0, 1.0),
    'max_features': hp.uniform('max_features', 0.0, 1.0),
}

### XGBoost hyperparameter space

In [63]:
from hyperopt import hp

booster_self = ["gbtree"]
n_estimators_self = [1_400, 1_600, 1_800, 2_000]
max_depth_self = [None,2, 3,4]
max_delta_step_self = [1,2,3]
min_child_weight_self = [None, *list(range(1,5,1))]

xb_choices = {
    'booster': booster_self,
    'n_estimators': n_estimators_self,
    'max_depth': max_depth_self,
    'max_delta_step': max_delta_step_self,
    'min_child_weight': min_child_weight_self,
    # 'device': ["cuda:2"],
    "tree_method": ["hist"]
}
 
xb_space = {
    **{key: hp.choice(key, value) for key, value in xb_choices.items()},
    'eta': hp.loguniform('eta', -3, -.4),
    'subsample': hp.uniform('subsample', 0.6, 1),
    'reg_lambda': hp.loguniform('reg_lambda',-5, 5),
    'reg_alpha': hp.loguniform('reg_alpha',-3, 1),
    'gamma': hp.uniform('gamma', 0, .8),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.6, 1),
}

### Random forest hyperparameter space

In [64]:
from hyperopt import hp

rf_choices = {
    'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000],
    'max_depth':  [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
    'min_samples_split': [2, 5, 10],
    "criterion":  ["gini", "entropy", "log_loss"]
}

rf_space = {
    **{key: hp.choice(key, value) for key, value in rf_choices.items()},
    'max_samples': hp.uniform('max_samples', 0.0, 1),
    'min_samples_leaf': hp.uniform('min_samples_leaf', 0.0, 1.0),
    'min_samples_split': hp.uniform('min_samples_split', 0.0, 1.0),
     'max_features': hp.uniform('max_features', 0.0, 1.0),
}

In [65]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

clfs_space = dict({})
clfs_space["RandomForestClassifier"] = rf_space
clfs_space["LogisticRegression"] = lr_space
clfs_space["DecisionTreeClassifier"] = dt_space
clfs_space["XGBClassifier"] = xb_space
clfs_space["SVC"] = svc_space

clfs = [RandomForestClassifier, LogisticRegression, DecisionTreeClassifier, XGBClassifier, SVC]

## Convert data in format for AutoTune

In [66]:
cora_set = dict({})
cora_set["X"] = features
cora_set["y"] = y
cora_set["test"] = test
cora_set["train"] = train
cora_set["val"] = val
cora_set["edge_index"] = edge_index

## Start AutoTune search

In [None]:
from AutoTune2 import AutoSearch
from sklearn.metrics import accuracy_score
from torch.nn.functional import normalize

def user_function(kwargs):
    return  normalize(kwargs["original_features"] + kwargs["summed_neighbors"], p=2.0, dim = 1)

searcher = AutoSearch(cora_set, max_evals=10, pred_metric = accuracy_score, parallelism=50)
hops = [3]
store = searcher.search(clfs, clfs_space, hops=hops, user_functions= [user_function],
                        attention_configs = [None])

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Total Trials: 10: 10 succeeded, 0 failed, 0 cancelled.                          


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Total Trials: 10: 10 succeeded, 0 failed, 0 cancelled.                          


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Traceback (most recent call last):) / 1][Stage 865:>                (0 + 1) / 1]
  File "/home/dwalke/.local/lib/python3.10/site-packages/hyperopt/spark.py", line 467, in run_task_on_executor
    result = domain.evaluate(
  File "/home/dwalke/.local/lib/python3.10/site-packages/hyperopt/base.py", line 892, in evaluate
    rval = self.fn(pyll_rval)
  File "/home/dwalke/git/graph_aware_ml/AutoTune2.py", line 91, in objective
    framework.fit(auto_search.data.X, auto_search.data.edge_index,
  File "/home/dwalke/git/graph_aware_ml/EnsembleFramework.py", line 195, in fit
    clf.fit(aggregated_train_features.cpu().numpy(), y_train,**transformed_kwargs)
  File "/home/dwalke/.local/lib/python3.10/site-packages/sklearn/base.py", line 1145, in wrapper
    estimator._validate_params()
  File "/home/dwalke/.local/lib/python3.10/site-packages/sklearn/base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "/home/dwalke/.local/lib/python3.10/site-packages/sklearn/utils/_

## Print results

In [54]:
store

{'RandomForestClassifier': {3: {'train_acc': 0.7214285714285714,
   'val_acc': 0.542,
   'test_acc': 0.578,
   'model': RandomForestClassifier(criterion='entropy', max_depth=60,
                          max_features=0.5386735923044993,
                          max_samples=0.8232521612995202,
                          min_samples_leaf=0.22776932723998056,
                          min_samples_split=0.1432446178498361, n_estimators=1800),
   'user_function': <function __main__.user_function(kwargs)>,
   'attention_config': None}},
 'LogisticRegression': {3: {'train_acc': 0.9928571428571429,
   'val_acc': 0.806,
   'test_acc': 0.824,
   'model': LogisticRegression(C=6.266904433373178, l1_ratio=0.8445600388364138,
                      max_iter=64, tol=0.0048069207391340085),
   'user_function': <function __main__.user_function(kwargs)>,
   'attention_config': None}},
 'DecisionTreeClassifier': {3: {'train_acc': 0.14285714285714285,
   'val_acc': 0.122,
   'test_acc': 0.13,
   'model': D