In [1]:
from torch_geometric.datasets import IMDB, DBLP
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score
import torch
import numpy as np
from sklearn.metrics import normalized_mutual_info_score, f1_score, accuracy_score
from HeteroNestedCV import NestedTransductiveCV
from tqdm.notebook import tqdm
import time
import copy
from hyperopt import hp

  _torch_pytree._register_pytree_node(


## Future improvemtn ideas
- Representationen ähnlicher Entitäten des gleichen Knotentyps mit einander verknüpfen als zusatz-info nach dem über meta-paths informationen aggregiert wurden
 -> Neue Pseudo-attention (e.g.,Sylvester Stalone ist ähnlich zu Arnold Schwarzenegger - Action basiert)
- Expansion not on semantic level and instead only expand vector to maxiumum vector size to reduce #trainable params

In [2]:
imdb = IMDB(root= "./data/IMDB")
dblp = DBLP(root= "./data/DBLP")
data = dblp[0]

In [3]:
data["author"].y

tensor([2, 2, 3,  ..., 0, 0, 0])

In [4]:
data

HeteroData(
  author={
    x=[4057, 334],
    y=[4057],
    train_mask=[4057],
    val_mask=[4057],
    test_mask=[4057],
  },
  paper={ x=[14328, 4231] },
  term={ x=[7723, 50] },
  conference={ num_nodes=20 },
  (author, to, paper)={ edge_index=[2, 19645] },
  (paper, to, author)={ edge_index=[2, 19645] },
  (paper, to, term)={ edge_index=[2, 85810] },
  (paper, to, conference)={ edge_index=[2, 14328] },
  (term, to, paper)={ edge_index=[2, 85810] },
  (conference, to, paper)={ edge_index=[2, 14328] }
)

## Add features (categorical enc. for now) to nodes withot features

In [5]:
data["conference"].x = torch.arange(data["conference"].num_nodes).unsqueeze(-1)
data["conference"].x

tensor([[ 0],
        [ 1],
        [ 2],
        [ 3],
        [ 4],
        [ 5],
        [ 6],
        [ 7],
        [ 8],
        [ 9],
        [10],
        [11],
        [12],
        [13],
        [14],
        [15],
        [16],
        [17],
        [18],
        [19]])

## Node type feature padding for preserving semantic differences bewteen individual node tyypes

In [6]:
from torch.nn.functional import pad

node_types, edge_types = data.metadata()
node_type_shapes = torch.tensor([data[node_type].x.shape[-1] for node_type in data.metadata()[0]])

ident = torch.ones(len(node_types), len(node_types))
right_pad_mat = torch.triu(ident) - torch.diag(torch.ones(len(node_types)), 0)
left_pad_mat = right_pad_mat.t()
left_pad_mat.type(torch.long) * node_type_shapes
pad_matrix = torch.cat([left_pad_mat.type(torch.long) @node_type_shapes.unsqueeze(1), right_pad_mat.type(torch.long) @node_type_shapes.unsqueeze(1)], dim = -1) 

for i, node_type in enumerate(node_types):
    data[node_type].x = pad(data[node_type].x, pad_matrix[i].tolist(), "constant", 0).type(torch.long)

## Input data file

In [7]:
class HeteroGraphData:
    def __init__(self):
        pass
    

## Graph for graph schema with DFS algorithm

In [8]:
class Edge:
    def __init__(self, source_node, edge_type, target_node):
        self.source_node = source_node
        self.edge_type:str = edge_type
        self.target_node = target_node

    def __repr__(self):
        return f"""
        {self.source_node}->{self.edge_type}->{self.target_node}
        """

class Node:
    def __init__(self, name):
        self.name = name
        self.targets = []
        self.sources = []

    def add_target_edge(self, target_edge):
        self.targets.append(target_edge)

    def add_source_edge(self, source_edge):
        self.sources.append(source_edge)

    def __repr__(self):
        return f"""{self.name}()"""

class Graph:
    def __init__(self):
        self.nodes = []
        self.nodes_index = dict()
        self.edges = []
        self.dfs_paths = []

    def add_node(self,node_name):
        node = Node(node_name)
        self.nodes_index[node_name] = node
        self.nodes.append(node)

    def add_edge(self, source_node_name, edge_type, target_node_name):
        if source_node_name not in self.nodes_index:
            self.add_node(source_node_name)
        if target_node_name not in self.nodes_index:
            self.add_node(target_node_name)
        source_node = self.nodes_index[source_node_name]
        target_node = self.nodes_index[target_node_name]
        
        edge = Edge(source_node, edge_type, target_node)
        self.edges.append(edge)
        source_node.add_target_edge(edge)
        target_node.add_source_edge(edge)

    def dfs_per_node(self, start_node_name, max_depth):
        assert start_node_name in self.nodes_index, f"Node {start_node_name} is not in the Graph ({self})"
        self.dfs_paths = []
        start_node = self.nodes_index[start_node_name]
        dfs_path = []
        
        def traverse(node, curr_depth, dfs_path):
            if node is None or len(dfs_path) == max_depth: 
                self.dfs_paths.append(dfs_path)
                dfs_path = []
                return
            for edge in node.targets:
                
                traverse(edge.target_node, curr_depth + 1, [*dfs_path, edge])
        
        traverse(start_node, 0, dfs_path) 
        return self.dfs_paths

    def dfs_per_node_include_all_paths(self, start_node_name, max_depth):
        all_meta_paths_per_node = []
        for max_depth_i in range(1, max_depth + 1):
            all_meta_paths_per_node.extend(self.dfs_per_node(start_node_name, max_depth_i))
        self.dfs_paths = all_meta_paths_per_node
        return self.dfs_paths

    def dfs(self, max_depth):
        all_meta_paths = []
        for node in self.nodes:
            all_meta_paths.extend(self.dfs_per_node(node.name, max_depth))
        self.dfs_paths = all_meta_paths
        return self.dfs_paths

    def dfs_include_all_paths(self, max_depth):
        all_meta_paths = []
        for node in self.nodes:
            all_meta_paths.extend(self.dfs_per_node_include_all_paths(node.name, max_depth))
        self.dfs_paths = all_meta_paths
        return self.dfs_paths

    

    def __repr__(self):
        return f"""
        self.nodes: {self.nodes},
        self.edges: {self.edges},
        self.dfs_paths: {self.dfs_paths},
        """

## Metapath creation or automatic construction

In [9]:
class MetaPaths:
    def __init__(self, target_node_type):
        self.meta_path_dict = {}
        self.target_node_type = target_node_type
        pass

    @staticmethod
    def graph_meta_path_to_triple_meta_path(meta_path):
        meta_path_triples = []
        for meta_path_part in meta_path:
            meta_path_triples.append((
                meta_path_part.source_node.name,
                meta_path_part.edge_type,
                meta_path_part.target_node.name,
            ))
        return meta_path_triples

    def construct_meta_paths(self, metadata, max_depth):
        assert max_depth >= 1, "max_depth needs to be greater or equal to one to construct meta_paths since a distance of less than one does not consider the graph."
        _, edge_types = metadata
        graph_schema = Graph()
        for edge_type in edge_types:
            graph_schema.add_edge(*edge_type)
        for max_depth_i in range(1, max_depth + 1):
            meta_paths = graph_schema.dfs(max_depth_i)
            
            meta_paths_for_target_node = list(filter(lambda meta_path: meta_path[-1].target_node.name == self.target_node_type, meta_paths))     
            meta_paths_for_target_node_triples = list(map(lambda meta_path: MetaPaths.graph_meta_path_to_triple_meta_path(meta_path), meta_paths_for_target_node))
            self.add_meta_paths(max_depth_i, meta_paths_for_target_node_triples)
        return self.meta_path_dict

    
    def assert_meta_path(self, hop, meta_path):
        assert len(meta_path) == hop, f"Metapath with length {len(meta_path)} should have the same length as the number of hops in the graphs: {hop}."
        valid_len_meta_path_parts = [len(meta_path_part) == 3 for meta_path_part in meta_path]
        assert all(valid_len_meta_path_parts), f"All meta path parts should have length 3 (source_type, edge_type, target_type) but positions {np.where(np.logical_not(valid_len_meta_path_parts))[0].tolist()} have an invalid length."
        assert meta_path[-1][-1] == self.target_node_type, f"Target type of the last metapath part should be desired target node type but targte node type of the last metapath part is {meta_path[-1][-1]} desired target node type is {self.target_node_type}."
        
        if len(meta_path) <=  1: return
        valid_meta_path_parts = []
        for meta_path_part_it in range(len(meta_path) - 1):
            last_target_type_is_source_type_of_next_edge = meta_path[meta_path_part_it][-1] == meta_path[meta_path_part_it + 1][0]
            valid_meta_path_parts.append(last_target_type_is_source_type_of_next_edge)
        assert valid_meta_path_parts, "The target type of each meta path part (triplet) have s´to be the source type of the next meta path part (triplet) in a metapath"

    def add_meta_path(self, hop, meta_path):
        """
        meta_path: List of tuples where each tuple has three parts: source node type, edge type and target node type
        """
        self.assert_meta_path(hop, meta_path)
        
        if hop not in self.meta_path_dict:
            self.meta_path_dict[hop] = []
        self.meta_path_dict[hop].append(meta_path)

    def add_meta_paths(self, hop, meta_paths):
        """
        meta_paths: List of List of tuples where each tuple has three parts: source node type, edge type and target node type
        """
        for meta_path in meta_paths:
            self.add_meta_path(hop, meta_path)

    def get_meta_path_dict(self):
        return self.meta_path_dict

    def get_meta_paths(self):
        meta_paths = []
        for hop in self.meta_path_dict:
            meta_paths.extend(self.meta_path_dict[hop])
        return meta_paths

    def __repr__(self):
        return f"""
        Metapaths: 
        {self.meta_path_dict}
        """

## Usage

1. Automated construction
```
meta_paths = MetaPaths("author")
depth = 1
meta_paths.construct_meta_paths(data.metadata(), depth)
```

2. Own meta-path defintions
```
meta_paths = MetaPaths("movie")
meta_paths.add_meta_path(1 ,[('actor', 'to', 'movie')])
meta_paths.add_meta_path(1 ,[('director', 'to', 'movie')])
meta_paths.add_meta_path(2 ,[('movie', 'to', 'director'), ('director', 'to', 'movie')])
meta_paths.add_meta_path(2 ,[("movie", "to", "actor"), ("actor", "to", "movie")])
meta_paths.meta_path_dict
```

## HeteroGraphAware Framework for metapath based aggregation and classifications

In [10]:
import numpy as np
import torch
from sklearn.linear_model import LogisticRegression

class HeteroGraphAware(MetaPaths):
    def __init__(self, target_node_type, model, model_hyperparams, add_skip_connection = True, aggregator = "sum", add_self_loops = True, include_original_features = True):
        ## include_original_features: seems stupid and I would remove it since this can represented by the graph structure (e.g., adding movie -> to -> movie self loops)
        super().__init__(target_node_type)
        
        self.target_node_type = target_node_type # String
        self.model = model ## sklearn or xgboost model class!
        self.models = []
        self.features = []
        self.add_skip_connection = add_skip_connection
        self.aggregator = aggregator
        self.model_hyperparams = model_hyperparams
        self.final_model = LogisticRegression(class_weight="balanced")
        self.add_self_loops = add_self_loops
        self.include_original_features = include_original_features
        
    
    def add_self_loops_fun(self, data):
        edge_index = torch.cat([torch.arange(data[self.target_node_type].x.shape[0]).unsqueeze(0), torch.arange(data[self.target_node_type].x.shape[0]).unsqueeze(0)], dim = 0)
        data[self.target_node_type, "to", self.target_node_type].edge_index = edge_index
        return data

    @staticmethod
    def aggregate(source_features, target_features, edge_index, aggregator = "sum"):
        source_lift = torch.index_select(source_features, 0, edge_index[0])
        target_lift = torch.index_select(target_features, 0, edge_index[1])    
        input = torch.zeros_like(target_features)
        return torch.scatter_reduce(input, 0, edge_index[1].unsqueeze(-1).expand_as(source_lift), source_lift, reduce = aggregator)

    def generate_features(self, data, metapath):
        source, _, target = metapath[0]
        new_features = data[source].x
        
        for metapath_part in metapath:
            source, _, target = metapath_part
            new_features = HeteroGraphAware.aggregate(new_features, data[target].x, data[metapath_part].edge_index, self.aggregator)
            if self.add_skip_connection:
                new_features += data[target].x 
        return new_features

    def include_original_features_fit(self, data, train_mask):                
        model_instance = self.model(**self.model_hyperparams)
        features = data[self.target_node_type].x[train_mask]
        model_instance.fit(features, data[self.target_node_type].y[train_mask])
        self.models.append(model_instance)
        self.features.append(features)

    def include_original_features_predict_proba(self, model, data, test_mask):                
        return model.predict_proba(data[self.target_node_type].x)[test_mask]

    def fit(self, data, train_mask):
        """
        hyperparams: dict with keys specific for the classifier or regressor model
        """
        if self.add_self_loops:
            data = self.add_self_loops_fun(data)
        if self.include_original_features:
            self.include_original_features_fit(data, train_mask)
            
        for metapath in self.get_meta_paths():
            new_features = self.generate_features(data, metapath)
                
            model_instance = self.model(**self.model_hyperparams)
            model_instance.fit(new_features[train_mask], data[self.target_node_type].y[train_mask])
            self.models.append(model_instance)
            self.features.append(new_features)
        pred_probas = self.get_pred_proba(data, train_mask) # data["movie"].val_mask
        
        self.final_model.fit(np.concatenate(pred_probas, axis = -1), data[self.target_node_type].y[train_mask])
        return self.models

    def get_pred_proba(self, data, test_mask):
        if self.add_self_loops:
            data = self.add_self_loops_fun(data)

        pred_probas = []
        if self.include_original_features:
            model = self.models.pop(0)
            pred_probas.append(self.include_original_features_predict_proba(model, data, test_mask))

        for i, model in enumerate(self.models):
            metapath = self.get_meta_paths()[i]
            new_features = self.generate_features(data, metapath)                    
            pred_probas.append(model.predict_proba(new_features)[test_mask])
        return np.array(pred_probas)
        

    def predict_proba(self, data, test_mask):
        pred_probas = self.get_pred_proba(data, test_mask)
        return self.final_model.predict_proba(np.concatenate(pred_probas, axis = -1)) #pred_probas.mean(0) #self.final_model.predict_proba(np.concatenate(pred_probas).transpose())

    def __repr__(self):
        return f"""
        HeteroGraphAware()
        """

## Usage

1) Training
```
from xgboost import XGBClassifier
hetero_graph_aware = HeteroGraphAware("author", XGBClassifier, {"alpha": 10}, add_skip_connection=True, include_original_features=False, aggregator="sum")
hetero_graph_aware.construct_meta_paths(data.metadata(), 4)
hetero_graph_aware.fit(data, data["author"].train_mask)
```

2) Evaluation
```
from sklearn.metrics import f1_score
f1_score(data["author"].y[data["author"].test_mask], hetero_graph_aware.predict_proba(data, data["author"].test_mask).argmax(1), average="micro")
```

## Hyperparameter space defintions for different machine learning models

In [11]:
class ModelSpace():
    def __init__(self):
        self.space = None
        self.initialize_space()

    def initialize_space(self):
        framework_choices = {
            'hops': [3], #2, 3, 4
            'aggregator': ["sum"],#, "mean"
            'add_skip_connection': [True],#False
        }
         
        self.space = {
            **{key: hp.choice(key, value) for key, value in framework_choices.items()}
        }
        
    def add_choice(self, key, items):
        self.space[key] = hp.choice(key, items)
        
    def add_uniform(self, key, limits: tuple):
        self.space[key] = hp.uniform(key, limits[0], limits[1])
        
    def add_loguniform(self, key, limits: tuple):
        self.space[key] = hp.loguniform(key, np.log(limits[0]), np.log(limits[1]))
        
    def add_qloguniform(self, key, limits, q):
        self.space[key] = hp.qloguniform(key, low=np.log(limits[0]), high=np.log(limits[1]), q=q)

class LogitsticRegressionSpace(ModelSpace):
    def __init__(self):
        super().__init__()

    def get_space(self):
        self.add_choice("n_jobs", [-1])
        # self.add_loguniform('tol', [6e-3, 4e-2])
        self.add_uniform('C', [0, 10])
        return self.space    

## Evaluation on nested Cross validation

In [12]:
def train_val_masks(train_mask, manual_seed = None, train_size = 0.8):
    if manual_seed:
        torch.manual_seed(manual_seed)
    train_index = train_mask.nonzero().squeeze()
    min = int(train_size*train_index.shape[0])
    rand_train_index = torch.randperm(train_index.shape[0])
    rand_train_index_train_index = rand_train_index[:min]
    rand_train_index_val_index = rand_train_index[min:]

    train_mask = torch.zeros_like(train_mask)
    val_mask = torch.zeros_like(train_mask)
    
    new_train_idx = train_index[rand_train_index_train_index]
    new_val_idx = train_index[rand_train_index_val_index]

    train_mask[new_train_idx] = 1
    val_mask[new_val_idx] = 1
    return train_mask, val_mask

def space_to_spaces(space, hops):
    spaces = []
    for hop in hops:
        spaces.append(copy.deepcopy(space))
    return spaces

class HeteroGraphAwareNestedCVEvaluation:

    def __init__(self,target_node_type, device_id, model, data, minimize = True, max_evals = 100, parallelism = 1):
        self.target_node_type = target_node_type
        self.device_id = device_id
        self.model = model
        self.training_times = []
        self.minimize = minimize
        self.data = data
        self.nested_transd_cv = None
        self.max_evals = max_evals
        self.parallelism = parallelism

    def nested_cross_validate(self, k_outer, k_inner, space):  

        # spaces = space_to_spaces()        
        def evaluate_fun(fitted_model, data, mask):
            pred_proba = fitted_model.predict_proba(data, mask)
            return f1_score(data[self.target_node_type].y[mask], pred_proba.argmax(1), average="micro")

        def train_fun(data, inner_train_mask, hyperparameters):  
            hops = hyperparameters["hops"]
            aggregator = hyperparameters["aggregator"]
            add_skip_connection = hyperparameters["add_skip_connection"]

            filtered_keys = list(filter(lambda key: key not in ["aggregator", "hops", "add_skip_connection"], hyperparameters.keys()))
            model_hyperparams = {key: hyperparameters[key] for key in filtered_keys}
            
            hetero_graph_aware = HeteroGraphAware(self.target_node_type, self.model, model_hyperparams, add_skip_connection=add_skip_connection, include_original_features=False, aggregator=aggregator)
            ## TODO test all bool combination
            hetero_graph_aware.construct_meta_paths(data.metadata(), hops)
            start_time = time.time()
            hetero_graph_aware.fit(data, inner_train_mask)
            end_time = time.time() - start_time
            print(end_time)
            return hetero_graph_aware
            
        self.nested_transd_cv = NestedTransductiveCV(self.data, self.target_node_type, k_outer, k_inner, train_fun, evaluate_fun,max_evals = self.max_evals, parallelism = self.parallelism, minimalize = self.minimize)
        self.nested_transd_cv.outer_cv(space)
        return self.nested_transd_cv

In [13]:
hetero_graph_aware_evaluation = HeteroGraphAwareNestedCVEvaluation("author", 2, LogisticRegression, data, minimize = False, max_evals = 3, parallelism = 1)
space = LogitsticRegressionSpace().get_space()
hetero_graph_aware_evaluation.nested_cross_validate(3,3, space)


        Using a 3 x 3 nested StratifiedKFold Cross-Validation, we obtain:
        0.8802 +- 0.0166.

        self.outer_scores: [0.86252772 0.87573964 0.90236686]

        self.best_params_per_fold: [{'C': 2.115256507027895, 'add_skip_connection': True, 'aggregator': 'sum', 'hops': 3, 'n_jobs': -1}, {'C': 1.8462062927881318, 'add_skip_connection': True, 'aggregator': 'sum', 'hops': 3, 'n_jobs': -1}, {'C': 0.13506876976748128, 'add_skip_connection': True, 'aggregator': 'sum', 'hops': 3, 'n_jobs': -1}]

        self.best_models: []

        