In [None]:
!pip install networkx
!pip install node2vec
!pip install stellargraph

## Node2Vec

In [None]:
import networkx as nx
import pandas as pd
import altair as alt
from node2vec import Node2Vec
alt.data_transformers.disable_max_rows()

G = nx.read_graphml("drive/My Drive/graph/graph_1_10.graphml")

In [None]:
from node2vec import Node2Vec
n2v_obj = Node2Vec(G, dimensions=10, walk_length=5, num_walks=10, p=1, q=1, workers=4)

In [None]:
n2v_model = n2v_obj.fit(window=3, min_count=1, batch_words=4)

In [None]:
n2v_model

## GraphSAGE

In [None]:
def process_time(t):
    time_to_seconds = {"s": 1, "m": 60, "h": 3600, "d": 86400}
    return int(t[:-1]) * time_to_seconds[t[-1]]

time_enum = {
    "s": 0,
    "m": 1,
    "h": 2,
    "d": 3
}

system_criticality_enum = {
      "NOT_CRITICAL": 0,
      "CRITICAL_CAN_CAUSE_S0_OUTAGE": 1,
      "CRITICAL_SIGNIFICANT_RUN_RATE": 2,
      "CRITICAL_OTHER": 3
      }

env_enum = {
    "DEVELOPMENT_ENV": 0,
    "PERSONAL_ENV": 1,
    "PRODUCTION_ENV": 2,
    "STAGING_ENV": 3,
    "TESTING_ENV": 4,
    "UNKNOWN_ENV": 5
  }

impact_enum = {
      "DOWN": 0,
      "SEVERELY_DEGRADED": 1,
      "DEGRADED": 2,
      "OPPORTUNITY_LOSS": 3,
      "NONE": 4
    }

freshness_enum = {
      "IMMEDIATE": 0,
      "DAY": 1,
      "WEEK": 2,
      "EVENTUALLY": 3,
      "NEVER": 4
    }

def get_dataset_features(node_attributes):
    regex_len_chars = len(node_attributes["regex_grouping"])
    regex_len_folders = len(node_attributes["regex_grouping"].split('.'))
    node_name_len = len(node_attributes["node_name"])
    id_in_node_name = str(node_attributes["id"]) in node_attributes["node_name"]
    node_name_word_len = len(node_attributes["node_name"].split())
    slo_seconds = process_time(node_attributes["slo"])
    slo_time_var = time_enum[node_attributes["slo"][-1]]
    env = env_enum[node_attributes["env"]]
    description_len = len(node_attributes["description"])
    description_word_len = len(node_attributes["description"].split())
    return [regex_len_chars, regex_len_folders, node_name_len, id_in_node_name,
            node_name_word_len, slo_seconds, slo_time_var, env, description_len,
            description_word_len]

def get_system_features(node_attributes):
    regex_len_chars = len(node_attributes["regex_grouping"])
    regex_len_folders = len(node_attributes["regex_grouping"].split('.'))
    node_name_len = len(node_attributes["node_name"])
    id_in_node_name = str(node_attributes["id"]) in node_attributes["node_name"]
    node_name_word_len = len(node_attributes["node_name"].split())
    env = env_enum[node_attributes["env"]]
    description_len = len(node_attributes["description"])
    description_word_len = len(node_attributes["description"].split())
    system_critic = system_criticality_enum[node_attributes["system_critic"]]

    return [regex_len_chars, regex_len_folders, node_name_len, id_in_node_name,
            node_name_word_len, env, description_len, system_critic,
            description_word_len]

def get_collection_features(node_attributes):
    node_name_len = len(node_attributes["node_name"])
    id_in_node_name = str(node_attributes["id"]) in node_attributes["node_name"]
    node_name_word_len = len(node_attributes["node_name"].split())
    return [node_name_len, id_in_node_name, node_name_word_len]

def get_processing_features(node_attributes):
    return [impact_enum[node_attributes["impact"]], 
            freshness_enum[node_attributes["freshness"]]]

def get_data_integrity_features(node_attributes):
    data_integrity_rec_time = process_time(node_attributes["data_integrity_rec_time"])
    data_integrity_rec_time_var = time_enum[node_attributes["data_integrity_rec_time"][-1]]
    data_integrity_rest_time = process_time(node_attributes["data_integrity_rest_time"])
    data_integrity_rest_time_var = time_enum[node_attributes["data_integrity_rest_time"][-1]]
    data_integrity_reg_time = process_time(node_attributes["data_integrity_reg_time"])
    data_integrity_reg_time_var = time_enum[node_attributes["data_integrity_reg_time"][-1]]
    data_integrity_volat = node_attributes["data_integrity_volat"]

    return [data_integrity_rec_time, data_integrity_rec_time_var, data_integrity_rest_time, 
            data_integrity_rest_time_var, data_integrity_reg_time, data_integrity_reg_time_var, 
            data_integrity_volat]

In [None]:
node_types = ["collection", "dataset_collection", "system_collection", "dataset", "system",  "data_integrity", "processing"]

In [None]:
stellar_nodes = []

for node in G.nodes():
    node_attributes = G.nodes()[node]
    if node_attributes["type"] == "dataset":
        extracted_features = get_dataset_features(node_attributes)
    elif node_attributes["type"] == "system":
        extracted_features = get_system_features(node_attributes)
    elif node_attributes["type"] == "data_integrity":
        extracted_features = get_data_integrity_features(node_attributes)
    elif node_attributes["type"] == "processing":
        extracted_features = get_processing_features(node_attributes)
    else:
        extracted_features = get_collection_features(node_attributes)

    stellar_node_attributes = {"label": node_attributes["type"], "feature": extracted_features}
    stellar_nodes.append((node, stellar_node_attributes))

In [None]:
preprocessed_graph = nx.DiGraph()
preprocessed_graph.add_nodes_from(stellar_nodes)

In [None]:
import time
start = time.time()
for edge in G.edges():
    preprocessed_graph.add_edge(edge[0], edge[1], label=G.edges()[edge]['label'])
print(time.time() - start)

3.434558868408203


In [None]:
from stellargraph import StellarGraph
stellar_graph = StellarGraph.from_networkx(preprocessed_graph, node_features="feature")

In [None]:
print(stellar_graph.info())

StellarDiGraph: Directed multigraph
 Nodes: 315887, Edges: 369510

 Node types:
  dataset: [212368]
    Features: float32 vector, length 10
    Edge types: dataset-INPUTS->processing
  processing: [53633]
    Features: float32 vector, length 2
    Edge types: processing-INPUTS->system, processing-OUTPUTS->dataset
  dataset_collection: [20028]
    Features: float32 vector, length 3
    Edge types: dataset_collection-CONTAINS->dataset, dataset_collection-HAS->data_integrity
  data_integrity: [20028]
    Features: float32 vector, length 7
    Edge types: none
  system: [8415]
    Features: float32 vector, length 9
    Edge types: system-OUTPUTS->processing
  system_collection: [1405]
    Features: float32 vector, length 3
    Edge types: system_collection-CONTAINS->system
  collection: [10]
    Features: float32 vector, length 3
    Edge types: collection-CONTAINS->dataset_collection, collection-CONTAINS->system_collection

 Edge types:
    dataset_collection-CONTAINS->dataset: [212368]
 

In [None]:
from stellargraph.mapper import (
    CorruptedGenerator
    GraphSAGENodeGenerator,
    HinSAGENodeGenerator,
)
from stellargraph import StellarGraph
from stellargraph.layer import GCN, DeepGraphInfomax, GraphSAGE, GAT, APPNP, HinSAGE

from stellargraph import datasets
from stellargraph.utils import plot_history

import pandas as pd
from matplotlib import pyplot as plt
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.manifold import TSNE
from IPython.display import display, HTML

from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
from tensorflow.keras import Model

In [None]:
hinsage_generator = HinSAGENodeGenerator(
    stellar_graph, batch_size=1000, num_samples=[5], head_node_type="dataset"
)

hinsage_model = HinSAGE(
    layer_sizes=[128], activations=["relu"], generator=hinsage_generator
)