In [1]:
# %load_ext autoreload
# %autoreload 2

In [2]:
import pandas as pd

from stellargraph import StellarGraph
from stellargraph.data import UnsupervisedSampler, EdgeSplitter, BiasedRandomWalk
from stellargraph.mapper import HinSAGELinkGenerator
from stellargraph.layer import HinSAGE, link_classification

2022-02-05 13:43:12.583036: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-02-05 13:43:12.593160: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-02-05 13:43:12.593383: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-02-05 13:43:12.594293: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropri

# Load the data

In [3]:
from shared.schema import DatasetSchema, GraphSchema
from shared.graph.loading import pd_from_entity_schema


In [4]:
DATASET = DatasetSchema.load_schema('star-wars')
schema = GraphSchema.from_dataset(DATASET)

In [5]:
explicit_label = False
explicit_timestamp = True
unix_timestamp = True
prefix_id = None
include_properties = lambda cs: [c for c in cs if c.startswith('feat_') or c == 'name']

nodes_dfs = {
    label: pd_from_entity_schema(
        entity_schema,
        explicit_label=explicit_label,
        explicit_timestamp=explicit_timestamp,
        include_properties=include_properties,
        unix_timestamp=unix_timestamp,
        prefix_id=prefix_id,
    ).set_index('id').drop(columns=['type']).sort_index()
    for label, entity_schema in schema.nodes.items()
}

edges_dfs = {
    label: pd_from_entity_schema(
        entity_schema,
        explicit_label=explicit_label,
        explicit_timestamp=explicit_timestamp,
        include_properties=include_properties,
        unix_timestamp=unix_timestamp,
        prefix_id=prefix_id,
    ).reset_index().drop(columns=['type']).drop_duplicates(subset=['src', 'dst', 'timestamp'])
    for label, entity_schema in schema.edges.items()
}

cursor = 0
for df in edges_dfs.values():
    df.index += cursor
    cursor += len(df)

In [6]:
graph = StellarGraph(
    nodes={k: df.drop(columns=['name_']) for k, df in nodes_dfs.items()},
    edges=edges_dfs,
    source_column='src',
    target_column='dst',
)
print(graph.info())

StellarGraph: Undirected multigraph
 Nodes: 113, Edges: 2078

 Node types:
  Character: [113]
    Features: float32 vector, length 32
    Edge types: Character-INTERACTIONS->Character, Character-MENTIONS->Character

 Edge types:
    Character-MENTIONS->Character: [1120]
        Weights: all 1 (default)
        Features: float32 vector, length 2
    Character-INTERACTIONS->Character: [958]
        Weights: all 1 (default)
        Features: float32 vector, length 2


# Split dataset

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
edge_splitter_test = EdgeSplitter(graph)
graph_sub_test, examples_test, labels_test = edge_splitter_test.train_test_split(
    p=0.05, method="global"
)

print(graph_sub_test.info())

** Sampled 103 positive and 103 negative edges. **
StellarGraph: Undirected multigraph
 Nodes: 113, Edges: 1975

 Node types:
  Character: [113]
    Features: float32 vector, length 32
    Edge types: Character-INTERACTIONS->Character, Character-MENTIONS->Character

 Edge types:
    Character-MENTIONS->Character: [1067]
        Weights: all 1 (default)
        Features: none
    Character-INTERACTIONS->Character: [908]
        Weights: all 1 (default)
        Features: none


In [9]:
train_size = 0.75
val_size = 0.25

edge_splitter_train = EdgeSplitter(graph_sub_test)
graph_train, examples, labels = edge_splitter_train.train_test_split(
    p=0.1, method="global"
)

(
    examples_train,
    examples_val,
    labels_train,
    labels_val,
) = train_test_split(examples, labels, train_size=train_size, test_size=val_size)

print(graph_train.info())

** Sampled 197 positive and 197 negative edges. **
StellarGraph: Undirected multigraph
 Nodes: 113, Edges: 1778

 Node types:
  Character: [113]
    Features: float32 vector, length 32
    Edge types: Character-INTERACTIONS->Character, Character-MENTIONS->Character

 Edge types:
    Character-MENTIONS->Character: [959]
        Weights: all 1 (default)
        Features: none
    Character-INTERACTIONS->Character: [819]
        Weights: all 1 (default)
        Features: none


In [10]:
pd.DataFrame(
    [
        (
            "Training Set",
            len(examples_train),
            "Train Graph",
            "Test Graph",
            "Train the Link Classifier",
        ),
        (
            "Validation Set",
            len(examples_val),
            "Train Graph",
            "Test Graph",
            "Validate the Link Classifier",
        ),
        (
            "Test set",
            len(examples_test),
            "Test Graph",
            "Full Graph",
            "Evaluate Link Classifier",
        ),
    ],
    columns=("Split", "Number of Examples", "Hidden from", "Picked from", "Use"),
).set_index("Split")


Unnamed: 0_level_0,Number of Examples,Hidden from,Picked from,Use
Split,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Training Set,295,Train Graph,Test Graph,Train the Link Classifier
Validation Set,99,Train Graph,Test Graph,Validate the Link Classifier
Test set,206,Test Graph,Full Graph,Evaluate Link Classifier


# Train HinSage

In [11]:
from tensorflow import keras

In [12]:
batch_size = 30
epochs = 60
dimensions = [128, 128]
num_samples = [10, 5]
walk_length = 5
walk_number = 1

In [13]:
unsupervised_samples = UnsupervisedSampler(
    graph_train, nodes=list(graph_train.nodes()),
    walker=BiasedRandomWalk(graph_train, n=walk_number, length=walk_length, p=1, q=1)
)

generator = HinSAGELinkGenerator(
    graph_train, batch_size, num_samples, head_node_types=["Character", "Character"]
)

In [14]:
hinsage_layer_sizes = [32, 32]
assert len(hinsage_layer_sizes) == len(num_samples)

hinsage = HinSAGE(
    layer_sizes=hinsage_layer_sizes, generator=generator, bias=True, dropout=0.0
)

x_inp, x_out = hinsage.in_out_tensors()
prediction = link_classification(
    output_dim=1, output_act="sigmoid", edge_embedding_method="ip"
)(x_out)

link_classification: using 'ip' method to combine node embeddings into edge embeddings


In [15]:
model = keras.Model(inputs=x_inp, outputs=prediction)
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-4),
    loss=keras.losses.binary_crossentropy,
    metrics=[keras.metrics.binary_accuracy],
)

In [16]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 10, 32)]     0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, 10, 32)]     0                                            
__________________________________________________________________________________________________
input_7 (InputLayer)            [(None, 50, 32)]     0                                            
__________________________________________________________________________________________________
input_8 (InputLayer)            [(None, 50, 32)]     0                                            
______________________________________________________________________________________________

In [17]:
model.fit(
    generator.flow(unsupervised_samples),
    validation_data=generator.flow(examples_val, labels_val),
    epochs=epochs,
    verbose=2,
    use_multiprocessing=False,
    workers=4,
    shuffle=True,
    callbacks=[
        keras.callbacks.EarlyStopping(
            monitor="val_loss", patience=10, restore_best_weights=True
        )
    ],
)

2022-02-05 13:43:14.763673: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/60


2022-02-05 13:43:17.861946: I tensorflow/stream_executor/cuda/cuda_blas.cc:1760] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


31/31 - 6s - loss: 0.7255 - binary_accuracy: 0.4989 - val_loss: 0.7218 - val_binary_accuracy: 0.5152
Epoch 2/60
31/31 - 2s - loss: 0.7198 - binary_accuracy: 0.4923 - val_loss: 0.7246 - val_binary_accuracy: 0.5051
Epoch 3/60
31/31 - 2s - loss: 0.7091 - binary_accuracy: 0.5066 - val_loss: 0.7167 - val_binary_accuracy: 0.5051
Epoch 4/60
31/31 - 2s - loss: 0.7030 - binary_accuracy: 0.5122 - val_loss: 0.7113 - val_binary_accuracy: 0.5051
Epoch 5/60
31/31 - 2s - loss: 0.6980 - binary_accuracy: 0.5277 - val_loss: 0.7115 - val_binary_accuracy: 0.5253
Epoch 6/60
31/31 - 2s - loss: 0.6967 - binary_accuracy: 0.5321 - val_loss: 0.7042 - val_binary_accuracy: 0.5859
Epoch 7/60
31/31 - 2s - loss: 0.6872 - binary_accuracy: 0.5498 - val_loss: 0.7078 - val_binary_accuracy: 0.5455
Epoch 8/60
31/31 - 2s - loss: 0.6983 - binary_accuracy: 0.5332 - val_loss: 0.7069 - val_binary_accuracy: 0.5051
Epoch 9/60
31/31 - 2s - loss: 0.6979 - binary_accuracy: 0.5354 - val_loss: 0.7050 - val_binary_accuracy: 0.5253
Epo

<keras.callbacks.History at 0x7f7a18440d30>

# Extract and cluster the embeddings

In [18]:
embedding_model = keras.Model(inputs=x_inp, outputs=x_out)

In [19]:
graph_node_list = graph.nodes()

node_embeddings = embedding_model.predict(
    generator.flow(list(zip(graph_node_list, graph_node_list)))
)
node_embeddings = node_embeddings[0]

In [20]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=5, random_state=0).fit(node_embeddings)

In [21]:
from shared.constants import BENCHMARKS_RESULTS

save_path = BENCHMARKS_RESULTS.joinpath('analysis', 'hinsage-linkprediction-kmeans')
save_path.mkdir(parents=True, exist_ok=True)

In [32]:
from shared.graph import CommunityAssignment

labeling = pd.Series(kmeans.labels_, index=graph_node_list, name="cid")
labeling.index.name = 'nid'
comlist = CommunityAssignment(labeling)

In [33]:
comlist.save_comlist(save_path.joinpath('schema.comlist'))

In [None]:
from datasets.scripts import export_to_visualization

export_to_visualization.run(
    export_to_visualization.Args(
        dataset='star-wars',
        version='base',
        run_paths=[str(save_path)]
    )
)

[2022-02-05 13:47:46,308][export_to_visualization.py][INFO] Processing /data/pella/projects/University/Thesis/Thesis/code/storage/datasets/processed/star-wars/schema.yaml
[2022-02-05 13:47:46,372][export_to_visualization.py][INFO] Adding prediction /data/pella/projects/University/Thesis/Thesis/code/storage/results/analysis/hinsage-linkprediction-kmeans/schema.comlist as prediction_0
[2022-02-05 13:47:46,376][export_to_visualization.py][INFO] Writing /data/pella/projects/University/Thesis/Thesis/code/storage/datasets/export/star-wars/versions/base/schema.graphml


# Calculate Evaluation Metrics

In [38]:
from shared.graph import DataGraph
from benchmarks.evaluation import get_metric_list

[2022-02-05 13:57:37,740][git.cmd][DEBUG] Popen(['git', 'version'], cwd=/data/pella/projects/University/Thesis/Thesis/code/experiments/notebooks, universal_newlines=False, shell=None, istream=None)
[2022-02-05 13:57:37,816][git.cmd][DEBUG] Popen(['git', 'version'], cwd=/data/pella/projects/University/Thesis/Thesis/code/experiments/notebooks, universal_newlines=False, shell=None, istream=None)


In [35]:
G = DataGraph.from_schema(schema)

In [40]:
metrics = get_metric_list(ground_truth=False, overlapping=False)

results = pd.DataFrame([
    {
        'metric': metric_cls.metric_name(),
        'value': metric_cls.calculate(G, comlist)
    }
    for metric_cls in metrics]
)
results