In [1]:
# %load_ext autoreload
# %autoreload 2

In [31]:
import pandas as pd

from stellargraph import StellarGraph
from stellargraph.data import UnsupervisedSampler, EdgeSplitter, BiasedRandomWalk
from stellargraph.mapper import HinSAGELinkGenerator
from stellargraph.layer import HinSAGE, link_classification

# Load the data

In [3]:
from shared.schema import DatasetSchema, GraphSchema
from shared.graph.loading import pd_from_entity_schema


In [4]:
DATASET = DatasetSchema.load_schema('star-wars')
schema = GraphSchema.from_dataset(DATASET)

In [110]:
explicit_label = False
explicit_timestamp = True
unix_timestamp = True
prefix_id = None
include_properties = lambda cs: [c for c in cs if c.startswith('feat_') or c == 'name']

nodes_dfs = {
    label: pd_from_entity_schema(
        entity_schema,
        explicit_label=explicit_label,
        explicit_timestamp=explicit_timestamp,
        include_properties=include_properties,
        unix_timestamp=unix_timestamp,
        prefix_id=prefix_id,
    ).set_index('id').drop(columns=['type']).sort_index()
    for label, entity_schema in schema.nodes.items()
}

edges_dfs = {
    label: pd_from_entity_schema(
        entity_schema,
        explicit_label=explicit_label,
        explicit_timestamp=explicit_timestamp,
        include_properties=include_properties,
        unix_timestamp=unix_timestamp,
        prefix_id=prefix_id,
    ).reset_index().drop(columns=['type']).drop_duplicates(subset=['src', 'dst', 'timestamp'])
    for label, entity_schema in schema.edges.items()
}

cursor = 0
for df in edges_dfs.values():
    df.index += cursor
    cursor += len(df)

In [112]:
graph = StellarGraph(
    nodes={k: df.drop(columns=['name']) for k, df in nodes_dfs.items()},
    edges=edges_dfs,
    source_column='src',
    target_column='dst',
)
print(graph.info())

StellarGraph: Undirected multigraph
 Nodes: 113, Edges: 2078

 Node types:
  Character: [113]
    Features: float32 vector, length 32
    Edge types: Character-INTERACTIONS->Character, Character-MENTIONS->Character

 Edge types:
    Character-MENTIONS->Character: [1120]
        Weights: all 1 (default)
        Features: float32 vector, length 2
    Character-INTERACTIONS->Character: [958]
        Weights: all 1 (default)
        Features: float32 vector, length 2


# Split dataset

In [113]:
from sklearn.model_selection import train_test_split

In [114]:
edge_splitter_test = EdgeSplitter(graph)
graph_sub_test, examples_test, labels_test = edge_splitter_test.train_test_split(
    p=0.05, method="global"
)

print(graph_sub_test.info())

** Sampled 103 positive and 103 negative edges. **
StellarGraph: Undirected multigraph
 Nodes: 113, Edges: 1975

 Node types:
  Character: [113]
    Features: float32 vector, length 32
    Edge types: Character-INTERACTIONS->Character, Character-MENTIONS->Character

 Edge types:
    Character-MENTIONS->Character: [1069]
        Weights: all 1 (default)
        Features: none
    Character-INTERACTIONS->Character: [906]
        Weights: all 1 (default)
        Features: none


In [115]:
train_size = 0.75
val_size = 0.25

edge_splitter_train = EdgeSplitter(graph_sub_test)
graph_train, examples, labels = edge_splitter_train.train_test_split(
    p=0.1, method="global"
)

(
    examples_train,
    examples_val,
    labels_train,
    labels_val,
) = train_test_split(examples, labels, train_size=train_size, test_size=val_size)

print(graph_train.info())

** Sampled 197 positive and 197 negative edges. **
StellarGraph: Undirected multigraph
 Nodes: 113, Edges: 1778

 Node types:
  Character: [113]
    Features: float32 vector, length 32
    Edge types: Character-INTERACTIONS->Character, Character-MENTIONS->Character

 Edge types:
    Character-MENTIONS->Character: [964]
        Weights: all 1 (default)
        Features: none
    Character-INTERACTIONS->Character: [814]
        Weights: all 1 (default)
        Features: none


In [116]:
pd.DataFrame(
    [
        (
            "Training Set",
            len(examples_train),
            "Train Graph",
            "Test Graph",
            "Train the Link Classifier",
        ),
        (
            "Validation Set",
            len(examples_val),
            "Train Graph",
            "Test Graph",
            "Validate the Link Classifier",
        ),
        (
            "Test set",
            len(examples_test),
            "Test Graph",
            "Full Graph",
            "Evaluate Link Classifier",
        ),
    ],
    columns=("Split", "Number of Examples", "Hidden from", "Picked from", "Use"),
).set_index("Split")


Unnamed: 0_level_0,Number of Examples,Hidden from,Picked from,Use
Split,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Training Set,295,Train Graph,Test Graph,Train the Link Classifier
Validation Set,99,Train Graph,Test Graph,Validate the Link Classifier
Test set,206,Test Graph,Full Graph,Evaluate Link Classifier


# Train HinSage

In [117]:
from tensorflow import keras

In [124]:
batch_size = 30
epochs = 60
dimensions = [128, 128]
num_samples = [10, 5]
walk_length = 5
walk_number = 1

In [125]:
unsupervised_samples = UnsupervisedSampler(
    graph_train, nodes=list(graph_train.nodes()),
    walker=BiasedRandomWalk(graph_train, n=walk_number, length=walk_length, p=1, q=1)
)

generator = HinSAGELinkGenerator(
    graph_train, batch_size, num_samples, head_node_types=["Character", "Character"]
)

In [126]:
hinsage_layer_sizes = [32, 32]
assert len(hinsage_layer_sizes) == len(num_samples)

hinsage = HinSAGE(
    layer_sizes=hinsage_layer_sizes, generator=generator, bias=True, dropout=0.0
)

x_inp, x_out = hinsage.in_out_tensors()
prediction = link_classification(
    output_dim=1, output_act="sigmoid", edge_embedding_method="ip"
)(x_out)

link_classification: using 'ip' method to combine node embeddings into edge embeddings


In [127]:
model = keras.Model(inputs=x_inp, outputs=prediction)
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-4),
    loss=keras.losses.binary_crossentropy,
    metrics=[keras.metrics.binary_accuracy],
)

In [128]:
model.summary()

Model: "model_13"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_101 (InputLayer)          [(None, 10, 32)]     0                                            
__________________________________________________________________________________________________
input_102 (InputLayer)          [(None, 10, 32)]     0                                            
__________________________________________________________________________________________________
input_105 (InputLayer)          [(None, 50, 32)]     0                                            
__________________________________________________________________________________________________
input_106 (InputLayer)          [(None, 50, 32)]     0                                            
___________________________________________________________________________________________

In [129]:
model.fit(
    generator.flow(unsupervised_samples),
    validation_data=generator.flow(examples_val, labels_val),
    epochs=epochs,
    verbose=2,
    use_multiprocessing=False,
    workers=4,
    shuffle=True,
    callbacks=[
        keras.callbacks.EarlyStopping(
            monitor="val_loss", patience=10, restore_best_weights=True
        )
    ],
)

Epoch 1/60
31/31 - 5s - loss: 0.7388 - binary_accuracy: 0.4945 - val_loss: 0.7798 - val_binary_accuracy: 0.4242
Epoch 2/60
31/31 - 2s - loss: 0.7429 - binary_accuracy: 0.4978 - val_loss: 0.7719 - val_binary_accuracy: 0.4242
Epoch 3/60
31/31 - 2s - loss: 0.7348 - binary_accuracy: 0.5000 - val_loss: 0.7568 - val_binary_accuracy: 0.4141
Epoch 4/60
31/31 - 2s - loss: 0.7206 - binary_accuracy: 0.5077 - val_loss: 0.7479 - val_binary_accuracy: 0.4141
Epoch 5/60
31/31 - 2s - loss: 0.7188 - binary_accuracy: 0.5022 - val_loss: 0.7412 - val_binary_accuracy: 0.4545
Epoch 6/60
31/31 - 2s - loss: 0.7062 - binary_accuracy: 0.5144 - val_loss: 0.7385 - val_binary_accuracy: 0.4141
Epoch 7/60
31/31 - 2s - loss: 0.7046 - binary_accuracy: 0.5310 - val_loss: 0.7215 - val_binary_accuracy: 0.4646
Epoch 8/60
31/31 - 2s - loss: 0.6994 - binary_accuracy: 0.5332 - val_loss: 0.7120 - val_binary_accuracy: 0.4646
Epoch 9/60
31/31 - 2s - loss: 0.6986 - binary_accuracy: 0.5144 - val_loss: 0.7241 - val_binary_accuracy:

<keras.callbacks.History at 0x7f5c2c381640>

# Extract and cluster the embeddings

In [130]:
embedding_model = keras.Model(inputs=x_inp, outputs=x_out)

In [131]:
graph_node_list = graph.nodes()

node_embeddings = embedding_model.predict(
    generator.flow(list(zip(graph_node_list, graph_node_list)))
)
node_embeddings = node_embeddings[0]

In [132]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=5, random_state=0).fit(node_embeddings)

In [133]:
from shared.constants import BENCHMARKS_RESULTS

save_path = BENCHMARKS_RESULTS.joinpath('analysis', 'hinsage-linkprediction-kmeans')
save_path.mkdir(parents=True, exist_ok=True)

In [134]:
from shared.graph import CommunityAssignment

labeling = pd.Series(kmeans.labels_, index=graph_node_list, name="cid")
comlist = CommunityAssignment(labeling)

In [135]:
comlist.save_comlist(save_path.joinpath('schema.comlist'))

In [None]:
from datasets.scripts import export_to_visualization

export_to_visualization.run(
    export_to_visualization.Args(
        dataset='star-wars',
        version='base',
        run_paths=[str(save_path)]
    )
)