In [1]:
spark

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log
59,application_1598227897403_0022,pyspark,idle,Link,Link


SparkSession available as 'spark'.
<pyspark.sql.session.SparkSession object at 0x7f042e9134a8>

In [2]:
from maggy import Searchspace
from hops import featurestore

In [3]:
edges = featurestore.get_featuregroup("transaction_features", dataframe_type="pandas")
nodes = featurestore.get_featuregroup("node_features", dataframe_type="pandas")    


Running sql: use paysim_featurestore against offline feature store
SQL string for the query created successfully
Running sql: SELECT * FROM transaction_features_1 against offline feature store
Running sql: use paysim_featurestore against offline feature store
SQL string for the query created successfully
Running sql: SELECT * FROM node_features_1 against offline feature store

In [4]:
# The searchspace can be instantiated with parameters
sp = Searchspace(walk_number=('INTEGER', [100, 200]), walk_length=('INTEGER', [2, 3, 4]) , emb_size=('INTEGER', [64, 128]))

Hyperparameter added: walk_number
Hyperparameter added: walk_length
Hyperparameter added: emb_size

In [5]:
def embeddings_computer(walk_number, walk_length, emb_size, edges = edges, nodes = nodes):
    import pandas as pd
    import numpy as np

    from sklearn.preprocessing import StandardScaler
    from sklearn import preprocessing, feature_extraction, model_selection
    from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
    from sklearn.metrics import accuracy_score
    from sklearn.model_selection import train_test_split
    from sklearn.manifold import TSNE

    import stellargraph as sg
    from stellargraph import StellarGraph
    from stellargraph import StellarDiGraph
    from stellargraph.data import BiasedRandomWalk
    from stellargraph.data import UnsupervisedSampler
    from stellargraph.data import BiasedRandomWalk
    from stellargraph.mapper import Node2VecLinkGenerator, Node2VecNodeGenerator
    from stellargraph.layer import Node2Vec, link_classification

    from tensorflow import keras    

    batch_size = 32
    epochs = 10

    node_data = pd.DataFrame(nodes['type'], index=nodes['id'])

    G =StellarDiGraph(node_data,
        edges=edges, edge_type_column="action"
    )


    nodes = list(G.nodes())


    walker = BiasedRandomWalk(
        G,
        n=walk_number,
        length=walk_length,
        p=0.5,  # defines probability, 1/p, of returning to source node
        q=2.0,  # defines probability, 1/q, for moving to a node away from the source node
    )
    unsupervised_samples = UnsupervisedSampler(G, nodes=list(G.nodes()), walker=walker)


    generator = Node2VecLinkGenerator(G, batch_size)

    node2vec = Node2Vec(emb_size, generator=generator)

    x_inp, x_out = node2vec.in_out_tensors()

    prediction = link_classification(
        output_dim=1, output_act="sigmoid", edge_embedding_method="dot"
    )(x_out)

    model = keras.Model(inputs=x_inp, outputs=prediction)

    model.compile(
        optimizer=keras.optimizers.Adam(lr=1e-3),
        loss=keras.losses.binary_crossentropy,
        metrics=[keras.metrics.binary_accuracy],
    )

    history = model.fit(
        generator.flow(unsupervised_samples),
        epochs=epochs,
        verbose=1,
        use_multiprocessing=False,
        workers=4,
        shuffle=True,
    )
    
    binary_accuracy = history.history['binary_accuracy'][-1]
    return binary_accuracy



In [None]:
from maggy import experiment
result = experiment.lagom(embeddings_computer, 
                           searchspace=sp, 
                           optimizer='randomsearch', 
                           direction='max',
                           num_trials=10, 
                           name='EMBEDDINGS',
                           hb_interval=5, 
                           es_interval=5,
                           es_min=5
                          )

HBox(children=(FloatProgress(value=0.0, description='Maggy experiment', max=10.0, style=ProgressStyle(descript…

0: link_classification: using 'dot' method to combine node embeddings into edge embeddings
0: 
0: Epoch 1/10
0: 
0: 
0: Epoch 2/10
0: Epoch 3/10
0: Epoch 4/10
0: Epoch 5/10
0: Epoch 6/10
0: Epoch 7/10
0: Epoch 8/10
0: Epoch 7/10


In [None]:
import json
from hops import hdfs
EMBEDDINGS_HYPERPARAMS_FILE = 'embeddings_best_hp.json'
hdfs.dump(json.dumps(result['best_hp']), "Resources/" + EMBEDDINGS_HYPERPARAMS_FILE)