In [1]:
spark

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log
64,application_1598227897403_0028,pyspark,idle,Link,Link


SparkSession available as 'spark'.
<pyspark.sql.session.SparkSession object at 0x7fe077a8d4a8>

In [2]:
from hops import featurestore

In [3]:
def embeddings_computer(walk_number, walk_length, emb_size):
    
    import os
    import sys
    import uuid
    import random    
    
    import pandas as pd
    import numpy as np

    from hops import hdfs, featurestore
    from hops import pandas_helper as pandas
    from hops import model as hops_model
    
    from sklearn.preprocessing import StandardScaler
    from sklearn import preprocessing, feature_extraction, model_selection
    from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
    from sklearn.metrics import accuracy_score
    from sklearn.model_selection import train_test_split
    from sklearn.manifold import TSNE

    import stellargraph as sg
    from stellargraph import StellarGraph
    from stellargraph import StellarDiGraph
    from stellargraph.data import BiasedRandomWalk
    from stellargraph.data import UnsupervisedSampler
    from stellargraph.data import BiasedRandomWalk
    from stellargraph.mapper import Node2VecLinkGenerator, Node2VecNodeGenerator
    from stellargraph.layer import Node2Vec, link_classification

    import tensorflow as tf
    from tensorflow import keras        
    
    # Get path to training dataset
    file_path = featurestore.get_training_dataset_path("transaction_features_csv",
                                                                featurestore=featurestore.project_featurestore(),
                                                                training_dataset_version=featurestore.get_latest_training_dataset_version("transaction_features_csv")                                                      
                                                      )
    edges = pandas.read_csv(hdfs.get_plain_path([path for path in hdfs.ls(file_path) if ".csv" in path][0]))

    file_path = featurestore.get_training_dataset_path("node_features_csv",
                                                                featurestore=featurestore.project_featurestore(),
                                                                training_dataset_version=featurestore.get_latest_training_dataset_version("node_features_csv")                                                      
                                                      )
    nodes = pandas.read_csv(hdfs.get_plain_path([path for path in hdfs.ls(file_path) if ".csv" in path][0]))
    
    ###########
    batch_size = 32
    epochs = 20

    node_data = pd.DataFrame(nodes['type'], index=nodes['id'])
    ###########
    
    

    G =StellarDiGraph(node_data,
        edges=edges, edge_type_column="action"
    )


    nodes = list(G.nodes())


    walker = BiasedRandomWalk(
        G,
        n=walk_number,
        length=walk_length,
        p=0.5,  # defines probability, 1/p, of returning to source node
        q=2.0,  # defines probability, 1/q, for moving to a node away from the source node
    )
    unsupervised_samples = UnsupervisedSampler(G, nodes=list(G.nodes()), walker=walker)


    generator = Node2VecLinkGenerator(G, batch_size)

    node2vec = Node2Vec(emb_size, generator=generator)

    x_inp, x_out = node2vec.in_out_tensors()

    prediction = link_classification(
        output_dim=1, output_act="sigmoid", edge_embedding_method="dot"
    )(x_out)

    model = keras.Model(inputs=x_inp, outputs=prediction)

    model.compile(
        optimizer=keras.optimizers.Adam(lr=1e-3),
        loss=keras.losses.binary_crossentropy,
        metrics=[keras.metrics.binary_accuracy],
    )

    history = model.fit(
        generator.flow(unsupervised_samples),
        epochs=epochs,
        verbose=1,
        use_multiprocessing=False,
        workers=4,
        shuffle=True,
    )
    
    x_inp_src = x_inp[0]
    x_out_src = x_out[0]
    embedding_model = keras.Model(inputs=x_inp_src, outputs=x_out_src)
    node_gen = Node2VecNodeGenerator(G, batch_size).flow(nodes)
    node_embeddings = embedding_model.predict(node_gen, workers=4, verbose=1)
    
    embeddings_features = pd.DataFrame(node_embeddings, index=nodes)
    
    embeddings_features.to_csv('embeddings_features.csv', index=True)
    hdfs.copy_to_hdfs('embeddings_features.csv', "Resources", overwrite=True)
    
    binary_accuracy = history.history['binary_accuracy'][-1]
    
    metrics={'accuracy': binary_accuracy} 
    
    # save to the model registry
    export_path = os.getcwd() + '/model-' + str(uuid.uuid4())
    print('Exporting trained model to: {}'.format(export_path))
    tf.saved_model.save(model, export_path)
    print('Done exporting!')
        
    hops_model.export(export_path, 'graph_embeddings', metrics=metrics)
    
    return metrics

In [4]:
from hops import hdfs
import json
best_hyperparams_path = "Resources/embeddings_best_hp.json"
best_hyperparams = json.loads(hdfs.load(best_hyperparams_path))
args_dict = {}
for key in best_hyperparams.keys():
    args_dict[key] = [best_hyperparams[key]]
    
from hops import experiment
from hops import hdfs

experiment.launch(embeddings_computer, args_dict, name='graph_embeddings_compute', metric_key='accuracy')

Finished Experiment 

('hdfs://rpc.namenode.service.consul:8020/Projects/paysim/Experiments/application_1598227897403_0028_1', {'accuracy': 0.8619130253791809, 'log': 'Experiments/application_1598227897403_0028_1/walk_number=200&walk_length=2&emb_size=128/output.log'})