In [104]:
import yaml
import pandas as pd
from graphdatascience import GraphDataScience

In [105]:
with open('config.yaml', 'r') as file:
    PARAM = yaml.safe_load(file)


url = "bolt://localhost:7687"
gds = GraphDataScience(url, auth=(PARAM["neo4j_username"], PARAM["neo4j_password"]))

In [106]:
G, result = gds.graph.project(
    "contextGraph",                  #  Graph name
    {
    "Known": { "properties":['protection_status_one_hot', 'site_area_in_hectares', 'site_length_km', 'valuation_methods_one_hot', 'value_year', 'beneficiary_unit_one_hot', 'int__per_hectare_per_year'] },
    "Unknown": {"properties":['protection_status_one_hot', 'site_area_in_hectares', 'site_length_km', 'valuation_methods_one_hot', 'value_year', 'beneficiary_unit_one_hot']},
    "Country": {},
    "Seea": {},
    "Ecosystem": {},
    "Ecozone": {},
    "Biome": {},
    "Cices_class": {},
    "Cices_division": {},
    "Cices_group": {},
    "Cices_section": {},
    "Teeb_service": {},
    "Teeb_subservice": {},
    "Teeb_category": {},
    "Subregion": {},
    "Region": {}
  },                 #  Node projection
  {
    "IS_LOCATE_IN": {"orientation": 'UNDIRECTED'},
    "IS_AN_SEEA_OF": {"orientation": 'UNDIRECTED'},
    "IS_AN_ECOSYSTEM_OF": {"orientation": 'UNDIRECTED'},
    "ECOSYSTEM_BELONGS_TO": {"orientation": 'UNDIRECTED'},
    "IS_AN_ECOZONE_OF": {"orientation": 'UNDIRECTED'},
    "ECOZONE_BELONGS_TO": {"orientation": 'UNDIRECTED'},
    "IS_A_BIOME_OF": {"orientation": 'UNDIRECTED'},
    "BELONGS_TO_CLASS_OF_CICES": {"orientation": 'UNDIRECTED'},
    "IS_A_GROUP_OF_CICES": {"orientation": 'UNDIRECTED'},
    "IS_A_DIVISION_OF_CICES": {"orientation": 'UNDIRECTED'},
    "IS_A_CLASS_OF_CICES": {"orientation": 'UNDIRECTED'},
    "PROVIDES_SERVICE": {"orientation": 'UNDIRECTED'},
    "PROVIDES_SUBSERVICE": {"orientation": 'UNDIRECTED'},
    "COUNTRY_IS_LOCATE_IN": {"orientation": 'UNDIRECTED'},
    "SUBREGION_IS_LOCATE_IN": {"orientation": 'UNDIRECTED'},
    "IS_A_SUBSERVICE_OF": {"orientation": 'UNDIRECTED'},
    "IS_A_SERVICE_OF":  {"orientation": 'UNDIRECTED'}     
    },              #  Relationship projection
    readConcurrency=4           #  Configuration parameters
)

In [107]:
model, res = gds.beta.graphSage.train(G,
modelName="multiLabelModel",
featureProperties=['protection_status_one_hot', 'site_area_in_hectares', 'site_length_km', 'valuation_methods_one_hot', 'value_year', 'beneficiary_unit_one_hot'],
projectedFeatureDimension=8,
embeddingDimension=32,
epochs=5,
maxIterations=20
)

GraphSageTrain:   0%|          | 0/100 [00:00<?, ?%/s]

In [108]:
gds.beta.graphSage.mutate(
    G,
    mutateProperty='graph_sage_embedding',
    modelName='multiLabelModel'
)

nodePropertiesWritten                                                10284
mutateMillis                                                             0
nodeCount                                                            10284
preProcessingMillis                                                      0
computeMillis                                                          224
configuration            {'mutateProperty': 'graph_sage_embedding', 'mo...
Name: 0, dtype: object

In [109]:
gds.graph.nodeProperties.stream(
    G,
    ['graph_sage_embedding'],
    ["Known", "Unknown"]
)

Unnamed: 0,nodeId,nodeProperty,propertyValue,nodeLabels
0,0,graph_sage_embedding,"[0.12962182754561222, 0.4584301229353667, 5.68...",[]
1,1,graph_sage_embedding,"[0.00874604450310037, 0.1917266839496676, 1.54...",[]
2,2,graph_sage_embedding,"[0.011528794235358358, 0.10632605382155381, 0....",[]
3,3,graph_sage_embedding,"[0.005313134142373653, 0.18024365713093346, 2....",[]
4,4,graph_sage_embedding,"[0.011528794235358358, 0.10632605382155381, 0....",[]
...,...,...,...,...
9495,10342,graph_sage_embedding,"[0.004774445765994023, 0.3312014974529382, 2.6...",[]
9496,10343,graph_sage_embedding,"[0.01903641498178452, 0.5562542213035176, 9.92...",[]
9497,10344,graph_sage_embedding,"[0.0135571982876406, 0.5693939152247499, 6.913...",[]
9498,10345,graph_sage_embedding,"[0.011890505049983183, 0.2567768228325833, 2.2...",[]


In [110]:
pipe_with_context = gds.nr_pipe("pipe-with-context")

In [111]:
pipe_with_context.selectFeatures(["graph_sage_embedding"])

name                                            pipe-with-context
nodePropertySteps                                              []
featureProperties                          [graph_sage_embedding]
splitConfig           {'testFraction': 0.3, 'validationFolds': 3}
autoTuningConfig                                {'maxTrials': 10}
parameterSpace       {'LinearRegression': [], 'RandomForest': []}
Name: 0, dtype: object

In [112]:
pipe_with_context.addRandomForest(numberOfDecisionTrees=10)

name                                                 pipe-with-context
nodePropertySteps                                                   []
featureProperties                               [graph_sage_embedding]
splitConfig                {'testFraction': 0.3, 'validationFolds': 3}
autoTuningConfig                                     {'maxTrials': 10}
parameterSpace       {'LinearRegression': [], 'RandomForest': [{'ma...
Name: 0, dtype: object

In [113]:
pipe_with_context.configureAutoTuning(maxTrials=5)

name                                                 pipe-with-context
nodePropertySteps                                                   []
featureProperties                               [graph_sage_embedding]
splitConfig                {'testFraction': 0.3, 'validationFolds': 3}
autoTuningConfig                                      {'maxTrials': 5}
parameterSpace       {'LinearRegression': [], 'RandomForest': [{'ma...
Name: 0, dtype: object

In [114]:
model, stats = pipe_with_context.train(
    G,
    targetNodeLabels=["Known"],
    modelName="nr-pipeline-model-contextual",
    targetProperty="int__per_hectare_per_year",
    metrics=["ROOT_MEAN_SQUARED_ERROR"],
    concurrency=4
)

Node Regression Train Pipeline:   0%|          | 0/100 [00:00<?, ?%/s]

Without the context Node labels: {'test': 40443.38886354173, 'validation': {'min': 43493.441360938836, 'max': 51896.71613829475, 'avg': 48193.77188901938}, 'outerTrain': 39749.880691939696, 'train': {'min': 38593.61942880581, 'max': 44602.9439280394, 'avg': 41371.3436473497}}


In [115]:
print (stats["modelInfo"]["metrics"]["ROOT_MEAN_SQUARED_ERROR"])

{'test': 43688.224366944305, 'validation': {'min': 48354.99737931862, 'max': 51913.82123589156, 'avg': 49929.191374667105}, 'outerTrain': 42873.92092326766, 'train': {'min': 40491.977359513825, 'max': 45575.02190090115, 'avg': 42705.07810630917}}


In [116]:
predicted = model.predict_stream(
    G, modelName="nr-pipeline-model-contextual", targetNodeLabels=["Unknown"]
)
     


In [117]:
predicted["name"] = [x["name"] for x in gds.util.asNodes(predicted["nodeId"].to_list())]

In [118]:
predicted.rename(columns={"predictedValue": "GraphSAGE_rf"})

Unnamed: 0,nodeId,GraphSAGE_rf,name
0,23,129734.28849,12270007
1,31,26955.16982,12250001
2,35,28565.12394,12240002
3,41,129734.28849,12230006
4,46,25697.00822,12210004
...,...,...,...
3471,10291,3810.09997,12410007
3472,10293,88182.90784,12400002
3473,10308,13696.17102,12360001
3474,10315,987.89758,12350007


In [119]:
predicted.to_csv("predicted_GraphSAGE.tsv", sep="\t", index=False)