In [22]:
import pandas as pd
import fornax
from sqlalchemy import create_engine
from sqlalchemy.orm.session import Session

# Tutorial 2 - Executing a Query

## Introduction

In this tutorial we will
* query the graph for:
    - a node label containing hulk 
    - joining to a node label containing Lady 
    - joining to a node label containing storm 
    
Critically there is no such subgraph in the dataset.
However, fornax will correctly deduce that She-Hulk and the Invisible Woman are both members of the Lady Liberators and this is the best match because The Invisible Woman is also known as Sue Storm.

## Database Initialisation

In [23]:
#populate the database as in tutorial 2
nodes_df = pd.read_csv('./nodes.csv')
edges_df = pd.read_csv('./edges.csv')
target_graph = fornax.Graph.create(nodes_df['uid'], zip(edges_df['start'], edges_df['end']))

## Preliminaries

Since the database only contains ids of nodes, lets build a look up table to convert node ids into labels.

In [24]:
nodes_df = pd.read_csv('./nodes.csv')
nodes_df.head()

Unnamed: 0,label,type,uid
0,Selene,0,3467757555
1,Doctor Doom,0,3667190797
2,Viper,0,1383997739
3,Rhino,0,1455651050
4,Sin,0,1790395687


In [25]:
look_up = {uid: label for uid, label in zip(nodes_df['uid'], nodes_df['label'])}
look_up[3667190797]

'Doctor Doom'

## Building a Query

The first step is to create a query graph.

We need three nodes and two edges. 
Add them to the database just like target nodes.

In [26]:
nodes = list(range(3))
nodes

[0, 1, 2]

In [27]:
edges = [(0, 1), (1, 2)]
edges

[(0, 1), (1, 2)]

In [28]:
query_graph = fornax.Graph.create(nodes, edges)

Now we add matching edgse between:
* query node 0 and all target nodes containing the substring hulk
* query node 1 and all target nodes containing the substring lady
* query node 2 and all target nodes containing the substring storm

The weight of each match is 1. Had we used a non binary matching function the weight can be in the range $0 < weight <= 1$.

In [29]:
matches = []
for uid in nodes_df[nodes_df['label'].str.contains("(?i)hulk")]['uid']:
    matches.append((0, uid, 1))
for uid in nodes_df[nodes_df['label'].str.contains('(?i)lady')]['uid']:
    matches.append((1, uid, 1))
for uid in nodes_df[nodes_df['label'].str.contains('(?i)storm')]['uid']:
    matches.append((2, uid, 1))
query = fornax.Query.create(query_graph, target_graph, matches)

In [30]:
results = query.execute(n=3, edges=False)


This code may break in numpy 1.15 because this will return a view instead of a copy -- see release notes for details.
  return obj.view(dtype=(self.dtype.type, obj.dtype))


In [31]:
results

{'iterations': 2,
 'subgraph_matches': [{'subgraph_match': [(0, 72820223),
    (1, 37085608),
    (2, -2147483648)],
   'total_score': 0.07324992213398218,
   'individual_scores': [0.013037520460784435,
    0.025322148576378822,
    0.034890253096818924]},
  {'subgraph_match': [(0, 72820223), (1, 37085608), (2, 1799668242)],
   'total_score': 0.07324992213398218,
   'individual_scores': [0.013037520460784435,
    0.025322148576378822,
    0.034890253096818924]},
  {'subgraph_match': [(0, 72820223), (1, 37085608)],
   'total_score': 1.0383596690371633,
   'individual_scores': [0.013037520460784435, 0.025322148576378822]}],
 'query_nodes': [0, 1, 2],
 'query_edges': None,
 'target_edges': [(37085608, 72820223)],
 'target_nodes': [37085608, -2147483648, 1799668242, 72820223]}

In [19]:
[[(query_id, look_up.get(target_id)) for (query_id, target_id) in match['subgraph_match']] for match in results['subgraph_matches'] ]

[[(0, 'She-Hulk'), (1, 'Lady Liberators'), (2, None)],
 [(0, 'She-Hulk'), (1, 'Lady Liberators'), (2, 'Susan Storm')],
 [(0, 'She-Hulk'), (1, 'Lady Liberators')]]

In [21]:
[result['total_score'] for result in results['subgraph_matches']]

[0.07324992213398218, 0.07324992213398218, 1.0383596690371633]