In [1]:
import pandas as pd
import fornax
from sqlalchemy import create_engine
from sqlalchemy.orm.session import Session

# Tutorial 2 - Executing a Query

## Introduction

In this tutorial we will
* query the graph for:
    - a node label containing hulk 
    - joining to a node label containing Lady 
    - joining to a node label containing storm 
    
Critically there is no such subgraph in the dataset.
However, fornax will correctly deduce that She-Hulk and the Invisible Woman are both members of the Lady Liberators and this is the best match because The Invisible Woman is also known as Sue Storm.

## Database Initialisation

In [2]:
#populate the database as in tutorial 2
nodes_df = pd.read_csv('./nodes.csv')
edges_df = pd.read_csv('./edges.csv')
target_graph = fornax.Graph.create(nodes_df['uid'], zip(edges_df['start'], edges_df['end']))

## Preliminaries

Since the database only contains ids of nodes, lets build a look up table to convert node ids into labels.

In [3]:
nodes_df = pd.read_csv('./nodes.csv')
nodes_df.head()

Unnamed: 0,label,type,uid
0,Selene,0,3467757555
1,Doctor Doom,0,3667190797
2,Viper,0,1383997739
3,Rhino,0,1455651050
4,Sin,0,1790395687


In [4]:
look_up = {uid: label for uid, label in zip(nodes_df['uid'], nodes_df['label'])}
look_up[3667190797]

'Doctor Doom'

## Building a Query

The first step is to create a query graph.

We need three nodes and two edges. 
Add them to the database just like target nodes.

In [5]:
nodes = list(range(3))
nodes

[0, 1, 2]

In [6]:
edges = [(0, 1), (1, 2)]
edges

[(0, 1), (1, 2)]

In [7]:
query_graph = fornax.Graph.create(nodes, edges)

Now we add matching edgse between:
* query node 0 and all target nodes containing the substring hulk
* query node 1 and all target nodes containing the substring lady
* query node 2 and all target nodes containing the substring storm

The weight of each match is 1. Had we used a non binary matching function the weight can be in the range $0 < weight <= 1$.

In [8]:
matches = []
for uid in nodes_df[nodes_df['label'].str.contains("(?i)hulk")]['uid']:
    matches.append((0, uid, 1))
for uid in nodes_df[nodes_df['label'].str.contains('(?i)lady')]['uid']:
    matches.append((1, uid, 1))
for uid in nodes_df[nodes_df['label'].str.contains('(?i)storm')]['uid']:
    matches.append((2, uid, 1))
query = fornax.Query.create(query_graph, target_graph, matches)

In [9]:
results = query.execute(n=3)


This code may break in numpy 1.15 because this will return a view instead of a copy -- see release notes for details.
  return obj.view(dtype=(self.dtype.type, obj.dtype))


In [10]:
results

[{'graph': [(0, 72820223), (1, 37085608), (2, -2147483648)],
  'score': 0.07324992213398218,
  'matches': {(0, 72820223): 0.01303752,
   (1, 37085608): 0.025322149,
   (2, -2147483648): 0.034890253},
  'query_nodes': [0, 1, 2],
  'query_edges': [],
  'target_nodes': [72820223, 37085608, -2147483648],
  'target_edges': []},
 {'graph': [(0, 72820223), (1, 37085608), (2, 1799668242)],
  'score': 0.07324992213398218,
  'matches': {(0, 72820223): 0.01303752,
   (1, 37085608): 0.025322149,
   (2, 1799668242): 0.034890253},
  'query_nodes': [0, 1, 2],
  'query_edges': [],
  'target_nodes': [72820223, 37085608, 1799668242],
  'target_edges': []},
 {'graph': [(0, 72820223), (1, 37085608)],
  'score': 1.0383596690371633,
  'matches': {(0, 72820223): 0.01303752, (1, 37085608): 0.025322149},
  'query_nodes': [0, 1],
  'query_edges': [],
  'target_nodes': [72820223, 37085608],
  'target_edges': []}]

In [11]:
[[(query_id, look_up.get(target_id)) for (query_id, target_id) in solution['graph']] for solution in results]

[[(0, 'She-Hulk'), (1, 'Lady Liberators'), (2, None)],
 [(0, 'She-Hulk'), (1, 'Lady Liberators'), (2, 'Susan Storm')],
 [(0, 'She-Hulk'), (1, 'Lady Liberators')]]

In [12]:
[result['score'] for result in results]

[0.07324992213398218, 0.07324992213398218, 1.0383596690371633]