In [1]:
import pandas as pd
import fornax
from sqlalchemy import create_engine
from sqlalchemy.orm.session import Session

# Tutorial 3 - Executing a Query

## Introduction

In this tutorial we will
* reproduce the results of tutorial 2
* query the graph for:
    - a node label containing hulk 
    - joining to a node label containing Lady 
    - joining to a node label containing storm 
    
Critically there is no such subgraph in the dataset.
However, fornax will correctly deduce that She-Hulk and the Invisible Woman are both members of the Lady Liberators and this is the best match because The Invisible Woman is also known as Sue Storm.

## Database Initialisation

Below is simply a repeat of the previous tutorial.

In [2]:
#populate the database as in tutorial 2

engine = create_engine('sqlite://', echo=False)
connection = engine.connect()
fornax.model.Base.metadata.create_all(connection)

nodes_df = pd.read_csv('./nodes.csv')
# create a list of TargetNode objects
nodes = [fornax.model.TargetNode(id=uid, type=type_) for uid, type_ in zip(nodes_df['uid'], nodes_df['type'])]

session = Session(connection)
session.add_all(nodes)
session.commit()

edges_df = pd.read_csv('./edges.csv')
# create a list of TargetEdge objects
edges = [fornax.model.TargetEdge(start=start, end=end) for start, end in zip(edges_df['start'], edges_df['end'])]


session.add_all(edges)
session.commit()
session.close()

## Preliminaries

Since the database only contains ids of nodes, lets build a look up table to convert node ids into labels.

In [3]:
nodes_df = pd.read_csv('./nodes.csv')
nodes_df.head()

Unnamed: 0,label,type,uid
0,Selene,0,2861295873
1,Doctor Doom,0,2169370700
2,Viper,0,421230664
3,Rhino,0,1747963121
4,Sin,0,1390977948


In [4]:
look_up = {uid: label for uid, label in zip(nodes_df['uid'], nodes_df['label'])}
look_up[2169370700]

'Doctor Doom'

## Building a Query

The first step is to create a query graph.

We need three nodes and two edges. 
Add them to the database just like target nodes.

In [5]:
nodes = [
    fornax.model.QueryNode(id=0, type=0), # hulk
    fornax.model.QueryNode(id=1, type=1), # ladies
    fornax.model.QueryNode(id=2, type=0)  # storm
]
session = Session(connection)
session.add_all(nodes)
session.commit()

In [6]:
edges = [
    fornax.model.QueryEdge(start=0, end=1), fornax.model.QueryEdge(start=1, end=0),
    fornax.model.QueryEdge(start=1, end=2), fornax.model.QueryEdge(start=2, end=1)
]
session.add_all(edges)
session.commit()

Now we add matching edgse between:
* query node 0 and all target nodes containing the substring hulk
* query node 1 and all target nodes containing the substring lady
* query node 2 and all target nodes containing the substring storm

The weight of each match is 1. Had we used a non binary matching function the weight can be in the range $0 < weight <= 1$.

In [7]:
matches = []
for uid in nodes_df[nodes_df['label'].str.contains("(?i)hulk")]['uid']:
    matches.append(fornax.model.Match(start=0, end=uid, weight=1))
for uid in nodes_df[nodes_df['label'].str.contains('(?i)lady')]['uid']:
    matches.append(fornax.model.Match(start=1, end=uid, weight=1))
for uid in nodes_df[nodes_df['label'].str.contains('(?i)storm')]['uid']:
    matches.append(fornax.model.Match(start=2, end=uid, weight=1))
session.add_all(matches)
session.commit()

The following three lines execute the query

In [8]:
# create a sql query using a hopping distance of 2
query = fornax.select.join(2)
# get a table of results from the databse
records = query.with_session(session).all()
# run the optimiser to find the best results
solutions, scores = fornax.opt.solve(records)


This code may break in numpy 1.15 because this will return a view instead of a copy -- see release notes for details.
  return obj.view(dtype=(self.dtype.type, obj.dtype))


In [17]:
[[(query_id, look_up[target_id]) for (query_id, target_id) in solution] for solution in solutions]

[[(0, 'She-Hulk'), (1, 'Lady Liberators'), (2, ' Susan Storm-Richards')],
 [(0, 'She-Hulk'), (1, 'Lady Liberators'), (2, 'Sue Storm')],
 [(0, 'She-Hulk'), (1, 'Lady Liberators'), (2, ' Susan Storm')]]

In [10]:
scores

(0.42505555555555546, 0.42505555555555546, 0.42505555555555546)

The top three results are all matches to different aliases of the Invisible Woman.

In [None]:
connection.close()