diff --git a/README.md b/README.md index e5f286e..733571b 100644 --- a/README.md +++ b/README.md @@ -1,18 +1,24 @@ [![CircleCI](https://circleci.com/gh/CDECatapult/fornax.svg?style=svg&circle-token=2110b6bc1d713698d241fd08ae60cd925e60062f)](https://circleci.com/gh/CDECatapult/fornax) [![Coverage Status](https://coveralls.io/repos/github/CDECatapult/fornax/badge.svg?branch=master)](https://coveralls.io/github/CDECatapult/fornax?branch=master) +[![Known Vulnerabilities](https://snyk.io/test/github/CDECatapult/fornax/badge.svg)](https://snyk.io/test/github/CDECatapult/fornax/badge.svg) + # Fornax An implementation of [NeMa: Fast Graph Search with Label Similarity](http://www.vldb.org/pvldb/vol6/p181-khan.pdf) using python3 and sqlite or postgres. -![FORNAX](./fornax.png) +![FORNAX](./docs/img/fornax.png) ## Install (Dev) From the root directory: ```bash +# install dev dependencies pip install -r requirements/dev.txt + +# install fornax +pip install -e . ``` ## Test @@ -30,146 +36,40 @@ The available options for installing SciPy packages are listed [here](https://sc See the tutorials for a full working example. -### Tutorial Dependencies +* [Part 1](docs/tutorial/tutorial1.ipynb) - Download a small graph dataset +* [Part 2](docs/tutorial/tutorial2.ipynb) - Search the dataset using fornax + +### Install Tutorial Dependencies (using conda) The following tutorials use jupyter notebooks to create a worked example. -We reccomend you use the anaconda python distribution to run the notebooks. +We recommend you use the anaconda python distribution to run the notebooks. ```bash conda env create -f environment.yml -pip install -r requirements.txt ``` -* [Part 1](https://github.com/CDECatapult/fornax/blob/master/notebooks/tutorial/Tutorial%201%20-%20Creating%20a%20Dataset.ipynb) -* [Part 2](https://github.com/CDECatapult/fornax/blob/master/notebooks/tutorial/Tutorial%202%20-%20Making%20a%20Query.ipynb) - -## Database Setup - -By default fornax will use an in memory SQlite database. - -Alternative databases can be used by setting the environment variable `FORNAX_DB_URL` using the [sqlalchemy database url format](https://docs.sqlalchemy.org/en/latest/core/engines.html). -SQLite and Postgresql are supported although other databases are untested. - -All tables and indicies are initialised at import time if they do not exist already. - -## Quick start - -```python -# create a query graph -query_graph_handle = fornax.GraphHandle.create() -query_graph_handle.add_nodes(id_src=[0, 1, 2], label=['Hulk', 'Lady', 'Storm']) -query_graph_handle.add_edges([0, 1], [1, 2]) +### Run the Tutorials +```bash +source activate fornax_tutorial +cd docs/tutorial +jupyter-notebook +``` -# create a target graph -target_graph_handle = fornax.GraphHandle.create() -target_graph_handle.add_nodes(id_src=comic_book_nodes['id], label=comic_book_nodes['name']) -target_graph_handle.add_edges(comic_book_edges['start'], comic_book_edges['end']) +## Documentation -matches = [ - (query_node_id, target_node_id, weight) - for query_node_id, target_node_id, weight - in string_similarities -] +### Build the Docs (requires dev dependencies) -match_starts, match_ends, weights = zip(*matches) +```bash +cd docs +make html +``` -# stage a query -query = fornax.QueryHandle.create(query_graph_handle, target_graph_handle) -query.add_matches(match_starts, match_ends, weights) +### View the Docs Locally -# go! -query.execute() +```bash +cd _build/html +python3 -m http.server ``` -```json -{ - "graphs": [ - { - "cost": 0.024416640711327393, - "nodes": [ - { - "id": 9437002, - "type": "query", - "id_src": 0, - "label": "hulk" - }, - { - "id": 13982314, - "type": "query", - "id_src": 1, - "label": "lady" - }, - { - "id": 76350203, - "type": "query", - "id_src": 2, - "label": "storm" - }, - { - "id": 75367743, - "type": "target", - "id_src": 37644418, - "label": " Susan Storm", - "type_": 2 - }, - { - "id": 5878004, - "type": "target", - "id_src": 995920086, - "label": "Lady Liberators", - "type_": 1 - }, - { - "id": 71379958, - "type": "target", - "id_src": 2142361735, - "label": "She-Hulk", - "type_": 0 - } - ], - "links": [ - { - "start": 9437002, - "end": 71379958, - "type": "match", - "weight": 0.9869624795392156 - }, - { - "start": 13982314, - "end": 5878004, - "type": "match", - "weight": 0.9746778514236212 - }, - { - "start": 76350203, - "end": 75367743, - "type": "match", - "weight": 0.9651097469031811 - }, - { - "start": 9437002, - "end": 13982314, - "type": "query", - "weight": 1.0 - }, - { - "start": 13982314, - "end": 76350203, - "type": "query", - "weight": 1.0 - }, - { - "start": 5878004, - "end": 71379958, - "type": "target", - "weight": 1.0 - } - ] - } - ], - "iters": 2, - "hopping_distance": 2, - "max_iters": 10 -} -``` +navigate to `0.0.0.0:8000` in your browser. diff --git a/docs/Makefile b/docs/Makefile index a0e42a0..298ea9e 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -3,10 +3,9 @@ # You can set these variables from the command line. SPHINXOPTS = -SPHINXBUILD = python -msphinx -SPHINXPROJ = fornax -SOURCEDIR = source -BUILDDIR = build +SPHINXBUILD = sphinx-build +SOURCEDIR = . +BUILDDIR = _build # Put it first so that "make" without argument is like "make help". help: diff --git a/docs/_static/.gitignore b/docs/_static/.gitignore new file mode 100644 index 0000000..e69de29 diff --git a/docs/api.rst b/docs/api.rst new file mode 100644 index 0000000..17ad2aa --- /dev/null +++ b/docs/api.rst @@ -0,0 +1,80 @@ +.. module:: fornax.api + +API +=== + +.. _fornax-api-introduction: + +Introduction +------------ + +This part of the documentation covers the the interface for creating an searching graphs using the fornax package. +For the full documentation of the module api see :ref:`fornax-api-module`. + + +All of the functionality in :mod:`fornax` can be accessed via the follwoing three classes. + +* :class:`Connection` +* :class:`GraphHandle` +* :class:`QueryHandle` + +:class:`Connection` is used to manage a connection to a SQL database. +:class:`GraphHandle` and :class:`QueryHandle` are used to create, insert +update and delete graphs and queries. + +Connection API +-------------------- + + +Fornax stores and queries graphs using a database via a database connection. +:class:`Connection` manages the lifecycle of this database connection, +the creation of database schema (if required) +and any cleanup once the connection is closed. + + +.. autoclass:: Connection + :members: + :noindex: + +Graph API +-------------------------------- + +Since Graphs are persisted in a database they are not represented +directly by any object. +Rather, graphs are accessed via a graph handle which permits the user +to manipulate graphs via a :class:`Connection` instance. + +.. autoclass:: GraphHandle + :members: + :noindex: + +Query API +------------------------------ + +Like Graphs, queries exist in a database and a accessed via a handle. +Queries are executed using the :func:`QueryHandle.execute` method. + +A query brings together three important concenpts. + +A **target graph** is the graph which is going to be searched. + +A **query graph** is the subgraph that is being seached for in the target graph. + +**matches** are label similarities between nodes in the query graph and target graph +with a weight where :math:`0 \lt weight \lt= 1`. +Users are free to caculate label similarity scores however they like. +Fornax only needs to know about non zero weights between matches. + +Once a query has been created and executed it will return the *n* subgraphs in the +target graph which are most similar to the query graph based on the similarity score +between nodes and their surrounding neighbourhoods. + +.. note:: + Nodes in the target graph will only be returned from a query if they have a + non zero similarity score to at least one node in the query graph. + + +.. autoclass:: QueryHandle + :members: + :noindex: + diff --git a/docs/source/conf.py b/docs/conf.py similarity index 67% rename from docs/source/conf.py rename to docs/conf.py index 871d921..6cdcbef 100644 --- a/docs/source/conf.py +++ b/docs/conf.py @@ -1,28 +1,35 @@ -#!/usr/bin/env python3 # -*- coding: utf-8 -*- # -# fornax documentation build configuration file, created by -# sphinx-quickstart on Fri Nov 16 10:25:34 2018. +# Configuration file for the Sphinx documentation builder. # -# This file is execfile()d with the current directory set to its -# containing dir. -# -# Note that not all possible configuration values are present in this -# autogenerated file. -# -# All configuration values have a default; values that are commented out -# serve to show the default. +# This file does only contain a selection of the most common options. For a +# full list see the documentation: +# http://www.sphinx-doc.org/en/master/config + +# -- Path setup -------------------------------------------------------------- # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # -#import os -#import sys -#sys.path.insert(0, os.path.abspath('.')) +import os +import sys +sys.path.insert(0, os.path.abspath('../fornax')) + + +# -- Project information ----------------------------------------------------- + +project = 'fornax' +copyright = '2018, Digital Catapult (https://www.digicatapult.org.uk)' +author = 'Daniel Staff' + +# The short X.Y version +version = '' +# The full version, including alpha/beta/rc tags +release = '0.0.1' -# -- General configuration ------------------------------------------------ +# -- General configuration --------------------------------------------------- # If your documentation needs a minimal Sphinx version, state it here. # @@ -31,9 +38,13 @@ # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. -extensions = ['sphinx.ext.autodoc', +extensions = [ + 'sphinx.ext.autodoc', 'sphinx.ext.doctest', - 'sphinx.ext.viewcode'] + 'sphinx.ext.mathjax', + 'sphinx.ext.viewcode', + 'nbsphinx' +] # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] @@ -47,20 +58,6 @@ # The master toctree document. master_doc = 'index' -# General information about the project. -project = 'fornax' -copyright = '2018, Daniel Staff' -author = 'Daniel Staff' - -# The version info for the project you're documenting, acts as replacement for -# |version| and |release|, also used in various other places throughout the -# built documents. -# -# The short X.Y version. -version = '0.0.1' -# The full version, including alpha/beta/rc tags. -release = '0.0.1' - # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. # @@ -70,17 +67,17 @@ # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. -# This patterns also effect to html_static_path and html_extra_path -exclude_patterns = [] +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = [ + '_build', 'Thumbs.db', '.DS_Store', '_build', + '**.ipynb_checkpoints' +] # The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' +pygments_style = None -# If true, `todo` and `todoList` produce output, else they produce nothing. -todo_include_todos = False - -# -- Options for HTML output ---------------------------------------------- +# -- Options for HTML output ------------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. @@ -97,30 +94,26 @@ # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['_static'] +html_logo = 'img/logo.png' # Custom sidebar templates, must be a dictionary that maps document names # to template names. # -# This is required for the alabaster theme -# refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars -html_sidebars = { - '**': [ - 'about.html', - 'navigation.html', - 'relations.html', # needs 'show_related': True theme option to display - 'searchbox.html', - 'donate.html', - ] -} +# The default sidebars (for documents that don't match any pattern) are +# defined by theme itself. Builtin themes are using these templates by +# default: ``['localtoc.html', 'relations.html', 'sourcelink.html', +# 'searchbox.html']``. +# +# html_sidebars = {} -# -- Options for HTMLHelp output ------------------------------------------ +# -- Options for HTMLHelp output --------------------------------------------- # Output file base name for HTML help builder. htmlhelp_basename = 'fornaxdoc' -# -- Options for LaTeX output --------------------------------------------- +# -- Options for LaTeX output ------------------------------------------------ latex_elements = { # The paper size ('letterpaper' or 'a4paper'). @@ -149,7 +142,7 @@ ] -# -- Options for manual page output --------------------------------------- +# -- Options for manual page output ------------------------------------------ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). @@ -159,7 +152,7 @@ ] -# -- Options for Texinfo output ------------------------------------------- +# -- Options for Texinfo output ---------------------------------------------- # Grouping the document tree into Texinfo files. List of tuples # (source start file, target name, title, author, @@ -171,4 +164,22 @@ ] +# -- Options for Epub output ------------------------------------------------- + +# Bibliographic Dublin Core info. +epub_title = project + +# The unique identifier of the text. This can be a ISBN number +# or the project homepage. +# +# epub_identifier = '' + +# A unique identification for the text. +# +# epub_uid = '' + +# A list of files that should not be packed into the epub file. +epub_exclude_files = ['search.html'] + +# -- Extension configuration ------------------------------------------------- diff --git a/docs/source/fornax.rst b/docs/fornax.rst similarity index 75% rename from docs/source/fornax.rst rename to docs/fornax.rst index 80a04de..9d24613 100644 --- a/docs/source/fornax.rst +++ b/docs/fornax.rst @@ -1,35 +1,34 @@ -fornax package +Submodules ============== -Submodules ----------- +.. _fornax-api-module: -fornax\.api module ------------------- +fornax.api module +----------------- .. automodule:: fornax.api :members: :undoc-members: :show-inheritance: -fornax\.model module --------------------- +fornax.model module +------------------- .. automodule:: fornax.model :members: :undoc-members: :show-inheritance: -fornax\.opt module ------------------- +fornax.opt module +----------------- .. automodule:: fornax.opt :members: :undoc-members: :show-inheritance: -fornax\.select module ---------------------- +fornax.select module +-------------------- .. automodule:: fornax.select :members: diff --git a/docs/guide.rst b/docs/guide.rst new file mode 100644 index 0000000..900e34a --- /dev/null +++ b/docs/guide.rst @@ -0,0 +1,66 @@ +======== +Guide +======== + +Fornax is an open source library to perform fuzzy subgraph matching +between labelled undirected graphs based on +`NeMa: Fast Graph Search with Label Similarity. `_ + + +Subgraph Matching +----------------- + +A subgraph is any collection of node and edges that form some subset of a graph. +For example in the image below the graph on the left is isomorphic to the green nodes +in the graph on the right, hence they form a subgraph. + +.. image:: /img/subgraph.png + +If we refer to the graph on the left as the *query graph* +and the graph on the right as the *target graph* +subgraph matching is the process of finding the *query graph* +in the *target graph* such that the node labels and edges are strictly the same. + +Fornax will kind the *n* most similar subgraphs in a *target graph* based on a user +specified *query graph* using a user specified *label similarity function*. + +Fornax will not only find exact subgraph isomorphisms +but the *n* most similar subgraphs even if they are not exact isomorphisms of the query graph. +Hence, fornax can be used for **fuzzy** subgraph matching. + +For example, Fornax can be used to find subgraphs where labels are similar, yet different, +based on a user specified definition. +Additionally neighbours in the query graph may be absent, or are neighbours of neighbours in the +target graph. + +Example Problems +---------------- + +Common fuzzy subgraph matching problems include: + +* searching knowledge graphs +* mining social networks +* searching geospation data as a graph +* searching text as a graph + +Goals +----- + +*fornax* was written with three primary goals in mind + +* to demonstrate process and provide ease of use over performance +* to be flexible and accomidate the users notions of similarity +* to scale to large target graphs of millions of nodes and edges + +Architecture +------------ + +In order to support large graphs and persist them +between python interpreter sessions fornax stores all data +in a database. + +To facilite ease of use fornax can use *sqlite* or *postgresql* +as a back end. +For more details see the API :ref:`fornax-api-introduction`. + +.. image:: /img/fornax.png \ No newline at end of file diff --git a/fornax.png b/docs/img/fornax.png similarity index 100% rename from fornax.png rename to docs/img/fornax.png diff --git a/docs/img/iron_man.svg b/docs/img/iron_man.svg new file mode 100644 index 0000000..016d14a --- /dev/null +++ b/docs/img/iron_man.svg @@ -0,0 +1 @@ +
Iron
Man
[Not supported by viewer]
The
Avengers
The<br>Avengers<br>
Tony
Tony<br>
Earth's Mightiest
Heros
Earth's Mightiest<br>Heros<br>
Hero
Alias
[Not supported by viewer]
Hero
Alias
[Not supported by viewer]
Hero
Hero
Team
Team
Team
Alias
[Not supported by viewer]
\ No newline at end of file diff --git a/docs/img/logo.png b/docs/img/logo.png new file mode 100644 index 0000000..5e69ca2 Binary files /dev/null and b/docs/img/logo.png differ diff --git a/docs/img/subgraph.png b/docs/img/subgraph.png new file mode 100644 index 0000000..14c8553 Binary files /dev/null and b/docs/img/subgraph.png differ diff --git a/docs/source/index.rst b/docs/index.rst similarity index 54% rename from docs/source/index.rst rename to docs/index.rst index d33f853..72dfd3f 100644 --- a/docs/source/index.rst +++ b/docs/index.rst @@ -1,16 +1,20 @@ .. fornax documentation master file, created by - sphinx-quickstart on Fri Nov 16 10:25:34 2018. + sphinx-quickstart on Wed Nov 28 10:27:45 2018. You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. -.. automodule:: fornax.api - :members: __doc__ +Welcome to fornax's documentation! +================================== -.. autoclass:: fornax.GraphHandle - :members: +.. toctree:: + :maxdepth: 2 + + guide + tutorial/tutorial1 + tutorial/tutorial2 + api + modules -.. autoclass:: fornax.QueryHandle - :members: Indices and tables ================== diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..27f573b --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% + +:end +popd diff --git a/docs/source/modules.rst b/docs/modules.rst similarity index 71% rename from docs/source/modules.rst rename to docs/modules.rst index 9322be9..075975f 100644 --- a/docs/source/modules.rst +++ b/docs/modules.rst @@ -1,5 +1,5 @@ -fornax -====== +Modules +======= .. toctree:: :maxdepth: 4 diff --git a/notebooks/tutorial/Tutorial 1 - Creating a Dataset.ipynb b/docs/tutorial/tutorial1.ipynb similarity index 79% rename from notebooks/tutorial/Tutorial 1 - Creating a Dataset.ipynb rename to docs/tutorial/tutorial1.ipynb index 19369ae..86b23da 100644 --- a/notebooks/tutorial/Tutorial 1 - Creating a Dataset.ipynb +++ b/docs/tutorial/tutorial1.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Tutorial 1 - Creating a Dataset From WikiData" + "# Creating a Dataset" ] }, { @@ -22,19 +22,26 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Introduction\n", + "To install the use the dependencies for this notebook:\n", "\n", - "Welcome to this three part fornax fuzzy graph tutorial. \n", - "* In part 1 we will create a sample graph dataset from WikiData.\n", - "* In part 2 we will demonstrate how to create a database backend for fornax\n", - "* In part 3 we will demonstrate some queries and results" + "```bash\n", + "conda env create -f environment.yml\n", + "source activate fornax_tutorial\n", + "```\n", + "\n", + "To run this notebook from the project root:\n", + "\n", + "```bash\n", + "cd docs/tutorial\n", + "jupyter-notebook\n", + "```" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Dowloading A Dataset\n", + "## Download\n", "\n", "For the duration of this tutorial we will be using the social network of Marvel Comicbook characters.\n", "\n", @@ -114,7 +121,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Loading into Pandas\n", + "## Loading with Pandas\n", "\n", "We'll be using pandas to do some data manipulation so lets put the result inside a pandas dataframe." ] @@ -180,48 +187,48 @@ " \n", " \n", " 0\n", - " http://www.wikidata.org/entity/Q59665\n", - " http://www.wikidata.org/entity/Q1922133\n", - " Jubilee\n", - " Generation X\n", - " Jubilation Lee\n", - " Jubilation Lee\n", + " http://www.wikidata.org/entity/Q60002\n", + " http://www.wikidata.org/entity/Q2603976\n", + " Colossus\n", + " Excalibur\n", + " Пётр Николаевич Распутин\n", + " Peter Rasputin, Piotr Nikolayevich Rasputin, P...\n", " \n", " \n", " 1\n", - " http://www.wikidata.org/entity/Q95153\n", - " http://www.wikidata.org/entity/Q1319624\n", - " Silver Surfer\n", - " The Order\n", - " None\n", - " Norrin Radd\n", + " http://www.wikidata.org/entity/Q258015\n", + " http://www.wikidata.org/entity/Q2603976\n", + " Rachel Summers\n", + " Excalibur\n", + " Rachel Anne Summers\n", + " Phoenix, Prestige, Marvel Girl, Mother Askani,...\n", " \n", " \n", " 2\n", - " http://www.wikidata.org/entity/Q302186\n", - " http://www.wikidata.org/entity/Q2211423\n", - " Beast\n", - " X-Factor\n", - " Henry Philip McCoy\n", - " Hank McCoy, Henry McCoy\n", + " http://www.wikidata.org/entity/Q369197\n", + " http://www.wikidata.org/entity/Q2527918\n", + " Black Widow\n", + " Thunderbolts\n", + " Наталья Алиановна Романова\n", + " Natalie Rushman, Natasha Romanoff, asesina rus...\n", " \n", " \n", " 3\n", - " http://www.wikidata.org/entity/Q369805\n", - " http://www.wikidata.org/entity/Q2109149\n", - " Selene\n", - " Hellfire Club\n", + " http://www.wikidata.org/entity/Q388316\n", + " http://www.wikidata.org/entity/Q2527918\n", + " Bullseye\n", + " Thunderbolts\n", " None\n", - " Black Queen, Selene Gallio\n", + " Lester, Hawkeye, Benjamin Poindexter\n", " \n", " \n", " 4\n", - " http://www.wikidata.org/entity/Q431862\n", - " http://www.wikidata.org/entity/Q2211423\n", - " Sabretooth\n", - " X-Factor\n", - " Victor Creed\n", - " Victor Creed\n", + " http://www.wikidata.org/entity/Q432272\n", + " http://www.wikidata.org/entity/Q2457162\n", + " Medusa\n", + " Frightful Four\n", + " None\n", + " None\n", " \n", " \n", "\n", @@ -229,25 +236,32 @@ ], "text/plain": [ " character_id \\\n", - "0 http://www.wikidata.org/entity/Q59665 \n", - "1 http://www.wikidata.org/entity/Q95153 \n", - "2 http://www.wikidata.org/entity/Q302186 \n", - "3 http://www.wikidata.org/entity/Q369805 \n", - "4 http://www.wikidata.org/entity/Q431862 \n", + "0 http://www.wikidata.org/entity/Q60002 \n", + "1 http://www.wikidata.org/entity/Q258015 \n", + "2 http://www.wikidata.org/entity/Q369197 \n", + "3 http://www.wikidata.org/entity/Q388316 \n", + "4 http://www.wikidata.org/entity/Q432272 \n", "\n", - " group_id name group \\\n", - "0 http://www.wikidata.org/entity/Q1922133 Jubilee Generation X \n", - "1 http://www.wikidata.org/entity/Q1319624 Silver Surfer The Order \n", - "2 http://www.wikidata.org/entity/Q2211423 Beast X-Factor \n", - "3 http://www.wikidata.org/entity/Q2109149 Selene Hellfire Club \n", - "4 http://www.wikidata.org/entity/Q2211423 Sabretooth X-Factor \n", + " group_id name group \\\n", + "0 http://www.wikidata.org/entity/Q2603976 Colossus Excalibur \n", + "1 http://www.wikidata.org/entity/Q2603976 Rachel Summers Excalibur \n", + "2 http://www.wikidata.org/entity/Q2527918 Black Widow Thunderbolts \n", + "3 http://www.wikidata.org/entity/Q2527918 Bullseye Thunderbolts \n", + "4 http://www.wikidata.org/entity/Q2457162 Medusa Frightful Four \n", "\n", - " birth_name alt_names \n", - "0 Jubilation Lee Jubilation Lee \n", - "1 None Norrin Radd \n", - "2 Henry Philip McCoy Hank McCoy, Henry McCoy \n", - "3 None Black Queen, Selene Gallio \n", - "4 Victor Creed Victor Creed " + " birth_name \\\n", + "0 Пётр Николаевич Распутин \n", + "1 Rachel Anne Summers \n", + "2 Наталья Алиановна Романова \n", + "3 None \n", + "4 None \n", + "\n", + " alt_names \n", + "0 Peter Rasputin, Piotr Nikolayevich Rasputin, P... \n", + "1 Phoenix, Prestige, Marvel Girl, Mother Askani,... \n", + "2 Natalie Rushman, Natasha Romanoff, asesina rus... \n", + "3 Lester, Hawkeye, Benjamin Poindexter \n", + "4 None " ] }, "execution_count": 4, @@ -307,7 +321,7 @@ "\n", "Lets quickly examine the data to check it still makes sense.\n", "\n", - "There are 388 unique characters in the dataset" + "There are 399 unique characters in the dataset" ] }, { @@ -318,7 +332,7 @@ { "data": { "text/plain": [ - "399" + "400" ] }, "execution_count": 7, @@ -397,18 +411,14 @@ " \n", " \n", " \n", - " \n", - " 103\n", - " http://www.wikidata.org/entity/Q186422\n", - " Wolverine\n", - " \n", " \n", "\n", "" ], "text/plain": [ - " character_id name\n", - "103 http://www.wikidata.org/entity/Q186422 Wolverine" + "Empty DataFrame\n", + "Columns: [character_id, name]\n", + "Index: []" ] }, "execution_count": 9, @@ -493,42 +503,42 @@ " \n", " \n", " \n", - " 167\n", + " 187\n", " Logan\n", " http://www.wikidata.org/entity/Q186422\n", " \n", " \n", - " 168\n", + " 188\n", " Weapon X\n", " http://www.wikidata.org/entity/Q186422\n", " \n", " \n", - " 169\n", + " 189\n", " Jim Logan\n", " http://www.wikidata.org/entity/Q186422\n", " \n", " \n", - " 170\n", + " 190\n", " Patch\n", " http://www.wikidata.org/entity/Q186422\n", " \n", " \n", - " 171\n", + " 191\n", " James Howlett\n", " http://www.wikidata.org/entity/Q186422\n", " \n", " \n", - " 172\n", + " 192\n", " Agent Ten\n", " http://www.wikidata.org/entity/Q186422\n", " \n", " \n", - " 173\n", + " 193\n", " Experiment X\n", " http://www.wikidata.org/entity/Q186422\n", " \n", " \n", - " 174\n", + " 194\n", " Weapon Ten\n", " http://www.wikidata.org/entity/Q186422\n", " \n", @@ -538,14 +548,14 @@ ], "text/plain": [ " alt_name character_id\n", - "167 Logan http://www.wikidata.org/entity/Q186422\n", - "168 Weapon X http://www.wikidata.org/entity/Q186422\n", - "169 Jim Logan http://www.wikidata.org/entity/Q186422\n", - "170 Patch http://www.wikidata.org/entity/Q186422\n", - "171 James Howlett http://www.wikidata.org/entity/Q186422\n", - "172 Agent Ten http://www.wikidata.org/entity/Q186422\n", - "173 Experiment X http://www.wikidata.org/entity/Q186422\n", - "174 Weapon Ten http://www.wikidata.org/entity/Q186422" + "187 Logan http://www.wikidata.org/entity/Q186422\n", + "188 Weapon X http://www.wikidata.org/entity/Q186422\n", + "189 Jim Logan http://www.wikidata.org/entity/Q186422\n", + "190 Patch http://www.wikidata.org/entity/Q186422\n", + "191 James Howlett http://www.wikidata.org/entity/Q186422\n", + "192 Agent Ten http://www.wikidata.org/entity/Q186422\n", + "193 Experiment X http://www.wikidata.org/entity/Q186422\n", + "194 Weapon Ten http://www.wikidata.org/entity/Q186422" ] }, "execution_count": 11, @@ -634,7 +644,7 @@ "data": { "text/plain": [ "(116, group_id group\n", - " 101 http://www.wikidata.org/entity/Q128452 X-Men)" + " 125 http://www.wikidata.org/entity/Q128452 X-Men)" ] }, "execution_count": 13, @@ -669,7 +679,7 @@ { "data": { "text/plain": [ - "(8, 114 Cannonball\n", + "(8, 102 Cannonball\n", " Name: name, dtype: object)" ] }, @@ -694,7 +704,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Exporting a csv dataset\n", + "## Export to CSV\n", "\n", "Let's write each node to a csv file, we need to record\n", "\n", @@ -768,45 +778,45 @@ " \n", " \n", " 0\n", - " Jubilee\n", + " Colossus\n", " 0\n", - " 1126900601\n", + " 2105314676\n", " \n", " \n", " 1\n", - " Silver Surfer\n", + " Rachel Summers\n", " 0\n", - " 440245546\n", + " 298635603\n", " \n", " \n", " 2\n", - " Beast\n", + " Black Widow\n", " 0\n", - " 1370118169\n", + " 1897346471\n", " \n", " \n", " 3\n", - " Selene\n", + " Bullseye\n", " 0\n", - " 87770955\n", + " 2027281781\n", " \n", " \n", " 4\n", - " Sabretooth\n", + " Medusa\n", " 0\n", - " 1299099267\n", + " 347320780\n", " \n", " \n", "\n", "" ], "text/plain": [ - " label type uid\n", - "0 Jubilee 0 1126900601\n", - "1 Silver Surfer 0 440245546\n", - "2 Beast 0 1370118169\n", - "3 Selene 0 87770955\n", - "4 Sabretooth 0 1299099267" + " label type uid\n", + "0 Colossus 0 2105314676\n", + "1 Rachel Summers 0 298635603\n", + "2 Black Widow 0 1897346471\n", + "3 Bullseye 0 2027281781\n", + "4 Medusa 0 347320780" ] }, "execution_count": 17, @@ -851,7 +861,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ diff --git a/docs/tutorial/tutorial2.ipynb b/docs/tutorial/tutorial2.ipynb new file mode 100644 index 0000000..9f91497 --- /dev/null +++ b/docs/tutorial/tutorial2.ipynb @@ -0,0 +1,1098 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Tutorial" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "import pandas as pd\n", + "import json\n", + "import matplotlib.pyplot as plt\n", + "import networkx as nx\n", + "import fornax\n", + "\n", + "%matplotlib inline\n", + "from IPython.core.display import SVG\n", + "\n", + "# Add project root dir\n", + "ROOT_DIR = os.path.abspath(\"../../\")\n", + "sys.path.append(ROOT_DIR)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To install the use the dependencies for this notebook:\n", + "\n", + "```bash\n", + "conda env create -f environment.yml\n", + "source activate fornax_tutorial\n", + "```\n", + "\n", + "To run this notebook from the project root:\n", + "\n", + "```bash\n", + "cd docs/tutorial\n", + "jupyter-notebook\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this tutorial we will:\n", + "\n", + "* Load a graph of superheros and their teams from csv files\n", + "\n", + "* Search for nodes in the graph using a string similarity function\n", + "\n", + "* Use fornax to search for nodes using string similarity and fuzzy graph matching\n", + "\n", + "The data in this tutorial we be generated using the preceding notebook: `Tutorial1.ipynb`.\n", + "\n", + "## Introduction\n", + "\n", + "`nodes.csv` and `edges.csv` contain a graph of superheros and their teams along with alternative names for those heros and groups (or aliases).\n", + "\n", + "The image below uses the example of Iron Man, who is known as \"Tony\" to his friends.\n", + "Iron man is a member of the Avengers, a.k.a. Earth's Mightiest Superheros.\n", + "Other heros are also members of The Avengers, and they will also have aliases.\n", + "Other heros will also be members of other teams and so and so forth.\n", + "\n", + "\n", + "All of these heros, teams and aliases together make our target graph, a graph which we will search using fornax." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "
Iron
Man
[Not supported by viewer]
The
Avengers
The<br>Avengers<br>
Tony
Tony<br>
Earth's Mightiest
Heros
Earth's Mightiest<br>Heros<br>
Hero
Alias
[Not supported by viewer]
Hero
Alias
[Not supported by viewer]
Hero
Hero
Team
Team
Team
Alias
[Not supported by viewer]
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "SVG('../img/iron_man.svg')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's load the data into the notebook using pandas." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# used for converting csv values in nodes.csv\n", + "mapping = {\n", + " '0': 'hero',\n", + " '1': 'team', \n", + " '2': 'hero_alias', \n", + " '3': 'team_alias'\n", + "}\n", + "\n", + "nodes_df = pd.read_csv(\n", + " './nodes.csv', \n", + " # rename the columns as targets as this will form the target graph\n", + " # (the graph which we will be searching)\n", + " names=['target_label', 'target_type', 'target_id'],\n", + " # ignore the header\n", + " header=0,\n", + " converters = {\n", + " # convert target_type from numeric values to\n", + " # literal string representations for ease of reading\n", + " 'target_type': lambda key: mapping.get(key)\n", + " }\n", + ")\n", + "\n", + "# contains pairs of target node ids\n", + "edges_df = pd.read_csv('./edges.csv')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see that the target nodes have a label (the hero's primary name).\n", + "The target_type column will be one of `hero`, `team`, `hero alias`, `team alias`, the four types of nodes in the graph.\n", + "\n", + "(Note that by hero we mean a person in a comic book who has superpowers regardless of them being good or bad)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 Colossus\n", + "1 Rachel Summers\n", + "2 Black Widow\n", + "3 Bullseye\n", + "4 Medusa\n", + "Name: target_label, dtype: object" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nodes_df['target_label'].head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Edges are pairs of `target_id` values.\n", + "Note that fornax deals with undirected graphs so there is no need to add the edge in the reverse direction.\n", + "Doing so will cause an exception as the edge will be considered a duplicate." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
endstart
013676981422105314676
11367698142298635603
29609147721897346471
39609147722027281781
4849109791347320780
\n", + "
" + ], + "text/plain": [ + " end start\n", + "0 1367698142 2105314676\n", + "1 1367698142 298635603\n", + "2 960914772 1897346471\n", + "3 960914772 2027281781\n", + "4 849109791 347320780" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "edges_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Label similarity\n", + "\n", + "For some motivation, before using fornax, let us search for nodes just using their labels.\n", + "Let's search for nodes similar to `guardians`, `star` and `groot`.\n", + "\n", + "We will create a function that given a pair of labels, it will return a score where:\n", + "\n", + "$$0 <= score <= 1$$\n", + "\n", + "Secondly we'll create a search function that returns rows from our table of target nodes that have a non zero similarity score." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "def node_scoring_function(first: str, second: str):\n", + " \"\"\" node scoring function takes two strings and returns a \n", + " score in the range 0 <= score <= 1\n", + " \"\"\"\n", + " first_, second_ = sorted((first.lower(), second.lower()), key=len)\n", + " # if first is not a substring of second: score = 0\n", + " if not first_ in second_:\n", + " return 0\n", + " # otherwise use the relative difference between\n", + " # the two lengths\n", + " score = len(second_) - len(first_)\n", + " score /= max(len(first_), len(second_))\n", + " score = 1. - score\n", + " return score" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "def search(query_id: int, query_label: str):\n", + " # compute all of the scores\n", + " scores = nodes_df['target_label'].apply(\n", + " node_scoring_function, \n", + " args=(query_label,)\n", + " )\n", + " # create a boolean mask\n", + " mask = scores > 0\n", + " # graph the non zero scoring nodes\n", + " matches = nodes_df[mask].copy()\n", + " # add extra columns\n", + " matches['score'] = scores[mask]\n", + " matches['query_label'] = query_label\n", + " matches['query_id'] = query_id\n", + " return matches" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Aside:\n", + "Note that these string search functions are not terribly efficient.\n", + "They involve repeated full scans of the target nodes table.\n", + "If we were searching a larger graph we could use a search tree as an index, an external sting matching service or database. However, since this is a tutorial, the above functions are simpler and more reproducible.\n", + "This is important as we will be using these search results with fornax." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "query_labels = ['guardians', 'star', 'groot']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Examining the table below we can see that we have a conundrum.\n", + "There are 22 nodes with varying similarity to `star` and 4 nodes similar to `galaxy`." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
target_labeltarget_typetarget_idscorequery_labelquery_id
262Guardianhero10816750.888889guardians0
405Guardians of the Galaxyteam8708072710.391304guardians0
483Guardians of the Galaxy (1969 team)team12954003890.257143guardians0
994Guardianhero_alias20627913260.888889guardians0
10Firestarhero2748217420.500000star1
15Danielle Moonstarhero20838509190.235294star1
69Star-Lordhero10618676050.444444star1
87Northstarhero12608802840.444444star1
174Darkstarhero12767533090.500000star1
269Starfoxhero15942942590.571429star1
302Ultimate Firestarhero17180267720.235294star1
352Shatterstarhero12419255060.363636star1
413Starjammersteam8951174950.363636star1
414Upstartsteam8398510790.500000star1
503Starforceteam16059411170.444444star1
520James Proudstarhero_alias2681493750.266667star1
556John Proudstarhero_alias8801970810.285714star1
602Anthony \"Tony\" Edward Carbonell Starkhero_alias20078060130.108108star1
661Star-Lordhero_alias925714790.400000star1
678Moonstarhero_alias2943734730.444444star1
764Starlordhero_alias17883144070.444444star1
765Star Lordhero_alias9254346460.400000star1
985Anthony Edward \"Tony\" Starkhero_alias21389963950.142857star1
986Tony Starkhero_alias1822991330.363636star1
989The Star Spangled Man With A Planhero_alias19155735630.117647star1
1051Firestarhero_alias15800653670.444444star1
68Groothero746714341.000000groot2
\n", + "
" + ], + "text/plain": [ + " target_label target_type target_id score \\\n", + "262 Guardian hero 1081675 0.888889 \n", + "405 Guardians of the Galaxy team 870807271 0.391304 \n", + "483 Guardians of the Galaxy (1969 team) team 1295400389 0.257143 \n", + "994 Guardian hero_alias 2062791326 0.888889 \n", + "10 Firestar hero 274821742 0.500000 \n", + "15 Danielle Moonstar hero 2083850919 0.235294 \n", + "69 Star-Lord hero 1061867605 0.444444 \n", + "87 Northstar hero 1260880284 0.444444 \n", + "174 Darkstar hero 1276753309 0.500000 \n", + "269 Starfox hero 1594294259 0.571429 \n", + "302 Ultimate Firestar hero 1718026772 0.235294 \n", + "352 Shatterstar hero 1241925506 0.363636 \n", + "413 Starjammers team 895117495 0.363636 \n", + "414 Upstarts team 839851079 0.500000 \n", + "503 Starforce team 1605941117 0.444444 \n", + "520 James Proudstar hero_alias 268149375 0.266667 \n", + "556 John Proudstar hero_alias 880197081 0.285714 \n", + "602 Anthony \"Tony\" Edward Carbonell Stark hero_alias 2007806013 0.108108 \n", + "661 Star-Lord hero_alias 92571479 0.400000 \n", + "678 Moonstar hero_alias 294373473 0.444444 \n", + "764 Starlord hero_alias 1788314407 0.444444 \n", + "765 Star Lord hero_alias 925434646 0.400000 \n", + "985 Anthony Edward \"Tony\" Stark hero_alias 2138996395 0.142857 \n", + "986 Tony Stark hero_alias 182299133 0.363636 \n", + "989 The Star Spangled Man With A Plan hero_alias 1915573563 0.117647 \n", + "1051 Firestar hero_alias 1580065367 0.444444 \n", + "68 Groot hero 74671434 1.000000 \n", + "\n", + " query_label query_id \n", + "262 guardians 0 \n", + "405 guardians 0 \n", + "483 guardians 0 \n", + "994 guardians 0 \n", + "10 star 1 \n", + "15 star 1 \n", + "69 star 1 \n", + "87 star 1 \n", + "174 star 1 \n", + "269 star 1 \n", + "302 star 1 \n", + "352 star 1 \n", + "413 star 1 \n", + "414 star 1 \n", + "503 star 1 \n", + "520 star 1 \n", + "556 star 1 \n", + "602 star 1 \n", + "661 star 1 \n", + "678 star 1 \n", + "764 star 1 \n", + "765 star 1 \n", + "985 star 1 \n", + "986 star 1 \n", + "989 star 1 \n", + "1051 star 1 \n", + "68 groot 2 " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# find the nodes similar to 'guardians', 'star' and 'groot'\n", + "matches = pd.concat(search(id_, label) for id_, label in enumerate(query_labels))\n", + "matches" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Fornax enables a more powerful type of search. \n", + "By specifying 'guardians', 'star', 'groot' as nodes in a graph, \n", + "and by specifying the relationships between them, \n", + "we can search for nodes in our target graph with the same relationships." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Creating a target graph\n", + "\n", + "Fornax behaves much like a database. In fact it uses SQLite or Postgresql to store graph data and index it.\n", + "To insert a new graph into fornax we can use the following three steps:\n", + "1. create a new graph\n", + "2. add nodes and node meta data\n", + "3. add edges and edge meta data\n", + "\n", + "The object `fornax.GraphHandle` is much like a file handle. It does not represent the graph but it is an accessor to it.\n", + "If the `GraphHandle` goes out of scope the graph will still persist until it is explicitly deleted, much like a file." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "conn = fornax.Connection()\n", + "conn.open()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "target_graph = fornax.GraphHandle.create(conn)\n", + "target_graph.add_nodes(\n", + " # use id_src to set a custom id on each node \n", + " id_src=nodes_df['target_id'],\n", + " # use other keyword arguments to attach arbitrary metadata to each node\n", + " label=nodes_df['target_label'],\n", + " # the type keyword is reserved to we use target_type\n", + " target_type=nodes_df['target_type']\n", + " # meta data must be json serialisable\n", + ")\n", + "target_graph.add_edges(edges_df['start'], edges_df['end'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can use the `graph_id` to access our graph in the future." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "target_graph.graph_id\n", + "another_target_graph_handle = fornax.GraphHandle.read(conn, target_graph.graph_id)\n", + "another_target_graph_handle == target_graph" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Creating a query graph\n", + "\n", + "Let's imagine that we suspect `groot` is directly related to `guardians` and `star` is also directly related to `guardians`.\n", + "For example `groot` and `star` could both be members of a team called `guardians`.\n", + "Let's create another small graph that represents this situation:" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "# create a new graph\n", + "query_graph = fornax.GraphHandle.create(conn)\n", + "\n", + "# insert the three nodes: \n", + "# 'guardians' (id=0), 'star' (id=1), 'groot' (id=2)\n", + "query_graph.add_nodes(label=query_labels)\n", + "\n", + "# alternatively:\n", + "# query_graph.add_nodes(id_src=query_labels)\n", + "# since id_src can use any unique hashable items\n", + "\n", + "edges = [\n", + " (0, 1), # edge between groot and guardians\n", + " (0, 2) # edge between star and guardians\n", + "]\n", + "\n", + "sources, targets = zip(*edges)\n", + "query_graph.add_edges(sources, targets)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Search\n", + "\n", + "We can create a query in an analogous way to creating graphs using a `QueryHandle`,\n", + "a handle to a query stored in the fornax database.\n", + "To create a useful query we need to insert the string similarity scores we computed in part 1.\n", + "Fornax will use these scores and the graph edges to execute the query." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "query = fornax.QueryHandle.create(conn, query_graph, target_graph)\n", + "query.add_matches(matches['query_id'], matches['target_id'], matches['score'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally we can execute the query using a variety of options.\n", + "We specify we want the top 5 best matches between the query graph and the target graph." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 79.2 ms, sys: 4.3 ms, total: 83.5 ms\n", + "Wall time: 80.8 ms\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/dstaff/anaconda3/envs/fornax/lib/python3.6/site-packages/numpy/core/records.py:513: FutureWarning: Numpy has detected that you may be viewing or writing to an array returned by selecting multiple fields in a structured array. \n", + "\n", + "This code may break in numpy 1.15 because this will return a view instead of a copy -- see release notes for details.\n", + " return obj.view(dtype=(self.dtype.type, obj.dtype))\n" + ] + } + ], + "source": [ + "%time results = query.execute(n=5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Visualise\n", + "\n", + "`query.execute` returns an object describing the search result.\n", + "Of primary interest is the `graph` field which contains a list of graphs in `node_link_graph` format.\n", + "We can use networkx to draw these graphs and visualise the results." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "def draw(graph):\n", + " \"\"\" function for drawing a graph using matplotlib and networkx\"\"\"\n", + " \n", + " # each graph is already in node_link_graph format \n", + " G = nx.json_graph.node_link_graph(graph)\n", + " \n", + " labels = {node['id']: node['label'] for node in graph['nodes']}\n", + " node_colour = ['r' if node['type'] == 'query' else 'b' for node in graph['nodes']]\n", + " pos = nx.spring_layout(G)\n", + " nx.draw_networkx_nodes(G, pos, node_size=600, node_color=node_colour, alpha=.3)\n", + " edgelist = [(e['source'], e['target']) for e in graph['links'] if e['type'] != 'match']\n", + " nx.draw_networkx_edges(G, pos, width=3, edgelist=edgelist, edge_color='grey', alpha=.3)\n", + " edgelist = [(e['source'], e['target']) for e in graph['links'] if e['type'] == 'match']\n", + " nx.draw_networkx_edges(G, pos, width=3, edgelist=edgelist, style='dashed', edge_color='pink')\n", + " nx.draw_networkx_labels(G, pos, font_size=12, font_family='sans-serif', labels=labels)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Result 1 contains the best match. The three query nodes (in red) best match the three target nodes (in blue). The dashed lines show which pairs of query and target nodes matched each other. The blue nodes are a subgraph of the target graph. Note that the result does not describe the whole target graph because in principle it can be very large.\n", + "\n", + "Here we can see that the blue subgraph has exactly the same shape as the red query graph. However, the labels are not exactly the same (e.g. `guardians != Guardians of the Galaxy`) so the result scores less than the maximum score of 1.\n", + "However, we can see that our query graph is really similar to Groot and Star-Lord from Guardians of the Galaxy.\n", + "Since this is the best match we know that " + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/dstaff/anaconda3/envs/fornax/lib/python3.6/site-packages/networkx/drawing/nx_pylab.py:611: MatplotlibDeprecationWarning: isinstance(..., numbers.Number)\n", + " if cb.is_numlike(alpha):\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "for i, graph in enumerate(results['graphs'][:1]):\n", + " plt.title('Result {0}, score: {1:.2f}'.format(1, 1. - graph['cost']))\n", + " draw(graph)\n", + " plt.xlim(-1.2,1.2)\n", + " plt.ylim(-1.2,1.2)\n", + " plt.axis('off')\n", + " plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Results 2-4 have a lower score because `star` matches to a different node not adjacent to Guardians of the Galaxy. Further inspection would show that `star` has matched aliases of Star-Lord which are near Guardians of the Galaxy but not ajacent to it." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/dstaff/anaconda3/envs/fornax/lib/python3.6/site-packages/networkx/drawing/nx_pylab.py:611: MatplotlibDeprecationWarning: isinstance(..., numbers.Number)\n", + " if cb.is_numlike(alpha):\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "for i, graph in enumerate(results['graphs'][1:4]):\n", + " plt.title('Result {0}, score: {1:.2f}'.format(i+2, 1. - graph['cost']))\n", + " draw(graph)\n", + " plt.xlim(-1.2,1.2)\n", + " plt.ylim(-1.2,1.2)\n", + " plt.axis('off')\n", + " plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The final match pairs `guardians` and `star` to two nodes that do not have similar edges to the target graph. `groot` is not found in the target graph. The result gets a much lower score than the preceding results and we can be sure that any additional results will also be poor because the result are ordered." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/dstaff/anaconda3/envs/fornax/lib/python3.6/site-packages/networkx/drawing/nx_pylab.py:611: MatplotlibDeprecationWarning: isinstance(..., numbers.Number)\n", + " if cb.is_numlike(alpha):\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "for i, graph in enumerate(results['graphs'][4:]):\n", + " plt.title('Result {0}, score: {1:.2f}'.format(i+5, 1. - graph['cost']))\n", + " draw(graph)\n", + " plt.xlim(-1.2,1.2)\n", + " plt.ylim(-1.2,1.2)\n", + " plt.axis('off')\n", + " plt.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/tutorial/tutorial2.py b/docs/tutorial/tutorial2.py new file mode 100644 index 0000000..c793fdc --- /dev/null +++ b/docs/tutorial/tutorial2.py @@ -0,0 +1,363 @@ + +# coding: utf-8 + +# # Tutorial + +# In[1]: + + +import os +import sys +import pandas as pd +import json +import matplotlib.pyplot as plt +import networkx as nx +import fornax + +get_ipython().run_line_magic('matplotlib', 'inline') +from IPython.core.display import SVG + +# Add project root dir +ROOT_DIR = os.path.abspath("../../") +sys.path.append(ROOT_DIR) + + +# To install the use the dependencies for this notebook: +# +# ```bash +# conda env create -f environment.yml +# source activate fornax_tutorial +# ``` +# +# To run this notebook from the project root: +# +# ```bash +# cd docs/tutorial +# jupyter-notebook +# ``` + +# In this tutorial we will: +# +# * Load a graph of superheros and their teams from csv files +# +# * Search for nodes in the graph using a string similarity function +# +# * Use fornax to search for nodes using string similarity and fuzzy graph matching +# +# The data in this tutorial we be generated using the preceding notebook: `Tutorial1.ipynb`. +# +# ## Introduction +# +# `nodes.csv` and `edges.csv` contain a graph of superheros and their teams along with alternative names for those heros and groups (or aliases). +# +# The image below uses the example of Iron Man, who is known as "Tony" to his friends. +# Iron man is a member of the Avengers, a.k.a. Earth's Mightiest Superheros. +# Other heros are also members of The Avengers, and they will also have aliases. +# Other heros will also be members of other teams and so and so forth. +# +# +# All of these heros, teams and aliases together make our target graph, a graph which we will search using fornax. + +# In[2]: + + +SVG('../img/iron_man.svg') + + +# Let's load the data into the notebook using pandas. + +# In[3]: + + +# used for converting csv values in nodes.csv +mapping = { + '0': 'hero', + '1': 'team', + '2': 'hero_alias', + '3': 'team_alias' +} + +nodes_df = pd.read_csv( + './nodes.csv', + # rename the columns as targets as this will form the target graph + # (the graph which we will be searching) + names=['target_label', 'target_type', 'target_id'], + # ignore the header + header=0, + converters = { + # convert target_type from numeric values to + # literal string representations for ease of reading + 'target_type': lambda key: mapping.get(key) + } +) + +# contains pairs of target node ids +edges_df = pd.read_csv('./edges.csv') + + +# We can see that the target nodes have a label (the hero's primary name). +# The target_type column will be one of `hero`, `team`, `hero alias`, `team alias`, the four types of nodes in the graph. +# +# (Note that by hero we mean a person in a comic book who has superpowers regardless of them being good or bad) + +# In[4]: + + +nodes_df['target_label'].head() + + +# Edges are pairs of `target_id` values. +# Note that fornax deals with undirected graphs so there is no need to add the edge in the reverse direction. +# Doing so will cause an exception as the edge will be considered a duplicate. + +# In[5]: + + +edges_df.head() + + +# ## Label similarity +# +# For some motivation, before using fornax, let us search for nodes just using their labels. +# Let's search for nodes similar to `guardians`, `star` and `groot`. +# +# We will create a function that given a pair of labels, it will return a score where: +# +# $$0 <= score <= 1$$ +# +# Secondly we'll create a search function that returns rows from our table of target nodes that have a non zero similarity score. + +# In[6]: + + +def node_scoring_function(first: str, second: str): + """ node scoring function takes two strings and returns a + score in the range 0 <= score <= 1 + """ + first_, second_ = sorted((first.lower(), second.lower()), key=len) + # if first is not a substring of second: score = 0 + if not first_ in second_: + return 0 + # otherwise use the relative difference between + # the two lengths + score = len(second_) - len(first_) + score /= max(len(first_), len(second_)) + score = 1. - score + return score + + +# In[7]: + + +def search(query_id: int, query_label: str): + # compute all of the scores + scores = nodes_df['target_label'].apply( + node_scoring_function, + args=(query_label,) + ) + # create a boolean mask + mask = scores > 0 + # graph the non zero scoring nodes + matches = nodes_df[mask].copy() + # add extra columns + matches['score'] = scores[mask] + matches['query_label'] = query_label + matches['query_id'] = query_id + return matches + + +# ### Aside: +# Note that these string search functions are not terribly efficient. +# They involve repeated full scans of the target nodes table. +# If we were searching a larger graph we could use a search tree as an index, an external sting matching service or database. However, since this is a tutorial, the above functions are simpler and more reproducible. +# This is important as we will be using these search results with fornax. + +# In[8]: + + +query_labels = ['guardians', 'star', 'groot'] + + +# Examining the table below we can see that we have a conundrum. +# There are 22 nodes with varying similarity to `star` and 4 nodes similar to `galaxy`. + +# In[9]: + + +# find the nodes similar to 'guardians', 'star' and 'groot' +matches = pd.concat(search(id_, label) for id_, label in enumerate(query_labels)) +matches + + +# Fornax enables a more powerful type of search. +# By specifying 'guardians', 'star', 'groot' as nodes in a graph, +# and by specifying the relationships between them, +# we can search for nodes in our target graph with the same relationships. + +# ## Creating a target graph +# +# Fornax behaves much like a database. In fact it uses SQLite or Postgresql to store graph data and index it. +# To insert a new graph into fornax we can use the following three steps: +# 1. create a new graph +# 2. add nodes and node meta data +# 3. add edges and edge meta data +# +# The object `fornax.GraphHandle` is much like a file handle. It does not represent the graph but it is an accessor to it. +# If the `GraphHandle` goes out of scope the graph will still persist until it is explicitly deleted, much like a file. + +# In[10]: + + +conn = fornax.Connection() +conn.open() + + +# In[11]: + + +target_graph = fornax.GraphHandle.create(conn) +target_graph.add_nodes( + # use id_src to set a custom id on each node + id_src=nodes_df['target_id'], + # use other keyword arguments to attach arbitrary metadata to each node + label=nodes_df['target_label'], + # the type keyword is reserved to we use target_type + target_type=nodes_df['target_type'] + # meta data must be json serialisable +) +target_graph.add_edges(edges_df['start'], edges_df['end']) + + +# We can use the `graph_id` to access our graph in the future. + +# In[12]: + + +target_graph.graph_id +another_target_graph_handle = fornax.GraphHandle.read(conn, target_graph.graph_id) +another_target_graph_handle == target_graph + + +# ## Creating a query graph +# +# Let's imagine that we suspect `groot` is directly related to `guardians` and `star` is also directly related to `guardians`. +# For example `groot` and `star` could both be members of a team called `guardians`. +# Let's create another small graph that represents this situation: + +# In[13]: + + +# create a new graph +query_graph = fornax.GraphHandle.create(conn) + +# insert the three nodes: +# 'guardians' (id=0), 'star' (id=1), 'groot' (id=2) +query_graph.add_nodes(label=query_labels) + +# alternatively: +# query_graph.add_nodes(id_src=query_labels) +# since id_src can use any unique hashable items + +edges = [ + (0, 1), # edge between groot and guardians + (0, 2) # edge between star and guardians +] + +sources, targets = zip(*edges) +query_graph.add_edges(sources, targets) + + +# ## Search +# +# We can create a query in an analogous way to creating graphs using a `QueryHandle`, +# a handle to a query stored in the fornax database. +# To create a useful query we need to insert the string similarity scores we computed in part 1. +# Fornax will use these scores and the graph edges to execute the query. + +# In[14]: + + +query = fornax.QueryHandle.create(conn, query_graph, target_graph) +query.add_matches(matches['query_id'], matches['target_id'], matches['score']) + + +# Finally we can execute the query using a variety of options. +# We specify we want the top 5 best matches between the query graph and the target graph. + +# In[15]: + + +get_ipython().run_line_magic('time', 'results = query.execute(n=5)') + + +# ## Visualise +# +# `query.execute` returns an object describing the search result. +# Of primary interest is the `graph` field which contains a list of graphs in `node_link_graph` format. +# We can use networkx to draw these graphs and visualise the results. + +# In[16]: + + +def draw(graph): + """ function for drawing a graph using matplotlib and networkx""" + + # each graph is already in node_link_graph format + G = nx.json_graph.node_link_graph(graph) + + labels = {node['id']: node['label'] for node in graph['nodes']} + node_colour = ['r' if node['type'] == 'query' else 'b' for node in graph['nodes']] + pos = nx.spring_layout(G) + nx.draw_networkx_nodes(G, pos, node_size=600, node_color=node_colour, alpha=.3) + edgelist = [(e['source'], e['target']) for e in graph['links'] if e['type'] != 'match'] + nx.draw_networkx_edges(G, pos, width=3, edgelist=edgelist, edge_color='grey', alpha=.3) + edgelist = [(e['source'], e['target']) for e in graph['links'] if e['type'] == 'match'] + nx.draw_networkx_edges(G, pos, width=3, edgelist=edgelist, style='dashed', edge_color='pink') + nx.draw_networkx_labels(G, pos, font_size=12, font_family='sans-serif', labels=labels) + + +# Result 1 contains the best match. The three query nodes (in red) best match the three target nodes (in blue). The dashed lines show which pairs of query and target nodes matched each other. The blue nodes are a subgraph of the target graph. Note that the result does not describe the whole target graph because in principle it can be very large. +# +# Here we can see that the blue subgraph has exactly the same shape as the red query graph. However, the labels are not exactly the same (e.g. `guardians != Guardians of the Galaxy`) so the result scores less than the maximum score of 1. +# However, we can see that our query graph is really similar to Groot and Star-Lord from Guardians of the Galaxy. +# Since this is the best match we know that + +# In[17]: + + +for i, graph in enumerate(results['graphs'][:1]): + plt.title('Result {0}, score: {1:.2f}'.format(1, 1. - graph['cost'])) + draw(graph) + plt.xlim(-1.2,1.2) + plt.ylim(-1.2,1.2) + plt.axis('off') + plt.show() + + +# Results 2-4 have a lower score because `star` matches to a different node not adjacent to Guardians of the Galaxy. Further inspection would show that `star` has matched aliases of Star-Lord which are near Guardians of the Galaxy but not ajacent to it. + +# In[18]: + + +for i, graph in enumerate(results['graphs'][1:4]): + plt.title('Result {0}, score: {1:.2f}'.format(i+2, 1. - graph['cost'])) + draw(graph) + plt.xlim(-1.2,1.2) + plt.ylim(-1.2,1.2) + plt.axis('off') + plt.show() + + +# The final match pairs `guardians` and `star` to two nodes that do not have similar edges to the target graph. `groot` is not found in the target graph. The result gets a much lower score than the preceding results and we can be sure that any additional results will also be poor because the result are ordered. + +# In[19]: + + +for i, graph in enumerate(results['graphs'][4:]): + plt.title('Result {0}, score: {1:.2f}'.format(i+5, 1. - graph['cost'])) + draw(graph) + plt.xlim(-1.2,1.2) + plt.ylim(-1.2,1.2) + plt.axis('off') + plt.show() + diff --git a/environment.yml b/environment.yml index 9ff3a0c..05ce63c 100644 --- a/environment.yml +++ b/environment.yml @@ -3,33 +3,36 @@ channels: - conda-forge - defaults dependencies: -- keepalive=0.5=py_1 - appnope=0.1.0=py36hf537a9a_0 - backcall=0.1.0=py36_0 - blas=1.0=mkl - bleach=3.0.2=py36_0 - ca-certificates=2018.03.07=0 - certifi=2018.10.15=py36_0 +- cycler=0.10.0=py36hfc81398_0 - dbus=1.13.2=h760590f_1 - decorator=4.3.0=py36_0 - entrypoints=0.2.3=py36_2 - expat=2.2.6=h0a44026_0 +- freetype=2.9.1=hb4e5f40_0 - gettext=0.19.8.1=h15daf44_3 - glib=2.56.2=hd9629dc_0 - icu=58.2=h4b95b61_1 - intel-openmp=2019.0=118 - ipykernel=5.1.0=py36h39e3cac_0 -- ipython=7.1.1=py36h39e3cac_0 - ipython_genutils=0.2.0=py36h241746c_0 +- ipython=7.1.1=py36h39e3cac_0 - ipywidgets=7.4.2=py36_0 - jedi=0.13.1=py36_0 - jinja2=2.10=py36_0 - jpeg=9b=he5867d9_2 - jsonschema=2.6.0=py36hb385e00_0 -- jupyter=1.0.0=py36_7 - jupyter_client=5.2.3=py36_0 - jupyter_console=6.0.0=py36_0 - jupyter_core=4.4.0=py36_0 +- jupyter=1.0.0=py36_7 +- keepalive=0.5=py_1 +- kiwisolver=1.0.1=py36h0a44026_0 - libcxx=4.0.1=hcfea43d_1 - libcxxabi=4.0.1=hcfea43d_1 - libedit=3.1.20170329=hb402a30_2 @@ -39,17 +42,19 @@ dependencies: - libpng=1.6.35=ha441bb4_0 - libsodium=1.0.16=h3efe00b_0 - markupsafe=1.1.0=py36h1de35cc_0 +- matplotlib=3.0.1=py36h54f8f79_0 - mistune=0.8.4=py36h1de35cc_0 -- mkl=2019.0=118 - mkl_fft=1.0.6=py36hb8a8100_0 - mkl_random=1.0.1=py36h5d10147_1 +- mkl=2018.0.3=1 - nbconvert=5.3.1=py36_0 - nbformat=4.4.0=py36h827af21_0 - ncurses=6.1=h0a44026_0 +- networkx=2.2=py36_1 - notebook=5.7.1=py36_0 -- numpy=1.15.4=py36h6a91979_0 - numpy-base=1.15.4=py36h8a80b8c_0 -- openssl=1.1.1=h1de35cc_0 +- numpy=1.15.4=py36h6a91979_0 +- openssl=1.1.1a=h1de35cc_0 - pandas=0.23.4=py36h6440ff4_0 - pandoc=2.2.3.2=0 - pandocfilters=1.4.2=py36_1 @@ -62,9 +67,10 @@ dependencies: - prompt_toolkit=2.0.7=py36_0 - ptyprocess=0.6.0=py36_0 - pygments=2.2.0=py36h240cd3f_0 +- pyparsing=2.3.0=py36_0 - pyqt=5.9.2=py36h655552a_2 -- python=3.6.7=haf84260_0 - python-dateutil=2.7.5=py36_0 +- python=3.6.7=haf84260_0 - pytz=2018.7=py36_0 - pyzmq=17.1.2=py36h1de35cc_0 - qt=5.9.6=h45cd832_2 @@ -87,3 +93,7 @@ dependencies: - xz=5.2.4=h1de35cc_4 - zeromq=4.2.5=h0a44026_1 - zlib=1.2.11=hf3cbc9b_2 +- pip: + - sparqlwrapper + - psycopg2 + - SQLAlchemy diff --git a/fornax/__init__.py b/fornax/__init__.py index dca354d..5d65c4e 100644 --- a/fornax/__init__.py +++ b/fornax/__init__.py @@ -1,2 +1,2 @@ -from fornax.api import GraphHandle, QueryHandle +from fornax.api import Connection, GraphHandle, QueryHandle __version__ = '0.0.1' diff --git a/fornax/api.py b/fornax/api.py index 3de86e8..e101adb 100644 --- a/fornax/api.py +++ b/fornax/api.py @@ -1,29 +1,3 @@ -""" -Fornax API documentation -======================== - -Introduction ------------- - -Fornax performs fuzzy subgraph matching between graphs with labelled nodes. -Given a small graph (the query graph) and a large graph (the target graph) -fornax will approximate the top `n` subgraphs in the target graph that are most -similar to the query graph even if the node labels and graph relationships are -not exactly the same. - -Use this query API to specify query and target graphs and to seach for fuzzy -subgraph matches of the query graph to the target graph. - -fornax is designed to handle very large graphs of millions of nodes. -As such graphs are persisted in a database. -Rather than interacting directly with a graph, the API implements GraphHandles. -These are similar to file handles or file pointers for a file system. -They allow the user to Create, Read, Update and Delete graphs but much like a -file the graphs will still persist even if the handle goes out of scope. - -Similarly query objects, which define a search operation, can be created using -a QueryHandle. -""" import fornax.select import fornax.opt import sqlalchemy @@ -42,65 +16,111 @@ # TODO: sqlalchemy database integrity exceptions are not caught by the API -"""URL for a supported SQL database backend""" -FORNAX_DB_URL = os.environ.get('FORNAX_DB_URL') -if FORNAX_DB_URL is None: - FORNAX_DB_URL = 'sqlite://' -MAX_SIZE = sys.maxsize -SQLITE_MAX_SIZE = 2147483647 -if FORNAX_DB_URL == 'sqlite://': - MAX_SIZE = min(MAX_SIZE, SQLITE_MAX_SIZE) - -ECHO = False -ENGINE = sqlalchemy.create_engine(FORNAX_DB_URL, echo=ECHO) -CONNECTION = ENGINE.connect() -Session = sqlalchemy.orm.sessionmaker(bind=ENGINE) -fornax.model.Base.metadata.create_all(CONNECTION) +# enforce foreign key constrains in SQLite +@event.listens_for(Engine, "connect") +def _set_sqlite_pragma(dbapi_connection, connection_record): + cursor = dbapi_connection.cursor() + cursor.execute("PRAGMA foreign_keys=ON") + cursor.close() -def _hash(item: str) -> int: - """An unsalted hash function with a range between 0 and MAX_SIZE +def _hash(item: str, maxsize=sys.maxsize) -> int: + """An unsalted hash function with a range between 0 and maxsize :param item: string or string like object that is accepted by builtin function `str` :type item: str - :return: hash between 0 and MAX_SIZE + param maxsize: maximum value of returned integer + :type maxsize: int + :return: hash between 0 and maxsize :rtype: int """ - if isinstance(item, int): - return item % MAX_SIZE + return item % maxsize else: return int( hashlib.sha256(str(item).encode('utf-8')).hexdigest(), 16 - ) % MAX_SIZE + ) % maxsize -# enforce foreign key constrains in SQLite -@event.listens_for(Engine, "connect") -def _set_sqlite_pragma(dbapi_connection, connection_record): - cursor = dbapi_connection.cursor() - cursor.execute("PRAGMA foreign_keys=ON") - cursor.close() +class Connection: + """ + Create a new database connection. + If the database is empty :class:`Connection` will create + any missing schema. + Currrently sqlite and postgresql are activly supported + as backend databases. -@contextlib.contextmanager -def session_scope(): - """ - Provide a transactional scope around a series of db operations. - Transactions will be rolled back in the case of an exception. + In addition to the open, close syntax + Connection supports the context manager syntax:: + + with Connection("postgres:://user/0.0.0.0./mydb") as conn: + graph = fornax.GraphHandle.create(conn) + + :param url: dialect[+driver]://user:password@host/dbname[?key=value..] + :type url: str, optional """ - session = Session() - try: - yield session - session.commit() - except BaseException: - session.rollback() - raise - finally: - session.close() + SQLITE_MAX_SIZE = 2147483647 + + def __init__(self, url='sqlite://', **kwargs): + + self.url = url + self.engine = sqlalchemy.create_engine(self.url, **kwargs) + self.make_session = sqlalchemy.orm.sessionmaker(bind=self.engine) + self.maxsize = sys.maxsize + if self.url.startswith('sqlite'): + self.maxsize = self.SQLITE_MAX_SIZE + + def open(self): + """ Open the fornax database connection + and create any absent tables and indicies + """ + self.connection = self.engine.connect() + fornax.model.Base.metadata.create_all(self.connection) + + def close(self): + """ Close the fornax database connection + and free any connections in the connection pool + """ + + self.connection.close() + + def __enter__(self): + self.open() + return self + + def __exit__(self, *args): + self.close() + + @contextlib.contextmanager + def _get_session(self): + """ + Provide a transactional scope around a series of db operations. + Transactions will be rolled back in the case of an exception. + """ + session = self.make_session() + try: + yield session + session.commit() + except Exception: + session.rollback() + raise + finally: + session.close() + + def _hash(self, item: str) -> int: + """An unsalted hash function with a range between 0 and self.maxsize + + :param item: string or string like object that is accepted by builtin + function `str` + :type item: str + :return: hash between 0 and self.maxsize + :rtype: int + """ + return _hash(item, self.maxsize) class InvalidNodeError(Exception): @@ -180,23 +200,6 @@ def __repr__(self): def __lt__(self, other): return (self.type, self.id) < (other.type, other.id) - def to_dict(self) -> dict: - """Return self as a json serialisable dictionary - - :return: dictionary with keys `id`, `type` and `meta` - :rtype: dict - """ - - return { - # hash id with type so that the node id is unique to a given - # submatch result - **{ - 'id': _hash((self.id, self.type)), - 'type': self.type - }, - **self.meta - } - class Edge: """Representation of an Edge used internally be QueryHandle @@ -241,52 +244,22 @@ def __repr__(self): self.start, self.end, self.type, self.meta ) - def to_dict(self): - """Return self as a json serialisable dictionary - - Returns: - dict -- dictionart with keys start, end, type, metadata and weight - """ - if self.type == 'query' or self.type == 'target': - # hash start and end with the edge type - # to make id unique within a subgraph match - start = _hash((self.start, self.type)) - end = _hash((self.end, self.type)) - - elif self.type == 'match': - # hash start and end with the edge type - # to make id unique within a subgraph match - start = _hash((self.start, 'query')) - end = _hash((self.end, 'target')) - return { - **{ - 'source': start, - 'target': end, - 'type': self.type, - 'weight': self.weight - }, - **self.meta - } - class GraphHandle: - """Accessor for a graph - - Because fornax is designed to operate on very large graphs node and edges - are not stored in memory. - Rather, they are persisted using a database back end. - Currently sqlite and postgres are supported. + """ - GraphHandle is an interface to this persistent layer. - One can access an existing graph by - specifying it using the `graph_id` itentifier. + Create a handle to an existing graph with id *graph_id* + accessed via *connection*. + :param connection: fornax database connection + :type connection: Connection :param graph_id: unique id for an existing graph :type graph_id: int """ - def __init__(self, graph_id: int): + def __init__(self, connection: Connection, graph_id: int): self._graph_id = graph_id + self.conn = connection self._check_exists() def __len__(self): @@ -309,18 +282,23 @@ def __eq__(self, other): @property def graph_id(self): - """Unique identifier for a graph""" + """Get the unique id for this graph + + Graph id's are automaticly assigned at creation time. + """ return self._graph_id @classmethod - def create(cls): - """Create a new empy graph and return a GraphHandle to it + def create(cls, connection: Connection): + """Create a new empty graph via *connection* and return a GraphHandle to it + :param connection: a fornax database connection + :type connection: Connection :return: GraphHandle to a new graph :rtype: GraphHandle """ - with session_scope() as session: + with connection._get_session() as session: query = session.query( sqlalchemy.func.max(model.Graph.graph_id) @@ -332,24 +310,25 @@ def create(cls): else: graph_id += 1 session.add(model.Graph(graph_id=graph_id)) - session.commit() - return GraphHandle(graph_id) + return GraphHandle(connection, graph_id) @classmethod - def read(cls, graph_id: int): + def read(cls, connection: Connection, graph_id: int): """Create a new GraphHandle to an existing graph with unique identifier `graph_id` + :param connection: a fornax database connection + :type connection: Connection :param graph_id: unique identifier for an existing graph :type graph_id: int :return: A new graph handle to an existing graph :rtype: GraphHandle """ - return GraphHandle(graph_id) + return GraphHandle(connection, graph_id) def delete(self): - """Delete a graph. + """Delete this graph. Delete the graph accessed through graph handle and all of the associated nodes and edges. @@ -357,7 +336,7 @@ def delete(self): """ self._check_exists() - with session_scope() as session: + with self.conn._get_session() as session: session.query( model.Graph ).filter(model.Graph.graph_id == self._graph_id).delete() @@ -369,7 +348,7 @@ def delete(self): ).filter(model.Node.graph_id == self._graph_id).delete() def _check_exists(self): - with session_scope() as session: + with self.conn._get_session() as session: exists = session.query(sqlalchemy.exists().where( model.Graph.graph_id == self._graph_id )).scalar() @@ -381,24 +360,27 @@ def _check_exists(self): def add_nodes(self, **kwargs): """Append nodes to a graph - :param id_src: An iterable if Unique hashable identifiers - for each node, defaults to None - :raises ValueError: Raised if `id` is used as a keyword argument - :raises ValueError: Raised if no keyword arguments are provided + :param id_src: An iterable of unique hashable identifiers, default None + :type id_src: Iterable - If `id_src` is not provided, - each node will be indentifed by order of insertion - using a continuous range index starting at zero. + Keyword arguments can be used to attached arbitrary JSON serialised + metadata to each node:: - Metadata can be attached to each node - by specifying extra keyword arguments - (not that id is reserved). - For example, to attach a name to each node: + # create 3 nodes with ids: 0, 1, 2 + # and names 'Anne', 'Ben', 'Charles' + graph_handle.add_nodes(names=['Anne', 'Ben', 'Charles']) - :Example: + By default, each node will be assigned a sequential integer id + starting from 0. A custom id can be assigned using the *id_src* + keyword provided that all of the ids are hashable:: - graph_handle.add_node(id_src=[1,2,3], name=['a', 'b', 'c']) + # create 3 nodes with ids: 'Anne', 'Ben', 'Charles' + # and no explicit name field + graph_handle.add_nodes(id_src=['Anne', 'Ben', 'Charles']) + .. note:: + + *id* is a reserved keyword argument which will raise an exception """ keys = kwargs.keys() @@ -426,14 +408,14 @@ def add_nodes(self, **kwargs): nodes = ( model.Node( - node_id=_hash(node_id), + node_id=self.conn._hash(node_id), graph_id=self.graph_id, meta=json.dumps({key: val for key, val in zip(keys, values)}) ) for node_id, values in zipped ) nodes = self._check_nodes(nodes) - with session_scope() as session: + with self.conn._get_session() as session: session.add_all(nodes) session.commit() @@ -442,16 +424,32 @@ def add_edges( ): """Append edges to a graph representing relationships between nodes - :param sources: node `id_src` + :param sources: node id_src :type sources: typing.Iterable - :param targets: node `id_src` + :param targets: node id_src :type targets: typing.Iterable - keyword arguments can be used to attach metadata to the edges. + Keyword arguments can be used to attach metadata to the edges. + For example to add three edges with a relationship attribute friend or + foe:: - :Example: + graph_handle.add_edges( + sources=[0, 1, 2], + targets=[1, 2, 0], + relationship=['friend', 'friend', 'foe'] + ) + Keyword arguments can be used to attach any arbitrary JSON + serialisable data to edges. + + .. note:: + + The following reserved keywords are not reserved and will raise + an exception - graph_handle.add_edges([0, 0], [1, 1], relation=['friend', 'foe']) + * *start* + * *end* + * *type* + * *weight* """ @@ -469,8 +467,8 @@ def add_edges( if 'weight' in keys: raise(ValueError('weight is a reserved node attribute \ which cannot be assigned using kwargs')) - hashed_sources = map(_hash, sources) - hashed_targets = map(_hash, targets) + hashed_sources = map(self.conn._hash, sources) + hashed_targets = map(self.conn._hash, targets) zipped = itertools.zip_longest( hashed_sources, hashed_targets, *kwargs.values(), fillvalue=NullValue() @@ -491,12 +489,11 @@ def add_edges( for start, end, *values in zipped ) edges = self._check_edges(edges) - with session_scope() as session: + with self.conn._get_session() as session: session.add_all(edges) session.commit() - @staticmethod - def _check_nodes(nodes) -> typing.Generator: + def _check_nodes(self, nodes) -> typing.Generator: """Guard against invalid nodes by raising an InvalidNodeError for forbidden node parameters @@ -516,7 +513,7 @@ def _check_nodes(nodes) -> typing.Generator: raise InvalidNodeError( '{}, node_id must be an integer'.format(node) ) - if node_id > SQLITE_MAX_SIZE and FORNAX_DB_URL == 'sqlite://': + if node_id > self.conn.maxsize and self.conn.startswith('sqlite'): raise InvalidNodeError('node id {} is too large'.format(node)) yield node @@ -552,14 +549,18 @@ def _check_edges(edges: typing.Iterable[model.Edge]) -> typing.Generator: class QueryHandle: - """Accessor for a fuzzy subgraph matching query + """Create a handle to an existing query via *connection* with unique id + *query_id*. + :param connection: a fornax database connection + :type connection: Connection :param query_id: unique id for an existing query :type query_id: int """ - def __init__(self, query_id: int): + def __init__(self, connection: Connection, query_id: int): self.query_id = query_id + self.conn = connection self._check_exists() def __eq__(self, other): @@ -571,9 +572,8 @@ def __len__(self): Returns: {int} -- Count of matching edges """ - self._check_exists() - with session_scope() as session: + with self.conn._get_session() as session: count = session.query(model.Match).filter( model.Match.query_id == self.query_id).count() return count @@ -584,8 +584,7 @@ def _check_exists(self): Raises: ValueError -- Raised if the query had been deleted """ - - with session_scope() as session: + with self.conn._get_session() as session: exists = session.query(model.Query).filter( model.Query.query_id == self.query_id ).scalar() @@ -595,18 +594,22 @@ def _check_exists(self): ) @classmethod - def create(cls, query_graph: GraphHandle, target_graph: GraphHandle): + def create( + cls, connection: Connection, + query_graph: GraphHandle, target_graph: GraphHandle + ): """Create a new query and return a QueryHandle for it - :param query_graph: Subgraph to be search for in the target graph + :param connection: a fornax database connection + :type connection: Connection + :param query_graph: subgraph to find target graph :type query_graph: GraphHandle :param target_graph: Graph to be searched :type target_graph: GraphHandle :return: new QueryHandle :rtype: QueryHandle """ - - with session_scope() as session: + with connection._get_session() as session: query_id = session.query( sqlalchemy.func.max(model.Query.query_id) ).first()[0] @@ -620,26 +623,27 @@ def create(cls, query_graph: GraphHandle, target_graph: GraphHandle): end_graph_id=target_graph.graph_id ) session.add(new_query) - return QueryHandle(query_id) + return QueryHandle(connection, query_id) @classmethod - def read(cls, query_id: int): - """Create a new QueryHandle to an existing query with unique id `query_id` + def read(cls, connection: Connection, query_id: int): + """Create a new QueryHandle to an existing query with unique id *query_id* + via *connection*. + :param connection: a fornax database connection + :type connection: Connection :param query_id: unique identifier for a query :type query_id: int :return: new QueryHandle :rtype: QueryHandle """ - - return QueryHandle(query_id) + return QueryHandle(connection, query_id) def delete(self): """Delete this query and any associated matches """ - self._check_exists() - with session_scope() as session: + with self.conn._get_session() as session: session.query(model.Query).filter( model.Query.query_id == self.query_id ).delete() @@ -655,14 +659,14 @@ def query_graph(self) -> GraphHandle: """ self._check_exists() - with session_scope() as session: + with self.conn._get_session() as session: start_graph = session.query( model.Graph ).join( model.Query, model.Graph.graph_id == model.Query.start_graph_id ).filter(model.Query.query_id == self.query_id).first() graph_id = start_graph.graph_id - return GraphHandle(graph_id) + return GraphHandle(self.conn, graph_id) def target_graph(self) -> GraphHandle: """Get a QueryHandle for the target graph @@ -672,14 +676,14 @@ def target_graph(self) -> GraphHandle: """ self._check_exists() - with session_scope() as session: + with self.conn._get_session() as session: end_graph = session.query( model.Graph ).join( model.Query, model.Graph.graph_id == model.Query.end_graph_id ).filter(model.Query.query_id == self.query_id).first() graph_id = end_graph.graph_id - return GraphHandle(graph_id) + return GraphHandle(self.conn, graph_id) def add_matches( self, @@ -688,13 +692,7 @@ def add_matches( weights: typing.Iterable[float], **kwargs ): - """Add candidate matches between the query graph and the target graph - - Matches represent a pairwise node similarity - between all nodes in the query graph - and all nodes in the target graph. - Only similarities with non zero score need to be stated explicitly. - Matches with zero score are implicit. + """Add matches between the query graph and the target graph :param sources: Iterable of `src_id` in the query graph :type sources: typing.Iterable[int] @@ -703,6 +701,22 @@ def add_matches( :param weights: Iterable of weights between 0 and 1 :type weights: typing.Iterable[float] + For example, to add matches between + + * node *0* in the query graph and node *0* in the target graph \ + with weight *.9* + + * node *0* in the query graph and node *1* in the target graph \ + with weight *.1* + + then:: + + query.add_matches([0, 0], [0, 1], [.9, .1]) + + .. note:: + + Adding weights that compare equal to zero will raise an exception. + """ self._check_exists() @@ -719,8 +733,8 @@ def add_matches( if 'weight' in keys: raise(ValueError('weight is a reserved node attribute \ which cannot be assigned using kwargs')) - hashed_sources = map(_hash, sources) - hashed_targetes = map(_hash, targets) + hashed_sources = map(self.conn._hash, sources) + hashed_targetes = map(self.conn._hash, targets) zipped = itertools.zip_longest( hashed_sources, hashed_targetes, weights, *kwargs.values(), fillvalue=NullValue() @@ -740,7 +754,7 @@ def add_matches( for start, end, weight, *values in zipped ) matches = self._check_matches(matches) - with session_scope() as session: + with self.conn._get_session() as session: session.add_all(matches) session.commit() @@ -792,7 +806,7 @@ def _check_matches( yield match def _query_nodes(self): - with session_scope() as session: + with self.conn._get_session() as session: nodes = session.query(model.Node).join( model.Query, model.Node.graph_id == model.Query.start_graph_id ).filter(model.Query.query_id == self.query_id).all() @@ -802,7 +816,7 @@ def _query_nodes(self): return nodes def _query_edges(self): - with session_scope() as session: + with self.conn._get_session() as session: edges = session.query(model.Edge).join( model.Query, model.Edge.graph_id == model.Query.start_graph_id ).filter( @@ -817,7 +831,7 @@ def _query_edges(self): return edges def _target_nodes(self): - with session_scope() as session: + with self.conn._get_session() as session: nodes = session.query(model.Node).join( model.Query, model.Node.graph_id == model.Query.end_graph_id ).filter( @@ -836,7 +850,7 @@ def is_between(target_ids, edge): def _target_edges(self, target_nodes, target_edges_arr): # only include target edges that are between the target nodes above - with session_scope() as session: + with self.conn._get_session() as session: EndMatch = sqlalchemy.alias(model.Match, "end_match") EndNode = sqlalchemy.alias(model.Node, "end_node") StartNode = sqlalchemy.alias(model.Node, "start_node") @@ -859,7 +873,7 @@ def _target_edges(self, target_nodes, target_edges_arr): return edges def _optimise(self, hopping_distance, max_iters, offsets): - with session_scope() as session: + with self.conn._get_session() as session: sql_query = fornax.select.join( self.query_id, h=hopping_distance, offsets=offsets ) @@ -883,18 +897,60 @@ def _get_scores(cls, inference_costs, query_nodes, subgraphs, sz): scores.append(score) return scores + def _node_to_dict(self, node: Node) -> dict: + """Return self as a json serialisable dictionary + + :return: dictionary with keys `id`, `type` and `meta` + :rtype: dict + """ + + return { + # hash id with type so that the node id is unique to a given + # submatch result + **{ + 'id': self.conn._hash((node.id, node.type)), + 'type': node.type + }, + **node.meta + } + + def _edge_to_dict(self, edge: Edge): + """Return self as a json serialisable dictionary + + Returns: + dict -- dictionart with keys start, end, type, metadata and weight + """ + if edge.type == 'query' or edge.type == 'target': + # hash start and end with the edge type + # to make id unique within a subgraph match + start = self.conn._hash((edge.start, edge.type)) + end = self.conn._hash((edge.end, edge.type)) + + elif edge.type == 'match': + # hash start and end with the edge type + # to make id unique within a subgraph match + start = self.conn._hash((edge.start, 'query')) + end = self.conn._hash((edge.end, 'target')) + return { + **{ + 'source': start, + 'target': end, + 'type': edge.type, + 'weight': edge.weight + }, + **edge.meta + } + def execute(self, n=5, hopping_distance=2, max_iters=10): - """Execute a fuzzy subgraph matching query + """Execute a fuzzy subgraph matching query finding the top *n* subgraph + matches between the query graph and the target graph. - :param n: number of subgraph matches to return, defaults to 5 - :param n: int, optional + :param n: number of subgraph matches to return + :type n: int, optional :param hopping_distance: lengthscale hyperparameter, defaults to 2 - :param hopping_distance: int, optional - :param max_iters: maximum number of optimisation iterations, - defaults to 10 - :param max_iters: int, optional - :raises ValueError: Raised if there are no matches - between the query and target graph + :type hopping_distance: int, optional + :param max_iters: maximum number of optimisation iterations + :type max_iters: int, optional :return: query result :rtype: dict """ @@ -921,37 +977,64 @@ def execute(self, n=5, hopping_distance=2, max_iters=10): # sort graphs by score then deturministicly by hashing idxs = sorted( enumerate(scores), - key=lambda x: (x[1], _hash(tuple(subgraphs[x[0]]))) + key=lambda x: (x[1], self.conn._hash(tuple(subgraphs[x[0]]))) ) - query_nodes_payload = [node.to_dict() for node in query_nodes] - query_edges_payload = [edge.to_dict() for edge in query_edges] - target_nodes_payload = [node.to_dict() for node in target_nodes] - target_edges_payload = [edge.to_dict() for edge in target_edges] + query_nodes_payload = [ + self._node_to_dict(node) + for node in query_nodes + ] + + query_edges_payload = [ + self._edge_to_dict(edge) + for edge in query_edges + ] + + target_nodes_payload = [ + self._node_to_dict(node) + for node in target_nodes + ] + + target_edges_payload = [ + self._edge_to_dict(edge) + for edge in target_edges + ] for i, score in idxs[:min(n, len(idxs))]: + _, match_ends = zip(*subgraphs[i]) + matches = [ - Edge(s, e, 'match', {}, 1. - inference_costs[s, e]).to_dict() + self._edge_to_dict( + Edge(s, e, 'match', {}, 1. - inference_costs[s, e]) + ) for s, e in sorted(subgraphs[i]) ] - match_ends = set(_hash((i, 'target')) for i in match_ends) + + match_ends = set( + self.conn._hash((i, 'target')) + for i in match_ends + ) + nxt_graph = { 'is_multigraph': False, 'cost': score, 'nodes': list(query_nodes_payload), # make a copy 'links': matches + list(query_edges_payload) # make a copy } + nxt_graph['nodes'].extend([ n for n in target_nodes_payload if n['id'] in match_ends ]) + nxt_graph['links'].extend( [ e for e in target_edges_payload if e['source'] in match_ends and e['target'] in match_ends ] ) + graphs.append(nxt_graph) return { diff --git a/notebooks/tutorial/Tutorial 2 - Making a Query.ipynb b/notebooks/tutorial/Tutorial 2 - Making a Query.ipynb deleted file mode 100644 index b86a49f..0000000 --- a/notebooks/tutorial/Tutorial 2 - Making a Query.ipynb +++ /dev/null @@ -1,561 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import json\n", - "import fornax\n", - "import matplotlib\n", - "import networkx as nx\n", - "\n", - "%matplotlib inline" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Performing a Fuzzy Graph Query using Fornax\n", - "\n", - "Im this tutorial we will perform a fuzzy search on the dataset curated during the first tutorial.\n", - "\n", - "You need to run tutorial 1 before doing this tutorial otherwise you will not be able to load any of the data produced.\n", - "\n", - "## Step 1: Load the data from the first tutorial" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "nodes_df = pd.read_csv('./nodes.csv')\n", - "edges_df = pd.read_csv('./edges.csv')" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
labeltypeuid
0Jubilee01126900601
1Silver Surfer0440245546
2Beast01370118169
3Selene087770955
4Sabretooth01299099267
\n", - "
" - ], - "text/plain": [ - " label type uid\n", - "0 Jubilee 0 1126900601\n", - "1 Silver Surfer 0 440245546\n", - "2 Beast 0 1370118169\n", - "3 Selene 0 87770955\n", - "4 Sabretooth 0 1299099267" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "nodes_df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
endstart
02729691141126900601
1741356524440245546
214496268731370118169
360991860287770955
414496268731299099267
\n", - "
" - ], - "text/plain": [ - " end start\n", - "0 272969114 1126900601\n", - "1 741356524 440245546\n", - "2 1449626873 1370118169\n", - "3 609918602 87770955\n", - "4 1449626873 1299099267" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "edges_df.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Step 2: Create a target graph\n", - "\n", - "We create a target graph by creating a graph handle and loading the nodes and edges into the graph.\n", - "\n", - "We can attach metadata to the nodes using keyword arguments of `GraphHandle.add_nodes`.\n", - "`id_src` is a special field which specifies a unique id for each node. By default `id_src` is a continuous range of integers starting at zero." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "target_graph = fornax.GraphHandle.create()\n", - "target_graph.add_nodes(id_src=nodes_df['uid'], label=nodes_df['label'], type_=nodes_df['type'])\n", - "target_graph.add_edges(edges_df['start'], edges_df['end'])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Step 3: Create a query graph\n", - "\n", - "Creating a query graph is exactly the same as creating a target graph. Because this query graph is small we don't load any values from file." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "query_graph = fornax.GraphHandle.create()\n", - "query_graph.add_nodes(id_src=[0, 1, 2], label=['hulk', 'lady', 'storm'])\n", - "query_graph.add_edges([0, 1], [1, 2])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Step 4: Create a query\n", - "\n", - "A query needs a set of correspondances between the query and target graph.\n", - "They can be anything provided that they are scored between 0 and 1.\n", - "Below we create a correspondance between a query node and a target node. If the query node label is a substring of a target node label the correspondance scores 1, otherwise it is zero we don't specify it explicitly.\n", - "\n", - "Just like graph edges, matches are specified using the `src_id` of source and target nodes respectivly." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "matches = []\n", - "for i in nodes_df[nodes_df['label'].str.contains(\"(?i)hulk\")]['uid']:\n", - " matches.append((0, i, 1))\n", - "for i in nodes_df[nodes_df['label'].str.contains('(?i)lady')]['uid']:\n", - " matches.append((1, i, 1))\n", - "for i in nodes_df[nodes_df['label'].str.contains('(?i)storm')]['uid']:\n", - " matches.append((2, i, 1))\n", - "sources, targets, weights = zip(*matches)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[(0, 2020667501, 1),\n", - " (0, 2087196931, 1),\n", - " (0, 2142361735, 1),\n", - " (0, 1056123890, 1),\n", - " (0, 970313367, 1),\n", - " (0, 329519748, 1),\n", - " (0, 959673558, 1),\n", - " (0, 560425637, 1),\n", - " (1, 1639515098, 1),\n", - " (1, 995920086, 1)]" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "matches[:10]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "A query needs a two graph handles.\n", - "Once created we can insert the matches." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "query = fornax.QueryHandle.create(query_graph, target_graph)\n", - "query.add_matches(sources, targets, weights)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Exacute the query\n", - "\n", - "The query produces a python dictionary that can be serialised to json.\n", - "Each `graph` in the result uses the [node link fromat](https://networkx.github.io/documentation/stable/reference/readwrite/generated/networkx.readwrite.json_graph.node_link_data.html#networkx.readwrite.json_graph.node_link_data) for graphs.\n", - "\n", - "The resulting graphs contain\n", - "* The query nodes\n", - "* The optimal target nodes matching the query nodes\n", - "* Edges between the query nodes\n", - "* Edges between the relavent target nodes\n", - "* The matches between the query graph and optimum target nodes" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/dstaff/anaconda3/envs/fornax/lib/python3.6/site-packages/numpy/core/records.py:513: FutureWarning: Numpy has detected that you may be viewing or writing to an array returned by selecting multiple fields in a structured array. \n", - "\n", - "This code may break in numpy 1.15 because this will return a view instead of a copy -- see release notes for details.\n", - " return obj.view(dtype=(self.dtype.type, obj.dtype))\n" - ] - } - ], - "source": [ - "# get the n best subgraph matches\n", - "results = query.execute(n=1)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"graphs\": [\n", - " {\n", - " \"is_multigraph\": false,\n", - " \"cost\": 0.024416640711327393,\n", - " \"nodes\": [\n", - " {\n", - " \"id\": 1934515491,\n", - " \"type\": \"query\",\n", - " \"id_src\": 0,\n", - " \"label\": \"hulk\"\n", - " },\n", - " {\n", - " \"id\": 1325512080,\n", - " \"type\": \"query\",\n", - " \"id_src\": 1,\n", - " \"label\": \"lady\"\n", - " },\n", - " {\n", - " \"id\": 1876249886,\n", - " \"type\": \"query\",\n", - " \"id_src\": 2,\n", - " \"label\": \"storm\"\n", - " },\n", - " {\n", - " \"id\": 715503065,\n", - " \"type\": \"target\",\n", - " \"id_src\": 37644418,\n", - " \"label\": \" Susan Storm\",\n", - " \"type_\": 2\n", - " },\n", - " {\n", - " \"id\": 1936330398,\n", - " \"type\": \"target\",\n", - " \"id_src\": 995920086,\n", - " \"label\": \"Lady Liberators\",\n", - " \"type_\": 1\n", - " },\n", - " {\n", - " \"id\": 168902329,\n", - " \"type\": \"target\",\n", - " \"id_src\": 2142361735,\n", - " \"label\": \"She-Hulk\",\n", - " \"type_\": 0\n", - " }\n", - " ],\n", - " \"links\": [\n", - " {\n", - " \"source\": 1934515491,\n", - " \"target\": 168902329,\n", - " \"type\": \"match\",\n", - " \"weight\": 0.9869624795392156\n", - " },\n", - " {\n", - " \"source\": 1325512080,\n", - " \"target\": 1936330398,\n", - " \"type\": \"match\",\n", - " \"weight\": 0.9746778514236212\n", - " },\n", - " {\n", - " \"source\": 1876249886,\n", - " \"target\": 715503065,\n", - " \"type\": \"match\",\n", - " \"weight\": 0.9651097469031811\n", - " },\n", - " {\n", - " \"source\": 1934515491,\n", - " \"target\": 1325512080,\n", - " \"type\": \"query\",\n", - " \"weight\": 1.0\n", - " },\n", - " {\n", - " \"source\": 1325512080,\n", - " \"target\": 1876249886,\n", - " \"type\": \"query\",\n", - " \"weight\": 1.0\n", - " },\n", - " {\n", - " \"source\": 1936330398,\n", - " \"target\": 168902329,\n", - " \"type\": \"target\",\n", - " \"weight\": 1.0\n", - " }\n", - " ]\n", - " }\n", - " ],\n", - " \"iters\": 2,\n", - " \"hopping_distance\": 2,\n", - " \"max_iters\": 10\n", - "}\n" - ] - } - ], - "source": [ - "print(json.dumps(results, indent=4))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Draw the result using networkx" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "graph = results['graphs'][0]\n", - "G = nx.json_graph.node_link_graph(graph)\n", - "nodes = G.nodes()\n", - "cols = []\n", - "for k in G.nodes():\n", - " if nodes[k]['type'] == 'query':\n", - " cols.append('r')\n", - " elif nodes[k]['type'] == 'target':\n", - " cols.append('b')" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/dstaff/anaconda3/envs/fornax/lib/python3.6/site-packages/networkx/drawing/nx_pylab.py:611: MatplotlibDeprecationWarning: isinstance(..., numbers.Number)\n", - " if cb.is_numlike(alpha):\n" - ] - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "nx.draw(\n", - " G, \n", - " with_labels=True, \n", - " labels={n['id']:n['label'] for n in graph['nodes']}, \n", - " node_color=cols\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Write the resulting supgraph match using another graph format " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "nx.write_gexf(G, 'my_graph.gexf')" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.5" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/requirements/dev.txt b/requirements/dev.txt index 046cae0..8013904 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -1,3 +1,5 @@ -r common.txt xmlrunner coveralls +sphinx>=1.4 +nbsphinx \ No newline at end of file diff --git a/test/test_api.py b/test/test_api.py index c3f7431..188f7b9 100644 --- a/test/test_api.py +++ b/test/test_api.py @@ -5,84 +5,84 @@ from test_base import TestCaseDB from sqlalchemy.orm.session import Session from unittest import TestCase -from fornax.api import _hash class TestGraph(TestCaseDB): - @classmethod - def setUp(self): - """trick fornax into using the test database setup - """ - super().setUp(self) - fornax.api.Session = lambda: Session(self._connection) + def run(self, result=None): + with fornax.Connection() as conn: + self.conn = conn + self.conn.make_session = lambda: Session(self._connection) + super().run(result) def test_init_raises(self): """ raise an ValueError if a hadle to a graph is constructed that does not exist """ - self.assertRaises(ValueError, fornax.GraphHandle, 0) - self.assertRaises(ValueError, fornax.GraphHandle.read, 0) + self.assertRaises(ValueError, fornax.GraphHandle, self.conn, 0) + self.assertRaises(ValueError, fornax.GraphHandle.read, self.conn, 0) def test_create(self): """first graph has id zero """ - graph = fornax.GraphHandle.create() + graph = fornax.GraphHandle.create(self.conn) self.assertEqual(graph.graph_id, 0) def test_create_two(self): """auto increment graph id """ - _ = fornax.GraphHandle.create() - second = fornax.GraphHandle.create() + _ = fornax.GraphHandle.create(self.conn) + second = fornax.GraphHandle.create(self.conn) self.assertEqual(second.graph_id, 1) def test_read(self): """get a graph handle using graph id """ - graph = fornax.GraphHandle.create() + graph = fornax.GraphHandle.create(self.conn) graph_id = graph.graph_id - same_graph = fornax.GraphHandle.read(graph_id) + same_graph = fornax.GraphHandle.read(self.conn, graph_id) self.assertEqual(same_graph.graph_id, graph_id) def test_delete(self): """getting a deleted graph should raise a value error """ - graph = fornax.GraphHandle.create() + graph = fornax.GraphHandle.create(self.conn) graph.delete() - self.assertRaises(ValueError, fornax.GraphHandle.read, 0) + self.assertRaises(ValueError, fornax.GraphHandle.read, self.conn, 0) def test_add_nodes(self): """meta data is stored on a node """ - graph = fornax.GraphHandle.create() + graph = fornax.GraphHandle.create(self.conn) names = ['adam', 'ben', 'chris'] graph.add_nodes(name=names) - nodes = self.session.query(fornax.model.Node).filter( - fornax.model.Node.graph_id == 0).all() - nodes = sorted(nodes, key=lambda node: node.node_id) - self.assertListEqual( - names, [json.loads(node.meta)['name'] for node in nodes]) + with self.conn._get_session() as session: + nodes = self.session.query(fornax.model.Node).filter( + fornax.model.Node.graph_id == 0).all() + nodes = sorted(nodes, key=lambda node: node.node_id) + self.assertListEqual( + names, [json.loads(node.meta)['name'] for node in nodes]) def test_add_nodes_more_meta(self): """multiple metadata is stored on a node """ - graph = fornax.GraphHandle.create() + graph = fornax.GraphHandle.create(self.conn) names = ['adam', 'ben', 'chris'] ages = [9, 10, 11] graph.add_nodes(name=names, age=ages) - nodes = self.session.query(fornax.model.Node).filter( - fornax.model.Node.graph_id == 0).all() - nodes = sorted(nodes, key=lambda node: node.node_id) - self.assertListEqual( - names, [json.loads(node.meta)['name'] for node in nodes]) - self.assertListEqual( - ages, [json.loads(node.meta)['age'] for node in nodes]) + with self.conn._get_session() as session: + nodes = session.query(fornax.model.Node).filter( + fornax.model.Node.graph_id == 0).all() + nodes = sorted(nodes, key=lambda node: node.node_id) + self.assertListEqual( + names, [json.loads(node.meta)['name'] for node in nodes]) + self.assertListEqual( + ages, [json.loads(node.meta)['age'] for node in nodes]) def test_missing_attribute(self): """Null values for metadata must be explicit """ - graph = fornax.GraphHandle.create() + graph = fornax.GraphHandle.create(self.conn) names = ['adam', 'ben', 'chris'] ages = [9, 10] self.assertRaises(TypeError, graph.add_nodes, name=names, age=ages) @@ -90,40 +90,41 @@ def test_missing_attribute(self): def test_assign_id(self): """assigning node id is forbidden """ - graph = fornax.GraphHandle.create() + graph = fornax.GraphHandle.create(self.conn) ids = range(3) self.assertRaises(ValueError, graph.add_nodes, id=ids) def test_no_metadata(self): """Nodes must have some metadata associated with them """ - graph = fornax.GraphHandle.create() + graph = fornax.GraphHandle.create(self.conn) self.assertRaises(ValueError, graph.add_nodes) def test_add_edges(self): """store metadata on edges """ - graph = fornax.GraphHandle.create() + graph = fornax.GraphHandle.create(self.conn) names = ['adam', 'ben', 'chris'] ages = [9, 10, 11] graph.add_nodes(name=names, age=ages) relationships = ['is_friend', 'is_foe'] graph.add_edges([0, 0], [1, 2], relationship=relationships) - edges = self.session.query( - fornax.model.Edge - ).filter( - fornax.model.Edge.graph_id == graph.graph_id - ).filter( - fornax.model.Edge.start < fornax.model.Edge.end - ).all() - edges = sorted(edges, key=lambda edge: (edge.start, edge.end)) - self.assertListEqual(relationships, [json.loads( - edge.meta)['relationship'] for edge in edges]) + with self.conn._get_session() as session: + edges = session.query( + fornax.model.Edge + ).filter( + fornax.model.Edge.graph_id == graph.graph_id + ).filter( + fornax.model.Edge.start < fornax.model.Edge.end + ).all() + edges = sorted(edges, key=lambda edge: (edge.start, edge.end)) + self.assertListEqual(relationships, [json.loads( + edge.meta)['relationship'] for edge in edges]) def test_add_edges_more_meta(self): """store multiple metadata on edges """ - graph = fornax.GraphHandle.create() + graph = fornax.GraphHandle.create(self.conn) names = ['adam', 'ben', 'chris'] ages = [9, 10, 11] graph.add_nodes(name=names, age=ages) @@ -131,25 +132,26 @@ def test_add_edges_more_meta(self): types = [0, 1] graph.add_edges( [0, 0], [1, 2], relationship=relationships, type_=types) - edges = self.session.query( - fornax.model.Edge - ).filter( - fornax.model.Edge.graph_id == graph.graph_id - ).filter( - fornax.model.Edge.start < fornax.model.Edge.end - ).all() - edges = sorted(edges, key=lambda edge: (edge.start, edge.end)) - self.assertListEqual(relationships, [json.loads( - edge.meta)['relationship'] for edge in edges]) - self.assertListEqual( - types, [json.loads(edge.meta)['type_'] for edge in edges]) + with self.conn._get_session() as session: + edges = session.query( + fornax.model.Edge + ).filter( + fornax.model.Edge.graph_id == graph.graph_id + ).filter( + fornax.model.Edge.start < fornax.model.Edge.end + ).all() + edges = sorted(edges, key=lambda edge: (edge.start, edge.end)) + self.assertListEqual(relationships, [json.loads( + edge.meta)['relationship'] for edge in edges]) + self.assertListEqual( + types, [json.loads(edge.meta)['type_'] for edge in edges]) def test_simple_graph(self): """Test for a simple graph. A simple graph is a graph with no loops. A loop is an edge that connects a vertex to itself """ - graph = fornax.GraphHandle.create() + graph = fornax.GraphHandle.create(self.conn) names = ['adam', 'ben', 'chris'] ages = [9, 10, 11] graph.add_nodes(name=names, age=ages) @@ -157,110 +159,116 @@ def test_simple_graph(self): 1, 0], [1, 2], relationship=['is_friend', 'is_foe']) def test_add_nodes_id_src(self): - graph = fornax.GraphHandle.create() + graph = fornax.GraphHandle.create(self.conn) graph.add_nodes(id_src=['a', 'b', 'c', 'd']) graph.add_edges(['a', 'b'], ['b', 'c']) - nodes = self.session.query(fornax.model.Node).all() - self.assertEqual( - [n.node_id for n in nodes], - [_hash(item) for item in ('a', 'b', 'c', 'd')] - ) + with self.conn._get_session() as session: + nodes = session.query(fornax.model.Node).all() + self.assertEqual( + [n.node_id for n in nodes], + [self.conn._hash(item) for item in ('a', 'b', 'c', 'd')] + ) def test_add_nodes_id_src_meta(self): - graph = fornax.GraphHandle.create() + graph = fornax.GraphHandle.create(self.conn) graph.add_nodes(id_src=['a', 'b', 'c', 'd']) graph.add_edges(['a', 'b'], ['b', 'c']) - nodes = self.session.query(fornax.model.Node).all() - self.assertEqual( - [json.loads(n.meta)['id_src'] for n in nodes], - ['a', 'b', 'c', 'd'] - ) + with self.conn._get_session() as session: + nodes = session.query(fornax.model.Node).all() + self.assertEqual( + [json.loads(n.meta)['id_src'] for n in nodes], + ['a', 'b', 'c', 'd'] + ) def test_add_edges_id_src(self): - graph = fornax.GraphHandle.create() + graph = fornax.GraphHandle.create(self.conn) graph.add_nodes(id_src=['a', 'b', 'c', 'd']) graph.add_edges(['a', 'b'], ['b', 'c']) - edges = self.session.query( - fornax.model.Edge - ).filter( - fornax.model.Edge.start < fornax.model.Edge.end - ).all() - self.assertEqual( - sorted([e.start, e.end] for e in edges), - sorted( - sorted([_hash(start), _hash(end)]) - for start, end in [('a', 'b'), ('b', 'c')] + with self.conn._get_session() as session: + edges = session.query( + fornax.model.Edge + ).filter( + fornax.model.Edge.start < fornax.model.Edge.end + ).all() + self.assertEqual( + sorted([e.start, e.end] for e in edges), + sorted( + sorted([self.conn._hash(start), self.conn._hash(end)]) + for start, end in [('a', 'b'), ('b', 'c')] + ) ) - ) class TestQuery(TestCaseDB): - @classmethod - def setUp(self): - """trick fornax into using the test database setup - """ - super().setUp(self) - fornax.api.Session = lambda: Session(self._connection) + def run(self, result=None): + with fornax.Connection() as conn: + self.conn = conn + self.conn.make_session = lambda: Session(self._connection) + super().run(result) def test_init_query_raises(self): - self.assertRaises(ValueError, fornax.QueryHandle, 0) + self.assertRaises(ValueError, fornax.QueryHandle, self.conn, 0) def test_init_read_raises(self): - self.assertRaises(ValueError, fornax.QueryHandle.read, 0) + self.assertRaises(ValueError, fornax.QueryHandle.read, self.conn, 0) def test_create(self): - query_graphs = [fornax.GraphHandle.create() for _ in range(3)] - target_graphs = [fornax.GraphHandle.create() for _ in range(3)] + query_graphs = [fornax.GraphHandle.create(self.conn) for _ in range(3)] + target_graphs = [ + fornax.GraphHandle.create(self.conn) for _ in range(3) + ] queries = [fornax.QueryHandle.create( - q, t) for q, t in zip(query_graphs, target_graphs)] + self.conn, q, t) for q, t in zip(query_graphs, target_graphs)] self.assertEqual([q.query_id for q in queries], [0, 1, 2]) def test_create_query_target(self): - query_graph = fornax.GraphHandle.create() - target_graph = fornax.GraphHandle.create() - query = fornax.QueryHandle.create(query_graph, target_graph) - query_db = self.session.query(fornax.model.Query).filter( - fornax.model.Query.query_id == query.query_id).first() - self.assertEqual(query_db.start_graph_id, query_graph.graph_id) - self.assertEqual(query_db.end_graph_id, target_graph.graph_id) + query_graph = fornax.GraphHandle.create(self.conn) + target_graph = fornax.GraphHandle.create(self.conn) + query = fornax.QueryHandle.create(self.conn, query_graph, target_graph) + with self.conn._get_session() as session: + query_db = session.query(fornax.model.Query).filter( + fornax.model.Query.query_id == query.query_id).first() + self.assertEqual(query_db.start_graph_id, query_graph.graph_id) + self.assertEqual(query_db.end_graph_id, target_graph.graph_id) def test_read(self): - query_graph = fornax.GraphHandle.create() - target_graph = fornax.GraphHandle.create() - q1 = fornax.QueryHandle.create(query_graph, target_graph) - q2 = fornax.QueryHandle.read(q1.query_id) + query_graph = fornax.GraphHandle.create(self.conn) + target_graph = fornax.GraphHandle.create(self.conn) + q1 = fornax.QueryHandle.create(self.conn, query_graph, target_graph) + q2 = fornax.QueryHandle.read(self.conn, q1.query_id) self.assertEqual(q1.query_id, q2.query_id) def test_delete(self): - query_graph = fornax.GraphHandle.create() - target_graph = fornax.GraphHandle.create() - query = fornax.QueryHandle.create(query_graph, target_graph) + query_graph = fornax.GraphHandle.create(self.conn) + target_graph = fornax.GraphHandle.create(self.conn) + query = fornax.QueryHandle.create(self.conn, query_graph, target_graph) query_id = query.query_id query.delete() - query_exists = self.session.query(fornax.model.Query).filter( - fornax.model.Query.query_id == query_id).scalar() - matches_exists = self.session.query(fornax.model.Match).filter( - fornax.model.Match.query_id == query_id).scalar() + with self.conn._get_session() as session: + query_exists = session.query(fornax.model.Query).filter( + fornax.model.Query.query_id == query_id).scalar() + matches_exists = session.query(fornax.model.Match).filter( + fornax.model.Match.query_id == query_id).scalar() self.assertFalse(query_exists) self.assertFalse(matches_exists) def test_get_query_graph(self): - query_graph = fornax.GraphHandle.create() - target_graph = fornax.GraphHandle.create() - query = fornax.QueryHandle.create(query_graph, target_graph) + query_graph = fornax.GraphHandle.create(self.conn) + target_graph = fornax.GraphHandle.create(self.conn) + query = fornax.QueryHandle.create(self.conn, query_graph, target_graph) self.assertEqual(query.query_graph(), query_graph) def test_get_target_graph(self): - query_graph = fornax.GraphHandle.create() - target_graph = fornax.GraphHandle.create() - query = fornax.QueryHandle.create(query_graph, target_graph) + query_graph = fornax.GraphHandle.create(self.conn) + target_graph = fornax.GraphHandle.create(self.conn) + query = fornax.QueryHandle.create(self.conn, query_graph, target_graph) self.assertEqual(query.target_graph(), target_graph) def test_query_nodes(self): - query_graph = fornax.GraphHandle.create() - target_graph = fornax.GraphHandle.create() - query = fornax.QueryHandle.create(query_graph, target_graph) + query_graph = fornax.GraphHandle.create(self.conn) + target_graph = fornax.GraphHandle.create(self.conn) + query = fornax.QueryHandle.create(self.conn, query_graph, target_graph) q_uids = [0, 1, 2] t_uids = [3, 4, 5] query_graph.add_nodes(uid=q_uids) @@ -273,9 +281,9 @@ def test_query_nodes(self): ) def test_query_edges(self): - query_graph = fornax.GraphHandle.create() - target_graph = fornax.GraphHandle.create() - query = fornax.QueryHandle.create(query_graph, target_graph) + query_graph = fornax.GraphHandle.create(self.conn) + target_graph = fornax.GraphHandle.create(self.conn) + query = fornax.QueryHandle.create(self.conn, query_graph, target_graph) q_uids = [0, 1, 2] t_uids = [3, 4, 5] query_graph.add_nodes(uid=q_uids) @@ -289,9 +297,9 @@ def test_query_edges(self): ) def test_target_nodes(self): - query_graph = fornax.GraphHandle.create() - target_graph = fornax.GraphHandle.create() - query = fornax.QueryHandle.create(query_graph, target_graph) + query_graph = fornax.GraphHandle.create(self.conn) + target_graph = fornax.GraphHandle.create(self.conn) + query = fornax.QueryHandle.create(self.conn, query_graph, target_graph) q_uids = [0, 1, 2] t_uids = [3, 4, 5] query_graph.add_nodes(id_src=q_uids) @@ -310,17 +318,20 @@ def test_target_nodes(self): def test_undirected_edges(self): """Each edge needs to be stored in both directions """ - graph = fornax.GraphHandle.create() + graph = fornax.GraphHandle.create(self.conn) graph.add_nodes(myid=[1, 2, 3]) graph.add_edges([0], [1]) - src = [(e.start, e.end) - for e in self.session.query(fornax.model.Edge).all()] + with self.conn._get_session() as session: + src = [ + (e.start, e.end) + for e in session.query(fornax.model.Edge).all() + ] self.assertListEqual(sorted(src), [(0, 1), (1, 0)]) def test_target_edges(self): - query_graph = fornax.GraphHandle.create() - target_graph = fornax.GraphHandle.create() - query = fornax.QueryHandle.create(query_graph, target_graph) + query_graph = fornax.GraphHandle.create(self.conn) + target_graph = fornax.GraphHandle.create(self.conn) + query = fornax.QueryHandle.create(self.conn, query_graph, target_graph) uids = [0, 1] query_graph.add_nodes(uid=range(3)) target_graph.add_nodes(uid=range(3)) @@ -342,9 +353,9 @@ def test_target_edges(self): ) def test_add_matches(self): - query_graph = fornax.GraphHandle.create() - target_graph = fornax.GraphHandle.create() - query = fornax.QueryHandle.create(query_graph, target_graph) + query_graph = fornax.GraphHandle.create(self.conn) + target_graph = fornax.GraphHandle.create(self.conn) + query = fornax.QueryHandle.create(self.conn, query_graph, target_graph) uids = [0, 1] query_graph.add_nodes(uid=range(3)) target_graph.add_nodes(uid=range(3)) @@ -361,9 +372,9 @@ def test_add_matches(self): self.assertEqual(uids, [json.loads(m.meta)['my_id'] for m in matches]) def test_execute_raises(self): - query_graph = fornax.GraphHandle.create() - target_graph = fornax.GraphHandle.create() - query = fornax.QueryHandle.create(query_graph, target_graph) + query_graph = fornax.GraphHandle.create(self.conn) + target_graph = fornax.GraphHandle.create(self.conn) + query = fornax.QueryHandle.create(self.conn, query_graph, target_graph) self.assertRaises(ValueError, query.execute) @@ -385,12 +396,6 @@ def test_eq(self): self.assertNotEqual(self.node, fornax.api.Node(1, 'query', {'a': 0})) self.assertNotEqual(self.node, fornax.api.Node(0, 'target', {'a': 1})) - def test_to_dict(self): - self.assertDictEqual( - self.node.to_dict(), {'id': _hash( - (0, 'query')), 'type': 'query', 'a': 1} - ) - def test_node_raises(self): self.assertRaises(ValueError, fornax.api.Node, 0, 'a', {}) @@ -431,49 +436,51 @@ def setUp(self): """trick fornax into using the test database setup """ super().setUp(self) - fornax.api.Session = lambda: Session(self._connection) - - query_graph = fornax.GraphHandle.create() - query_graph.add_nodes(my_id=range(1, 6)) - starts, ends = zip(*[(1, 3), (1, 2), (2, 4), (4, 5)]) - query_graph.add_edges( - [i - 1 for i in starts], - [i - 1 for i in ends] - ) + self.maxsize = fornax.Connection.SQLITE_MAX_SIZE + with fornax.Connection() as conn: + conn.make_session = lambda: Session(self._connection) + query_graph = fornax.GraphHandle.create(conn) + query_graph.add_nodes(my_id=range(1, 6)) + starts, ends = zip(*[(1, 3), (1, 2), (2, 4), (4, 5)]) + query_graph.add_edges( + [i - 1 for i in starts], + [i - 1 for i in ends] + ) - target_graph = fornax.GraphHandle.create() - target_graph.add_nodes(my_id=range(1, 14)) - starts, ends = zip(*[ - (1, 2), (1, 3), (1, 4), - (3, 7), (4, 5), (4, 6), - (5, 7), (6, 8), (7, 10), - (8, 9), (8, 12), (9, 10), - (10, 11), (11, 12), (11, 13), - ]) - target_graph.add_edges( - [s - 1 for s in starts], - [e - 1 for e in ends] - ) + target_graph = fornax.GraphHandle.create(conn) + target_graph.add_nodes(my_id=range(1, 14)) + starts, ends = zip(*[ + (1, 2), (1, 3), (1, 4), + (3, 7), (4, 5), (4, 6), + (5, 7), (6, 8), (7, 10), + (8, 9), (8, 12), (9, 10), + (10, 11), (11, 12), (11, 13), + ]) + target_graph.add_edges( + [s - 1 for s in starts], + [e - 1 for e in ends] + ) - query = fornax.QueryHandle.create( - query_graph, - target_graph - ) - starts, ends, weights = zip(*[ - (1, 1, 1), (1, 4, 1), (1, 8, 1), - (2, 2, 1), (2, 5, 1), (2, 9, 1), - (3, 3, 1), (3, 6, 1), (3, 12, 1), (3, 13, 1), - (4, 7, 1), (4, 10, 1), - (5, 11, 1) - ]) - - query.add_matches( - [s - 1 for s in starts], - [e - 1 for e in ends], - weights - ) + query = fornax.QueryHandle.create( + conn, + query_graph, + target_graph + ) + starts, ends, weights = zip(*[ + (1, 1, 1), (1, 4, 1), (1, 8, 1), + (2, 2, 1), (2, 5, 1), (2, 9, 1), + (3, 3, 1), (3, 6, 1), (3, 12, 1), (3, 13, 1), + (4, 7, 1), (4, 10, 1), + (5, 11, 1) + ]) + + query.add_matches( + [s - 1 for s in starts], + [e - 1 for e in ends], + weights + ) - self.payload = query.execute(n=2) + self.payload = query.execute(n=2) def test_iters(self): self.assertEqual(self.payload['max_iters'], 10) @@ -500,14 +507,19 @@ def test_first_graph_nodes(self): {"id": 11, "type": "target", "my_id": 12} ] for node in nodes: - node['id'] = _hash((node['id'], node['type'])) + node['id'] = fornax.api._hash( + (node['id'], node['type']), + self.maxsize + ) self.assertListEqual( graph['nodes'], nodes ) def test_first_graph_links(self): + graph = self.payload['graphs'][0] + matches = [ {"source": 0, "target": 7, "type": "match", "weight": 1.0}, {"source": 1, "target": 8, "type": "match", "weight": 1.0}, @@ -524,13 +536,33 @@ def test_first_graph_links(self): {"source": 9, "target": 10, "type": "target", "weight": 1.0}, {"source": 10, "target": 11, "type": "target", "weight": 1.0}, ] + for match in matches: + if match['type'] == 'query' or match['type'] == 'target': - match['source'] = _hash((match['source'], match['type'])) - match['target'] = _hash((match['target'], match['type'])) + + match['source'] = fornax.api._hash( + (match['source'], match['type']), + self.maxsize + ) + + match['target'] = fornax.api._hash( + (match['target'], match['type']), + self.maxsize + ) + else: - match['source'] = _hash((match['source'], 'query')) - match['target'] = _hash((match['target'], 'target')) + + match['source'] = fornax.api._hash( + (match['source'], 'query'), + self.maxsize + ) + + match['target'] = fornax.api._hash( + (match['target'], 'target'), + self.maxsize + ) + self.assertListEqual(graph['links'], matches) def test_second_graph_cost(self): @@ -552,7 +584,11 @@ def test_second_graph_nodes(self): {"id": 10, "type": "target", "my_id": 11}, ] for node in nodes: - node['id'] = _hash((node['id'], node['type'])) + node['id'] = fornax.api._hash( + (node['id'], node['type']), + self.maxsize + ) + self.assertListEqual( graph['nodes'], nodes @@ -575,11 +611,31 @@ def test_second_graph_links(self): {"source": 8, "target": 9, "type": "target", "weight": 1.0}, {"source": 9, "target": 10, "type": "target", "weight": 1.0}, ] + for match in matches: + if match['type'] == 'query' or match['type'] == 'target': - match['source'] = _hash((match['source'], match['type'])) - match['target'] = _hash((match['target'], match['type'])) + + match['source'] = fornax.api._hash( + (match['source'], match['type']), + self.maxsize + ) + + match['target'] = fornax.api._hash( + (match['target'], match['type']), + self.maxsize + ) + elif match['type'] == 'match': - match['source'] = _hash((match['source'], 'query')) - match['target'] = _hash((match['target'], 'target')) + + match['source'] = fornax.api._hash( + (match['source'], 'query'), + self.maxsize + ) + + match['target'] = fornax.api._hash( + (match['target'], 'target'), + self.maxsize + ) + self.assertListEqual(graph['links'], matches) diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..347011d --- /dev/null +++ b/tox.ini @@ -0,0 +1,4 @@ +[tox] +envlist = py36 +[testenv] +commands=python3 -m unittest discover -v -s ./test -p "test_*.py" \ No newline at end of file