In [17]:
from pyArango.connection import Connection
from pyArango.collection import Collection, Edges, Field
from pyArango.graph import Graph, EdgeDefinition

# For a local database, view at http://127.0.0.1:8529
conn = Connection(username="root", password="")

##### Establish connection to the database, create if it doesn't exist

In [18]:
if not conn.hasDatabase("colabfit"):
    conn.createDatabase("colabfit")
db = conn["colabfit"]

##### Collection classes

In [19]:

class Dataset(Collection):
    _fields = {
        "name": Field(),
        "authors": Field(),
        "links": Field(),
        "description": Field()
    }

class Configuration(Collection):
    _fields = {
        # _key is required to be unique
        # used here for the 'name' values for indexing purposes
        # Doesn't actually have to be included here, as is always created
        "_key": Field(), 
        "positions": Field(),
        "cell": Field(),
        "atoms": Field()
    }

class Property(Collection):
    _fields = {
        "name": Field(),
        "value": Field()
    }

##### Edge classes

In [20]:
class DatasetConfig(Edges):
    _fields = {}

class DatasetProperty(Edges):
    _fields = {}

class ConfigProperty(Edges):
    _fields = {}
    

##### Create graph object

In [21]:
class ColabfitGraph(Graph):
    _edgeDefinitions = [
        EdgeDefinition(
            "DatasetConfig",
            fromCollections=["Dataset"],
            toCollections=["Configuration"],
        ),

        EdgeDefinition(
            "DatasetProperty",
            fromCollections=["Dataset"],
            toCollections=["Property"],
        ),

        EdgeDefinition(
            "ConfigProperty",
            fromCollections=["Configuration"],
            toCollections=["Property"],
        ),

    ]
    _orphanedCollections = []


In [22]:
if not db.hasGraph('ColabfitGraph'):
    graph = db.createGraph("ColabfitGraph")
else:
    graph = db.graphs['ColabfitGraph']
    

##### Set up mock dataset

In [23]:
"""
There will be three vertex collections (Datasets, Configurations, and Properties) 
and three edge collections denoting relationships 
(Dataset<->Configurations, 
 Dataset<->Properties, 
 Configurations<->Properties)
"""

# Number of configurations w/ properties to insert into database
n_configs = 1000

# Contains dictionaries of configurational key-value pairs to insert into database 
config_l = []

# Contains lists of dictionaries of property key-value pairs to insert into database-Two properties for each configuration
prop_l = []

# Simulated Data
for i in range(n_configs):
    config_l.append({'name':'name_%s' %i,'atoms':'atoms_%s' %i, 'positions':'positions_%s' %i})
    prop_l.append([{'name':'first_property','value': 'fp_%s' %(i%3)},{'name':'second_property','value':'sp_%s' %(i%5)}])
# Two Datasets-One contains a subset of data from the other
dataset_1 = [*range(int(n_configs/2))] # Let's say this dataset only had "first_property" computed for its n_configs/2 configurations
dataset_2 = [*range(n_configs)] # This dataset expanded upon the first and both properties were computed 


In [25]:
for i, id in enumerate(dataset_1):
    config = graph.createVertex(
        collectionName="Configuration",
        docAttributes={
            "_key": config_l[i]['name'],
            "atoms": config_l[i]['atoms'],
            "positions": config_l[i]['positions']
        }
        )
    prop = graph.createVertex(
        collectionName="Property",
        docAttributes={
            "name": prop_l[i][0]['name'],
            "value": prop_l[i][0]['value']
            }
            )
    graph.createEdge("ConfigProperty", config['_id'], prop['_id'], {})
    


##### Create the first dataset object in the graph

In [26]:
ds1 = graph.createVertex(
    "Dataset",
    {
        "name": "Dataset_1",
        "authors": ["Eric Fuemmeler"],
        "description": "A dataset of 500 configurations."
    }
)

##### Gather keys for configurations, add configurations to dataset 1

In [27]:
aql = """
    FOR config in Configuration
    RETURN config
"""

configs = db.AQLQuery(aql, batchSize=1000, rawResults=True)

for config in configs:
    
    graph.createEdge("DatasetConfig", ds1['_id'], config['_id'], edgeAttributes={})

##### Connect dataset 1 to property 1

In [28]:
aql = """
    FOR prop in Property
    RETURN prop._id
"""
ds1_props = db.AQLQuery(aql, batchSize=1000, rawResults=True)

for prop in ds1_props:
    graph.createEdge("DatasetProperty", ds1['_id'], prop, {})

In [29]:
ds2 = graph.createVertex(
    "Dataset",
    {
        "name": "Dataset_2",
        "authors": ["Eric Fuemmeler"],
        "description": "A dataset of 1000 configurations."
    }
)


##### A cleaner addition of edges and vertices
##### Adds dataset-property edges without having to find the appropriate vertices again

In [30]:
for i, id in enumerate(dataset_2):
    config = graph.createVertex(
        collectionName="Configuration",
        docAttributes={
            "_key": config_l[i]['name'] + "_2",
            "atoms": config_l[i]['atoms'],
            "positions": config_l[i]['positions']
        }
        )
    prop = graph.createVertex(
        collectionName="Property",
        docAttributes={
            "name": prop_l[i][0]['name'],
            "value": prop_l[i][0]['value']
            }
            )
    prop2 = graph.createVertex(
        collectionName="Property",
        docAttributes={
            "name": prop_l[i][1]['name'],
            "value": prop_l[i][1]['value']
            }
            )
    graph.createEdge("ConfigProperty", config['_id'], prop['_id'], {})
    graph.createEdge("ConfigProperty", config['_id'], prop2['_id'], {})
    graph.createEdge("DatasetProperty", ds2['_id'], prop['_id'], {})
    graph.createEdge("DatasetProperty", ds2['_id'], prop2['_id'], {})


##### To clear the database

In [16]:
db.dropAllCollections()