In [1]:
from typing import Dict
from collections import defaultdict
import json
from pprint import pprint

from py2neo import Graph, Node, Relationship

In [2]:
G = Graph("bolt://localhost:7687")
# username = "neo4j"
# password = "KrRsKLC26qhHSdj4PG_qRso98GdyDbZOTQrdvvZPr8Q"
# G = Graph(f"neo4j+s://{username}:{password}@f2c1d80b.databases.neo4j.io")

In [17]:
# G.delete_all()

In [18]:
with open("data/scraped/classes.json") as f:
    classes = json.load(f)
    
with open("data/scraped/programs.json") as f:
    programs = json.load(f)
    
with open("data/scraped/specialisations.json") as f:
    special = json.load(f)

In [19]:
print(len(classes), len(programs), len(special))

3142 294 449


In [20]:
CLASSES = defaultdict(lambda : Node("class"))
PROGRAMS = defaultdict(lambda : Node("program"))
SPECIAL = defaultdict(lambda : Node("specialisation"))

class Prerequisite(Relationship):
    name = 'Prerequisite'

class Incompatible(Relationship):
    name = 'Incompatible'

class Enrolled(Relationship):
    name = 'Enrolled'

class Unknown(Relationship):
    name = 'Unknown'

class Requirement(Relationship):
    name = 'Requirement'

CONDITION_MAPPER = {
    'completed': Prerequisite,
    'incompatible': Incompatible,
    'studying': Enrolled,
    'enrolled': Enrolled,
    'Unknown': Unknown,
    'permission': Unknown,
    'obtained': Unknown
}

In [22]:
def update_node(node: Node, doc: Dict) -> Node:
    for key in doc.keys():
        if doc[key]:
            if type(doc[key]) != list and type(doc[key]) != dict:
                node[key] = doc[key]
    return node

def create_node(node: Node, doc: Dict, G: Graph) -> Node:
    if 'id' in doc: node.identity = node['id'] = doc['id']
    node = update_node(node, doc)
    G.create(node)
    return node

def create_node_if_not_exists(cache: defaultdict, key: str, doc: Dict, G: Graph) -> Node:
    if key not in cache:
        dest_node = cache[key]
        dest_node = create_node(dest_node, doc, G)
        cache[key] = dest_node
    else:
        dest_node = cache[key]
        dest_node = update_node(dest_node, doc)
    return dest_node

def create_edge(edge: Relationship, doc: Dict, G: Graph, label: str=None, key: str=None) -> Relationship:
    """
    program -- Req -> req
    req -- Req -> req
    req -- Req -> spec
    spec -- Req -> req
    req -- Req -> class
    class -> class
    """
    if 'condition' in doc: edge['condition'] = doc['condition']
    if 'description' in doc: edge['description'] = doc['description']
    if 'negation' in doc: edge['negation'] = doc['negation']
    if not label:
        labels = str(edge.start_node.labels).split(':')
        labels = [item for item in labels if item]
        label = labels[0]
    if not key:
        key = edge.start_node.identity
    G.merge(edge, label, key)

    if label == 'requirement':
        labels = str(edge.end_node.labels).split(':')
        labels = [item for item in labels if item]
        label = labels[0]
        key = edge.end_node.identity
        G.merge(edge, label, key)
    return edge

def create_nodes_and_edges(doc: Dict, parent_node: Node, G: Graph, op: str='and'):
    if not doc:
        return
    
    # iterate over class requisites and create edges
    create_nodes_and_edges_if_class_requirement(doc, parent_node, G, op)

    # create edges if document is a program or specialisation / major / minor
    create_nodes_and_edges_if_program(doc, parent_node, G, op)

    create_nodes_and_edges_if_specialisation(doc, parent_node, G, op)
    
    # smallest documents in programs / specialisations.json
    if 'id' in doc:
        if doc['id'] in PROGRAMS:
            dest_node = create_node_if_not_exists(PROGRAMS, doc['id'], doc, G)
        elif doc['id'] in SPECIAL:
            dest_node = create_node_if_not_exists(SPECIAL, doc['id'], doc, G)
        else:
            dest_node = create_node_if_not_exists(CLASSES, doc['id'], doc, G)
        create_edge(Requirement(parent_node, dest_node), doc, G)
    elif 'description' in doc:
        # create new requirement node
        create_edge(Requirement(parent_node, create_node(Node("requirement"), doc, G)), doc, G)

def create_nodes_and_edges_if_class_requirement(doc: Dict, parent_node: Node, G: Graph, op: str):
    if not doc:
        return
        
    if 'operator' in doc and type(doc['operator']) == dict:
        for operator in doc['operator'].keys():
            for requirement in doc['operator'][operator]:
                create_nodes_and_edges_if_class_requirement(requirement, parent_node, G, op)
    elif 'condition' in doc and doc['condition'] and doc['condition'] in CONDITION_MAPPER:
        EDGE_FACTORY = CONDITION_MAPPER[doc['condition']]

        if 'programs' in doc:
            for program_name in doc['programs']:
                dest_node = create_node_if_not_exists(PROGRAMS, program_name, doc, G)
                create_edge(EDGE_FACTORY(parent_node, dest_node), doc, G)
        if 'classes' in doc:
            for class_name in doc['classes']:
                dest_node = create_node_if_not_exists(CLASSES, class_name, doc, G)
                create_edge(EDGE_FACTORY(parent_node, dest_node), doc, G)
        if not doc['programs'] and not doc['classes'] and doc['description']:
            create_node(parent_node, doc, G)

def create_nodes_and_edges_if_program(doc: Dict, parent_node: Node, G: Graph, op: str):
    """create edges if document is a program or specialisation / major / minor"""
    if 'items' in doc:
        # create new requirement node and connect to parent
        req_node = Node("requirement")
        req_node = create_node(req_node, doc, G)
        create_edge(Requirement(parent_node, req_node), doc, G)

        # create each child node and edges
        for child in doc['items']:
            create_nodes_and_edges(child, req_node, G, op)

def create_nodes_and_edges_if_specialisation(doc: Dict, parent_node: Node, G: Graph, op: str):
    if 'type' in doc:
        dest_node = create_node_if_not_exists(SPECIAL, doc['name'], doc, G)
        create_edge(Requirement(parent_node, dest_node), doc, G)

## Create Nodes

In [23]:
G.run("""
MATCH (n)
RETURN count(*)
""")

count(*)
13593


In [24]:
for doc in classes:
    node = create_node_if_not_exists(CLASSES, doc['id'], doc, G)
    CLASSES[node['id']] = node
    
len(CLASSES)

3142

In [25]:
for doc in programs:
    node = create_node_if_not_exists(PROGRAMS, doc['id'], doc, G)
    PROGRAMS[node['id']] = node
    PROGRAMS[node['name']] = node
    
len(PROGRAMS)

587

In [26]:
for doc in special:
    node = create_node_if_not_exists(SPECIAL, doc['id'], doc, G)
    SPECIAL[node['id']] = node
    SPECIAL[node['name']] = node # TODO: TMP - there may be specialisations with duplicate names
    
len(SPECIAL)

798

In [28]:
G.run("""
MATCH (n)
RETURN count(*)
""")

count(*)
17478


## Create Edges

In [29]:
%%time
## create edges for class requisites
for doc in classes:
    if 'requisites' in doc:
        create_nodes_and_edges(doc['requisites'], CLASSES[doc['id']], G)

CPU times: user 30.1 s, sys: 3.75 s, total: 33.9 s
Wall time: 6min 16s


In [30]:
%%time
for doc in programs:
    src_node = PROGRAMS[doc['id']]
    for requirement in doc['requirements']:
        create_nodes_and_edges(requirement, src_node, G)

CPU times: user 35.6 s, sys: 4.46 s, total: 40.1 s
Wall time: 7min 1s


In [31]:
%%time
for doc in special:
    src_node = SPECIAL[doc['id']]
    for requirement in doc['requirements']:
        create_nodes_and_edges(requirement, src_node, G)

CPU times: user 29.4 s, sys: 3.63 s, total: 33 s
Wall time: 5min 33s


In [32]:
G.run("""
MATCH (n)-[]-()
RETURN count(*)
""")

count(*)
117504


In [46]:
programs[0]['requirements'][0]

{'description': 'This Honours plan requires the completion of 48 units, which must consist of:',
 'n_units': 48,
 'items': [{'description': '48 units from completion of the Accounting Honours specialisation.',
   'n_units': 48,
   'items': []}]}

In [33]:
PROGRAMS.keys()

dict_keys(['HACCT', 'Bachelor of Accounting (Honours)', 'BADAN', 'Bachelor of Applied Data Analytics', 'HACTS', 'Bachelor of Actuarial Studies (Honours)', 'AACOM', 'Bachelor of Advanced Computing (Honours)', 'BACCT', 'Bachelor of Accounting', 'HAHCR', 'Bachelor of Art History and Curatorship (Honours)', 'BACTS', 'Bachelor of Actuarial Studies', 'BAHCR', 'Bachelor of Art History and Curatorship', 'HADAN', 'Bachelor of Applied Data Analytics (Honours)', 'AACRD', 'Bachelor of Advanced Computing (Research and Development) (Honours)', 'BAPRC', 'Bachelor of Archaeological Practice', 'HAPRC', 'Bachelor of Archaeological Practice (Honours)', 'HART2', 'Bachelor of Arts (Honours)', 'BAPAF', 'Bachelor of Asia-Pacific Affairs', 'HASIA', 'Bachelor of Asian Studies (Honours)', 'BARTS', 'Bachelor of Arts', 'HARTS', 'BARTY', 'Bachelor of Arts (with Year in Asia)', 'BAPAR', 'Bachelor of Asia-Pacific Affairs (Ritsumeikan)', 'BASIA', 'Bachelor of Asian Studies', 'BASY', 'Bachelor of Asian Studies (with Y

In [18]:
SPECIAL.keys()

dict_keys(['ADIS-MAJ', 'Advanced Intelligent Systems', 'ASSY-MAJ', 'Aerospace Systems Engineering', 'APST-MAJ', 'Applied Statistics', 'AGRK-MAJ', 'Ancient Greek', 'AGIN-MAJ', 'Agricultural Innovation', 'ANTH-MAJ', 'Anthropology', 'ANCH-MAJ', 'Ancient History', 'ACCT-MAJ', 'Accounting', 'SECU-MAJ', 'Asia-Pacific Security Studies', 'ARCH-MAJ', 'Archaeology', 'ARAB-MAJ', 'Arabic', 'ACMK-MAJ', 'Asian Capital Markets', 'AHIT-MAJ', 'Art History and Theory', 'APIR-MAJ', 'Asia-Pacific International Relations', 'ASPP-MAJ', 'Asia-Pacific Politics', 'AHIST-MAJ', 'Asian History', 'ASIA-MAJ', 'Asian Studies', 'ACMG-MAJ', 'Asian and Pacific Culture, Media and Gender', 'ASTR-MAJ', 'Astronomy and Astrophysics', 'AUIS-MAJ', 'Australian Indigenous Studies', 'BCHM-MAJ', 'Biochemistry', 'BIOD-MAJ', 'Biodiversity Conservation', 'BIAN-MAJ', 'Biological Anthropology', 'CPMK-MAJ', 'Capital Markets', 'CMBI-MAJ', 'Cell & Molecular Biology', 'BUSN-MAJ', 'Business Information Systems', 'COMP-MAJ', 'Composition', 