In [1]:
import pandas as pd
import polars as pl
import numpy as np
import time
import os
import sys
sys.path.insert(1, '../../framework')
from racetrack import *
rt = RACETrack()

In [2]:
import json
from jsonpath_ng import jsonpath, parse

In [3]:
# scanForward() - finds the next unescaped version of character c in x starting at i
def scanForward(x, i, c):
    in_escape = False
    while i < len(x):
        if   x[i] == '\\' and in_escape == False: in_escape = True
        else:
            if x[i] == c and in_escape == False: return i
            in_escape = False
        i += 1
    return None

# literalize() - converts any single or double quoted strings into unique literal names
# ... fails if inputs contain four underscore names that overlap with the format "____lit{num}____"
def literalize(x):
    l, lu = [], {}
    i = 0
    while i < len(x):
        c = x[i]
        if   c == "'":
            j = scanForward(x, i+1, "'")
            if j is None: raise Exception(f'RTOntology.literalize() - unterminated string literal "{x}"')
            _literal_name_ = f'____lit{len(lu.keys())}____' # Surely, no one would ever use four underscores in a literal... and don't call me Surely
            lu[_literal_name_] = x[i+1:j]
            l.append(_literal_name_)
            i = j + 1
        elif c == '"':
            j = scanForward(x, i+1, '"')
            if j is None: raise Exception(f'RTOntology.literalize() - unterminated string literal "{x}"')
            _literal_name_ = f'____lit{len(lu.keys())}____' # Surely, no one would ever use four underscores in a literal... and don't call me Surely
            lu[_literal_name_] = x[i+1:j]
            l.append(_literal_name_)
            i = j + 1
        else:
            l.append(c)
            i += 1
    return ''.join(l), lu

# fillLiteratals() - fill in the literal values (opposite of literalize but not guaranteed to keep spaces)
def fillLiterals(x, lu):
    for k, v in lu.items():
        x = x.replace(k, v)
    return x

# findClosingParen() - find the next closing paren taking other open/closes into consideration
# ... requires that literals were taken care of... [see literalize function]
def findClosingParen(s, i):
    stack = 0
    while i < len(s):
        if   s[i] == '(':               stack += 1
        elif s[i] == ')' and stack > 0: stack -= 1
        elif s[i] == ')':               return i
        i += 1
    raise Exception(f'RTOntology.findClosingParen() - no closing paren found for "{s}"')

# tokenizeParameters() - create a token list of function parameters
# ... requires that literals were taken care of... [see literalize function]
def tokenizeParameters(x):
    r = []
    while ',' in x or '(' in x:
        if   ',' in x and '(' in x: # both... process the one that occurs first
            i = x.index(',')
            j = x.index('(')
            if i < j:
                r.append(x[:i])
                x = x[i+1:]
            else:
                k = findClosingParen(x,j+1)
                r.append(x[:k+1].strip())
                x = x[k+1:]
                if ',' in x: x = x[x.index(',')+1:] # if there's another comma, consume it
        elif ',' in x: # just literals from here on out...
            r.append(x[:x.index(',')].strip())
            x = x[x.index(',')+1:]
        elif '(' in x: # just one function call from here on out...
            i = x.index('(')
            j = findClosingParen(x,i+1)
            r.append(x[:j+1].strip())
            x = x[j+1:]
            if ',' in x: x = x[x.index(',')+1:] # if there's another comma, consume it
    x = x.strip()
    if len(x) > 0:
        r.append(x)
    return r

# parseTree() - create a parse tree representation of a ontology node description
def parseTree(x, node_value=None, node_children=None, node_name=None, lit_lu=None):
    if node_value is None:
        node_value = {}
        node_children = {}
        node_name = 'root'

    if lit_lu is None:
        x, lit_lu = literalize(x)
    if '(' in x:
        i          = x.index('(')
        j          = findClosingParen(x, i+1)
        fname      = x[0:i].strip()
        parms      = tokenizeParameters(x[i+1:j])
        node_value   [node_name] = lit_lu[fname] if fname in lit_lu else fname
        node_children[node_name] = []    # functions have children... even if it's an empty list of children
        for child_i in range(len(parms)):
            child_name = f'{node_name}.{child_i}'
            node_children[node_name].append(child_name)
            parseTree(parms[child_i], node_value, node_children, child_name, lit_lu)
    else:
        x                        = x.strip()
        node_value   [node_name] = lit_lu[x] if x in lit_lu else x
        node_children[node_name] = None # literals have no children
    return node_value, node_children

# solveParseTree() - evaluate a parse tree
def solveParseTree(values, children, filled, i, node=None):
    if node is None: node = 'root'
    if   children[node] is None and isJsonPath(values[node]):
        return filled[values[node]][i]  # jsonpath filled in value from the json
    elif children[node] is None:
        return values[node]             # constant / literal
    else:
        parms = [solveParseTree(values, children, filled, i, x) for x in children[node]]
        return eval(f'{values[node]}(*parms)')

# upToStar() - upto the cth '[*]'
def upToStar(x, c):
    i = 0
    while c > 0:
        j = x.index('[*]', i)
        i = j + 3
        c -= 1
    return x[:i]

# fillStars() - fill the the stars in the specified order
def fillStars(x, i, j=None, k=None):
    if '[*]' not in x: return x # for example ... "$.id"
    _index_ = x.index('[*]')
    x = x[:_index_] + f'[{i}]' + x[_index_+3:]
    if j is not None and '[*]' in x:
        _index_ = x.index('[*]')
        x = x[:_index_] + f'[{j}]' + x[_index_+3:]
    if k is not None and '[*]' in x:
        _index_ = x.index('[*]')
        x = x[:_index_] + f'[{k}]' + x[_index_+3:]
    return x

# isJsonPath() - check if the string is a jsonpath
def isJsonPath(_str_): 
    return _str_.startswith('$.') or _str_.startswith('$[')

#
# fillJSONPathElementsByJSONPath() - unoptimized version using jsonpath-ng
# - to_fill should only have values with a [*] in them
#
def fillJSONPathElementsByJSONPath(to_fill, myjson):
    longest_by_star_path, min_stars, max_stars = None, None, None
    filled = {}
    for x in to_fill:
        star_count = x.count('[*]')
        if min_stars is None or star_count < min_stars: min_stars = star_count
        if max_stars is None or star_count > max_stars: max_stars = star_count
        if longest_by_star_path is None or len(x) > len(longest_by_star_path): longest_by_star_path = x
        filled[x] = []

    # fill in the json values into the filled dict
    if min_stars == max_stars: # shortcut if we only do the same number of stars
        _length_to_fill_ = None
        for v in filled.keys():
            if isJsonPath(v) and '[*]' in v:
                filled[v] = [match.value if match.value != {} else None for match in parse(v).find_or_create(myjson)]
                _this_length_ = len(filled[v])
                if   _length_to_fill_ is None:          _length_to_fill_ = _this_length_
                elif _length_to_fill_ != _this_length_: raise Exception(f'RTOntology.__applyTemplate__() - unequal number of values for {v} ({_length_to_fill_=} vs {_this_length_=})')
        for v in filled.keys():
            if   isJsonPath(v) and '[*]' not in v:
                _match_ = parse(v).find(myjson)[0].value
                filled[v] = [_match_] * _length_to_fill_
            elif isJsonPath(v) == False:
                filled[v] = [v] * _length_to_fill_
    else:
        star_count = longest_by_star_path.count('[*]')
        if star_count   == 1:
            for i in range(len(parse(upToStar(longest_by_star_path, 1)).find(myjson))):
                for v in filled.keys():
                    if isJsonPath(v):
                        _matches_ = parse(fillStars(v, i)).find(myjson)
                        if len(_matches_) == 1: filled[v].append(_matches_[0].value)
                        else:                   filled[v].append(None)
                    else:
                        filled[v].append(v)
        elif star_count == 2:
            for i in range(len(parse(upToStar(longest_by_star_path, 1)).find(myjson))):
                for j in range(len(parse(upToStar(fillStars(longest_by_star_path,i), 1)).find(myjson))):
                    for v in filled.keys():
                        if isJsonPath(v):
                            _matches_ = parse(fillStars(v, i, j)).find(myjson)
                            if len(_matches_) == 1: filled[v].append(_matches_[0].value)
                            else:                   filled[v].append(None)
                        else:
                            filled[v].append(v)
        elif star_count == 3:
            for i in range(len(parse(upToStar(longest_by_star_path, 1)).find(myjson))):
                for j in range(len(parse(upToStar(fillStars(longest_by_star_path,i), 1)).find(myjson))):
                    for k in range(len(parse(upToStar(fillStars(longest_by_star_path,i,j), 1)).find(myjson))):
                        for v in filled.keys():
                            if isJsonPath(v):
                                _matches_ = parse(fillStars(v, i, j, k)).find(myjson)
                                if len(_matches_) == 1: filled[v].append(_matches_[0].value)
                                else:                   filled[v].append(None)
                            else:
                                filled[v].append(v)
        else:
            raise Exception(f'RTOntology.__applyTemplate__() - max of three stars supported -- {star_count} found')
    return filled

#
# isLiteral() - true if it's a proper literal for json key string
#
def isLiteral(v):
    if v == '': return False
    for i in range(len(v)):
        if i == 0 and v[i] >= '0' and v[i] <= '9': return False # can't start with a number
        if (v[i] >= 'a' and v[i] <= 'z') or (v[i] >= 'A' and v[i] <= 'Z') or (v[i] >= '0' and v[i] <= '9') or (v[i] == '_'): pass
        else: return False
    return True

#
# endsWithAny() - does a string end with any of these?
#
def endsWithAny(_str_, _set_):
    return any(_str_.endswith(x) for x in _set_)

#
# fillJSONPathElements() - uses self modifying code to optimize the filling of the structures based on jsonpath specifications.
#
def fillJSONPathElements(to_fill, myjson):
    filled = {}
    for x in to_fill: filled[x] = [] 
    filled_list = list(filled.keys())
    longest_by_star_path  = filled_list[0]
    for i in range(1, len(filled_list)): 
        if longest_by_star_path.count('[*]') < filled_list[i].count('[*]'): longest_by_star_path = filled_list[i]
    to_eval, indent, _index_, _loop_vars_, _loop_i_, _path_, _star_path_, vars_set = [], 0, 1, ['i','j','k','l'], 0, '', '$', 0
    while _index_ < len(longest_by_star_path):
        _rest_ = longest_by_star_path[_index_:]
        if   _rest_.startswith('[*]'):
            to_eval.append(' '*indent+'for '+_loop_vars_[_loop_i_]+f' in range(len(myjson{_path_})):')
            _path_      += f'[{_loop_vars_[_loop_i_]}]'
            _star_path_ += f'[*]'
            _index_, _loop_i_, indent = _index_+3, _loop_i_+1, indent+4
            if _rest_.endswith('[*]'):
                for i in range(len(filled_list)):
                    if filled_list[i] == _star_path_:
                        to_eval.append(' '*indent+f'_var{i}_ = myjson{_path_}')
                        vars_set += 1
            for i in range(len(filled_list)):
                _filled_rest_ = filled_list[i][_index_:]
                if _filled_rest_.count('[*]') == 0 and '.' not in _filled_rest_ and len(_filled_rest_) > 0:
                    to_eval.append(' '*indent+f'_var{i}_ = myjson{_path_}' + _filled_rest_)
                    vars_set += 1

        elif _rest_[0] == '.':
            _star_path_ += '.'
            for i in range(len(filled_list)):
                lit_maybe = filled_list[i][len(_star_path_):]
                if isLiteral(lit_maybe):
                    to_eval.append(' '*indent+f'if "{lit_maybe}" in myjson{_path_}:')
                    to_eval.append(' '*(indent+4)+f'_var{i}_ = myjson{_path_}["{lit_maybe}"]')
                    to_eval.append(' '*indent+f'else: _var{i}_ = None')
                    vars_set += 1
            _index_ += 1
        elif _rest_[0].isalpha() or _rest_[0] == '_':
            l = len(_rest_)
            if '.' in _rest_:                           l = _rest_.index('.')
            if '[' in _rest_ and _rest_.index('[') < l: l = _rest_.index('[')
            lit = _rest_[:l]
            to_eval.append(' '*indent+f'if "{lit}" in myjson{_path_}:')
            _path_      += f'["{lit}"]'
            _star_path_ += f'{lit}'
            _index_, indent = _index_+l, indent+4
        else:
            print('Exception for the following script:\n')
            print('\n'.join(to_eval)) 
            raise Exception(f'RTOntology.fillJSONPathElements() - parse error at {i}')

        if vars_set >= len(filled_list):
            for i in range(len(filled_list)):
                to_eval.append(' '*indent+f'filled["{filled_list[i]}"].append(_var{i}_)')
            break

    if to_eval[-1].endswith(':'): to_eval.append(' '*indent+'pass')
    # print('\n'.join(to_eval))
    exec('\n'.join(to_eval))
    return filled

#
#
#
class RTOntology(object):
    # __init__() - prepare transform spec for use and initial instance variables
    def __init__(self, xform_spec=None):
        if xform_spec is not None: self.xform_spec_lines = self.__substituteDefines__(xform_spec)
        else:                      self.xform_spec_lines = []
        self.df_triples = None
        self.uid_lu     = {}
        self.rev_uid_lu = {}
        self.time_lu    = {}
        for x in ['fill.trace_json_paths', 'fill.collapse', 'fill.parse']: self.time_lu[x] = 0

    # to_files() - write state to several files
    def to_files(self, _base_name_):
        self.df_triples.write_parquet(f'{_base_name_}.triples.parquet')
        _lu_ = {'uid':[], 't0':[], 't1':[], 't2':[]}
        for _uid_ in self.uid_lu:
            _lu_['uid'].append(_uid_)
            _lu_['t0'].append(self.uid_lu[_uid_][0])
            _lu_['t1'].append(self.uid_lu[_uid_][1])
            _lu_['t2'].append(self.uid_lu[_uid_][2])
        pd.DataFrame(_lu_).to_parquet(f'{_base_name_}.uids.parquet')
        if len(self.xform_spec_lines) > 0:
            with open(f'{_base_name_}.xform_spec', 'wt') as f: f.write('\n'.join(self.xform_spec_lines))

    # fm_files() - read state from several files
    def fm_files(self, _base_name_):
        self.df_triples = pd.read_parquet(f'{_base_name_}.triples.parquet')
        _lu_ = pd.read_parquet(f'{_base_name_}.uids.parquet')
        uid_v, t0_v, t1_v, t2_v = _lu_['uid'].values, _lu_['t0'].values, _lu_['t1'].values, _lu_['t2'].values
        for i in range(len(uid_v)):
            self.uid_lu[uid_v[i]] = (t0_v[i], t1_v[i], t2_v[i])
            if t2_v[i] == 'uniq':
                _key_ = str(t0_v[i]) + '|' + str(t1_v[i])
                self.rev_uid_lu[_key_] = uid_v[i]
                
    # __substituteDefines__() - subsitute defines
    def __substituteDefines__(self, _txt_):
        lines     = _txt_.split('\n')
        subs      = {}
        completes = []
        for _line_ in lines:
            tokens = _line_.split()
            if len(tokens) >= 3 and tokens[1] == '=':
                subs[tokens[0]] = ' '.join(tokens[2:])
            else:
                for r in subs:
                    if r in _line_:
                        _line_ = _line_.replace(r, subs[r])
                if len(_line_) > 0:
                    completes.append(_line_)
        return completes

    # __applyTemplate__() - apply templated line in the transform to the json representation
    def __applyTemplate__(self, 
                          myjson,        # json representation
                          s_values,      s_children,    s_type,     s_disp, # subject params
                          v_values,      v_children,                        # verb params   (it's only a string, no typing, unique to the schema)
                          o_values,      o_children,    o_type,     o_disp, # object params
                          g_values,      g_children,    g_type,     g_disp, # group params
                          src_values,    src_children,                      # source params (it's only a string, no typing, unique to this ontological instance)
                          ):
        # resolve the jsonpath values        
        all_values  = set(s_values.values()) | set(v_values.values()) | set(o_values.values())
        if g_values   is not None: all_values |= set(g_values.values())
        if src_values is not None: all_values |= set(src_values.values())

        t0 = time.time()
        path_values, starred_path_values, longest_by_star_path = [], [], None
        for x in all_values:
            if isJsonPath(x):
                if '[*]' in x:
                    starred_path_values.append(x)
                    if   longest_by_star_path is None:                          longest_by_star_path = x
                    elif longest_by_star_path.rindex('[*]') < x.rindex('[*]'):  longest_by_star_path = x
                else:
                    path_values.append(x)

        # ensure that all jsonpath values are substrings of the longest star path
        for x in starred_path_values:
            x_until_last_star = x[:x.rindex('[*]')+3] # get the close bracket too
            if longest_by_star_path[:len(x_until_last_star)] != x_until_last_star:
                raise Exception(f'OntologyForViz.__applyTemplate__() - jsonpath are not subsets "{x}" vs "{longest_by_star_path}"')

        # fill in the json values into the filled dict
        if len(starred_path_values) > 0: filled = fillJSONPathElements(starred_path_values, myjson)
        else:                            filled = {}

        # ... double check that they are the same length
        fill_len = None
        for x in filled.keys():
            if fill_len is None: fill_len = len(filled[x])
            if len(filled[x]) != fill_len: raise Exception(f'OntologyForViz.__applyTemplate__() - unequal number of values for {x}')
        if fill_len is None: fill_len = 1 # if there are no starred paths, then we need at least one filler (it's a constant path)

        # Fix up the filled with either constants or with static json paths
        for v in all_values:
            if isJsonPath(v) and '[*]' in v: continue
            if    isJsonPath(v): to_fill = [parse(v).find(myjson)[0].value]
            else:                to_fill = [v]
            filled[v] = to_fill * fill_len
        t1 = time.time()
        self.time_lu['fill.trace_json_paths'] += (t1-t0)

        # collapse the parse trees based on the filled values
        # ... double check that they are the same length
        t0 = time.time()
        l = None
        for v in filled.keys():
            if l is None: l = len(filled[v])
            if len(filled[v]) != l: raise Exception(f'RTOntology.__applyTemplate__() - unequal number of values for {v}')
        pre_df = {}
        pre_df['sbj']    = [solveParseTree(s_values,   s_children,   filled, i) for i in range(l)]
        pre_df['vrb']    = [solveParseTree(v_values,   v_children,   filled, i) for i in range(l)]
        pre_df['obj']    = [solveParseTree(o_values,   o_children,   filled, i) for i in range(l)]
        if g_values   is not None: pre_df['grp'] = [solveParseTree(g_values,   g_children,   filled, i) for i in range(l)]
        if src_values is not None: pre_df['src'] = [solveParseTree(src_values, src_children, filled, i) for i in range(l)]
        t1 = time.time()
        self.time_lu['fill.collapse'] += (t1-t0)

        t0 = time.time()
        for_df = {'sbj': [], 'stype': [], 'sdisp': [], 'vrb': [], 'obj': [], 'otype': [], 'odisp': [], 'grp':[], 'gdisp':[], 'src':[]}
        for i in range(l):
            #
            # Subject (Required)
            #
            _sbj_, _sbj_type_, _sbj_disp_ = pre_df['sbj'][i], s_type, s_disp
            if type(_sbj_) == tuple:
                _sbj_type_ = _sbj_[1] if len(_sbj_) > 1 else s_type
                _sbj_disp_ = _sbj_[2] if len(_sbj_) > 2 else s_disp
                _sbj_      = _sbj_[0]
            _sbj_uid_ = self.resolveUniqIdAndUpdateLookups(_sbj_, _sbj_type_, _sbj_disp_, 'sbj')
            for_df['sbj'].append(_sbj_uid_), for_df['stype'].append(_sbj_type_), for_df['sdisp'].append(_sbj_disp_)

            #
            # Verb (Required)
            #
            _vrb_ = pre_df['vrb'][i]
            for_df['vrb'].append(_vrb_)

            #
            # Object (Required)
            #
            _obj_, _obj_type_, _obj_disp_ = pre_df['obj'][i], o_type, o_disp
            if type(_obj_) == tuple:
                _obj_type_ = _obj_[1] if len(_obj_) > 1 else o_type
                _obj_disp_ = _obj_[2] if len(_obj_) > 2 else o_disp
                _obj_      = _obj_[0]
            _obj_uid_ = self.resolveUniqIdAndUpdateLookups(_obj_, _obj_type_, _obj_disp_, 'obj')            
            for_df['obj'].append(_obj_uid_), for_df['otype'].append(_obj_type_), for_df['odisp'].append(_obj_disp_)

            #
            # Grouping (Optional)
            #
            if g_values is not None:
                _grp_, _grp_type_, _grp_disp_ = pre_df['grp'][i], g_type, g_disp
                if type(_grp_) == tuple:
                    _grp_type_ = _grp_[1] if len(_grp_) > 1 else g_type
                    _grp_disp_ = _grp_[2] if len(_grp_) > 2 else g_disp
                    _grp_      = _grp_[0]
                _grp_uid_ = self.resolveUniqIdAndUpdateLookups(_grp_, _grp_type_, _grp_disp_, 'grp')
                for_df['grp'].append(_grp_uid_)
                for_df['gdisp'].append(_grp_disp_)
            else:
                for_df['grp'].append(None)
                for_df['gdisp'].append(None)

            #
            # Sourcing (Optional)
            #
            if src_values is not None:
                _src_ = pre_df['src'][i]
                for_df['src'].append(str(_src_))
            else:
                for_df['src'].append(None)

        t1 = time.time()
        self.time_lu['fill.parse'] += (t1-t0)

        _df_            = pl.DataFrame(for_df)
        if len(_df_) > 0: self.df_triples = _df_ if self.df_triples is None else pl.concat([_df_, self.df_triples])

    # resolveIdAndUpdateLookups() - resolve id and update lookups
    # self.uid_lu[<interger>] = (id-from-input, type-from-input, disposition-from-input)
    #
    def resolveUniqIdAndUpdateLookups(self, _id_, _type_, _disp_, _occurs_in_):
        _uniq_key_ = str(_id_)+'|'+str(_type_)
        if _disp_ == 'uniq' and _uniq_key_ in self.rev_uid_lu: return self.rev_uid_lu[_uniq_key_]
        my_uid = 100_000 + len(self.uid_lu.keys())
        self.uid_lu[my_uid] = (_id_, _type_, _disp_)
        if _disp_ == 'uniq':  self.rev_uid_lu[_uniq_key_] = my_uid
        return my_uid

    # parse() - parse json into ontology via specification
    def parse(self, j):
        for l in self.xform_spec_lines:
            l, lu = literalize(l) # get rid of any literal values so it doesn't mess up the delimiters
            if '#' in l: l = l[:l.index('#')].strip() # comments... hope the hash symbol doesn't occur anywhere in the template that isn't a comment
            if len(l) == 0: continue

            # Sourcing Information
            src_values = src_children = None
            if '^^^' in l:
                src = l[l.index('^^^')+3:]
                l   = l[:l.index('^^^')].strip()
                src_values, src_children = parseTree(fillLiterals(src, lu))

            # Grouping Information
            g_values = g_children = g_type = g_disp = None
            if '@@@' in l:
                grp = l[l.index('@@@')+3:]
                l   = l[:l.index('@@@')].strip()
                g_uniq = None
                if endsWithAny(grp, {'uniq', 'ambi', 'anon', 'yyyy', 'dura', 'cata', 'valu', 'cont'}) and '|' in grp:
                    g_disp = grp[grp.rindex('|')+1:].strip()
                    grp    = grp[:grp.rindex('|')]
                else: g_disp = 'ambi'
                g_type = None
                if '|' in grp:
                    g_type = grp[grp.rindex('|')+1:].strip()
                    grp   = grp[:grp.rindex('|')]
                g_node = grp
                g_values, g_children = parseTree(fillLiterals(g_node, lu))
                
            svo = [x.strip() for x in l.split('---')]
            if len(svo) == 3:
                s, v, o = svo[0], svo[1], svo[2]

                # Subject
                s_uniq = None
                if endsWithAny(s, {'uniq', 'ambi', 'anon', 'yyyy', 'dura', 'cata', 'valu', 'cont'}) and '|' in s:
                    s_disp = s[s.rindex('|')+1:].strip()
                    s      = s[:s.rindex('|')]
                else: s_disp = 'ambi'
                s_type = None
                if '|' in s:
                    s_type = s[s.rindex('|')+1:].strip()
                    s      = s[:s.rindex('|')]
                s_node = s
                s_values, s_children = parseTree(fillLiterals(s_node, lu))

                # Verb
                v_values, v_children = parseTree(fillLiterals(v, lu))

                # Object
                o_uniq = None
                if endsWithAny(o, {'uniq', 'ambi', 'anon', 'yyyy', 'dura', 'cata', 'valu', 'cont'}) and '|' in o:
                    o_disp = o[o.rindex('|')+1:].strip()
                    o      = o[:o.rindex('|')]
                else: o_disp = 'ambi'
                if '|' in o:
                    o_type = o[o.rindex('|')+1:].strip()
                    o      = o[:o.rindex('|')]
                o_node = o
                o_values, o_children = parseTree(fillLiterals(o_node, lu))
                self.__applyTemplate__(j, s_values, s_children, s_type, s_disp, 
                                          v_values, v_children, 
                                          o_values, o_children, o_type, o_disp,
                                          g_values, g_children, g_type, g_disp,
                                          src_values, src_children)
            else:
                raise Exception(f'RTOntology.parse() - line "{l}" does not have three parts')


In [4]:
_json_txt_ = '''
{"id":1,
 "people":[{"first":"John", "last":"Smith", "id":10, "citescore":2.3, "age":30, "city":"nyc",          "state":"ny", "country":"us"},
           {"first":"Joe",  "last":"Smith", "id":20, "citescore":1.8, "age":35,                        "state":"ny", "country":"us"},
           {"first":"Mary", "last":"Jones", "id":30, "age":32, "city":"philadelphia", "state":"pa", "country":"us"}],
 "knowsFrom":[[10, 20, "Conference A"], 
              [20, 30, "Conference B"]],
 "education":[{"id":10, "degreeReceived":"Ph.D. in Computer Science",   "university":"Stanford University"},
              {"id":10, "degreeReceived":"Masters in Computer Science", "university":"University of Pennsylvania"}],
 "total_people":3
}'''
_json_simple_  = json.loads(_json_txt_)
def concatNames(_last_,_first_):
    return _last_ + ' ' + _first_
def combineAddress(_city_,_state_,_country_):
    s = ''
    if _city_    is not None: s += _city_
    if _state_   is not None: s += ', ' + _state_    if (len(s) > 0) else _state_
    if _country_ is not None: s += ', ' + _country_  if (len(s) > 0) else _country_
    return s if (len(s) > 0) else 'Not Supplied'
_xform_simple_ = '''
_id_ = '$.people[*].id' | PersonID | uniq
'$.id'                                --- "hasEntryCount"    --- '$.total_people' | xsd:integer                                                                           ^^^ "IN_TEMPLATE"
_id_                                  --- "hasName"          --- concatNames('$.people[*].last', '$.people[*].first') | xsd:string                                        ^^^ "IN_TEMPLATE"
_id_                                  --- "hasCitationScore" --- '$.people[*].citescore' | xsd:float   | valu                                                             ^^^ '$.id'    
_id_                                  --- "hasAge"           --- '$.people[*].age'       | xsd:integer | valu                                                             ^^^ '$.id'
_id_                                  --- "isFrom"           --- combineAddress('$.people[*].city', '$.people[*].state', '$.people[*].country') | CityStateCountry | uniq ^^^ '$.id'
_id_                                  --- "isFromCity"       --- '$.people[*].city'      | City                                                                           ^^^ '$.id'
'$.knowsFrom[*][0]' | PersonID | uniq --- "knows"            --- '$.knowsFrom[*][1]'     | PersonID    | uniq                 @@@ '$.knowsFrom[*][2]' | xsd:string | uniq ^^^ '$.id'
'''
ofv_simple = RTOntology(_xform_simple_)
ofv_simple.parse(_json_simple_)

In [5]:
print(f'{ofv_simple.uid_lu=}')
ofv_simple.df_triples

ofv_simple.uid_lu={100000: (1, None, 'ambi'), 100001: (3, 'xsd:integer', 'ambi'), 100002: (10, 'PersonID', 'uniq'), 100003: ('Smith John', 'xsd:string', 'ambi'), 100004: (20, 'PersonID', 'uniq'), 100005: ('Smith Joe', 'xsd:string', 'ambi'), 100006: (30, 'PersonID', 'uniq'), 100007: ('Jones Mary', 'xsd:string', 'ambi'), 100008: (2.3, 'xsd:float', 'valu'), 100009: (1.8, 'xsd:float', 'valu'), 100010: (None, 'xsd:float', 'valu'), 100011: (30, 'xsd:integer', 'valu'), 100012: (35, 'xsd:integer', 'valu'), 100013: (32, 'xsd:integer', 'valu'), 100014: ('nyc, ny, us', 'CityStateCountry', 'uniq'), 100015: ('ny, us', 'CityStateCountry', 'uniq'), 100016: ('philadelphia, pa, us', 'CityStateCountry', 'uniq'), 100017: ('nyc', 'City', 'ambi'), 100018: (None, 'City', 'ambi'), 100019: ('philadelphia', 'City', 'ambi'), 100020: ('Conference A', 'xsd:string', 'uniq'), 100021: ('Conference B', 'xsd:string', 'uniq')}


sbj,stype,sdisp,vrb,obj,otype,odisp,grp,gdisp,src
i64,str,str,str,i64,str,str,i64,str,str
100002,"""PersonID""","""uniq""","""knows""",100004,"""PersonID""","""uniq""",100020,"""uniq""","""1"""
100004,"""PersonID""","""uniq""","""knows""",100006,"""PersonID""","""uniq""",100021,"""uniq""","""1"""
100002,"""PersonID""","""uniq""","""isFromCity""",100017,"""City""","""ambi""",,,"""1"""
100004,"""PersonID""","""uniq""","""isFromCity""",100018,"""City""","""ambi""",,,"""1"""
100006,"""PersonID""","""uniq""","""isFromCity""",100019,"""City""","""ambi""",,,"""1"""
…,…,…,…,…,…,…,…,…,…
100006,"""PersonID""","""uniq""","""hasCitationSco…",100010,"""xsd:float""","""valu""",,,"""1"""
100002,"""PersonID""","""uniq""","""hasName""",100003,"""xsd:string""","""ambi""",,,"""IN_TEMPLATE"""
100004,"""PersonID""","""uniq""","""hasName""",100005,"""xsd:string""","""ambi""",,,"""IN_TEMPLATE"""
100006,"""PersonID""","""uniq""","""hasName""",100007,"""xsd:string""","""ambi""",,,"""IN_TEMPLATE"""


In [6]:
# Length
#print(len(parse('$[*]').find(_json_)))

# Examples
#jsp_expr = parse('$[*].name')
#jsp_expr = parse('$[*].cast.[*].name') # but note that it doesn't distinguish the movie id
#jsp_expr = parse('$..director.name')
#jsp_expr = parse('$..name')
#jsp_expr = parse('$..genre[*]')
#[match.value for match in jsp_expr.find(_json_)][:3]

# IMDB 600K Transform Map
# ... maybe add "@@@" for grouping the triples together ... and then "^^^" for sourcing?
_xform_map_ = '''
__id__              = '$[*]._id'              | MovieID      | uniq
__director__        = '$[*].director.name_id' | DirectorID   | uniq
__castmember__      = '$[*].cast.[*].name_id' | CastMemberID | uniq
__id__              --- "hasTitle"       --- '$[*].name'          | xsd:string          ^^^ "imdb_600k_international_movies"
__id__              --- "yearReleased"   --- '$[*].year'          | xsd:date     | yyyy ^^^ "imdb_600k_international_movies"
__id__              --- "runTime"        --- '$[*].runtime'       | xsd:duration | dura ^^^ "imdb_600k_international_movies"
__id__              --- "hasGenre"       --- '$[*].genre[*]'      | xsd.string   | cata ^^^ "imdb_600k_international_movies"
__id__              --- "ratingValue"    --- '$[*].ratingValue'   | xsd:float    | valu ^^^ "imdb_600k_international_movies"
__id__              --- "summary"        --- '$[*].summary_text'  | xsd:string   | cont ^^^ "imdb_600k_international_movies"
__director__        --- "directedMovie"  --- __id__                                     ^^^ "imdb_600k_international_movies"
__director__        --- "hasName"        --- '$[*].director.name' | xsd:string          ^^^ "imdb_600k_international_movies"
__castmember__      --- "castMemberOf"   --- __id__                                     ^^^ "imdb_600k_international_movies"
__castmember__      --- "hasName"        --- '$[*].cast.[*].name' | xsd:string          ^^^ "imdb_600k_international_movies"
'''

#
# Updated Transform Map
# ... the Person ID edge is now simplified to use the end jsonpath syntax
# ... which isn't supported by the self modifying code example...
#
_xform_map_CONTAINS_REVERSE_ISSUE = '''
__id__              = '$[*]._id'              | MovieID  | uniq
__director__        = '$[*].director.name_id' | PersonID | uniq
__castmember__      = '$[*].cast.[*].name_id' | PersonID | uniq
__id__                       --- "hasTitle"       --- '$[*].name'          | xsd:string
__id__                       --- "yearReleased"   --- '$[*].year'          | xsd:date
__id__                       --- "runTime"        --- '$[*].runtime'       | xsd:duration
__id__                       --- "hasGenre"       --- '$[*].genre[*]'      | xsd.string
__id__                       --- "ratingValue"    --- '$[*].ratingValue'   | xsd:float
__id__                       --- "summary"        --- '$[*].summary_text'  | xsd:string
__director__                 --- "directedMovie"  --- __id__
__castmember__               --- "castMemberOf"   --- __id__
$..name_id | PersonID | uniq --- "hasName"        --- '$..name'            | xsd:string
'''

ofv = RTOntology(_xform_map_)
_base_ = '../../../data/kaggle_imdb_600k/international-movies-json/'
_files_ = os.listdir(_base_)
print(f'{len(_files_)} files...')
jsonparse_time_sum = ontology_time_sum = files_processed = 0
for i in range(len(_files_)):
    _file_ = _files_[i]
    if (i > 0) and ((i % 50) == 0): print(f'{i:4} | json {jsonparse_time_sum/files_processed:0.3f}s | ontology {ontology_time_sum/files_processed:0.3f}s ...')
    _txt_  = open(_base_ + _file_).read()
    ts0 = time.time()
    _json_ = json.loads(_txt_)
    ts1 = time.time()        
    ofv.parse(_json_)
    ts2 = time.time()
    jsonparse_time_sum += (ts1 - ts0)
    ontology_time_sum  += (ts2 - ts1)
    files_processed    += 1
    if files_processed > 100: break

print()
print(f'{files_processed} files processed')
print(f'json parse (per file):     {jsonparse_time_sum/files_processed:0.3f}s | total: {jsonparse_time_sum:0.3f}s')
print(f'ontology parse (per file): {ontology_time_sum/files_processed:0.3f}s | total: {ontology_time_sum:0.3f}s')

# just the first 10 files...
# ... for all 10 template rows, it's 14.5s per file...  triples extracted is 36,547
# ... cut down to 2 files... it's 4.8s per file after implementing the "equal stars" stub
# ... ... however, to get the nulls to show up, you have to use find_or_create() from jsonpath-ng ...
# ... ... and that creation sticks in an empty dictionary "{}" instead of a None...
# ... ... with the self modifying code modification... now down to 0.015s per file
# ... ... but this increases as more and more files are parsed... at 500 files, the average is as 0.24s / file

2535 files...
  50 | json 0.001s | ontology 0.018s ...
 100 | json 0.001s | ontology 0.042s ...

101 files processed
json parse (per file):     0.001s | total: 0.068s
ontology parse (per file): 0.043s | total: 4.309s


In [7]:
print(f'{len(ofv.df_triples)=}')
print(f'{len(ofv.uid_lu)=}')
print(f'{len(ofv.rev_uid_lu)=}')
ofv.df_triples.sample(3)

len(ofv.df_triples)=361207
len(ofv.uid_lu)=361841
len(ofv.rev_uid_lu)=102207


sbj,stype,sdisp,vrb,obj,otype,odisp,grp,gdisp,src
i64,str,str,str,i64,str,str,null,null,str
214112,"""DirectorID""","""uniq""","""directedMovie""",212205,"""MovieID""","""uniq""",,,"""imdb_600k_inte…"
272357,"""MovieID""","""uniq""","""summary""",273854,"""xsd:string""","""cont""",,,"""imdb_600k_inte…"
114400,"""MovieID""","""uniq""","""yearReleased""",114661,"""xsd:date""","""yyyy""",,,"""imdb_600k_inte…"


In [8]:
#_uid_ = 284569
#print(f'{ofv.uid_lu[_uid_]=}')
#_tuple_ =ofv.uid_lu[_uid_]
#_key_   = str(_tuple_[0])+'|'+str(_tuple_[1])
#print(f'{ofv.rev_uid_lu[_key_]=}')

In [12]:
ofv.to_files('imdb_600k_movies')

AttributeError: 'DataFrame' object has no attribute 'to_parquet'

In [9]:
my_star_set = {'$[*]._id', '$[*].cast.[*].name'}
my_star_set = ['$[*]._id', '$[*].name']
my_star_set = ['$[*]._id', '$[*].genre[*]']

golden = fillJSONPathElementsByJSONPath(my_star_set, _json_)
filled = fillJSONPathElements          (my_star_set, _json_)
for v in filled.keys(): print(f'  {v:38}: {len(filled[v]):8} {len(golden[v]):8}')
for v in golden.keys():
    for i in range(len(golden[v])):
        if filled[v][i] != golden[v][i]: print(f'WRONG:  {v}: {filled[v][i]} != {golden[v][i]}')

  $[*]._id                              :      300      300
  $[*].genre[*]                         :      300      300


In [10]:
_examples_ = [['$[*]._id',                       '$[*].name'],
              ['$[*]._id',                       '$[*].year'],
              ['$[*]._id',                       '$[*].runtime'],
              ['$[*]._id',                       '$[*].genre[*]'],
              ['$[*]._id',                       '$[*].ratingValue'],
              ['$[*]._id',                       '$[*].summary_text'],
              ['$[*].director.name_id',          '$[*]._id'],
              ['$[*].cast.[*].name_id',          '$[*]._id'],
              ['$[*].director.name_id',          '$[*].director.name'],
              ['$[*].cast.[*].name_id',          '$[*].cast.[*].name']]
for _example_ in _examples_:
    print(_example_)
    ts0 = time.time()
    golden = fillJSONPathElementsByJSONPath(_example_, _json_)
    ts1 = time.time()
    filled = fillJSONPathElements          (_example_, _json_)
    ts2 = time.time()
    for v in filled.keys(): print(f'  {v:38}: {len(filled[v]):8} | {(ts2-ts1):0.2f}s \t\t {len(golden[v]):8} | {(ts1-ts0):0.2f}s')
    for v in golden.keys():
        for i in range(len(golden[v])):
            results_differ = False
            if   i >= len(filled[v]):          results_differ, reason = True, 'lengths differ'
            elif filled[v][i] != golden[v][i]: results_differ, reason = True, 'values differ @ '+str(i) + ' : ' + str(filled[v][i]) + ' != ' + str(golden[v][i])
            if results_differ: break
        if results_differ: print(f'  Incorrect: {reason}')

['$[*]._id', '$[*].name']
  $[*]._id                              :      250 | 0.00s 		      250 | 0.00s
  $[*].name                             :      250 | 0.00s 		      250 | 0.00s
['$[*]._id', '$[*].year']
  $[*]._id                              :      250 | 0.00s 		      250 | 0.00s
  $[*].year                             :      250 | 0.00s 		      250 | 0.00s
['$[*]._id', '$[*].runtime']
  $[*]._id                              :      250 | 0.00s 		      250 | 0.00s
  $[*].runtime                          :      250 | 0.00s 		      250 | 0.00s
['$[*]._id', '$[*].genre[*]']
  $[*]._id                              :      300 | 0.00s 		      300 | 1.59s
  $[*].genre[*]                         :      300 | 0.00s 		      300 | 1.59s
['$[*]._id', '$[*].ratingValue']
  $[*]._id                              :      250 | 0.00s 		      250 | 0.00s
  $[*].ratingValue                      :      250 | 0.00s 		      250 | 0.00s
['$[*]._id', '$[*].summary_text']
  $[*]._id                      

In [11]:
_json_txt_ = '''
{"id":1,
 "people":[{"first":"John", "last":"Smith", "id":10, "age":30, "city":"nyc",          "state":"ny", "country":"us"},
           {"first":"Joe",  "last":"Smith", "id":20, "age":35,                        "state":"ny", "country":"us"},
           {"first":"Mary", "last":"Jones", "id":30, "age":32, "city":"philadelphia", "state":"pa", "country":"us"}],
 "knowsFrom":[[10, 20, "Conference A"], 
              [20, 30, "Conference B"]],
 "education":[{"id":10, "degreeReceived":"Ph.D. in Computer Science",   "university":"Stanford University"},
              {"id":10, "degreeReceived":"Masters in Computer Science", "university":"University of Pennsylvania"}],
 "total_people":3
}'''
_json_simple_  = json.loads(_json_txt_)
_examples_     = [
['$.people[*].id',    '$.people[*].last', '$.people[*].first'],
['$.people[*].id',    '$.people[*].age'],
['$.people[*].id',    '$.people[*].city', '$.people[*].state', '$.people[*].country'],
['$.people[*].id',    '$.people[*].city'],
['$.knowsFrom[*][0]', '$.knowsFrom[*][1]', '$.knowsFrom[*][2]'],
]
for _example_ in _examples_:
    print(_example_)
    filled = fillJSONPathElements(_example_, _json_simple_)
    for v in filled: print(f'  {v:38}: {len(filled[v]):8}')
    print(pd.DataFrame(filled))

['$.people[*].id', '$.people[*].last', '$.people[*].first']
  $.people[*].id                        :        3
  $.people[*].last                      :        3
  $.people[*].first                     :        3
   $.people[*].id $.people[*].last $.people[*].first
0              10            Smith              John
1              20            Smith               Joe
2              30            Jones              Mary
['$.people[*].id', '$.people[*].age']
  $.people[*].id                        :        3
  $.people[*].age                       :        3
   $.people[*].id  $.people[*].age
0              10               30
1              20               35
2              30               32
['$.people[*].id', '$.people[*].city', '$.people[*].state', '$.people[*].country']
  $.people[*].id                        :        3
  $.people[*].city                      :        3
  $.people[*].state                     :        3
  $.people[*].country                   :        3
   $.peo