In [None]:
import pandas as pd
import polars as pl
import numpy as np
import networkx as nx
from datetime import datetime, timedelta
import os
import sys
sys.path.insert(1, '../../framework')
from racetrack import *
rt = RACETrack()

In [None]:
import json
from jsonpath_ng import jsonpath, parse

In [None]:
# scanForward() - finds the next unescaped version of c in x starting at i
def scanForward(x, i, c):
    in_escape = False
    while i < len(x):
        if   x[i] == '\\' and in_escape == False: in_escape = True
        else:
            if x[i] == c and in_escape == False: return i
            in_escape = False
        i += 1
    return None

# literalize() - converts any single or double quoted strings into unique literal names
def literalize(x):
    l, lu = [], {}
    i = 0
    while i < len(x):
        c = x[i]
        if   c == "'":
            j = scanForward(x, i+1, "'")
            if j is None: raise Exception(f'OntologyForViz.literalize() - unterminated string literal "{x}"')
            _literal_name_ = f'__lit{len(lu.keys())}__'
            lu[_literal_name_] = x[i+1:j]
            l.append(_literal_name_)
            i = j + 1
        elif c == '"':
            j = scanForward(x, i+1, '"')
            if j is None: raise Exception(f'OntologyForViz.literalize() - unterminated string literal "{x}"')
            _literal_name_ = f'__lit{len(lu.keys())}__'
            lu[_literal_name_] = x[i+1:j]
            l.append(_literal_name_)
            i = j + 1
        else:
            l.append(c)
            i += 1
    return ''.join(l), lu

# findClosingParen() - find the next closing paren taking other open/closes into consideration
# ... requires that literals were taken care of... [see literalize function]
def findClosingParen(s, i):
    stack = 0
    while i < len(s):
        if   s[i] == '(':               stack += 1
        elif s[i] == ')' and stack > 0: stack -= 1
        elif s[i] == ')':               return i
        i += 1
    raise Exception(f'OntologyForViz.findClosingParen() - no closing paren found for "{s}"')

# tokenizeParameters() - create a token list of function parameters
# ... requires that literals were taken care of... [see literalize function]
def tokenizeParameters(x):
    r = []
    while ',' in x or '(' in x:
        if   ',' in x and '(' in x: # both... process the one that occurs first
            i = x.index(',')
            j = x.index('(')
            if i < j:
                r.append(x[:i])
                x = x[i+1:]
            else:
                k = findClosingParen(x,j+1)
                r.append(x[:k+1].strip())
                x = x[k+1:]
                if ',' in x: x = x[x.index(',')+1:] # if there's another comma, consume it
        elif ',' in x: # just literals from here on out...
            r.append(x[:x.index(',')].strip())
            x = x[x.index(',')+1:]
        elif '(' in x: # just one function call from here on out...
            i = x.index('(')
            j = findClosingParen(x,i+1)
            r.append(x[:j+1].strip())
            x = x[j+1:]
            if ',' in x: x = x[x.index(',')+1:] # if there's another comma, consume it
    x = x.strip()
    if len(x) > 0:
        r.append(x)
    return r

# parseTree() - create a parse tree representation of a ontology node description
def parseTree(x, node_value=None, node_children=None, node_name=None, lit_lu=None):
    if node_value is None:
        node_value = {}
        node_children = {}
        node_name = 'root'

    if lit_lu is None:
        x, lit_lu = literalize(x)
    if '(' in x:
        i          = x.index('(')
        j          = findClosingParen(x, i+1)
        fname      = x[0:i].strip()
        parms      = tokenizeParameters(x[i+1:j])
        node_value   [node_name] = lit_lu[fname] if fname in lit_lu else fname
        node_children[node_name] = []    # functions have children... even if it's an empty list of children
        for child_i in range(len(parms)):
            child_name = f'{node_name}.{child_i}'
            node_children[node_name].append(child_name)
            parseTree(parms[child_i], node_value, node_children, child_name, lit_lu)
    else:
        x                        = x.strip()
        node_value   [node_name] = lit_lu[x] if x in lit_lu else x
        node_children[node_name] = None # literals have no children
    return node_value, node_children

# solveParseTree() - evaluate a parse tree
def solveParseTree(values, children, filled, i, node=None):
    if node is None: node = 'root'
    if   children[node] is None and isJsonPath(values[node]):
        return filled[values[node]][i]  # jsonpath filled in value from the json
    elif children[node] is None:
        return values[node]             # constant / literal
    else:
        parms = [solveParseTree(values, children, filled, i, x) for x in children[node]]
        return eval(f'{values[node]}(*parms)')

# upToStar() - upto the cth '[*]'
def upToStar(x, c):
    i = 0
    while c > 0:
        j = x.index('[*]', i)
        i = j + 3
        c -= 1
    return x[:i]

# fillStars() - fill the the stars in the specified order
def fillStars(x, i, j=None, k=None):
    _index_ = x.index('[*]')
    x = x[:_index_] + f'[{i}]' + x[_index_+3:]
    if j is not None and '[*]' in x:
        _index_ = x.index('[*]')
        x = x[:_index_] + f'[{j}]' + x[_index_+3:]
    if k is not None and '[*]' in x:
        _index_ = x.index('[*]')
        x = x[:_index_] + f'[{k}]' + x[_index_+3:]
    return x

# isJsonPath() - check if the string is a jsonpath
def isJsonPath(_str_): 
    return _str_.startswith('$.') or _str_.startswith('$[')

#
#
#
class OntologyForViz(object):
    # __init__() - prepare transform spec for use
    def __init__(self, xform_spec):
        self.xform_spec_lines = self.__substituteDefines__(xform_spec)
        self.df_triples = pd.DataFrame()

    # __substituteDefines__() - subsitute defines
    def __substituteDefines__(self, _txt_):
        lines     = _txt_.split('\n')
        subs      = {}
        completes = []
        for _line_ in lines:
            tokens = _line_.split()
            if len(tokens) >= 3 and tokens[1] == '=':
                subs[tokens[0]] = ' '.join(tokens[2:])
            else:
                for r in subs:
                    if r in _line_:
                        _line_ = _line_.replace(r, subs[r])
                if len(_line_) > 0:
                    completes.append(_line_)
        return completes

    # __applyTemplate__() - apply templated line in the transform to the json representation
    def __applyTemplate__(self, 
                          myjson,        # json representation
                          s_values,      # subject parse tree values
                          s_children,    # subject parse tree structure
                          s_type,        # subject variable type
                          s_uniq,        # is subject a unique entity?
                          v_values,      # verb parse tree values
                          v_children,    # verb parse tree structure
                          o_values,      # object parse tree values
                          o_children,    # object parse tree structure
                          o_type,        # object type
                          o_uniq):       # is object a unique entity?
        # resolve the jsonpath values        
        all_values  = set(s_values.values()) | set(v_values.values()) | set(o_values.values())

        path_values, longest_by_star_path, filled = [], None, {}
        for x in all_values:
            filled[x] = []
            if isJsonPath(x):
                path_values.append(x)
                if '*' in x:
                    if   longest_by_star_path is None:
                        longest_by_star_path = x
                    elif longest_by_star_path.rindex('*') < x.rindex('*'):
                        longest_by_star_path = x

        # ensure that all jsonpath values are substrings of the longest star path
        for x in path_values:
            if '*' in x:
                x_until_last_star = x[:x.rindex('*')+2] # get the close bracket too
                if longest_by_star_path[:len(x_until_last_star)] != x_until_last_star:
                    raise Exception(f'OntologyForViz.__applyTemplate__() - jsonpath are not subsets "{x}" vs "{longest_by_star_path}"')
                
        # fill in the json values into the filled dict
        if    longest_by_star_path is None:
            raise('OntologyForViz.__applyTemplate__() - no meaningful jsonpath(s) found')
        else:
            star_count = longest_by_star_path.count('[*]')
            if star_count   == 1:
                for i in range(len(parse(upToStar(longest_by_star_path, 1)).find(myjson))):
                    for v in filled.keys():
                        if isJsonPath(v):
                            _matches_ = parse(fillStars(v, i)).find(myjson)
                            if len(_matches_) == 1: filled[v].append(_matches_[0].value)
                            else:                   filled[v].append(None)
                        else:
                            filled[v].append(v)
            elif star_count == 2:
                for i in range(len(parse(upToStar(longest_by_star_path, 1)).find(myjson))):
                    for j in range(len(parse(upToStar(fillStars(longest_by_star_path,i), 1)).find(myjson))):
                        for v in filled.keys():
                            if isJsonPath(v):
                                _matches_ = parse(fillStars(v, i, j)).find(myjson)
                                if len(_matches_) == 1: filled[v].append(_matches_[0].value)
                                else:                   filled[v].append(None)
                            else:
                                filled[v].append(v)
            elif star_count == 3:
                for i in range(len(parse(upToStar(longest_by_star_path, 1)).find(myjson))):
                    for j in range(len(parse(upToStar(fillStars(longest_by_star_path,i), 1)).find(myjson))):
                        for k in range(len(parse(upToStar(fillStars(longest_by_star_path,i,j), 1)).find(myjson))):
                            for v in filled.keys():
                                if isJsonPath(v):
                                    _matches_ = parse(fillStars(v, i, j, k)).find(myjson)
                                    if len(_matches_) == 1: filled[v].append(_matches_[0].value)
                                    else:                   filled[v].append(None)
                                else:
                                    filled[v].append(v)
            else:
                raise Exception(f'OntologyForViz.__applyTemplate__() - max of three stars supported -- {star_count} found')

        # collapse the parse trees based on the filled values
        # ... double check that they are the same length
        l = None
        for v in filled.keys():
            if l is None: l = len(filled[v])
            if len(filled[v]) != l: raise Exception(f'OntologyForViz.__applyTemplate__() - unequal number of values for {v}')
        subjects, verbs, objects = [], [], []
        for i in range(l):
            _subject_ = solveParseTree(s_values, s_children, filled, i)
            _verb_    = solveParseTree(v_values, v_children, filled, i)
            _object_  = solveParseTree(o_values, o_children, filled, i)
            subjects.append(_subject_), verbs.append(_verb_), objects.append(_object_)
        _df_            = pd.DataFrame({'subject': subjects, 'verb': verbs, 'object': objects})
        self.df_triples = pd.concat([_df_, self.df_triples], ignore_index=True)

    # parse() - parse json into ontology via specification
    def parse(self, j):
        for l in self.xform_spec_lines:
            svo = [x.strip() for x in l.split('---')]
            if len(svo) == 3:
                s, v, o = svo[0], svo[1], svo[2]

                # Subject
                s_uniq, o_uniq = False, False
                if s.endswith('uniq'):
                    s = s[:s.rindex('|')]
                    s_uniq = True
                s_type = s[s.rindex('|')+1:].strip()
                s_node = s[:s.rindex('|')]
                s_values, s_children = parseTree(s_node)

                # Verb
                v_values, v_children = parseTree(v)

                # Object
                if o.endswith('uniq'):
                    o = o[:o.rindex('|')]
                    o_uniq = True
                o_type = o[o.rindex('|')+1:].strip()
                o_node = o[:o.rindex('|')]
                o_values, o_children = parseTree(o_node)

                self.__applyTemplate__(j, s_values, s_children, s_type, s_uniq, 
                                          v_values, v_children, 
                                          o_values, o_children, o_type, o_uniq)
            else:
                raise Exception(f'OntologyForViz.parse() - line "{l}" does not have three parts')


In [None]:
_json_txt_ = '''
{"id":1,
 "people":[{"first":"John", "last":"Smith", "age":30, "city":"nyc",          "state":"ny", "country":"us"},
           {"first":"Joe",  "last":"Smith", "age":35,                        "state":"ny", "country":"us"},
           {"first":"Mary", "last":"Jones", "age":32, "city":"philadelphia", "state":"pa", "country":"us"}],
 "total_people":3
}'''
_json_simple_  = json.loads(_json_txt_)
_xform_simple_ = '''
'$.people[*].last' | xsd:string --- "hasAge" --- '$.people[*].age' | xsd:integer
'''
ofv = OntologyForViz(_xform_simple_)
ofv.parse(_json_simple_)
ofv.df_triples.sample(3)

In [None]:
_base_ = '../../../data/kaggle_imdb_600k/international-movies-json/'
_files_ = os.listdir(_base_)
for _file_ in _files_:
    _txt_  = open(_base_ + _file_).read()
    _json_ = json.loads(_txt_)

# Length
print(len(parse('$[*]').find(_json_)))
# Examples
jsp_expr = parse('$[*].name')
jsp_expr = parse('$[*].cast.[*].name') # but note that it doesn't distinguish the movie id
jsp_expr = parse('$..director.name')
jsp_expr = parse('$..name')
jsp_expr = parse('$..genre[*]')
[match.value for match in jsp_expr.find(_json_)][:3]

# Example within transform processing
def returnsSame(x):
    return x

# Example Transform Map
# ... maybe add "@@@" for grouping the triples together ... and then "^^^" for sourcing?
_xform_map_ = '''
__id__              = returnsSame('$[*]._id') | xsd:string | uniq
__director__        = '$[*].director.name_id' | xsd:string | uniq
__castmember__      = '$[*].cast.[*].name_id' | xsd:string | uniq
__id__              --- "hasTitle"       --- '$[*].name'          | xsd:string
__id__              --- "yearReleased"   --- '$[*].year'          | xsd:date
__id__              --- "runTime"        --- '$[*].runtime'       | xsd:duration
__id__              --- "hasGenre"       --- '$[*].genre[*]'      | xsd.string
__id__              --- "ratingValue"    --- '$[*].ratingValue'   | xsd:float
__id__              --- "summary"        --- '$[*].summary_text'  | xsd:string
__director__        --- "directedMovie"  --- __id__
__director__        --- "hasName"        --- '$[*].director.name' | xsd:string
__castmember__      --- "castMemberOf"   --- __id__
__castmember__      --- "hasName"        --- '$[*].cast.[*].name' | xsd:string
'''

ofv = OntologyForViz(_xform_map_)
ofv.parse(_json_)
ofv.df_triples.sample(3)