In [None]:
import pandas as pd
import polars as pl
import numpy as np
import networkx as nx
from datetime import datetime, timedelta
import time
import os
import sys
sys.path.insert(1, '../../framework')
from racetrack import *
rt = RACETrack()

In [None]:
import json
from jsonpath_ng import jsonpath, parse

In [None]:
# scanForward() - finds the next unescaped version of c in x starting at i
def scanForward(x, i, c):
    in_escape = False
    while i < len(x):
        if   x[i] == '\\' and in_escape == False: in_escape = True
        else:
            if x[i] == c and in_escape == False: return i
            in_escape = False
        i += 1
    return None

# literalize() - converts any single or double quoted strings into unique literal names
def literalize(x):
    l, lu = [], {}
    i = 0
    while i < len(x):
        c = x[i]
        if   c == "'":
            j = scanForward(x, i+1, "'")
            if j is None: raise Exception(f'OntologyForViz.literalize() - unterminated string literal "{x}"')
            _literal_name_ = f'____lit{len(lu.keys())}____' # Surely, no one would ever use four underscores in a literal... and don't call me Surely
            lu[_literal_name_] = x[i+1:j]
            l.append(_literal_name_)
            i = j + 1
        elif c == '"':
            j = scanForward(x, i+1, '"')
            if j is None: raise Exception(f'OntologyForViz.literalize() - unterminated string literal "{x}"')
            _literal_name_ = f'____lit{len(lu.keys())}____' # Surely, no one would ever use four underscores in a literal... and don't call me Surely
            lu[_literal_name_] = x[i+1:j]
            l.append(_literal_name_)
            i = j + 1
        else:
            l.append(c)
            i += 1
    return ''.join(l), lu

# fillLiteratals() - fill in the literal values (opposite of literalize but not guaranteed to keep spaces)
def fillLiterals(x, lu):
    for k, v in lu.items():
        x = x.replace(k, v)
    return x

# findClosingParen() - find the next closing paren taking other open/closes into consideration
# ... requires that literals were taken care of... [see literalize function]
def findClosingParen(s, i):
    stack = 0
    while i < len(s):
        if   s[i] == '(':               stack += 1
        elif s[i] == ')' and stack > 0: stack -= 1
        elif s[i] == ')':               return i
        i += 1
    raise Exception(f'OntologyForViz.findClosingParen() - no closing paren found for "{s}"')

# tokenizeParameters() - create a token list of function parameters
# ... requires that literals were taken care of... [see literalize function]
def tokenizeParameters(x):
    r = []
    while ',' in x or '(' in x:
        if   ',' in x and '(' in x: # both... process the one that occurs first
            i = x.index(',')
            j = x.index('(')
            if i < j:
                r.append(x[:i])
                x = x[i+1:]
            else:
                k = findClosingParen(x,j+1)
                r.append(x[:k+1].strip())
                x = x[k+1:]
                if ',' in x: x = x[x.index(',')+1:] # if there's another comma, consume it
        elif ',' in x: # just literals from here on out...
            r.append(x[:x.index(',')].strip())
            x = x[x.index(',')+1:]
        elif '(' in x: # just one function call from here on out...
            i = x.index('(')
            j = findClosingParen(x,i+1)
            r.append(x[:j+1].strip())
            x = x[j+1:]
            if ',' in x: x = x[x.index(',')+1:] # if there's another comma, consume it
    x = x.strip()
    if len(x) > 0:
        r.append(x)
    return r

# parseTree() - create a parse tree representation of a ontology node description
def parseTree(x, node_value=None, node_children=None, node_name=None, lit_lu=None):
    if node_value is None:
        node_value = {}
        node_children = {}
        node_name = 'root'

    if lit_lu is None:
        x, lit_lu = literalize(x)
    if '(' in x:
        i          = x.index('(')
        j          = findClosingParen(x, i+1)
        fname      = x[0:i].strip()
        parms      = tokenizeParameters(x[i+1:j])
        node_value   [node_name] = lit_lu[fname] if fname in lit_lu else fname
        node_children[node_name] = []    # functions have children... even if it's an empty list of children
        for child_i in range(len(parms)):
            child_name = f'{node_name}.{child_i}'
            node_children[node_name].append(child_name)
            parseTree(parms[child_i], node_value, node_children, child_name, lit_lu)
    else:
        x                        = x.strip()
        node_value   [node_name] = lit_lu[x] if x in lit_lu else x
        node_children[node_name] = None # literals have no children
    return node_value, node_children

# solveParseTree() - evaluate a parse tree
def solveParseTree(values, children, filled, i, node=None):
    if node is None: node = 'root'
    if   children[node] is None and isJsonPath(values[node]):
        return filled[values[node]][i]  # jsonpath filled in value from the json
    elif children[node] is None:
        return values[node]             # constant / literal
    else:
        parms = [solveParseTree(values, children, filled, i, x) for x in children[node]]
        return eval(f'{values[node]}(*parms)')

# upToStar() - upto the cth '[*]'
def upToStar(x, c):
    i = 0
    while c > 0:
        j = x.index('[*]', i)
        i = j + 3
        c -= 1
    return x[:i]

# fillStars() - fill the the stars in the specified order
def fillStars(x, i, j=None, k=None):
    if '[*]' not in x: return x # for example ... "$.id"
    _index_ = x.index('[*]')
    x = x[:_index_] + f'[{i}]' + x[_index_+3:]
    if j is not None and '[*]' in x:
        _index_ = x.index('[*]')
        x = x[:_index_] + f'[{j}]' + x[_index_+3:]
    if k is not None and '[*]' in x:
        _index_ = x.index('[*]')
        x = x[:_index_] + f'[{k}]' + x[_index_+3:]
    return x

# isJsonPath() - check if the string is a jsonpath
def isJsonPath(_str_): 
    return _str_.startswith('$.') or _str_.startswith('$[')

#
#
#
class RTOntology(object):
    # __init__() - prepare transform spec for use and initial instance variables
    def __init__(self, xform_spec):
        self.xform_spec_lines = self.__substituteDefines__(xform_spec)
        self.df_triples = None
        self.uid_lu     = {}
        self.rev_uid_lu = {}

    # __substituteDefines__() - subsitute defines
    def __substituteDefines__(self, _txt_):
        lines     = _txt_.split('\n')
        subs      = {}
        completes = []
        for _line_ in lines:
            tokens = _line_.split()
            if len(tokens) >= 3 and tokens[1] == '=':
                subs[tokens[0]] = ' '.join(tokens[2:])
            else:
                for r in subs:
                    if r in _line_:
                        _line_ = _line_.replace(r, subs[r])
                if len(_line_) > 0:
                    completes.append(_line_)
        return completes

    # __applyTemplate__() - apply templated line in the transform to the json representation
    def __applyTemplate__(self, 
                          myjson,        # json representation
                          s_values,      s_children,    s_type,     s_uniq, # subject params
                          v_values,      v_children,                        # verb params   (it's only a string, no typing, unique to the schema)
                          o_values,      o_children,    o_type,     o_uniq, # object params
                          g_values,      g_children,    g_type,     g_uniq, # group params
                          src_values,    src_children,                      # source params (it's only a string, no typing, unique to this ontological instance)
                          ):
        # resolve the jsonpath values        
        all_values  = set(s_values.values()) | set(v_values.values()) | set(o_values.values())
        if g_values   is not None: all_values |= set(g_values.values())
        if src_values is not None: all_values |= set(src_values.values())

        path_values, longest_by_star_path, filled = [], None, {}
        for x in all_values:
            filled[x] = []
            if isJsonPath(x):
                path_values.append(x)
                if '*' in x:
                    if   longest_by_star_path is None:
                        longest_by_star_path = x
                    elif longest_by_star_path.rindex('*') < x.rindex('*'):
                        longest_by_star_path = x

        # ensure that all jsonpath values are substrings of the longest star path
        for x in path_values:
            if '*' in x:
                x_until_last_star = x[:x.rindex('*')+2] # get the close bracket too
                if longest_by_star_path[:len(x_until_last_star)] != x_until_last_star:
                    raise Exception(f'OntologyForViz.__applyTemplate__() - jsonpath are not subsets "{x}" vs "{longest_by_star_path}"')
                
        # fill in the json values into the filled dict
        if    longest_by_star_path is None:
            raise('OntologyForViz.__applyTemplate__() - no meaningful jsonpath(s) found')
        else:
            star_count = longest_by_star_path.count('[*]')
            if star_count   == 1:
                for i in range(len(parse(upToStar(longest_by_star_path, 1)).find(myjson))):
                    for v in filled.keys():
                        if isJsonPath(v):
                            _matches_ = parse(fillStars(v, i)).find(myjson)
                            if len(_matches_) == 1: filled[v].append(_matches_[0].value)
                            else:                   filled[v].append(None)
                        else:
                            filled[v].append(v)
            elif star_count == 2:
                for i in range(len(parse(upToStar(longest_by_star_path, 1)).find(myjson))):
                    for j in range(len(parse(upToStar(fillStars(longest_by_star_path,i), 1)).find(myjson))):
                        for v in filled.keys():
                            if isJsonPath(v):
                                _matches_ = parse(fillStars(v, i, j)).find(myjson)
                                if len(_matches_) == 1: filled[v].append(_matches_[0].value)
                                else:                   filled[v].append(None)
                            else:
                                filled[v].append(v)
            elif star_count == 3:
                for i in range(len(parse(upToStar(longest_by_star_path, 1)).find(myjson))):
                    for j in range(len(parse(upToStar(fillStars(longest_by_star_path,i), 1)).find(myjson))):
                        for k in range(len(parse(upToStar(fillStars(longest_by_star_path,i,j), 1)).find(myjson))):
                            for v in filled.keys():
                                if isJsonPath(v):
                                    _matches_ = parse(fillStars(v, i, j, k)).find(myjson)
                                    if len(_matches_) == 1: filled[v].append(_matches_[0].value)
                                    else:                   filled[v].append(None)
                                else:
                                    filled[v].append(v)
            else:
                raise Exception(f'OntologyForViz.__applyTemplate__() - max of three stars supported -- {star_count} found')

        # collapse the parse trees based on the filled values
        # ... double check that they are the same length
        l = None
        for v in filled.keys():
            if l is None: l = len(filled[v])
            if len(filled[v]) != l: raise Exception(f'OntologyForViz.__applyTemplate__() - unequal number of values for {v}')
        pre_df = {}
        pre_df['sbj']    = [solveParseTree(s_values,   s_children,   filled, i) for i in range(l)]
        pre_df['vrb']    = [solveParseTree(v_values,   v_children,   filled, i) for i in range(l)]
        pre_df['obj']    = [solveParseTree(o_values,   o_children,   filled, i) for i in range(l)]
        if g_values   is not None: pre_df['grp'] = [solveParseTree(g_values,   g_children,   filled, i) for i in range(l)]
        if src_values is not None: pre_df['src'] = [solveParseTree(src_values, src_children, filled, i) for i in range(l)]

        for_df = {'sbj': [], 'vrb': [], 'obj': [], 'grp':[], 'src':[]}
        for i in range(l):
            #
            # Subject (Required)
            #
            _sbj_, _sbj_type_, _sbj_uniq_ = pre_df['sbj'][i], s_type, s_uniq
            if type(_sbj_) == tuple:
                _sbj_type_ = _sbj_[1] if len(_sbj_) > 1 else s_type
                _sbj_uniq_ = _sbj_[2] if len(_sbj_) > 2 else s_uniq
                _sbj_      = _sbj_[0]
            _sbj_uid_ = self.resolveUniqIdAndUpdateLookups(_sbj_, _sbj_type_, _sbj_uniq_, 'sbj')
            for_df['sbj'].append(_sbj_uid_)

            #
            # Verb (Required)
            #
            _vrb_ = pre_df['vrb'][i]
            for_df['vrb'].append(_vrb_)

            #
            # Object (Required)
            #
            _obj_, _obj_type_, _obj_uniq_ = pre_df['obj'][i], o_type, o_uniq
            if type(_obj_) == tuple:
                _obj_type_ = _obj_[1] if len(_obj_) > 1 else o_type
                _obj_uniq_ = _obj_[2] if len(_obj_) > 2 else o_uniq
                _obj_      = _obj_[0]
            _obj_uid_ = self.resolveUniqIdAndUpdateLookups(_obj_, _obj_type_, _obj_uniq_, 'obj')            
            for_df['obj'].append(_obj_uid_)

            #
            # Grouping (Optional)
            #
            if g_values is not None:
                _grp_, _grp_type_, _grp_uniq_ = pre_df['grp'][i], g_type, g_uniq
                if type(_grp_) == tuple:
                    _grp_type_ = _grp_[1] if len(_grp_) > 1 else g_type
                    _grp_uniq_ = _grp_[2] if len(_grp_) > 2 else g_uniq
                    _grp_      = _grp_[0]
                _grp_uid_ = self.resolveUniqIdAndUpdateLookups(_grp_, _grp_type_, _grp_uniq_, 'grp')
                for_df['grp'].append(_grp_uid_)
            else:
                for_df['grp'].append(None)

            #
            # Sourcing (Optional)
            #
            if src_values is not None:
                _src_ = pre_df['src'][i]
                for_df['src'].append(str(_src_))
            else:
                for_df['src'].append(None)

        _df_            = pl.DataFrame(for_df)
        self.df_triples = _df_ if self.df_triples is None else pl.concat([_df_, self.df_triples])

    # resolveIdAndUpdateLookups() - resolve id and update lookups
    # self.uid_lu[<interger>] = (id-from-input, type-from-input, uniq-from-input)
    #
    def resolveUniqIdAndUpdateLookups(self, _id_, _type_, _uniq_, _occurs_in_):
        _uniq_key_ = str(_id_)+'|'+str(_type_)
        if _uniq_ and _uniq_key_ in self.rev_uid_lu:
            return self.rev_uid_lu[_uniq_key_]
        my_uid = 100_000 + len(self.uid_lu.keys())
        self.uid_lu[my_uid] = (_id_, _type_, _uniq_)
        if _uniq_:  self.rev_uid_lu[_uniq_key_] = my_uid
        return my_uid

    # parse() - parse json into ontology via specification
    def parse(self, j):
        for l in self.xform_spec_lines:
            l, lu = literalize(l) # get rid of any literal values so it doesn't mess up the delimiters
            if '#' in l: l = l[:l.index('#')].strip() # comments... hope the hash doesn't occur anywhere in the template that isn't a comment
            if len(l) == 0: continue

            # Sourcing Information
            src_values = src_children = None
            if '^^^' in l:
                src = l[l.index('^^^')+3:]
                l   = l[:l.index('^^^')].strip()
                src_values, src_children = parseTree(fillLiterals(src, lu))

            # Grouping Information
            g_values = g_children = g_type = g_uniq = None
            if '@@@' in l:
                grp = l[l.index('@@@')+3:]
                l   = l[:l.index('@@@')].strip()
                g_uniq = None
                if grp.endswith('uniq') and '|' in grp:
                    grp = grp[:grp.rindex('|')]
                    g_uniq = True
                g_type = None
                if '|' in grp:
                    g_type = grp[grp.rindex('|')+1:].strip()
                    grp   = grp[:grp.rindex('|')]
                g_node = grp
                g_values, g_children = parseTree(fillLiterals(g_node, lu))
                
            svo = [x.strip() for x in l.split('---')]
            if len(svo) == 3:
                s, v, o = svo[0], svo[1], svo[2]

                # Subject
                s_uniq = None
                if s.endswith('uniq') and '|' in s:
                    s = s[:s.rindex('|')]
                    s_uniq = True
                s_type = None
                if '|' in s:
                    s_type = s[s.rindex('|')+1:].strip()
                    s      = s[:s.rindex('|')]
                s_node = s
                s_values, s_children = parseTree(fillLiterals(s_node, lu))

                # Verb
                v_values, v_children = parseTree(fillLiterals(v, lu))

                # Object
                o_uniq = None
                if o.endswith('uniq') and '|' in o:
                    o = o[:o.rindex('|')]
                    o_uniq = True
                if '|' in o:
                    o_type = o[o.rindex('|')+1:].strip()
                    o      = o[:o.rindex('|')]
                o_node = o
                o_values, o_children = parseTree(fillLiterals(o_node, lu))

                self.__applyTemplate__(j, s_values, s_children, s_type, s_uniq, 
                                          v_values, v_children, 
                                          o_values, o_children, o_type, o_uniq,
                                          g_values, g_children, g_type, g_uniq,
                                          src_values, src_children)
            else:
                raise Exception(f'OntologyForViz.parse() - line "{l}" does not have three parts')


In [None]:
_json_txt_ = '''
{"id":1,
 "people":[{"first":"John", "last":"Smith", "id":10, "age":30, "city":"nyc",          "state":"ny", "country":"us"},
           {"first":"Joe",  "last":"Smith", "id":20, "age":35,                        "state":"ny", "country":"us"},
           {"first":"Mary", "last":"Jones", "id":30, "age":32, "city":"philadelphia", "state":"pa", "country":"us"}],
 "knowsFrom":[[10, 20, "Conference A"], 
              [20, 30, "Conference B"]],
 "education":[{"id":10, "degreeReceived":"Ph.D. in Computer Science",   "university":"Stanford University"},
              {"id":10, "degreeReceived":"Masters in Computer Science", "university":"University of Pennsylvania"}],
 "total_people":3
}'''
_json_simple_  = json.loads(_json_txt_)
def concatNames(_last_,_first_):
    return _last_ + ' ' + _first_
def combineAddress(_city_,_state_,_country_):
    s = ''
    if _city_    is not None: s += _city_
    if _state_   is not None: s += ', ' + _state_    if (len(s) > 0) else _state_
    if _country_ is not None: s += ', ' + _country_  if (len(s) > 0) else _country_
    return s if (len(s) > 0) else 'Not Supplied'
_xform_simple_ = '''
'$.people[*].id'    | PersonID | uniq --- "hasName" --- concatNames('$.people[*].last', '$.people[*].first') | xsd:string                                                                             ^^^ "IN_TEMPLATE"
'$.people[*].id'    | PersonID | uniq --- "hasAge"  --- '$.people[*].age' | xsd:integer                                                                                                               ^^^ '$.id'
'$.people[*].id'    | PersonID | uniq --- "isFrom"  --- combineAddress('$.people[*].city', '$.people[*].state', '$.people[*].country') | CityStateCountry                                             ^^^ '$.id'
'$.knowsFrom[*][0]' | PersonID | uniq --- "knows"   --- '$.knowsFrom[*][1]' | PersonID | uniq                                                             @@@ '$.knowsFrom[*][2]' | xsd:string | uniq ^^^ '$.id'
'''
ofv_simple = RTOntology(_xform_simple_)
ofv_simple.parse(_json_simple_)
print(ofv_simple.uid_lu)
print(ofv_simple.df_triples)

In [None]:
#
# Length
#
#print(len(parse('$[*]').find(_json_)))
#
# Examples
#
#jsp_expr = parse('$[*].name')
#jsp_expr = parse('$[*].cast.[*].name') # but note that it doesn't distinguish the movie id
#jsp_expr = parse('$..director.name')
#jsp_expr = parse('$..name')
#jsp_expr = parse('$..genre[*]')
#[match.value for match in jsp_expr.find(_json_)][:3]

# IMDB 600K Transform Map
# ... maybe add "@@@" for grouping the triples together ... and then "^^^" for sourcing?
_xform_map_ = '''
__id__              = '$[*]._id'              | MovieID      | uniq
__director__        = '$[*].director.name_id' | DirectorID   | uniq
__castmember__      = '$[*].cast.[*].name_id' | CastMemberID | uniq
__id__              --- "hasTitle"       --- '$[*].name'          | xsd:string
__id__              --- "yearReleased"   --- '$[*].year'          | xsd:date
__id__              --- "runTime"        --- '$[*].runtime'       | xsd:duration
__id__              --- "hasGenre"       --- '$[*].genre[*]'      | xsd.string
__id__              --- "ratingValue"    --- '$[*].ratingValue'   | xsd:float
__id__              --- "summary"        --- '$[*].summary_text'  | xsd:string
__director__        --- "directedMovie"  --- __id__
__director__        --- "hasName"        --- '$[*].director.name' | xsd:string
__castmember__      --- "castMemberOf"   --- __id__
__castmember__      --- "hasName"        --- '$[*].cast.[*].name' | xsd:string
'''

ofv = RTOntology(_xform_map_)
_base_ = '../../../data/kaggle_imdb_600k/international-movies-json/'
_files_ = os.listdir(_base_)
print(f'{len(_files_)} files...')
jsonparse_time_sum = ontology_time_sum = files_processed = 0
for i in range(len(_files_)):
    _file_ = _files_[i]
    if (i > 0) and ((i % 500) == 0): print(f'{i} / json {jsonparse_time_sum/files_processed:0.1f}s / ontology {ontology_time_sum/files_processed:0.1f}s ...')
    _txt_  = open(_base_ + _file_).read()
    ts0 = time.time()
    _json_ = json.loads(_txt_)
    ts1 = time.time()        
    ofv.parse(_json_)
    ts2 = time.time()
    jsonparse_time_sum += (ts1 - ts0)
    ontology_time_sum  += (ts2 - ts1)
    files_processed    += 1
    if files_processed >= 10: break

print()
print(f'{files_processed} files processed')
print(f'json parse (per file):     {jsonparse_time_sum/files_processed:0.1f}s')
print(f'ontology parse (per file): {ontology_time_sum/files_processed:0.1f}s')

# just the first 10 files...
# ... for all 10 template rows, it's 14.5s per file...  triples extracted is 36,547

print(len(ofv.df_triples))
ofv.df_triples.sample(3)