In [None]:
import polars as pl
import numpy as np
import networkx as nx
from datetime import datetime, timedelta
import os
import sys
sys.path.insert(1, '../../framework')
from racetrack import *
rt = RACETrack()

In [None]:
import json
from jsonpath_ng import jsonpath, parse
_base_ = '../../../data/kaggle_imdb_600k/international-movies-json/'
_files_ = os.listdir(_base_)
for _file_ in _files_:
    _txt_  = open(_base_ + _file_).read()
    _json_ = json.loads(_txt_)

# Length
print(len(parse('$[*]').find(_json_)))
# Examples
jsp_expr = parse('$[*].name')
jsp_expr = parse('$[*].cast.[*].name') # but note that it doesn't distinguish the movie id
jsp_expr = parse('$..director.name')
jsp_expr = parse('$..name')
[match.value for match in jsp_expr.find(_json_)][:3]

In [None]:
_xform_map_ = '''
__id__              = returnsSame('$[*]._id') | xsd:string | uniq
__director__        = '$[*].director.name_id' | xsd:string | uniq
__castmember__      = '$[*].cast.[*].name_id' | xsd:string | uniq
__id__              --- "hasTitle"       --- '$[*].name'          | xsd:string
__id__              --- "yearReleased"   --- '$[*].year'          | xsd:date
__id__              --- "runTime"        --- '$[*].runtime'       | xsd:duration
__id__              --- "genre"          --- '$[*].genre'         | array(xsd:string)
__id__              --- "ratingValue"    --- '$[*].ratingValue'   | xsd:float
__id__              --- "summary"        --- '$[*].summary_text'  | xsd:string
__director__        --- "directedMovie"  --- __id__
__director__        --- "hasName"        --- '$[*].director.name' | xsd:string
__castmember__      --- "castMemberOf"   --- __id__
__castmember__      --- "hasName"        --- '$[*].cast.[*].name' | xsd:string
'''
# Example within transform processing
def returnsSame(x):
    return x
class OntologyForViz(object):
    # __init__() - prepare transform spec for use
    def __init__(self, xform_spec):
        self.xform_spec_lines = self.__substituteDefines__(xform_spec)

    # __substituteDefines__() - subsitute defines
    def __substituteDefines__(self, _txt_):
        lines     = _txt_.split('\n')
        subs      = {}
        completes = []
        for _line_ in lines:
            tokens = _line_.split()
            if len(tokens) >= 3 and tokens[1] == '=':
                subs[tokens[0]] = ' '.join(tokens[2:])
            else:
                for r in subs:
                    if r in _line_:
                        _line_ = _line_.replace(r, subs[r])
                if len(_line_) > 0:
                    completes.append(_line_)
        return completes

    # parse() - parse json into ontology via specification
    def parse(self, j):
        for l in self.xform_spec_lines:
            svo = [x.strip() for x in l.split('---')]
            if len(svo) == 3:
                s, v, o = svo[0], svo[1], svo[2]
            else:
                raise Exception(f'OntologyForViz.parse() - line "{l}" does not have three parts')
            print(svo)

ofv = OntologyForViz(_xform_map_)
ofv.parse(_json_)

In [None]:
def scanForward(x, i, c):
    in_escape = False
    while i < len(x):
        if   x[i] == '\\' and in_escape == False: in_escape = True
        else:
            if x[i] == c and in_escape == False: return i
            in_escape = False
        i += 1
    return None

def literalize(x):
    l, lu = [], {}
    i = 0
    while i < len(x):
        c = x[i]
        if   c == "'":
            j = scanForward(x, i+1, "'")
            if j is None: raise Exception(f'OntologyForViz.literalize() - unterminated string literal "{x}"')
            _literal_name_ = f'__lit{len(lu.keys())}__'
            lu[_literal_name_] = x[i+1:j]
            l.append(_literal_name_)
            i = j + 1
        elif c == '"':
            j = scanForward(x, i+1, '"')
            if j is None: raise Exception(f'OntologyForViz.literalize() - unterminated string literal "{x}"')
            _literal_name_ = f'__lit{len(lu.keys())}__'
            lu[_literal_name_] = x[i+1:j]
            l.append(_literal_name_)
            i = j + 1
        else:
            l.append(c)
            i += 1
    return ''.join(l), lu

def findClosingParen(s, i):
    stack = 0
    while i < len(s):
        if   s[i] == '(':               stack += 1
        elif s[i] == ')' and stack > 0: stack -= 1
        elif s[i] == ')':               return i
        i += 1
    raise Exception(f'OntologyForViz.findClosingParen() - no closing paren found for "{s}"')

def tokenizeParameters(x):
    r = []
    while ',' in x or '(' in x:
        if   ',' in x and '(' in x: # both... process the one that occurs first
            i = x.index(',')
            j = x.index('(')
            if i < j:
                r.append(x[:i])
                x = x[i+1:]
            else:
                k = findClosingParen(x,j+1)
                r.append(x[:k+1].strip())
                x = x[k+1:]
                if ',' in x: x = x[x.index(',')+1:] # if there's another comma, consume it
        elif ',' in x: # just literals from here on out...
            r.append(x[:x.index(',')].strip())
            x = x[x.index(',')+1:]
        elif '(' in x: # just one function call from here on out...
            i = x.index('(')
            j = findClosingParen(x,i+1)
            r.append(x[:j+1].strip())
            x = x[j+1:]
            if ',' in x: x = x[x.index(',')+1:] # if there's another comma, consume it
    x = x.strip()
    if len(x) > 0:
        r.append(x)
    return r

def parseTree(x, node_value={}, node_children={}, node_name="root", lit_lu=None):
    if lit_lu is None:
        x, lit_lu = literalize(x)
    if '(' in x:
        i          = x.index('(')
        j          = findClosingParen(x, i+1)
        fname      = x[0:i].strip()
        parms      = tokenizeParameters(x[i+1:j])
        node_value   [node_name] = lit_lu[fname] if fname in lit_lu else fname
        node_children[node_name] = []    # functions have children... even if it's an empty list of children
        for child_i in range(len(parms)):
            child_name = f'{node_name}.{child_i}'
            node_children[node_name].append(child_name)
            parseTree(parms[child_i], node_value, node_children, child_name, lit_lu)
    else:
        x                        = x.strip()
        node_value   [node_name] = lit_lu[x] if x in lit_lu else x
        node_children[node_name] = None # literals have no children
    return node_value, node_children

_example_ = """myFunc123('$[*].genre', self.__another_Func__('$[*].genre', "my special string", yetAnother("test", 'two')), myFunc123('$[*].name'))"""
# _example_ = "myFunc('$[*].genre')"
_values_, _children_ = parseTree(_example_)
for n in _values_:
    print(f'{n:16} {_values_[n]:24} {_children_[n]}')