In [1]:
import ctypes
import os
import sys
import subprocess
import time

def shell(cstr):
    try:
        res = subprocess.run(cstr, check=True, shell=True,
                                   capture_output=True)
        if len(res.stdout) > 0:
            print(res.stdout.decode("utf-8"))
    except subprocess.CalledProcessError as err:
        if len(err.output) > 0:
            print(err.output.decode("utf-8"))
        print(err.stderr.decode("utf-8"))
        raise err

To start, we'll repeat the main ideas from the last notebook.

In [2]:
# Declare where the halide distribution is at
HALIDE_PATH   = os.path.expanduser('~/install/halide')

# Derive the Halide build locations
HALIDE_LIB    = os.path.join(HALIDE_PATH,'lib')
HALIDE_INC    = os.path.join(HALIDE_PATH,'include')
HALIDE_STATIC = os.path.join(HALIDE_LIB,'libHalide.a')

# Make sure that the Halide library is actually there
if not os.path.isfile(HALIDE_STATIC):
    raise IOError(f"Halide library not found at {HALIDE_STATIC}")

# Make sure that we have a cache directory
HERE_DIR = os.path.abspath('')
C_DIR    = os.path.join(HERE_DIR,'._halide_c_wrap_cache')
if not os.path.isdir(C_DIR):
    os.mkdir(C_DIR)

# Make sure we have a conveient way to blow away the cache
def clean_cache():
    shell( 'rm ' + os.path.join(C_DIR,"*") )


In [3]:
# Basic Type Wrapping
_CTYPES_PTR_TYPE    = type(ctypes.POINTER(ctypes.c_int))
_CTYPES_SIMPLE_TYPE = type(ctypes.c_int)

_str_to_pair = {
    "b":    {'ct':ctypes.c_bool,'s':'_Bool'},
    "i8":   {'ct':ctypes.c_byte,'s':'int8_t'},
    "i16":  {'ct':ctypes.c_short,'s':'int16_t'},
    "i32":  {'ct':ctypes.c_int,'s':'int32_t'},
    "i64":  {'ct':ctypes.c_longlong,'s':'int64_t'},
    "u8":   {'ct':ctypes.c_ubyte,'s':'uint8_t'},
    "u16":  {'ct':ctypes.c_ushort,'s':'uint16_t'},
    "u32":  {'ct':ctypes.c_uint,'s':'uint32_t'},
    "u64":  {'ct':ctypes.c_ulonglong,'s':'uint64_t'},
    "f32":  {'ct':ctypes.c_float,'s':'float'},
    "f64":  {'ct':ctypes.c_double,'s':'double'},
    "v":    {'ct':None,'s':'void'},
    "vp":   {'ct':ctypes.c_void_p,'s':'void *'},
    "vpp":  {'ct':ctypes.POINTER(ctypes.c_void_p),'s':'void **'},
    "s":    {'ct':ctypes.c_char_p,'s':'char *'},
}
_typ_to_pair = {}
for _,pair in _str_to_pair.items():
    _typ_to_pair[pair['ct']] = pair

def wrapT(sig):
    if type(sig) is str:
        if sig in _str_to_pair:
            return _str_to_pair[sig]
        else:
            raise TypeError(f"unrecognized C type string: {sig}")
            
    # convert our defined object types into type pairs
    if issubclass(sig,ctypes.Structure):
        return {'ct':sig,'s':sig._c_type_str}
    
    # lift simple types up to type pairs
    elif type(sig) == _CTYPES_SIMPLE_TYPE:
        if sig in _typ_to_pair:
            return _typ_to_pair[sig]
        else:
            raise TypeError(f"unsupported C type: {sig}")
    
    # handle pointer types
    elif type(sig) == _CTYPES_PTR_TYPE:
        sub = wrapT(sig._type_)
        return {'ct':sig,'s':sub['s']+" *"}
    
    else:
        raise TypeError(f"unrecognized argument type: {type(sig)}")


In [4]:
H_v01_inc_str = """
#include "Halide.h"
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
"""

H_v01_decl_strs   = []
H_v01_defn_strs   = []
H_v01_ctype_wraps = []
H_v01_module      = None

def H_v01_function(name,args,ret,body):
    astr = ', '.join([ wrapT(a[1])['s'] +' '+a[0]
                       for a in args
                     ])
    rstr = wrapT(ret)['s']
    
    atyps = [ wrapT(a[1])['ct'] for a in args ]
    rtyp  = wrapT(ret)['ct']
    
    H_v01_decl_strs.append(f"{rstr} {name}({astr});")
    H_v01_defn_strs.append(f"{rstr} {name}({astr}) {{\n{body}\n}}")
    def wrap(mod):
        f = getattr(mod,name)
        f.argtypes = atyps
        f.restype  = rtyp
    H_v01_ctype_wraps.append(wrap)

def getHv01():
    global H_v01_module
    all_decls = '\n'.join(H_v01_decl_strs)
    all_defns = '\n\n'.join(H_v01_defn_strs)
    src_str   =  (f'{H_v01_inc_str}\n\n'
                  f'extern "C" {{\n'
                  f'{all_decls}\n'
                  f'}}\n\n'
                  f'{all_defns}\n')
    
    Hv01_CPP  = os.path.join(C_DIR, "Hwrap_v01.cpp")
    Hv01_SO   = os.path.join(C_DIR, "libHwrap_v01.so")
    
    # first, get timestamps on needed resources
    # we'll use these to conditionally compile
    def get_time(s):
        if not os.path.exists(s):
            return None
        else:
            return os.path.getmtime(s)
    
    cpp_time  = get_time(Hv01_CPP)
    so_time   = get_time(Hv01_SO)
    h_time    = get_time(HALIDE_STATIC)
    
    # Check whether the CPP file needs to be re-written
    write_cpp = True if cpp_time == None else False
    if cpp_time:
        with open(Hv01_CPP,'r',encoding = 'utf-8') as f:
            if src_str != f.read():
                write_cpp = True
    # possibly rewrite the CPP file
    if write_cpp:
        if so_time: cpp_time = so_time + 1
        with open(Hv01_CPP,'w',encoding = 'utf-8') as f:
            f.write(src_str)
    
    # Check whether the SO needs to be re-compiled
    if (not cpp_time or not so_time or
        so_time < cpp_time or
        so_time < h_time):
            cmd = (f"clang++ -Wall -Werror -fPIC -O3 -shared -std=c++11 "
                   f"-I {HALIDE_INC} {HALIDE_STATIC} -lz "
                   f"-o {Hv01_SO} {Hv01_CPP}")
            print(cmd)
            shell(cmd)
            if H_v01_module != None:
                raise IOError("library Hwrap_v01 already loaded")
    
    # Load the module if needed
    if H_v01_module == None:
        H_v01_module = ctypes.CDLL(Hv01_SO)
        for wrap in H_v01_ctype_wraps:
            wrap(H_v01_module)
    
    return H_v01_module


The following block declares types defined by Halide itself.  These don't need to be declared.

In [5]:
class halide_type_t(ctypes.Structure):
    _c_type_str = "struct halide_type_t"
    _fields_ = [
        ('code',  ctypes.c_ubyte),
        ('bits',  ctypes.c_ubyte),
        ('lanes', ctypes.c_ushort),
    ]

class halide_dimension_t(ctypes.Structure):
    _c_type_str = "struct halide_dimension_t"
    _fields_ = [
        ('min',    ctypes.c_int),
        ('extent', ctypes.c_int),
        ('stride', ctypes.c_int),
        ('flags',  ctypes.c_uint),
    ]

class halide_buffer_t(ctypes.Structure):
    _c_type_str = "struct halide_buffer_t"
    _fields_ = [
        ('device',            ctypes.c_ulonglong),
        ('device_interface',  ctypes.c_void_p),
        ('host',              ctypes.POINTER(ctypes.c_ubyte)),
        ('flags',             ctypes.c_ulonglong),
        ('type',              halide_type_t),
        ('dimensions',        ctypes.c_int),
        ('dim',               ctypes.POINTER(halide_dimension_t)),
        ('padding',           ctypes.c_void_p),
    ]


The following block defines custom types that need pre-declaration in the `extern "C"` block.

In [6]:

for C,ctyp,htyp in [
    ('E','struct hw_expr_t', 'Halide::Expr'),
    ('V','struct hw_var_t',  'Halide::Var' ),
    ('R','struct hw_rdom_t', 'Halide::RDom'),
    ('F','struct hw_func_t', 'Halide::Func'),
    ('I','struct hw_img_t',  'Halide::ImageParam'),
    ('P','struct hw_param_t','Halide::Param<>'),
    ]:
        H_v01_decl_strs.append(f"{ctyp} {{"
        f"    uint64_t id;"
        f"}};")
        H_v01_defn_strs.append(f"{ctyp} _to_{C}({htyp} * x) {{"
        f"    return ({ctyp}){{uint64_t(x)}};"
        f"}}")
        H_v01_defn_strs.append(f"{htyp} * _from_{C}({ctyp} x) {{"
        f"    return ({htyp} *)(x.id);"
        f"}}")


# EXPR
class hw_expr_t(ctypes.Structure):
    _c_type_str = "struct hw_expr_t"
    _fields_ = [ ('id', ctypes.c_ulonglong) ]

# VAR
class hw_var_t(ctypes.Structure):
    _c_type_str = "struct hw_var_t"
    _fields_ = [ ('id', ctypes.c_ulonglong) ]

# RDOM
class hw_rdom_t(ctypes.Structure):
    _c_type_str = "struct hw_rdom_t"
    _fields_ = [ ('id', ctypes.c_ulonglong) ]

# FUNC
class hw_func_t(ctypes.Structure):
    _c_type_str = "struct hw_func_t"
    _fields_ = [ ('id', ctypes.c_ulonglong) ]

# IMG
class hw_img_t(ctypes.Structure):
    _c_type_str = "struct hw_img_t"
    _fields_ = [ ('id', ctypes.c_ulonglong) ]

# PARAM
class hw_param_t(ctypes.Structure):
    _c_type_str = "struct hw_param_t"
    _fields_ = [ ('id', ctypes.c_ulonglong) ]


It'll also be beneficial to wrap up some of the constants programmatically...

In [7]:
_H_v01_constlist = [
    # device constants
    ('u64','DeviceAPI_None',            'Halide::DeviceAPI::None'),
    ('u64','DeviceAPI_Host',            'Halide::DeviceAPI::Host'),
    ('u64','DeviceAPI_Default_GPU',     'Halide::DeviceAPI::Default_GPU'),
    # type constants
    ('u8', 'type_int',                  'halide_type_int'),
    ('u8', 'type_uint',                 'halide_type_uint'),
    ('u8', 'type_float',                'halide_type_float'),
    ('u8', 'type_handle',               'halide_type_handle'),
]

def _add_const(typ, nm, c_nm):
    typ_str = wrapT(typ)['s']
    H_v01_function(f"hwrap_get_{nm}",[],typ,
                   f"return ({typ_str})({c_nm});")
for typ,nm,c_nm in _H_v01_constlist:
    _add_const(typ,nm,c_nm)

def _unpack_consts(mod):
    for typ,nm,c_nm in _H_v01_constlist:
        fn = getattr(mod,f"hwrap_get_{nm}")
        setattr(mod,nm,fn())
        # erase the function once we've used it.
        setattr(mod,f"hwrap_get_{nm}",None)
# wait until the module is loaded to unpack constants
H_v01_ctype_wraps.append(_unpack_consts)


In this version, we'll cleanup our way to clean things up.  All of the _"objects"_ that we have declared will have destructors that are installed so that they hook onto the Python garbage collector (via `__del__`).  Together with the `_to_*` and `_from_*` functions, this creates a more safely typed and memory managed interface than the previous version of the wrapper.

In [8]:
# Destructors
H_v01_function("hwrap_delete_func", [('f_handle',hw_func_t)],'v',"""
    delete _from_F(f_handle);""")
H_v01_function("hwrap_delete_var",[('v_handle',hw_var_t)],'v',"""
    delete _from_V(v_handle);""")
H_v01_function("hwrap_delete_rdom",[('r_handle',hw_rdom_t)],'v',"""
    delete _from_R(r_handle);""")
H_v01_function("hwrap_delete_expr",[('e_handle',hw_expr_t)],'v',"""
    delete _from_E(e_handle);""")
H_v01_function("hwrap_delete_img",[('i_handle',hw_img_t)],'v',"""
    delete _from_I(i_handle);""")
H_v01_function("hwrap_delete_param",[('p_handle',hw_param_t)],'v',"""
    delete _from_P(p_handle);""")

# DESTRUCTORS for Func, Expr, Var, RDom...
def _install_destructors(mod):
    def _hw_func_t_del(self):
        if self.id != 0:
            mod.hwrap_delete_func(self)
            self.id = 0
    def _hw_expr_t_del(self):
        if self.id != 0:
            mod.hwrap_delete_expr(self)
            self.id = 0
    def _hw_var_t_del(self):
        if self.id != 0:
            mod.hwrap_delete_var(self)
            self.id = 0
    def _hw_rdom_t_del(self):
        if self.id != 0:
            mod.hwrap_delete_rdom(self)
            self.id = 0
    def _hw_img_t_del(self):
        if self.id != 0:
            mod.hwrap_delete_img(self)
            self.id = 0
    def _hw_param_t_del(self):
        if self.id != 0:
            mod.hwrap_delete_param(self)
            self.id = 0
        
    hw_func_t.__del__  = _hw_func_t_del
    hw_expr_t.__del__  = _hw_expr_t_del
    hw_var_t.__del__   = _hw_var_t_del
    hw_rdom_t.__del__  = _hw_rdom_t_del
    hw_img_t.__del__   = _hw_img_t_del
    hw_param_t.__del__ = _hw_param_t_del
    
# install destructors onto objects
H_v01_ctype_wraps.append(_install_destructors)


# FUNC
H_v01_function("hwrap_new_func",[('name','s')],hw_func_t,"""
    return _to_F(new Halide::Func(name));""")
H_v01_function("hwrap_set_func_bound_estimate",
    [('f',hw_func_t),('v',hw_var_t),('min',hw_expr_t),('extent',hw_expr_t)],'v',"""
    _from_F(f)->estimate(*_from_V(v),*_from_E(min),*_from_E(extent));""")

# VAR
H_v01_function("hwrap_new_var",[('name','s')],hw_var_t,"""
    return _to_V(new Halide::Var(name));""")

# RDOM
H_v01_function("hwrap_new_rdom",
    [('name','s'),
     ('n_dim','i32'),
     ('ranges',ctypes.POINTER(hw_expr_t)),],
    hw_rdom_t,"""
    std::vector< std::pair< Halide::Expr, Halide::Expr > > r;
    for(int k=0; k<n_dim; k++)
        r.push_back(std::make_pair( *_from_E(ranges[2*k]),
                                    *_from_E(ranges[2*k+1]) ));
    return _to_R(new Halide::RDom(r,name));""")

# PARAM
H_v01_function("hwrap_new_param",
    [('name','s'),
     ('typ',halide_type_t),], hw_param_t,"""
     return _to_P(new Halide::Param<>(Halide::Type(typ), name));""")
H_v01_function("hwrap_set_param_range",
    [('param',hw_param_t),('lo',hw_expr_t),('hi',hw_expr_t)],'v',"""
    _from_P(param)->set_range(*_from_E(lo),*_from_E(hi));""")
H_v01_function("hwrap_set_param_estimate",
    [('param',hw_param_t),('e',hw_expr_t)],'v',"""
    _from_P(param)->parameter().set_estimate(*_from_E(e));""")
H_v01_function("hwrap_set_param",
    [('param',hw_param_t),('val','vp')],'v',"""
    Halide::Type typ = _from_P(param)->type();
    auto v = (halide_scalar_value_t*)(val);
    _from_P(param)->parameter().set_scalar(typ,*v);""")

# IMG
H_v01_function("hwrap_new_img",
    [('name','s'),
     ('n_dim','i32'),
     ('typ',halide_type_t),], hw_img_t,"""
    return _to_I(new Halide::ImageParam(Halide::Type(typ), n_dim, name));""")
H_v01_function("hwrap_set_img_bound_estimate",
    [('img',hw_img_t),('d','i32'),('min',hw_expr_t),('extent',hw_expr_t)],'v',"""
    _from_I(img)->dim(d).set_bounds_estimate(*_from_E(min),*_from_E(extent));""")
H_v01_function("hwrap_set_img",
    [('img',hw_img_t),('input',ctypes.POINTER(halide_buffer_t))],'v',"""
    _from_I(img)->set(Halide::Buffer<>(*input));""")
H_v01_function("hwrap_img_to_func",[('i_handle',hw_img_t)],hw_func_t,"""
    return _to_F(new Halide::Func( _from_I(i_handle)->in() ));""")

# EXPR
# convert constant values to Expr
for typstr in ['i8','i16','i32','i64','u8','u16','u32','u64','f32','f64']:
    H_v01_function(f"hwrap_{typstr}_to_expr",[('c',typstr)],hw_expr_t,"""
        return _to_E(new Halide::Expr( c ));""")
# converts a Var to a Int32-type Expr
H_v01_function("hwrap_var_to_expr",[('v_handle',hw_var_t)],hw_expr_t,"""
    return _to_E(new Halide::Expr( *(_from_V(v_handle)) ));""")
# converts an RDom to a Int32-type Expr
H_v01_function("hwrap_rdom_to_expr",[('r_handle',hw_rdom_t)],hw_expr_t,"""
    return _to_E(new Halide::Expr( *(_from_R(r_handle)) ));""")
# converts a Param to an Expr
H_v01_function("hwrap_param_to_expr",[('p_handle',hw_param_t)],hw_expr_t,"""
    return _to_E(new Halide::Expr( *(_from_P(p_handle)) ));""")
# binary operations
H_v01_function("hwrap_add",[('lhs',hw_expr_t),('rhs',hw_expr_t)],hw_expr_t,"""
    return _to_E(new Halide::Expr(  *_from_E(lhs)  +  *_from_E(rhs) ));""")
H_v01_function("hwrap_sub",[('lhs',hw_expr_t),('rhs',hw_expr_t)],hw_expr_t,"""
    return _to_E(new Halide::Expr(  *_from_E(lhs)  -  *_from_E(rhs) ));""")
H_v01_function("hwrap_mul",[('lhs',hw_expr_t),('rhs',hw_expr_t)],hw_expr_t,"""
    return _to_E(new Halide::Expr( (*_from_E(lhs)) * (*_from_E(rhs)) ));""")
H_v01_function("hwrap_div",[('lhs',hw_expr_t),('rhs',hw_expr_t)],hw_expr_t,"""
    return _to_E(new Halide::Expr( (*_from_E(lhs)) / (*_from_E(rhs)) ));""")
# func access
H_v01_function("hwrap_access_func",
    [('f',hw_func_t),
     ('n_idx','i32'),
     ('idx',ctypes.POINTER(hw_expr_t))], hw_expr_t,"""
    std::vector<Halide::Expr> args;
    for(int k=0; k<n_idx; k++)
        args.push_back( *_from_E(idx[k]) );
    Halide::FuncRef fr = (*_from_F(f))(args);
    return _to_E(new Halide::Expr( fr ));""")

# Statements
H_v01_function("hwrap_pure_def",
    [('fh',hw_func_t),
     ('n_idx','i32'),
     ('idx',ctypes.POINTER(hw_var_t)),
     ('rhs',hw_expr_t)], "v","""
    std::vector<Halide::Var> args;
    for(int k=0; k<n_idx; k++)
        args.push_back( *_from_V(idx[k]) );
    (*_from_F(fh))(args) = *_from_E(rhs);""")
H_v01_function("hwrap_update",
    [('fh',hw_func_t),
     ('n_idx','i32'),
     ('idx',ctypes.POINTER(hw_expr_t)),
     ('rhs',hw_expr_t)], "v","""
    std::vector<Halide::Expr> args;
    for(int k=0; k<n_idx; k++)
        args.push_back( *_from_E(idx[k]) );
    (*_from_F(fh))(args) = *_from_E(rhs);""")


# DEVICE INTERFACE
H_v01_function("hwrap_get_jit_device", [('_d','u64')],'vp', """
    Halide::DeviceAPI d = (Halide::DeviceAPI)(_d);
    return (void *)(Halide::get_device_interface_for_device_api(d));""")

# Realizing a result (with JiT compilation)
H_v01_function("hwrap_realize_func",
    [('self',hw_func_t),('output',ctypes.POINTER(halide_buffer_t))],'v',"""
    Halide::Buffer<> buf(*output);
    _from_F(self)->realize(Halide::Realization(buf));""")

H_v01_function("hwrap_autoschedule_func",[('f',hw_func_t)],'v',"""
    std::vector<Halide::Internal::Function> fs;
    fs.push_back(_from_F(f)->function());
    auto s = Halide::Internal::generate_schedules(
                fs, Halide::Target(), Halide::MachineParams::generic());""")



In [9]:
Hv01 = getHv01()

clang++ -Wall -Werror -fPIC -O3 -shared -std=c++11 -I /Users/gilbo/install/halide/include /Users/gilbo/install/halide/lib/libHalide.a -lz -o /Users/gilbo/code/iver/notebooks/._halide_c_wrap_cache/libHwrap_v01.so /Users/gilbo/code/iver/notebooks/._halide_c_wrap_cache/Hwrap_v01.cpp


In [10]:
def new_buffer(x,y,w,h):
    arr   = ((ctypes.c_int * w) * h)()
    p_arr = ctypes.cast( arr, ctypes.POINTER(ctypes.c_ubyte) )
    
    out_buf  = halide_buffer_t()
    out_buf.device              = 0
    out_buf.device_interface    = None
    out_buf.host                = p_arr
    out_buf.flags               = 0
    out_buf.type                = halide_type_t(Hv01.type_int,32,1)
    out_buf.dimensions          = 2
    out_buf.dim                 = (halide_dimension_t * 2)()
    out_buf.dim[0] = halide_dimension_t(x,w,1,0)
    out_buf.dim[1] = halide_dimension_t(y,h,w,0)
    out_buf.padding             = None
    
    return out_buf, arr


In [None]:
def run_tut1():
    gradient = Hv01.hwrap_new_func(b"gradient")
    x        = Hv01.hwrap_new_var(b"x")
    y        = Hv01.hwrap_new_var(b"y")
    c        = Hv01.hwrap_new_param(b"c",halide_type_t(Hv01.type_int,32,1))
    Hv01.hwrap_set_param(c,ctypes.byref(ctypes.c_int(2)))

    # e = c * (x + y)
    e_x      = Hv01.hwrap_var_to_expr(x)
    e_y      = Hv01.hwrap_var_to_expr(y)
    e        = Hv01.hwrap_add(e_x,e_y)
    e_c      = Hv01.hwrap_param_to_expr(c)
    e        = Hv01.hwrap_mul(e_c,e)
    
    # gradient(x,y) = e
    idx      = (hw_var_t * 2)(x,y)
    Hv01.hwrap_pure_def(gradient,2,idx,e)
    
    # set up the buffer
    W,H      = 1000,1000
    u32E     = Hv01.hwrap_u32_to_expr
    Hv01.hwrap_set_func_bound_estimate(gradient,x,u32E(0),u32E(W))
    Hv01.hwrap_set_func_bound_estimate(gradient,y,u32E(0),u32E(H))
    buf, arr = new_buffer(0,0,W,H)
        
    # run tests
    t0       = time.perf_counter()
    Hv01.hwrap_realize_func(gradient,buf)
    t1       = time.perf_counter()
    Hv01.hwrap_realize_func(gradient,buf)
    t2       = time.perf_counter()
    for k in range(0,100):
        Hv01.hwrap_realize_func(gradient,buf)
    t3       = time.perf_counter()
    print('times', t1-t0,t2-t1,t3-t2)
    
    # test the result
    for j in range(0,600):
        for i in range(0,800):
            if arr[j][i] != 2*(i + j):
                print(f"Something went wrong!\n"+
                      f"Pixel {i}, {j} was supposed to be {2*(i+j)},"
                      f"but instead it's {arr[j][i]}")
    
    print("Success!")
    
run_tut1()



In [19]:

def build_blur(orig,x,y):
    i32E     = Hv01.hwrap_i32_to_expr
    varE     = Hv01.hwrap_var_to_expr
    fEst     = Hv01.hwrap_set_func_bound_estimate
    iEst     = Hv01.hwrap_set_img_bound_estimate
    
    # new func defs
    blur_x   = Hv01.hwrap_new_func(b"blur_x")
    blur_y   = Hv01.hwrap_new_func(b"blur_y")
    
    e_x, e_y = varE(x), varE(y)
    
    # expressions and statements
    # blur_x(x,y) = (orig(x-1,y) + 2*orig(x,y) + orig(x+1,y))/4
    x_m1     = Hv01.hwrap_sub(varE(x),i32E(1))
    x_p1     = Hv01.hwrap_add(varE(x),i32E(1))
    o_mid    = Hv01.hwrap_access_func(orig,2,(hw_expr_t*2)(e_x,e_y))
    o_left   = Hv01.hwrap_access_func(orig,2,(hw_expr_t*2)(x_m1,e_y))
    o_right  = Hv01.hwrap_access_func(orig,2,(hw_expr_t*2)(x_p1,e_y))
    bx_sum   = Hv01.hwrap_add(Hv01.hwrap_add(o_left,
                                             Hv01.hwrap_mul(i32E(2),o_mid)),
                              o_right)
    bx_avg   = Hv01.hwrap_div(bx_sum,i32E(4))
    Hv01.hwrap_pure_def(blur_x,2,(hw_var_t*2)(x,y),bx_avg)
    
    # blur_y(x,y) = (blur_x(x,y-1) + 2*blur_x(x,y) + blur_x(x,y+1))/4
    y_m1     = Hv01.hwrap_sub(e_y,i32E(1))
    y_p1     = Hv01.hwrap_add(e_y,i32E(1))
    o_mid    = Hv01.hwrap_access_func(blur_x,2,(hw_expr_t*2)(e_x,e_y))
    o_top    = Hv01.hwrap_access_func(blur_x,2,(hw_expr_t*2)(e_x,y_m1))
    o_bot    = Hv01.hwrap_access_func(blur_x,2,(hw_expr_t*2)(e_x,y_p1))
    by_sum   = Hv01.hwrap_add(Hv01.hwrap_add(o_top,
                                             Hv01.hwrap_mul(i32E(2),o_mid)),
                              o_bot)
    by_avg   = Hv01.hwrap_div(by_sum,i32E(4))
    Hv01.hwrap_pure_def(blur_y,2,(hw_var_t*2)(x,y),by_avg)
    
    return blur_x, blur_y
    

def blur_test_0():
    i32E     = Hv01.hwrap_i32_to_expr
    varE     = Hv01.hwrap_var_to_expr
    fEst     = Hv01.hwrap_set_func_bound_estimate
    iEst     = Hv01.hwrap_set_img_bound_estimate
    
    # set up the buffers
    W,H          = 1000,1000
    bufI, arrI   = new_buffer(0,0,W,H)
    bufO, arrO   = new_buffer(1,1,W-2,H-2)
    
    # set up the input image parameter
    inImg    = Hv01.hwrap_new_img(b"inImg",2,halide_type_t(Hv01.type_int,32,1))
    Hv01.hwrap_set_img(inImg,ctypes.byref(bufI))
    iEst(inImg,0,i32E(0),i32E(W))
    iEst(inImg,1,i32E(0),i32E(H))
    
    # defs
    orig     = Hv01.hwrap_img_to_func(inImg)
    x        = Hv01.hwrap_new_var(b"x")
    y        = Hv01.hwrap_new_var(b"y")
    
    blur_x, blur_y = build_blur(orig,x,y)
    fEst(blur_y,x,i32E(1),i32E(W-2))
    fEst(blur_y,y,i32E(1),i32E(H-2))
    
    # run tests
    t0       = time.perf_counter()
    Hv01.hwrap_realize_func(blur_y,bufO)
    t1       = time.perf_counter()
    Hv01.hwrap_realize_func(blur_y,bufO)
    t2       = time.perf_counter()
    for k in range(0,100):
        Hv01.hwrap_realize_func(blur_y,bufO)
    t3       = time.perf_counter()
    print('blur times', t1-t0,t2-t1,t3-t2)
    
blur_test_0()
    
    

blur times 0.12162160299999414 0.001206800999995039 0.09216969999999947


As we do that, we'll need to expand the API bindings in a couple ways and try to consolidate those as well.  Specifically, we want to
* X broaden the set of operators beyond addition
* X include parameters (scalar inputs)
* X include input buffers
* _ include other ways to compile
* X wrap objects so that Python ensures deletion functions get called.
* _ figure out how to have multiple output buffers
* figure out how to invoke a reasonable auto-scheduler
