In [1]:
import ctypes
import os
import sys
import subprocess

def shell(cstr):
    try:
        res = subprocess.run(cstr, check=True, shell=True,
                                   capture_output=True)
        if len(res.stdout) > 0:
            print(res.stdout.decode("utf-8"))
    except subprocess.CalledProcessError as err:
        if len(err.output) > 0:
            print(err.output.decode("utf-8"))
        print(err.stderr.decode("utf-8"))
        raise err

Of course, external dependencies will be in different places on different systems, which is the bane of any build system.  I'm going to skip that problem here and just point to the Halide binary.  Dear programmer, you'll need to edit this on whatever system you're working on.

In [2]:
HALIDE_PATH   = os.path.expanduser('~/install/halide')

HALIDE_SO     = None
HALIDE_BIN    = os.path.join(HALIDE_PATH,'bin')
HALIDE_LIB    = os.path.join(HALIDE_PATH,'lib')
HALIDE_INC    = os.path.join(HALIDE_PATH,'include')
HALIDE_STATIC = os.path.join(HALIDE_LIB,'libHalide.a')
if sys.platform == 'darwin':
    HALIDE_SO = os.path.join(HALIDE_BIN,'libHalide.dylib')
elif sys.platform == 'linux':
    raise NotImplementedError("Support for Linux should be easy, "
                              "but not checked yet")
elif sys.platform == 'win32' or sys.platform == 'cygwin':
    raise NotImplementedError("I don't know what is necessary "
                              "to support Windows. Ce la vie!")

if not os.path.isfile(HALIDE_SO):
    raise IOError(f"Halide shared library not found at {HALIDE_SO}")


# Simplest Case: Wrapping an `int(void)` Function

It would be nice if we could just directly link and use the Halide library, but we can't.  The problem is that it was written with a C++ API, which is not easily bound into Python (or other languages' FFIs for that matter).  Instead, we'll try to build a wrapper library around it that uses a _pure C_ interface.  To accomplish this, we'll start by trying to just wrap one of the tutorials in a minimal way.  This will let us iron out some boilerplate related to bootstrapping our build system strategy.

Here is the first non-trivial Halide tutorial, stripped of comments, and with main renamed to be used as a library function instead.  (Note also the additional `extern "C" { ... }`, which is critical for us to expose the `C++` into pure `C` correctly.

In [3]:
h_tut_1_src = """
#include "Halide.h"
#include <stdio.h>
#include <stdlib.h>

extern "C" {
int run_tut1();
}

int run_tut1() {

    Halide::Func gradient;
    Halide::Var  x, y;

    Halide::Expr e = x + y;
    gradient(x, y) = e;
    
    Halide::Buffer<int32_t> output = gradient.realize(800, 600);

    for (int j = 0; j < output.height(); j++) {
        for (int i = 0; i < output.width(); i++) {
            if (output(i, j) != i + j) {
                printf("Something went wrong!\\n"
                       "Pixel %d, %d was supposed to be %d,"
                       "but instead it's %d\\n",
                       i, j, i+j, output(i, j));
                return -1;
            }
        }
    }

    printf("Success!\\n");
    return 0;
}
"""

We are going to have to compile this by escaping into the OS ultimately.  No way around it.  There are some really cool build systems that automatically determine dependencies by snooping on system commands using `strace` and the like.  `fabricate.py` is one of these.  However, it looks like those aren't super well maintained or portable across at least mac and linux.  So, I'm going to do this by starting with even more manual primitives in Python.  Hopefully I can still bootstrap up some clever resiliency as a kind of one-off build system.

We'll start by ensuring that there's a hidden directory we can dump all our temporaries and stuff into.

In [4]:
HERE_DIR = os.path.abspath('')
C_DIR    = os.path.join(HERE_DIR,'._c_wrapper_cache')

if not os.path.isdir(C_DIR):
    os.mkdir(C_DIR)

# make sure we have a conveient way to blow away the cache
def clean_cache():
    shell( 'rm ' + os.path.join(C_DIR,"*") )


We can now dump the cpp file into this cache.

In [5]:
TUT_1_FILENAME = os.path.join(C_DIR, "tut_1.cpp")
with open(TUT_1_FILENAME,'w',encoding = 'utf-8') as f:
    f.write(h_tut_1_src)

We now need to run the command to compile the file into a library.


In [6]:
TUT1_SO = os.path.join(C_DIR, "libtut_1.so")

#f"-Wl,-rpath,{HALIDE_LIB} "
cmd = (f"clang++ -Wall -Werror -fPIC -O3 -shared -std=c++11 "
       f"-I {HALIDE_INC} {HALIDE_STATIC} -lz "
       f"-o {TUT1_SO} {TUT_1_FILENAME}")
print(cmd)
shell(cmd)

clang++ -Wall -Werror -fPIC -O3 -shared -std=c++11 -I /Users/gilbo/install/halide/include /Users/gilbo/install/halide/lib/libHalide.a -lz -o /Users/gilbo/code/iver/notebooks/._c_wrapper_cache/libtut_1.so /Users/gilbo/code/iver/notebooks/._c_wrapper_cache/tut_1.cpp


In [7]:
# Check whether the shared object was created or not
shell("ls " + C_DIR)

Hwrap_v00.cpp
libHwrap_v00.so
libtut_1.so
tut_1.cpp



Now we want to look at binding this library in using `ctypes`

In [8]:
m_tut_1 = ctypes.CDLL(TUT1_SO)

In [9]:
m_tut_1.run_tut1()

0

Wait a second.  Where did the "Success!" go?  I thought the C function we wrote prints that out using `printf`.

Well, the standard file descriptors like `stdout` and `stderr` are bound to the console that launched this Jupyter notebook's server.  If you go look there, you'll see `"Success!"` printed out. _**This is a very important observation!**_ If by using C-code we manage to crash the process, all our notebook will tell us is "the kernel had to restart".  If the C-code tried to dump some kind of useful information about the panic to stdout or stderr, that got dumped to the console running the jupyter server.  You need to know to go look there or you'll be driving blind.

## ! An annoying detail about dynamic loading !
Once we execute `ctypes.CDLL(...soname...)` once for a given filename, this process will assume we loaded the one and only static version of that shared library.  Once we're done developing the shared library, this is just great.  But if we're changing the source for that shared library in this same process that's loading it, this behavior may prevent updates.  In general, you will need to restart the notebook kernel in order to see changes to a linked shared library propagate correctly.

--------

# What is the Halide API Anyway?

The tutorial we just looked at used a variety of objects/classes:
* `Halide::Func`
* `Halide::Var`
* `Halide::Expr`
* `Halide::Buffer<T>`

The first three of these can be managed via pointers, wrapped by appropriate `create` and `destroy` functions.  The last one will be trickier because we have to instantiate every version of it that we might want—ie for each type of data we may want to store in the `Buffer`.

Looking through the documentation we can find different constructors for each of these first three objects.

For `Halide::Func`, we see `Func(std::string)`, `Func(Expr)`, and `Func(Buffer)` notably.  Let's see if we can get away with just `Func(std::string)` for right now.

For `Halide::Var`, there is basically just `Var(std::string)`.

For `Halide::Expr`, one can make an expression out of `Expr()` and `Expr(const_val)` for all basic value types.

Of course, we'll have to sort out a lot of other stuff too!

## The 'plain' C interface to Halide "pipelines"/ahead-of-time compiled code

While talking with Alex Reinking, I learned that `halide_buffer_t` is the `C` interface to buffers.  The motivating idea to keep in mind here is that when Halide generates code ahead of time to be compiled into some project, it wants to have a `C`-compatible interface to that code.

A _pipeline_ is a compiled piece of Halide code with multiple input buffers and multiple output buffers.  In a sense it is a bit like a sub-routine/function.  However, it is not composable in the sense that a pipeline cannot be used inside of another pipeline.

### `halide_buffer_t`

Unlike the enormous `Halide::Buffer` interface, `halide_buffer_t` is a struct.  We can just list out its members.
```
struct halide_buffer_t {
    // these two relate to when the data is say GPU resident
    uint64_t                device;
    const halide_device_interface_t *
                            device_interface;
    // the data pointer
    uint8_t *               host;
    
    // no idea what these mean
    uint64_t                flags;
    
    // what data is stored at each coordinate
    halide_type_t           type;

    // the layout pattern and indexing coefficients
    int32_t                 dimensions;
    halide_dimension_t *    dim;
    
    // how does this work?
    void *                  padding;
}
```
This has a number of sub-structures, which we'll need to investigate next.

### `dimension_t`

Looking at `dimensions` and `dim`, we see that the first is the length of the array pointed to by the latter.  It indicates how many indices are necessary to address an element of the tensor.  Each entry in `dim` then has the following structure.
```
struct halide_dimension_t {
  int32_t   min;    // not quite sure
  int32_t   extent; // ditto, but n_elem in this dimension?
  int32_t   stride; // this is clear; what to multiply indices by
  uint32_t  flags;  // unused; reserved
};
```
Taking a vector `idx` of size `dimensions` indexing the buffer, we may compute the memory coordinates of a buffer entry as
```
  addr = host
  for i=0,dimensions:
    addr += dim[i].stride * (idx[i] - dim[i].min) * sizeof(type)
```
Meanwhile, `extent` is the number of entries in the dimension, so that `min+extent-1` is the maximum addressible coordinate in the given dimension.

This layout description is sufficient to handle padded buffers, windowing, and different storage orders.

### `halide_device_interface_t`

We can get a handle to a device using the following call, which returns `NULL` on failure.
```
const Halide::halide_device_interface_t*
Halide::get_device_interface_for_device_api (
  DeviceAPI d,
  const Target & t = get_jit_target_from_environment()
)
```

The `DeviceAPI d` argument is drawn from the enum
```
enum class DeviceAPI {
     None,
     Host,
     Default_GPU,
     CUDA,
     OpenCL,
     GLSL,
     OpenGLCompute,
     Metal,
     Hexagon,
     HexagonDma,
     D3D12Compute,
};
```
and presumably these values are assigned to `uint64_t device` in the `halide_buffer_t` structure too.

In this way, we can thankfully treat the device interface as a black box.

### `halide_type_t`

This is a simple enumeration on signed vs. unsigned integers, floats and handles.  For instance a `u32` would be `{1,32,1}` while a `vec4f` would be `{2,32,4}`.
```
struct halide_type_t {
  uint8_t  code; // halide_type_code_t
  uint8_t  bits;
  uint16_t lanes;
};
typedef enum halide_type_code_t {
  halide_type_int = 0,   //!< signed integers
  halide_type_uint = 1,  //!< unsigned integers
  halide_type_float = 2, //!< floating point numbers
  halide_type_handle = 3 //!< opaque pointer type (void *)
};
```

## Compiling Code --- JiT style

Let's look at how to expose the JiT compilation and execution to start with.  Looking through the `Func` documentation (i.e. every single public member function on the class), we see `realize` in a number of guises, including
```
void Func::realize ( Pipeline::Realization outputs,
                     const Target &target=Target(),
                     const ParamMap &param_map=ParamMap::empty_map() )
```
where `Target` is a description of the _compile target_ that gets passed through to LLVM.  The `ParamMap` is about setting scalar parameters on the pipeline.  Finally the `Pipeline::Realization` expresses the output buffer.  It is constructed as `Realization(halide_buffer_t *buf)`

## Ways to Combine Expressions

Consider the following excerpt from the tutorial, which contains the substantive definition of the pipeline.  How do we translate these lines into a C-API?
```
    Halide::Expr e = x + y;
    gradient(x, y) = e;
```
First, note that there is a type-conversion operator defined from `Var` to `Expr`.  This will lift `x` and `y`.  From there, we need some way to add two expressions.  As it turns out, we will be able to do that using the overloaded operator.

The tricky part will be indexing `gradient`.  What class/type of object does this result in?  And what are the functions to use?  Will it be ok to pass that object around without resolving it into an r-value immediately?  Perhaps the whole assignment must be a single invocation with those indices supplied.  This final answer is the simplest, since it will give us a general strategy without having to poke into the implementation in greater detail.


-----

# Halide Wrapper v0.0

We will proceed by constructing the wrapper string bit by bit.  As we go, we will also anticipate _wrapping_ the resulting function from the `ctypes.cdll` load with the correct types.

To begin, note that we must build our own Python classes to reflect C structures in the interface.

In [10]:
class halide_type_t(ctypes.Structure):
    _c_type_str = "struct halide_type_t"
    _fields_ = [
        ('code',  ctypes.c_ubyte),
        ('bits',  ctypes.c_ubyte),
        ('lanes', ctypes.c_ushort),
    ]

class halide_dimension_t(ctypes.Structure):
    _c_type_str = "struct halide_dimension_t"
    _fields_ = [
        ('min',    ctypes.c_int),
        ('extent', ctypes.c_int),
        ('stride', ctypes.c_int),
        ('flags',  ctypes.c_uint),
    ]

class halide_buffer_t(ctypes.Structure):
    _c_type_str = "struct halide_buffer_t"
    _fields_ = [
        ('device',            ctypes.c_ulonglong),
        ('device_interface',  ctypes.c_void_p),
        ('host',              ctypes.POINTER(ctypes.c_ubyte)),
        ('flags',             ctypes.c_ulonglong),
        ('type',              halide_type_t),
        ('dimensions',        ctypes.c_int),
        ('dim',               ctypes.POINTER(halide_dimension_t)),
        ('padding',           ctypes.c_void_p),
    ]


Our next concern is to make sure that we can quickly and easily consolidate all the information we need associated with a type, whether it is a simple built in type, one of the above types we defined, or a pointer to any of the preceding.

Not only do we need the correct `ctypes` object to represent the type; we also need a string that can be written into C-code that correctly identifies the type for C-type signatures.  The following function allows us to access this information using the ctype objects or via a convenient string-encoded shorthand.

In [11]:
_CTYPES_PTR_TYPE    = type(ctypes.POINTER(ctypes.c_int))
_CTYPES_SIMPLE_TYPE = type(ctypes.c_int)

_str_to_pair = {
    "b":    {'ct':ctypes.c_bool,'s':'_Bool'},
    "i8":   {'ct':ctypes.c_byte,'s':'int8_t'},
    "i16":  {'ct':ctypes.c_short,'s':'int16_t'},
    "i32":  {'ct':ctypes.c_int,'s':'int32_t'},
    "i64":  {'ct':ctypes.c_longlong,'s':'int64_t'},
    "u8":   {'ct':ctypes.c_ubyte,'s':'uint8_t'},
    "u16":  {'ct':ctypes.c_ushort,'s':'uint16_t'},
    "u32":  {'ct':ctypes.c_uint,'s':'uint32_t'},
    "u64":  {'ct':ctypes.c_ulonglong,'s':'uint64_t'},
    "f32":  {'ct':ctypes.c_float,'s':'float'},
    "f64":  {'ct':ctypes.c_double,'s':'double'},
    "v":    {'ct':None,'s':'void'},
    "vp":   {'ct':ctypes.c_void_p,'s':'void *'},
    "vpp":  {'ct':ctypes.POINTER(ctypes.c_void_p),'s':'void **'},
    "s":    {'ct':ctypes.c_char_p,'s':'char *'},
}
_typ_to_pair = {}
for _,pair in _str_to_pair.items():
    _typ_to_pair[pair['ct']] = pair

def wrapT(sig):
    if type(sig) is str:
        if sig in _str_to_pair:
            return _str_to_pair[sig]
        else:
            raise TypeError(f"unrecognized C type string: {sig}")
            
    # convert our defined object types into type pairs
    if issubclass(sig,ctypes.Structure):
        return {'ct':sig,'s':sig._c_type_str}
    
    # lift simple types up to type pairs
    elif type(sig) == _CTYPES_SIMPLE_TYPE:
        if sig in _typ_to_pair:
            return _typ_to_pair[sig]
        else:
            raise TypeError(f"unsupported C type: {sig}")
    
    # handle pointer types
    elif type(sig) == _CTYPES_PTR_TYPE:
        sub = wrapT(sig._type_)
        return {'ct':sig,'s':sub['s']+" *"}
    
    else:
        raise TypeError(f"unrecognized argument type: {type(sig)}")


Finally, the following chunk of code will allow us to declare the functions we want to include in the wrapper in a non-redundant way that keeps multiple occurrences of the function signature in sync with each other.

The `getHv00` function packages up our earlier insights along with a clever use of `os.path.getmtime` to read the _last-modified_ time for files involved in our build.  Doing this obviates one reason for a tool like `make` or `cmake`: that redundant builds are prevented.  However, by having the Python wrapper itself maintain its own freshness, we can avoid wrapper-library users having to manage another build invocation.

In [12]:
H_v00_inc_str = """
#include "Halide.h"
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
"""

H_v00_decl_strs   = []
H_v00_defn_strs   = []
H_v00_ctype_wraps = []
H_v00_module      = None

def H_v00_function(name,args,ret,body):
    astr = ', '.join([ wrapT(a[1])['s'] +' '+a[0]
                       for a in args
                     ])
    rstr = wrapT(ret)['s']
    
    atyps = [ wrapT(a[1])['ct'] for a in args ]
    rtyp  = wrapT(ret)['ct']
    
    declstr = f"{rstr} {name}({astr});"
    defnstr = f"{rstr} {name}({astr}) {{\n{body}\n}}"
    def wrap(mod):
        f = getattr(mod,name)
        f.argtypes = atyps
        f.restype  = rtyp
    
    H_v00_decl_strs.append(declstr)
    H_v00_defn_strs.append(defnstr)
    H_v00_ctype_wraps.append(wrap)

def getHv00():
    global H_v00_module
    all_decls = '\n'.join(H_v00_decl_strs)
    all_defns = '\n\n'.join(H_v00_defn_strs)
    src_str   =  (f'{H_v00_inc_str}\n\n'
                  f'extern "C" {{\n'
                  f'{all_decls}\n'
                  f'}}\n\n'
                  f'{all_defns}\n')
    
    Hv00_CPP  = os.path.join(C_DIR, "Hwrap_v00.cpp")
    Hv00_SO   = os.path.join(C_DIR, "libHwrap_v00.so")
    
    # first, get timestamps on needed resources
    # we'll use these to conditionally compile
    def get_time(s):
        if not os.path.exists(s):
            return None
        else:
            return os.path.getmtime(s)
    
    cpp_time  = get_time(Hv00_CPP)
    so_time   = get_time(Hv00_SO)
    h_time    = get_time(HALIDE_STATIC)
    
    # Check whether the CPP file needs to be re-written
    write_cpp = True if cpp_time == None else False
    if cpp_time:
        with open(Hv00_CPP,'r',encoding = 'utf-8') as f:
            if src_str != f.read():
                write_cpp = True
    # possibly rewrite the CPP file
    if write_cpp:
        if so_time: cpp_time = so_time + 1
        with open(Hv00_CPP,'w',encoding = 'utf-8') as f:
            f.write(src_str)
    
    # Check whether the SO needs to be re-compiled
    if (not cpp_time or not so_time or
        so_time < cpp_time or
        so_time < h_time):
            cmd = (f"clang++ -Wall -Werror -fPIC -O3 -shared -std=c++11 "
                   f"-I {HALIDE_INC} {HALIDE_STATIC} -lz "
                   f"-o {Hv00_SO} {Hv00_CPP}")
            print(cmd)
            shell(cmd)
            if H_v00_module != None:
                raise IOError("library Hwrap_v00 already loaded")
    
    # Load the module if needed
    if H_v00_module == None:
        H_v00_module = ctypes.CDLL(Hv00_SO)
        for wrap in H_v00_ctype_wraps:
            wrap(H_v00_module)
    
    return H_v00_module


The preceding machinery wraps up what we learned about building and loading a dynamic library, as well as packaging function declarations in a nice way that keeps them consistent across the three different sites that they need to occur at.

In [13]:
# FUNC
H_v00_function(
    "hwrap_new_func",
    [('name','s')],'vp',
    """
    Halide::Func *f = new Halide::Func(name);
    return (void *)(f);
    """)
H_v00_function(
    "hwrap_delete_func",
    [('f_handle','vp')],'v',
    """
    Halide::Func *f = (Halide::Func *)(f_handle);
    delete f;
    """)

# VAR
H_v00_function(
    "hwrap_new_var",
    [('name','s')],'vp',
    """
    Halide::Var *v = new Halide::Var(name);
    return (void *)(v);
    """)
H_v00_function(
    "hwrap_delete_var",
    [('v_handle','vp')],'v',
    """
    Halide::Var *v = (Halide::Var *)(v_handle);
    delete v;
    """)

# EXPR
# delete an expression we got some other way
H_v00_function(
    "hwrap_delete_expr",
    [('e_handle','vp')],'v',
    """
    Halide::Expr *e = (Halide::Expr *)(e_handle);
    delete e;
    """)
# converts a Var to a Int32-type Expr
H_v00_function(
    "hwrap_var_to_expr",
    [('v_handle','vp')],'vp',"""
    Halide::Var  *x = (Halide::Var *)(v_handle);
    Halide::Expr *e = new Halide::Expr(*x);
    return (void *)(e);
    """)
# add 2 expressions together
H_v00_function(
    "hwrap_add",
    [('lh','vp'),('rh','vp')],'vp',"""
    Halide::Expr *lhs = (Halide::Expr *)(lh);
    Halide::Expr *rhs = (Halide::Expr *)(rh);
    Halide::Expr *res = new Halide::Expr((*lhs) + (*rhs));
    return (void *)(res);
    """)

# Statements
H_v00_function(
    "hwrap_pure_def",
    [('fh','vp'),
     ('n_idx','i32'),
     ('idx','vpp'),
     ('rhs','vp')], "v",
    """
    Halide::Func *f = (Halide::Func *)(fh);
    std::vector<Halide::Var> args;
    for(int k=0; k<n_idx; k++)
        args.push_back( *(Halide::Var *)(idx[k]) );
    (*f)(args) = *(Halide::Expr *)(rhs);
    """)


# DEVICE INTERFACE
H_v00_function(
    "hwrap_get_jit_device",
    [('_d','u64')],'vp',
    """
    Halide::DeviceAPI d = (Halide::DeviceAPI)(_d);
    return (void *)(Halide::get_device_interface_for_device_api(d));
    """)
H_v00_function(
    "hwrap_get_DeviceAPI_None",[],'u64',"""
    return (uint64_t)(Halide::DeviceAPI::None);""")
H_v00_function(
    "hwrap_get_DeviceAPI_Host",[],'u64',"""
    return (uint64_t)(Halide::DeviceAPI::Host);""")
H_v00_function(
    "hwrap_get_DeviceAPI_Default_GPU",[],'u64',"""
    return (uint64_t)(Halide::DeviceAPI::Default_GPU);""")
# note: a number of other cases for the enumeration were ignored

# TYPE CODES (extracting different enums)
H_v00_function(
    "hwrap_get_type_code_int",[],'u8',"""
    return (uint8_t)(halide_type_int);""")
H_v00_function(
    "hwrap_get_type_code_uint",[],'u8',"""
    return (uint8_t)(halide_type_uint);""")
H_v00_function(
    "hwrap_get_type_code_float",[],'u8',"""
    return (uint8_t)(halide_type_float);""")
H_v00_function(
    "hwrap_get_type_code_handle",[],'u8',"""
    return (uint8_t)(halide_type_handle);""")

# Realizing a result (with JiT compilation)
H_v00_function(
    "hwrap_realize_jit",
    [('self','vp'),('output',ctypes.POINTER(halide_buffer_t))],'v',
    """
    Halide::Func *f = (Halide::Func *)(self);
    // note that this type annotation DOES NOT generalize
    Halide::Buffer<> buf(*output);
    f->realize(Halide::Realization(buf));
    """)



Hv00 = getHv00()

clang++ -Wall -Werror -fPIC -O3 -shared -std=c++11 -I /Users/gilbo/install/halide/include /Users/gilbo/install/halide/lib/libHalide.a -lz -o /Users/gilbo/code/iver/notebooks/._c_wrapper_cache/libHwrap_v00.so /Users/gilbo/code/iver/notebooks/._c_wrapper_cache/Hwrap_v00.cpp


In [14]:
Hv00.DeviceAPI_None         = Hv00.hwrap_get_DeviceAPI_None()
Hv00.DeviceAPI_Host         = Hv00.hwrap_get_DeviceAPI_Host()
Hv00.DeviceAPI_Default_GPU  = Hv00.hwrap_get_DeviceAPI_Default_GPU()

Hv00.type_int               = Hv00.hwrap_get_type_code_int()
Hv00.type_uint              = Hv00.hwrap_get_type_code_uint()
Hv00.type_float             = Hv00.hwrap_get_type_code_float()
Hv00.type_handle            = Hv00.hwrap_get_type_code_handle()

## Replicating the Tutorial Code in Python Now

In [17]:
def new_buffer(w,h):
    arr   = ((ctypes.c_int * w) * h)()
    p_arr = ctypes.cast( arr, ctypes.POINTER(ctypes.c_ubyte) )
    
    out_buf  = halide_buffer_t()
    out_buf.device              = 0
    out_buf.device_interface    = None
    out_buf.host                = p_arr
    out_buf.flags               = 0
    out_buf.type                = halide_type_t(Hv00.type_int,32,1)
    out_buf.dimensions          = 2
    out_buf.dim                 = (halide_dimension_t * 2)()
    out_buf.dim[0] = halide_dimension_t(0,w,1,0)
    out_buf.dim[1] = halide_dimension_t(0,h,w,0)
    out_buf.padding             = None
    
    return out_buf, arr


def run_tut1():
    gradient = Hv00.hwrap_new_func(b"gradient")
    x        = Hv00.hwrap_new_var(b"x")
    y        = Hv00.hwrap_new_var(b"y")

    # e = x + y
    e_x      = Hv00.hwrap_var_to_expr(x)
    e_y      = Hv00.hwrap_var_to_expr(y)
    e        = Hv00.hwrap_add(e_x,e_y)
    
    # gradient(x,y) = e
    idx      = (ctypes.c_void_p * 2)(x,y)
    Hv00.hwrap_pure_def(gradient,2,idx,e)
    
    # set up the buffer
    buf, arr = new_buffer(800,600)
    Hv00.hwrap_realize_jit(gradient,buf)
    
    # delete stuff
    Hv00.hwrap_delete_func(gradient)
    Hv00.hwrap_delete_var(x)
    Hv00.hwrap_delete_var(y)
    Hv00.hwrap_delete_expr(e_x)
    Hv00.hwrap_delete_expr(e_y)
    Hv00.hwrap_delete_expr(e)
    
    # test the result
    for j in range(0,600):
        for i in range(0,800):
            if arr[j][i] != i + j:
                print(f"Something went wrong!\n"+
                      f"Pixel {i}, {j} was supposed to be {i+j},"
                      f"but instead it's {arr[j][i]}")
    
    print("Success!")
    
run_tut1()

Success!


-------

# To be continued...

We got a lot of wrapping ideas explored in this notebook.  However, now is a good point to step back and get a fresh start on a wrapper that consolidates the progress we've already made.

As we do that, we'll need to expand the API bindings in a couple ways and try to consolidate those as well.  Specifically, we want to
* broaden the set of operators beyond addition
* include parameters (scalar inputs)
* include input buffers
* include other ways to compile
* wrap objects so that Python ensures deletion functions get called.
* figure out how to have multiple output buffers
* figure out how to invoke a reasonable auto-scheduler
