*Copyright 2024 Modular, Inc: Licensed under the Apache License v2.0 with LLVM Exceptions.*

# MAX Serve and PyTorch model client example

In [None]:
from python import Python
from tensor import Tensor, TensorShape, TensorSpec
from max.engine import EngineNumpyView

@always_inline
fn numpy_data_pointer[
    type: DType
](numpy_array: PythonObject) raises -> DTypePointer[type]:
    var data_ptr = numpy_array.__array_interface__["data"][0].__index__()
    return DTypePointer[type](address=data_ptr)

@always_inline
fn memcpy_to_numpy[
    type: DType
](array: PythonObject, tensor: Tensor[type]) raises:
    var dst = numpy_data_pointer[type](array)
    var src = tensor._ptr
    var length = tensor.num_elements()
    memcpy(dst, src, length)


@always_inline
fn shape_to_python_list(shape: TensorShape) raises -> PythonObject:
    var python_list = Python.evaluate("list()")
    for i in range(shape.rank()):
        _ = python_list.append(shape[i])
    return python_list^

@always_inline
fn get_np_dtype[type: DType](np: PythonObject) raises -> PythonObject:
    @parameter
    if type.is_float32():
        return np.float32
    elif type.is_int32():
        return np.int32
    elif type.is_int64():
        return np.int64
    elif type.is_uint8():
        return np.uint8

    raise "Unknown datatype"

@always_inline
fn tensor_to_numpy[
    type: DType
](tensor: Tensor[type], np: PythonObject) raises -> PythonObject:
    var shape = shape_to_python_list(tensor.shape())
    var tensor_as_numpy = np.zeros(shape, get_np_dtype[type](np))
    _ = shape^
    memcpy_to_numpy(tensor_as_numpy, tensor)
    return tensor_as_numpy^

@always_inline
fn numpy_to_tensor[
    dtype: DType
](inout np_array: PythonObject) raises -> Tensor[dtype]:
    var view = EngineNumpyView(np_array)
    var size = view.spec().num_elements()
    var ptr = DTypePointer[dtype].alloc(size)
    memcpy(ptr, view.unsafe_ptr().bitcast[dtype](), size)
    return Tensor[dtype](view.spec(), ptr)

## Prepare client/inputs

In [None]:
var transformers = Python.import_module("transformers")

var model_name = "roberta"
var model_path = "roberta.torchscript"
var batch = 1
var seqlen = 128

var HF_MODEL_NAME = "cardiffnlp/twitter-roberta-base-emotion-multilabel-latest"
var hf_model = transformers.AutoModelForSequenceClassification.from_pretrained(HF_MODEL_NAME)
hf_model.config.return_dict = False

# Tokenize input into input ids and mask:
var INPUT = "There are many exciting developments in the field of AI Infrastructure!"
var tokenizer = transformers.AutoTokenizer.from_pretrained(HF_MODEL_NAME)
var raw_inputs = tokenizer(INPUT,
    return_tensors="pt", padding='max_length', truncation=True, max_length=seqlen)
print(raw_inputs)

In [None]:
from max.engine import InferenceSession
from max.engine.tensor import EngineNumpyView
from max.serve.kserve.client import GRPCInferenceClient

var session = InferenceSession()
var inputs = session.new_tensor_map()
var a = raw_inputs["input_ids"].detach().numpy()
var b = raw_inputs["attention_mask"].detach().numpy()
var input_ids = numpy_to_tensor[DType.int64](a)
var attention_mask = numpy_to_tensor[DType.int64](b)
inputs.borrow("input_ids", input_ids)
inputs.borrow("attention_mask", attention_mask)
for key in inputs.keys():
    print(key[] + " : " + str(inputs.get[DType.int64](key[])))


## Run an inference

In [None]:
var req_outputs = List[String]("result0")
var client = GRPCInferenceClient("0.0.0.0:8000", session)
var response = client.infer("roberta", "0", inputs, req_outputs)
var outputs = response.get_output_tensors()
for key in outputs.keys():
    print(key[] + " : " + str(outputs.get[DType.float32](key[])))

var np = Python.import_module("numpy")
var arr = tensor_to_numpy(outputs.get[DType.float32]("result0"), np)

# Extract class prediction from output
var predicted_class_id = arr.argmax(axis=-1)[0]
var classification = hf_model.config.id2label[predicted_class_id]

print("The sentiment is: " + str(classification))

* TODO: Add batch example