Skip to content

Commit

Permalink
PyTorch NNAPI integration prototype
Browse files Browse the repository at this point in the history
Summary:
This is in prototype status, but pretty functional.  There are two major
parts.

- Model converter.  This is a pure Python component that consumes a
  model in TorchScript format, converts the operations into NNAPI
  semantics, and serializes the model in a custom format.  It then wraps
  the result in a new TorchScript model that can invoke NNAPI under the
  hood.
- Runtime.  This is a TorchBind object that deserializes the model and
  sends the result to NNAPI.  This is fairly simple since the serialized
  format is basically just a list of NNAPI calls to make, so most of the
  code is spent on bounds checking.

A few notes on the design.
- Currently, all tensor sizes need to be fixed, and those fixed sizes
  are burned directly into the serialized model.  This will probably
  need to change.  NNAPI supports variable-sized tensors, but the
  important hardware backends do not.  However, we're seeing use cases
  crop up where the input size is not known until around the time that
  the model is loaded (for example, it might depend on the camera aspect
  ratio).  I think the proper fix here is to remove the code in the
  converter that eagerly calculates the sizes of the intermediate
  tensors and replace it with a code generator that will generate some
  TorchScript code that will perform those calculations at model load
  time.  This way, we will be able to support models that have
  variable-sized inputs while still only showing fixed-sized operands to
  NNAPI.
- The important hardware backends want operands to be in NHWC order, but
  PyTorch natively represents all tensors and NCHW.  The strategy for
  this is to keep NCHW during most of the conversion process, but track
  and additional value per operand representing the "dimension order".
  The dimension order gets propagated through convolutions and pointwise
  ops.  When we're ready to serialize the model, we reorder the
  dimensions for "channels last" operands to NHWC.

Test Plan:
Some local testing with FB prod models.  I'll need to add some examples
and automated tests.

ghstack-source-id: e1fa978af170d4d00c5270c52b9d4cb63843e7d2
Pull Request resolved: pytorch#46780
  • Loading branch information
dreiss committed Oct 23, 2020
1 parent 90a8f21 commit 5aabc30
Show file tree
Hide file tree
Showing 13 changed files with 2,450 additions and 0 deletions.
8 changes: 8 additions & 0 deletions aten/src/ATen/CMakeLists.txt
Expand Up @@ -446,6 +446,14 @@ list(APPEND ATen_MOBILE_BENCHMARK_SRCS
list(APPEND ATen_MOBILE_BENCHMARK_SRCS
${CMAKE_CURRENT_SOURCE_DIR}/benchmarks/stateful_conv1d.cpp)

if(LINUX OR ANDROID)
# NNAPI is primarily for Android, but also build on Linux
# to allow easy experimentation when a host build of libneuralnetworks
# is available. We don't have any build-time dependencies on NNAPI,
# so this should be safe.
add_subdirectory(nnapi)
endif()

# Pass source, includes, and libs to parent
set(ATen_CORE_SRCS ${ATen_CORE_SRCS} PARENT_SCOPE)
set(ATen_CPU_SRCS ${ATen_CPU_SRCS} PARENT_SCOPE)
Expand Down
24 changes: 24 additions & 0 deletions aten/src/ATen/nnapi/CMakeLists.txt
@@ -0,0 +1,24 @@
cmake_minimum_required(VERSION 3.5 FATAL_ERROR)
project(pytorch_nnapi)

# Define this to build the NNAPI binding out of tree.
if(PYTORCH_NNAPI_STANDALONE)
set(CMAKE_CXX_STANDARD 14)
find_package(Torch REQUIRED)
endif()

set(NNAPI_SRCS
nnapi_bind.cpp
nnapi_wrapper.cpp
nnapi_model_loader.cpp
)

# Static build on Android so we can just bundle with the benchmarker
# or with PyTorch, but use shared on host so we don't load by accident.
if(ANDROID)
add_library(pytorch_nnapi STATIC ${NNAPI_SRCS})
else()
add_library(pytorch_nnapi SHARED ${NNAPI_SRCS})
endif()

target_link_libraries(pytorch_nnapi torch)
84 changes: 84 additions & 0 deletions aten/src/ATen/nnapi/NeuralNetworks.h
@@ -0,0 +1,84 @@
/*
* Copyright (C) 2017 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

/*
Most of NeuralNetworks.h has been stripped for simplicity.
We don't need any of the function declarations since
we call them all through dlopen/dlsym.
Operation codes are pulled directly from serialized models.
*/

#ifndef MINIMAL_NEURAL_NETWORKS_H
#define MINIMAL_NEURAL_NETWORKS_H

#include <stdint.h>

typedef enum {
ANEURALNETWORKS_NO_ERROR = 0,
ANEURALNETWORKS_OUT_OF_MEMORY = 1,
ANEURALNETWORKS_INCOMPLETE = 2,
ANEURALNETWORKS_UNEXPECTED_NULL = 3,
ANEURALNETWORKS_BAD_DATA = 4,
ANEURALNETWORKS_OP_FAILED = 5,
ANEURALNETWORKS_BAD_STATE = 6,
ANEURALNETWORKS_UNMAPPABLE = 7,
ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE = 8,
ANEURALNETWORKS_UNAVAILABLE_DEVICE = 9,
} ResultCode;

typedef enum {
ANEURALNETWORKS_FLOAT32 = 0,
ANEURALNETWORKS_INT32 = 1,
ANEURALNETWORKS_UINT32 = 2,
ANEURALNETWORKS_TENSOR_FLOAT32 = 3,
ANEURALNETWORKS_TENSOR_INT32 = 4,
ANEURALNETWORKS_TENSOR_QUANT8_ASYMM = 5,
ANEURALNETWORKS_BOOL = 6,
ANEURALNETWORKS_TENSOR_QUANT16_SYMM = 7,
ANEURALNETWORKS_TENSOR_FLOAT16 = 8,
ANEURALNETWORKS_TENSOR_BOOL8 = 9,
ANEURALNETWORKS_FLOAT16 = 10,
ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL = 11,
ANEURALNETWORKS_TENSOR_QUANT16_ASYMM = 12,
ANEURALNETWORKS_TENSOR_QUANT8_SYMM = 13,
} OperandCode;

typedef enum {
ANEURALNETWORKS_PREFER_LOW_POWER = 0,
ANEURALNETWORKS_PREFER_FAST_SINGLE_ANSWER = 1,
ANEURALNETWORKS_PREFER_SUSTAINED_SPEED = 2,
} PreferenceCode;

typedef struct ANeuralNetworksMemory ANeuralNetworksMemory;
typedef struct ANeuralNetworksModel ANeuralNetworksModel;
typedef struct ANeuralNetworksDevice ANeuralNetworksDevice;
typedef struct ANeuralNetworksCompilation ANeuralNetworksCompilation;
typedef struct ANeuralNetworksExecution ANeuralNetworksExecution;
typedef struct ANeuralNetworksEvent ANeuralNetworksEvent;

typedef int32_t ANeuralNetworksOperationType;

typedef struct ANeuralNetworksOperandType {
int32_t type;
uint32_t dimensionCount;
const uint32_t* dimensions;
float scale;
int32_t zeroPoint;
} ANeuralNetworksOperandType;

#endif // MINIMAL_NEURAL_NETWORKS_H
155 changes: 155 additions & 0 deletions aten/src/ATen/nnapi/codegen.py
@@ -0,0 +1,155 @@
#!/usr/bin/env python3
"""
Code generator for NNAPI wrapper. We can't link directly against
libneuralnetworks.so because we want PyTorch to work on Android
devices that don't have it available. Instead, we generate a wrapper
that opens libneuralnetworks.so with dlopen and finds the functions
we need with dlsym. We also generate a "check" wrapper that checks
return values and throws C++ exceptions on errors.
"""
import sys
import re
import pathlib
import textwrap


PREFIX = """\
/**
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// This file is generated by nnapi/codegen.py
"""


NNAPI_FUNCTIONS = [
("int", "ANeuralNetworks_getDeviceCount", "uint32_t* numDevices"),
("int", "ANeuralNetworks_getDevice", "uint32_t devIndex, ANeuralNetworksDevice** device"),
("int", "ANeuralNetworksDevice_getName", "const ANeuralNetworksDevice* device, const char** name"),
("int", "ANeuralNetworksDevice_getVersion", "const ANeuralNetworksDevice* device, const char** version"),
("int", "ANeuralNetworksDevice_getFeatureLevel", "const ANeuralNetworksDevice* device, int64_t* featureLevel"),
("int", "ANeuralNetworksModel_getSupportedOperationsForDevices", " const ANeuralNetworksModel* model, const ANeuralNetworksDevice* const* devices, uint32_t numDevices, bool* supportedOps"),
("int", "ANeuralNetworksCompilation_createForDevices", "ANeuralNetworksModel* model, const ANeuralNetworksDevice* const* devices, uint32_t numDevices, ANeuralNetworksCompilation** compilation"),
("int", "ANeuralNetworksExecution_compute", "ANeuralNetworksExecution* execution"),
("int", "ANeuralNetworksMemory_createFromFd", "size_t size, int protect, int fd, size_t offset, ANeuralNetworksMemory** memory"),
("void", "ANeuralNetworksMemory_free", "ANeuralNetworksMemory* memory"),
("int", "ANeuralNetworksModel_create", "ANeuralNetworksModel** model"),
("void", "ANeuralNetworksModel_free", "ANeuralNetworksModel* model"),
("int", "ANeuralNetworksModel_finish", "ANeuralNetworksModel* model"),
("int", "ANeuralNetworksModel_addOperand", "ANeuralNetworksModel* model, const ANeuralNetworksOperandType* type"),
("int", "ANeuralNetworksModel_setOperandValue", "ANeuralNetworksModel* model, int32_t index, const void* buffer, size_t length"),
("int", "ANeuralNetworksModel_setOperandValueFromMemory", "ANeuralNetworksModel* model, int32_t index, const ANeuralNetworksMemory* memory, size_t offset, size_t length"),
("int", "ANeuralNetworksModel_addOperation", "ANeuralNetworksModel* model, ANeuralNetworksOperationType type, uint32_t inputCount, const uint32_t* inputs, uint32_t outputCount, const uint32_t* outputs"),
("int", "ANeuralNetworksModel_identifyInputsAndOutputs", "ANeuralNetworksModel* model, uint32_t inputCount, const uint32_t* inputs, uint32_t outputCount, const uint32_t* outputs"),
("int", "ANeuralNetworksModel_relaxComputationFloat32toFloat16", "ANeuralNetworksModel* model, bool allow"),
("int", "ANeuralNetworksCompilation_create", "ANeuralNetworksModel* model, ANeuralNetworksCompilation** compilation"),
("void", "ANeuralNetworksCompilation_free", "ANeuralNetworksCompilation* compilation"),
("int", "ANeuralNetworksCompilation_setPreference", "ANeuralNetworksCompilation* compilation, int32_t preference"),
("int", "ANeuralNetworksCompilation_finish", "ANeuralNetworksCompilation* compilation"),
("int", "ANeuralNetworksExecution_create", "ANeuralNetworksCompilation* compilation, ANeuralNetworksExecution** execution"),
("void", "ANeuralNetworksExecution_free", "ANeuralNetworksExecution* execution"),
("int", "ANeuralNetworksExecution_setInput", "ANeuralNetworksExecution* execution, int32_t index, const ANeuralNetworksOperandType* type, const void* buffer, size_t length"),
("int", "ANeuralNetworksExecution_setInputFromMemory", "ANeuralNetworksExecution* execution, int32_t index, const ANeuralNetworksOperandType* type, const ANeuralNetworksMemory* memory, size_t offset, size_t length"),
("int", "ANeuralNetworksExecution_setOutput", "ANeuralNetworksExecution* execution, int32_t index, const ANeuralNetworksOperandType* type, void* buffer, size_t length"),
("int", "ANeuralNetworksExecution_setOutputFromMemory", "ANeuralNetworksExecution* execution, int32_t index, const ANeuralNetworksOperandType* type, const ANeuralNetworksMemory* memory, size_t offset, size_t length"),
("int", "ANeuralNetworksExecution_startCompute", "ANeuralNetworksExecution* execution, ANeuralNetworksEvent** event"),
("int", "ANeuralNetworksEvent_wait", "ANeuralNetworksEvent* event"),
("void", "ANeuralNetworksEvent_free", "ANeuralNetworksEvent* event"),
("int", "ANeuralNetworksExecution_getOutputOperandRank", "ANeuralNetworksExecution* execution, int32_t index, uint32_t* rank"),
("int", "ANeuralNetworksExecution_getOutputOperandDimensions", "ANeuralNetworksExecution* execution, int32_t index, uint32_t* dimensions"),
]


def main(argv):
struct_members = []
load_functions = []
define_checks = []

for ret, name, args in NNAPI_FUNCTIONS:
short_name = name.replace("ANeuralNetworks", "", 1)

struct_members.append(f" {ret}(*{short_name})({args});")

load_functions.append(f' *(void**)&nnapi_.{short_name} = dlsym(handle, "{name}");')
load_functions.append(f' check_nnapi_.{short_name} = check_{short_name};')

call_args = "".join(re.findall("\w+(?:,|$)", args))
if ret == "void":
define_checks.append(textwrap.dedent(f"""\
{ret} check_{short_name}({args}) {{
CAFFE_ENFORCE(nnapi_.{short_name});
nnapi_.{short_name}({call_args});
}}"""))
if ret == "int":
define_checks.append(textwrap.dedent(f"""\
{ret} check_{short_name}({args}) {{
CAFFE_ENFORCE(nnapi_.{short_name});
int ret = nnapi_.{short_name}({call_args});
// TODO: Maybe add better logging here.
CAFFE_ENFORCE(ret == ANEURALNETWORKS_NO_ERROR);
return ret;
}}"""))

out_dir = pathlib.Path(__file__).parent

(out_dir / "nnapi_wrapper.h").write_text(
PREFIX +
textwrap.dedent("""\
#ifndef NNAPI_WRAPPER_H_
#define NNAPI_WRAPPER_H_
#include <stddef.h>
#include <stdint.h>
#include "NeuralNetworks.h"
struct nnapi_wrapper {
__STRUCT_MEMBERS__
};
#ifdef __cplusplus
void nnapi_wrapper_load(struct nnapi_wrapper** nnapi, struct nnapi_wrapper** check_nnapi);
#endif
#endif
""")
.replace("__STRUCT_MEMBERS__", "\n".join(struct_members))
)

(out_dir / "nnapi_wrapper.cpp").write_text(
PREFIX +
textwrap.dedent("""\
#include <dlfcn.h>
#include "nnapi_wrapper.h"
#include "c10/util/Logging.h"
static int loaded = 0;
static struct nnapi_wrapper nnapi_;
static struct nnapi_wrapper check_nnapi_;
__DEFINE_CHECK_FUNCTIONS__
void nnapi_wrapper_load(struct nnapi_wrapper** nnapi, struct nnapi_wrapper** check_nnapi) {
if (!loaded) {
// Clear error flag.
dlerror();
void* handle = dlopen("libneuralnetworks.so", RTLD_LAZY | RTLD_LOCAL);
CAFFE_ENFORCE(handle, "Failed to load libneuralnetworks.so ", dlerror());
__LOAD_FUNCTIONS__
loaded = 1;
}
*nnapi = &nnapi_;
*check_nnapi = &check_nnapi_;
}
""")
.replace("__DEFINE_CHECK_FUNCTIONS__", "\n".join(define_checks))
.replace("__LOAD_FUNCTIONS__", "\n".join(load_functions))
)


if __name__ == "__main__":
sys.exit(main(sys.argv))

0 comments on commit 5aabc30

Please sign in to comment.