### 学習

In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# データセットの読み込み（ここではMNISTデータセットを使用）
(X_train, y_train), (X_test, y_test) = keras.datasets.mnist.load_data()

# データの前処理
X_train, X_test = X_train / 255.0, X_test / 255.0

# モデルの定義
model = keras.Sequential([
    layers.Flatten(input_shape=(28, 28)),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(10)
])

# モデルのコンパイル
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

# モデルの訓練
model.fit(X_train, y_train, epochs=5)

# モデルの評価
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=2)
print('\nTest accuracy:', test_acc)

2024-01-09 06:01:16.837324: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-09 06:01:16.870188: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-09 06:01:16.871050: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
313/313 - 0s - loss: 0.0712 - accuracy: 0.9785 - 445ms/epoch - 1ms/step

Test accuracy: 0.9785000085830688


### onnx形式にエクスポート

In [2]:
model_name = "model_tensorflow.onnx"

In [3]:
import tf2onnx
import onnx

# TensorFlowモデルをONNX形式に変換
onnx_model, _ = tf2onnx.convert.from_keras(model)

# ONNXモデルをファイルに保存
onnx.save_model(onnx_model, model_name)

Could not search for non-variable resources. Concrete function internal representation may have changed.
2024-01-09 06:01:39.142590: I tensorflow/core/grappler/devices.cc:66] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0
2024-01-09 06:01:39.142865: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session
2024-01-09 06:01:39.172468: I tensorflow/core/grappler/devices.cc:66] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0
2024-01-09 06:01:39.172594: I tensorflow/core/grappler/clusters/single_machine.cc:357] Starting new session


### onnx形式のモデルの読み込みおよび確認

In [4]:
# 入力ノードの名前と形状情報を確認
print("\n入力ノードの情報:")
for input_node in onnx_model.graph.input:
    print("名前:", input_node.name)
    print("形状:", input_node.type.tensor_type.shape)


入力ノードの情報:
名前: flatten_input
形状: dim {
  dim_param: "unk__8"
}
dim {
  dim_value: 28
}
dim {
  dim_value: 28
}



### so形式にエクスポート

In [5]:
import onnx
import tvm
from tvm import relay

# ONNXモデルの読み込み
onnx_model = onnx.load(model_name)

# モデルをTVMの中間表現に変換
target = "llvm"
input_shape = (1, 1, 28, 28)
shape_dict = {input_node.name: input_shape}
mod, params = relay.frontend.from_onnx(onnx_model, shape_dict)

# コンパイル
with tvm.transform.PassContext(opt_level=1):
    compiled_lib = relay.build(mod, target=target, params=params)

# コンパイルされたモデルの保存
compiled_lib.export_library("keras-tvm.so")

One or more operators have not been tuned. Please tune your model for better performance. Use DEBUG logging level to see more details.


### 形式比較

In [6]:
import numpy as np
# テストデータの前処理（適当なデータを生成）
input_data = np.random.rand(1, 28, 28).astype("float32")

In [7]:
import onnxruntime as ort
# ONNXモデルのパス
onnx_model_path = 'model_tensorflow.onnx'

# ONNXモデルを読み込む
onnx_model = onnx.load(onnx_model_path)

# ONNXランタイムをセットアップ
ort_session = ort.InferenceSession(onnx_model_path)

# ONNX形式に変換したモデルでの推論
onnx_output = ort_session.run(None, {input_node.name: input_data})

# keras.Sequential形式のモデルの推論
keras_output = model.predict(input_data)



In [16]:
import tvm
from tvm.contrib import graph_executor
import numpy as np

# コンパイルされたモジュールをロード
lib = tvm.runtime.load_module("keras-tvm.so")
dev = tvm.cpu(0)

# GraphModuleを作成
gmod = graph_executor.GraphModule(lib["default"](dev))

# 入力データを準備（これは例であり、実際のデータに置き換える必要があります）
input_data = np.random.uniform(size=(1, 1, 28, 28)).astype("float32")

# 入力を設定して実行
gmod.set_input(input_node.name, tvm.nd.array(input_data))
gmod.run()

# 出力を取得
tvm_output = gmod.get_output(0).asnumpy()

In [9]:
import pandas as pd


# ONNX出力をNumPy配列に変換して1次元に平坦化
onnx_output = np.array(onnx_output).flatten()

# Keras出力が複数次元の場合は1次元に平坦化
keras_output = keras_output.flatten()

# Keras出力が複数次元の場合は1次元に平坦化
tvm_output = tvm_output.flatten()

# 元は同じモデルだが推論結果が微妙に異なる(最適化などの影響の模様)
df = pd.DataFrame({
    'Keras Output': keras_output,
    'ONNX Output': onnx_output,
    'tvm Output': tvm_output,
    'k-o Difference': np.abs(keras_output - onnx_output),
    'k-t Difference': np.abs(keras_output - tvm_output),
    'o-t Difference': np.abs(onnx_output - tvm_output)
})
df

Unnamed: 0,Keras Output,ONNX Output,tvm Output,k-o Difference,k-t Difference,o-t Difference
0,-13.975501,-13.975495,-14.059313,5.722046e-06,0.083812,0.083817
1,-10.512897,-10.512895,-12.906715,2.861023e-06,2.393818,2.393821
2,6.042183,6.042183,4.898134,4.768372e-07,1.144049,1.144049
3,9.024312,9.024311,10.100356,9.536743e-07,1.076044,1.076045
4,-23.014868,-23.014866,-26.053291,1.907349e-06,3.038424,3.038425
5,6.289399,6.289399,5.962965,4.768372e-07,0.326434,0.326433
6,-4.499303,-4.4993,-3.610198,2.384186e-06,0.889104,0.889102
7,5.99357,5.993567,6.819549,3.33786e-06,0.825979,0.825982
8,-7.16173,-7.161728,-5.239726,1.907349e-06,1.922004,1.922002
9,-17.599163,-17.599163,-17.596352,0.0,0.002811,0.002811


### 実行時間比較

In [10]:
input_data = np.random.rand(1, 28, 28).astype("float32")

In [11]:
%%timeit
# モデルの推論
output = model.predict(input_data, verbose=0)

37.5 ms ± 1.5 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [12]:
%%timeit
# ONNXモデルを実行
output = ort_session.run(None, {input_node.name: input_data})

18.9 µs ± 2.42 µs per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [13]:
gmod.set_input(input_node.name, tvm.nd.array(input_data))

In [14]:
%%timeit
# tvmモデルを実行
gmod.run()

41 µs ± 1.6 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
