In [12]:
!python3 --version

Python 3.10.12


### 学習

In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# データセットの読み込み（ここではMNISTデータセットを使用）
(X_train, y_train), (X_test, y_test) = keras.datasets.mnist.load_data()

# データの前処理
X_train, X_test = X_train / 255.0, X_test / 255.0

# モデルの定義
model = keras.Sequential([
    layers.Flatten(input_shape=(28, 28)),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(10)
])

# モデルのコンパイル
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

# モデルの訓練
model.fit(X_train, y_train, epochs=5)

# モデルの評価
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=2)
print('\nTest accuracy:', test_acc)

2024-01-15 01:44:36.358375: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-15 01:44:36.358420: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-15 01:44:36.359189: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-15 01:44:36.364816: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-01-15 01:44:38.414742: I external/local_xla/xla/

Epoch 1/5


2024-01-15 01:44:40.024675: I external/local_xla/xla/service/service.cc:168] XLA service 0x7fd78e2ee6d0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-01-15 01:44:40.024715: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce GTX 1660 Ti, Compute Capability 7.5
2024-01-15 01:44:40.029790: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-01-15 01:44:40.044114: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8902
I0000 00:00:1705283080.100339     546 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
313/313 - 1s - loss: 0.0730 - accuracy: 0.9772 - 803ms/epoch - 3ms/step

Test accuracy: 0.9771999716758728


### onnx形式にエクスポート

In [2]:
model_name = "model_tensorflow.onnx"

In [3]:
import tf2onnx
import onnx

# TensorFlowモデルをONNX形式に変換
onnx_model, _ = tf2onnx.convert.from_keras(model)

# ONNXモデルをファイルに保存
onnx.save_model(onnx_model, model_name)

Could not search for non-variable resources. Concrete function internal representation may have changed.
2024-01-15 01:45:10.132508: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-01-15 01:45:10.132551: I tensorflow/core/grappler/devices.cc:66] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 1
2024-01-15 01:45:10.132680: I tensorflow/core/grappler/clusters/single_machine.cc:361] Starting new session
2024-01-15 01:45:10.133169: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-01-15 01:45:10.133228: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node


### onnx形式のモデルの読み込みおよび確認

In [4]:
# 入力ノードの名前と形状情報を確認
print("\n入力ノードの情報:")
for input_node in onnx_model.graph.input:
    print("名前:", input_node.name)
    print("形状:", input_node.type.tensor_type.shape)


入力ノードの情報:
名前: flatten_input
形状: dim {
  dim_param: "unk__8"
}
dim {
  dim_value: 28
}
dim {
  dim_value: 28
}



### so形式にエクスポート

In [5]:
# CPU最適化
import onnx
import tvm
from tvm import relay

# ONNXモデルの読み込み
onnx_model = onnx.load(model_name)

# モデルをTVMの中間表現に変換
target = "llvm"
input_shape = (1, 1, 28, 28)
shape_dict = {input_node.name: input_shape}
mod, params = relay.frontend.from_onnx(onnx_model, shape_dict)

# コンパイル
with tvm.transform.PassContext(opt_level=1):
    compiled_lib = relay.build(mod, target=target, params=params)

# コンパイルされたモデルの保存
compiled_lib.export_library("keras-tvm-cpu.so")

One or more operators have not been tuned. Please tune your model for better performance. Use DEBUG logging level to see more details.


In [6]:
import tvm
print(tvm.__version__)


0.16.dev0


In [7]:
# GPU最適化
import onnx
import tvm
from tvm import relay

# ONNXモデルの読み込み
onnx_model = onnx.load(model_name)

# モデルをTVMの中間表現に変換
target = "cuda"  # NVIDIA GPUを使用
input_shape = (1, 1, 28, 28)
shape_dict = {input_node.name: input_shape}
mod, params = relay.frontend.from_onnx(onnx_model, shape_dict)

# コンパイル
with tvm.transform.PassContext(opt_level=1):
    compiled_lib = relay.build(mod, target=target, params=params)

# コンパイルされたモデルの保存
compiled_lib.export_library("keras-tvm-gpu.so")

### 形式比較

In [8]:
import numpy as np
# テストデータの前処理（適当なデータを生成）
input_data = np.random.rand(1, 28, 28).astype("float32")

In [9]:
import onnxruntime as ort
# ONNXモデルのパス
onnx_model_path = 'model_tensorflow.onnx'

# ONNXモデルを読み込む
onnx_model = onnx.load(onnx_model_path)

# ONNXランタイムをセットアップ
ort_session = ort.InferenceSession(onnx_model_path)

# ONNX形式に変換したモデルでの推論
onnx_output = ort_session.run(None, {input_node.name: input_data})

# keras.Sequential形式のモデルの推論
keras_output = model.predict(input_data)



In [10]:
import tvm
from tvm.contrib import graph_executor
import numpy as np

# コンパイルされたモジュールをロード
lib = tvm.runtime.load_module("keras-tvm-cpu.so")
dev = tvm.cpu(0)

# GraphModuleを作成
gmodc = graph_executor.GraphModule(lib["default"](dev))

# 入力を設定して実行
gmodc.set_input(input_node.name, tvm.nd.array(input_data))
gmodc.run()

# 出力を取得
tvm_output_cpu = gmodc.get_output(0).asnumpy()

In [11]:
import tvm
from tvm.contrib import graph_executor
import numpy as np

# コンパイルされたモジュールをロード
lib = tvm.runtime.load_module("keras-tvm-gpu.so")
dev = tvm.cuda(0)

# GraphModuleを作成
gmodg = graph_executor.GraphModule(lib["default"](dev))

# 入力を設定して実行
gmodg.set_input(input_node.name, tvm.nd.array(input_data))
gmodg.run()

# 出力を取得
tvm_output_gpu = gmodg.get_output(0).asnumpy()

In [18]:
import pandas as pd


# ONNX出力をNumPy配列に変換して1次元に平坦化
onnx_output = np.array(onnx_output).flatten()

# Keras出力が複数次元の場合は1次元に平坦化
keras_output = keras_output.flatten()

# Keras出力が複数次元の場合は1次元に平坦化
tvm_output_cpu = tvm_output_cpu.flatten()
tvm_output_gpu = tvm_output_gpu.flatten()

# 元は同じモデルだが推論結果が微妙に異なる(最適化などの影響の模様)
df = pd.DataFrame({
    'Keras Output': keras_output,
    'ONNX Output': onnx_output,
    'tvm Output-cpu': tvm_output_cpu,
    'tvm Output-gpu': tvm_output_gpu,
    'onnx_output Difference': np.abs(keras_output - onnx_output),
    'tvm_output_cpu Difference': np.abs(keras_output - tvm_output_cpu),
    'tvm_output_gpu Difference': np.abs(onnx_output - tvm_output_gpu)
})
df

Unnamed: 0,Keras Output,ONNX Output,tvm Output-cpu,tvm Output-gpu,onnx_output Difference,tvm_output_cpu Difference,tvm_output_gpu Difference
0,-8.470961,-8.47096,-8.470962,-8.470961,9.536743e-07,9.536743e-07,9.536743e-07
1,-23.718079,-23.718092,-23.718086,-23.718079,1.335144e-05,7.629395e-06,1.335144e-05
2,0.338314,0.338309,0.338316,0.338313,5.424023e-06,2.086163e-06,4.708767e-06
3,2.017379,2.017372,2.017377,2.017378,6.198883e-06,1.430511e-06,5.722046e-06
4,-29.983334,-29.98333,-29.983328,-29.983332,3.814697e-06,5.722046e-06,1.907349e-06
5,9.280708,9.28071,9.280706,9.280709,1.907349e-06,1.907349e-06,9.536743e-07
6,-15.105251,-15.105246,-15.105249,-15.10525,5.722046e-06,1.907349e-06,4.768372e-06
7,3.33061,3.330611,3.330606,3.33061,1.430511e-06,4.529953e-06,1.907349e-06
8,-11.94764,-11.947638,-11.947636,-11.947641,2.861023e-06,4.768372e-06,3.814697e-06
9,-8.173767,-8.173758,-8.173763,-8.173768,9.536743e-06,3.814697e-06,1.049042e-05


### 実行時間比較

In [19]:
%%timeit
# kerasをそのまま実行
output = model.predict(input_data, verbose=0)

44.9 ms ± 838 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [20]:
%%timeit
# ONNXモデルを実行
output = ort_session.run(None, {input_node.name: input_data})

16.8 µs ± 211 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [22]:
gmodc.set_input(input_node.name, tvm.nd.array(input_data))

In [23]:
%%timeit
# tvmモデル(cpu)を実行
gmodc.run()

42.2 µs ± 3.07 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [24]:
gmodg.set_input(input_node.name, tvm.nd.array(input_data))

In [25]:
%%timeit
# tvmモデル(gpu)を実行
# 入力が小さい場合、GPUに転送するコストのほうが大きくなる
gmodg.run()

75.7 µs ± 644 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [10]:
# MLflow実行の手動終了
mlflow.end_run()