Merge pull request #15 from fabio-sim/fabio/flash

Support Flash Attention
fabio-sim · Jul 13, 2023 · 1735313 · 1735313
2 parents 75ec0e8 + ee0dc33
commit 1735313
Show file tree

Hide file tree

Showing 7 changed files with 32 additions and 10 deletions.
diff --git a/README.md b/README.md
@@ -8,6 +8,7 @@ Open Neural Network Exchange (ONNX) compatible implementation of [LightGlue: Loc
 
 ## Updates
 
+- **13 July 2023**: Add support for Flash Attention.
 - **11 July 2023**: Add support for mixed precision.
 - **4 July 2023**: Add inference time comparisons.
 - **1 July 2023**: Add support for extractor `max_num_keypoints`.
@@ -16,7 +17,7 @@ Open Neural Network Exchange (ONNX) compatible implementation of [LightGlue: Loc
 
 ## ONNX Export
 
-Prior to exporting the ONNX models, please install the [requirements](/requirements.txt) of the original LightGlue repository. ([Flash Attention](https://github.com/HazyResearch/flash-attention) does not need to be installed.)
+Prior to exporting the ONNX models, please install the [requirements](/requirements.txt) of the original LightGlue repository.
 
 To convert the DISK or SuperPoint and LightGlue models to ONNX, run [`export.py`](/export.py). We provide two types of ONNX exports: individual standalone models, and a combined end-to-end pipeline (recommended for convenience) with the `--end2end` flag.
 
@@ -32,6 +33,7 @@ python export.py \
 - Exporting individually can be useful when intermediate outputs can be cached or precomputed. On the other hand, the end-to-end pipeline can be more convenient.
 - Although dynamic axes have been specified, it is recommended to export your own ONNX model with the appropriate input image sizes of your use case.
 - Use the `--mp` option to export in mixed precision for more speed gains.
+- Enable flash attention with the `--flash` option for even faster speeds. ([Flash Attention](https://github.com/HazyResearch/flash-attention) must be installed for export but is not required during inference.)
 
 If you would like to try out inference right away, you can download ONNX models that have already been exported [here](https://github.com/fabio-sim/LightGlue-ONNX/releases).
 
@@ -91,7 +93,6 @@ As the ONNX Runtime has limited support for features like dynamic control flow,
 ### LightGlue Keypoint Matching
 
 - Since dynamic control flow has limited support in ONNX tracing, by extension, early stopping and adaptive point pruning (the `depth_confidence` and `width_confidence` parameters) are also difficult to export to ONNX.
-- Flash Attention is turned off.
 - Note that the end-to-end version, despite its name, still requires the postprocessing (filtering valid matches) function outside the ONNX model since the `scales` variables need to be passed.
 
 Additionally, the outputs of the ONNX models differ slightly from the original PyTorch models (by a small error on the magnitude of `1e-6` to `1e-5` for the scores/descriptors). Although the cause is still unclear, this could be due to differing implementations or modified dtypes.

diff --git a/assets/latency.png b/assets/latency.png
diff --git a/docs/README.zh.md b/docs/README.zh.md
@@ -8,6 +8,7 @@
 
 ## 更新
 
+- **2023年7月13日**: 支持FlashAttention。
 - **2023年7月11日**: 支持混合精度。
 - **2023年7月4日**: 加了运行时间比较。
 - **2023年7月1日**: 支持特征提取`max_num_keypoints`。
@@ -16,7 +17,7 @@
 
 ## ONNX格式转换
 
-在转换ONNX模型之前，请安装原始LightGlue的[requirements](/requirements.txt)。(不需要安装[Flash Attention](https://github.com/HazyResearch/flash-attention))。
+在转换ONNX模型之前，请安装原始LightGlue的[requirements](/requirements.txt)。
 
 将DISK或SuperPoint和LightGlue模型转换为ONNX格式，请运行[`export.py`](/export.py)。提供了两种类型的ONNX转换：独立模型和组合模型(使用`--end2end`，比较方便)。
 
@@ -31,6 +32,7 @@ python export.py \
 
 - 虽然已指定了`--dynamic`，但建议使用适合您用例的图像大小转换。
 - 指定`--mp`使混合精度。
+- 指定`--flash`使FlashAttention。(ONNX格式转换需要安装[Flash Attention](https://github.com/HazyResearch/flash-attention)，但推理不需要。)
 
 如果您想立即尝试ONNX运行，可以下载已转换的[ONNX模型](https://github.com/fabio-sim/LightGlue-ONNX/releases)。
 

diff --git a/eval.py b/eval.py
@@ -52,6 +52,11 @@ def parse_args() -> argparse.Namespace:
         action="store_true",
         help="Whether to enable mixed precision (CUDA only).",
     )
+    parser.add_argument(
+        "--flash",
+        action="store_true",
+        help="Whether to use Flash Attention (CUDA only). Flash Attention must be installed.",
+    )
 
     # ONNXRuntime-specific args
     parser.add_argument(
@@ -85,6 +90,7 @@ def create_models(
     max_num_keypoints=512,
     device="cuda",
     mp=False,
+    flash=False,
     extractor_path=None,
     lightglue_path=None,
 ):
@@ -96,7 +102,7 @@ def create_models(
         elif extractor_type == "disk":
             extractor = DISK(max_num_keypoints=max_num_keypoints).eval().to(device)
 
-        lightglue = LightGlue(extractor_type, mp=mp).eval().to(device)
+        lightglue = LightGlue(extractor_type, mp=mp, flash=flash).eval().to(device)
     elif framework == "ort":
         sess_opts = ort.SessionOptions()
         sess_opts.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
@@ -119,7 +125,10 @@ def create_models(
 
         if lightglue_path is None:
             lightglue_path = (
-                f"weights/{extractor_type}_lightglue" f"{'_mp' if mp else ''}" ".onnx"
+                f"weights/{extractor_type}_lightglue"
+                f"{'_mp' if mp else ''}"
+                f"{'_flash' if flash else ''}"
+                ".onnx"
             )
         lightglue = ort.InferenceSession(
             lightglue_path,
@@ -201,6 +210,7 @@ def evaluate(
     max_num_keypoints=512,
     device="cuda",
     mp=False,
+    flash=False,
     extractor_path=None,
     lightglue_path=None,
 ):
@@ -213,6 +223,7 @@ def evaluate(
         max_num_keypoints=max_num_keypoints,
         device=device,
         mp=mp,
+        flash=flash,
         extractor_path=extractor_path,
         lightglue_path=lightglue_path,
     )

diff --git a/evaluation/EVALUATION.md b/evaluation/EVALUATION.md
@@ -16,6 +16,6 @@ The measured run times are plotted in the figure below.
 
 <p align="center"><a href="https://github.com/fabio-sim/LightGlue-ONNX/blob/main/evaluation/EVALUATION.md"><img src="../assets/latency.png" alt="Latency Comparison" width=100%></a>
 
-<table align="center"><thead><tr><th>Number of Keypoints</th><th></th><th>512</th><th>1024</th><th>2048</th><th>4096</th></tr><tr><th>Model</th><th>Device</th><th colspan="4">Latency (ms)</th></tr></thead><tbody><tr><td>LightGlue</td><td>CUDA</td><td>35.42</td><td>47.36</td><td>112.87</td><td>187.51</td></tr><tr><td>LightGlue-ONNX</td><td>CUDA</td><td>30.44</td><td>82.24</td><td>269.39</td><td>519.41</td></tr><tr><td>LightGlue-MP</td><td>CUDA</td><td>36.32</td><td>37.10</td><td>61.58</td><td>127.59</td></tr><tr><td>LightGlue-ONNX-MP</td><td>CUDA</td><td>24.2</td><td>66.27</td><td>227.91</td><td>473.71</td></tr><tr><td>LightGlue</td><td>CPU</td><td>1121</td><td>3818</td><td>15968</td><td>37587</td></tr><tr><td>LightGlue-ONNX</td><td>CPU</td><td>759</td><td>2961</td><td>10493</td><td>20143</td></tr></tbody></table>
+<table align="center"><thead><tr><th>Number of Keypoints</th><th></th><th>512</th><th>1024</th><th>2048</th><th>4096</th></tr><tr><th>Model</th><th>Device</th><th colspan="4">Latency (ms)</th></tr></thead><tbody><tr><td>LightGlue</td><td>CUDA</td><td>35.42</td><td>47.36</td><td>112.87</td><td>187.51</td></tr><tr><td>LightGlue-ONNX</td><td>CUDA</td><td>30.44</td><td>82.24</td><td>269.39</td><td>519.41</td></tr><tr><td>LightGlue-MP</td><td>CUDA</td><td>36.32</td><td>37.10</td><td>61.58</td><td>127.59</td></tr><tr><td>LightGlue-ONNX-MP</td><td>CUDA</td><td>24.2</td><td>66.27</td><td>227.91</td><td>473.71</td></tr><tr><td>LightGlue-MP-Flash</td><td>CUDA</td><td>38.3</td><td>38.8</td><td>42.9</td><td>55.9</td></tr><tr><td>LightGlue-ONNX-MP-Flash</td><td>CUDA</td><td>21.2</td><td>57.4</td><td>191.1</td><td>368.9</td></tr><tr><td>LightGlue</td><td>CPU</td><td>1121</td><td>3818</td><td>15968</td><td>37587</td></tr><tr><td>LightGlue-ONNX</td><td>CPU</td><td>759</td><td>2961</td><td>10493</td><td>20143</td></tr></tbody></table>
 
 At smaller numbers of keypoints, the difference between the CUDA ONNX and PyTorch latencies are small; however, this becomes much more noticeable at higher keypoint numbers, where PyTorch is faster. The cause remains to be investigated (different operator implementations?). On the other hand, ONNX is faster overall for CPU inference.
diff --git a/evaluation/lightglue-onnx.ipynb b/evaluation/lightglue-onnx.ipynb
@@ -47,7 +47,8 @@
       "outputs": [],
       "source": [
         "!pip install -q torch==2.0.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118\n",
-        "!pip install -q kornia==0.6.12 onnx==1.14.0 onnxruntime-gpu==1.15.1 numpy opencv-python matplotlib einops"
+        "!pip install -q kornia==0.6.12 onnx==1.14.0 onnxruntime-gpu==1.15.1 numpy opencv-python matplotlib einops\n",
+        "# !pip install -q flash-attn==1.0.8 --no-build-isolation  # Time-consuming (~30 minutes)"
       ]
     },
     {

diff --git a/export.py b/export.py
@@ -59,6 +59,11 @@ def parse_args() -> argparse.Namespace:
         action="store_true",
         help="Whether to use mixed precision (CUDA only). Not supported when using the --safe option.",
     )
+    parser.add_argument(
+        "--flash",
+        action="store_true",
+        help="Whether to use Flash Attention (CUDA only). Flash Attention must be installed. Not supported when using the --safe option.",
+    )
 
     # Extractor-specific args:
     parser.add_argument(
@@ -83,6 +88,7 @@ def export_onnx(
     safe=False,
     dynamic=False,
     mp=False,
+    flash=False,
     max_num_keypoints=None,
 ):
     # Handle args
@@ -102,6 +108,7 @@ def export_onnx(
             f"{'_end2end' if end2end else ''}"
             f"{'_safe' if safe else ''}"
             f"{'_mp' if mp else ''}"
+            f"{'_flash' if flash else ''}"
             ".onnx"
         )
 
@@ -115,10 +122,10 @@ def export_onnx(
         image0 = rgb_to_grayscale(image0)
         image1 = rgb_to_grayscale(image1)
         extractor = SuperPoint(max_num_keypoints=max_num_keypoints).eval()
-        lightglue = LightGlue(extractor_type).eval()
+        lightglue = LightGlue(extractor_type, flash=flash).eval()
     elif extractor_type == "disk":
         extractor = DISK(max_num_keypoints=max_num_keypoints).eval()
-        lightglue = LightGlue(extractor_type).eval()
+        lightglue = LightGlue(extractor_type, flash=flash).eval()
 
         if torch.__version__ < "2.1":
             patch_disk_convolution_mode(extractor)
@@ -133,7 +140,7 @@ def export_onnx(
     ):
         register_aten_sdpa(opset_version=14)
 
-    if mp:
+    if mp or flash:
         assert torch.cuda.is_available(), "Mixed precision requires CUDA."
         image0, image1 = image0.to("cuda"), image1.to("cuda")
         extractor, lightglue = extractor.to("cuda"), lightglue.to("cuda")