facebookresearch · fmassa · Aug 25, 2022 · May 12, 2022 · May 23, 2022 · May 24, 2022
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -85,6 +85,11 @@ install_dep: &install_dep
 
         # start installing
         source activate /home/circleci/venv
+
+        # for faster builds
+        conda install ninja
+        echo "Ninja version $(ninja --version)"
+
         conda install pytorch torchvision torchaudio cudatoolkit=11.3 -c pytorch -q
         $CONDA_PYTHON -m pip install -r requirements-benchmark.txt --progress-bar off
 
@@ -108,100 +113,118 @@ install_dep_exp: &install_dep_exp
 install_repo: &install_repo
   - run:
       name: Install Repository
+      no_output_timeout: 30m
       command: |
-        $CONDA_PYTHON -m pip install -e .
+        source $BASH_ENV
+        source activate /home/circleci/venv
+        git submodule update --init --recursive
+        $CONDA_PYTHON -m pip install -v -e .
 
         # Test import.
         $CONDA_PYTHON -c 'import sys; sys.path = sys.path[1:]; import xformers'
 
 install_experimental_repo: &install_experimental_repo
   - run:
       name: Install Repository
+      no_output_timeout: 30m
       command: |
+        git submodule update --init --recursive
         source $BASH_ENV
 
         cd experimental
-        $CONDA_PYTHON -m pip install -e .
+        $CONDA_PYTHON -m pip install -v -e .
 
 run_isort: &run_isort
    - run:
       name: Run Linter (isort)
+      when: always
       command: |
         source $BASH_ENV
         $CONDA_PYTHON -m isort . --check --profile black
 
 run_black: &run_black
    - run:
       name: Run Linter (black)
+      when: always
       command: |
         source $BASH_ENV
-        $CONDA_PYTHON -m black --check .
+        $CONDA_PYTHON -m black --check . --exclude "third_party/"
 
 run_mypy: &run_mypy
    - run:
        name: Run type-checking (mypy)
+       when: always
        command: |
         source $BASH_ENV
-        $CONDA_PYTHON -m mypy --ignore-missing-imports --scripts-are-modules --pretty --exclude build/ --exclude stubs/ .
+        $CONDA_PYTHON -m mypy --ignore-missing-imports --scripts-are-modules --pretty --exclude "(build|stubs|third_party|docs|setup.py)" .
 
 run_flake8: &run_flake8
   - run:
       name: Run Linter (flake8)
+      when: always
       command: |
         source $BASH_ENV
         $CONDA_PYTHON -m flake8 --config .flake8 --show-source --statistics
 
 run_clang_format: &run_clang_format
   - run:
       name: Run Linter (clang-format)
+      when: always
       command: |
         # install clang-format here, so that it gets cached
         sudo apt-get update
         sudo apt-get install clang-format
+        clang-format --version
 
         # apply to our files
         ./.circleci/run-clang-format.py -r xformers/components/attention/csrc
 
 run_coverage: &run_coverage
   - run:
       name: Run Unit Tests With Coverage
+      when: always
       command: |
         source $BASH_ENV
-        $CONDA_PYTHON -m pytest --junitxml=test-results/junit.xml --verbose --timeout 600 --cov-report=xml --cov=./ tests
+        CUDA_LAUNCH_BLOCKING=1 $CONDA_PYTHON -m pytest --junitxml=test-results/junit.xml --verbose --timeout 600 --cov-report=xml --cov=./ tests
         #Uploading test coverage for Python code
         bash <(curl -s https://codecov.io/bash) -f coverage.xml -cF Python
 
 run_unittests: &run_unittests
   - run:
       name: Run Unit Tests
+      when: always
       command: |
         source $BASH_ENV
-        $CONDA_PYTHON -m pytest --junitxml=test-results/junit.xml --verbose --timeout 600 tests
+        CUDA_LAUNCH_BLOCKING=1 $CONDA_PYTHON -m pytest --junitxml=test-results/junit.xml --verbose --timeout 600 tests
 
 run_experimental_unittests: &run_experimental_unittests
   - run:
       name: Run Unit Tests
+      when: always
       command: |
         source $BASH_ENV
-        $CONDA_PYTHON -m pytest experimental/tests
+        CUDA_LAUNCH_BLOCKING=1 $CONDA_PYTHON -m pytest experimental/tests
 
 run_benchmarks: &run_benchmarks
   - run:
       name: Run Benchmarks
+      when: always
       command: |
         source $BASH_ENV
         $CONDA_PYTHON xformers/benchmarks/benchmark_encoder.py --activations gelu --plot -emb 128 -bs 16 -heads 4
 
 run_pytorch_benchmark: &run_pytorch_benchmark
   - run:
       name: Run Pytorch benchmark
+      when: always
       command: |
         source $BASH_ENV
         $CONDA_PYTHON xformers/benchmarks/benchmark_pytorch_transformer.py
 
 run_vit_benchmark: &run_vit_benchmark
   - run:
       name: Run ViT Timm benchmark
+      when: always
       command: |
         source $BASH_ENV
         $CONDA_PYTHON xformers/benchmarks/benchmark_vit_timm.py
@@ -211,6 +234,7 @@ run_vit_benchmark: &run_vit_benchmark
 run_doc_build: &run_doc_build
    - run:
       name: Testing doc build
+      when: always
       command: |
         source $BASH_ENV
         cd docs

diff --git a/.circleci/run-clang-format.py b/.circleci/run-clang-format.py
@@ -82,8 +82,8 @@ def make_diff(file, original, reformatted):
         difflib.unified_diff(
             original,
             reformatted,
-            fromfile="{}\t(original)".format(file),
-            tofile="{}\t(reformatted)".format(file),
+            fromfile="a/{}\t(original)".format(file),
+            tofile="b/{}\t(reformatted)".format(file),
             n=3,
         )
     )

diff --git a/.coveragerc b/.coveragerc
@@ -6,3 +6,4 @@ omit =
     xformers/benchmarks/*
     xformers/triton/k_*
     stubs/*
+    third_party/*
diff --git a/.flake8 b/.flake8
@@ -2,6 +2,7 @@
 exclude = 
     .git
     ,.circleci/run-clang-format.py
+    ,third_party
 max-line-length = 120
 copyright-check = True
 select = E,F,W,C

diff --git a/.gitignore b/.gitignore
@@ -52,3 +52,5 @@ examples/data
 # Hydra default output dir
 multirun
 outputs
+
+.benchmarks
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,7 @@
+[submodule "third_party/flash-attention"]
+	path = third_party/flash-attention
+	url = https://github.com/HazyResearch/flash-attention.git
+[submodule "third_party/cutlass"]
+	path = third_party/cutlass
+	url = https://github.com/fmassa/cutlass.git
+	branch = updates_for_mha
diff --git a/.isort.cfg b/.isort.cfg
@@ -1,2 +1,3 @@
 [settings]
 known_third_party =fvcore,hydra,input_pipeline,matplotlib,numpy,omegaconf,pandas,pl_bolts,pyre_extensions,pytest,pytorch_lightning,ragged_inference,recommonmark,seaborn,setuptools,sklearn,submitit,tensorflow,timm,torch,torchmetrics,torchvision,tqdm,triton,typing_extensions
+skip_glob=third_party/*
diff --git a/README.md b/README.md
@@ -53,6 +53,9 @@ There are two ways you can install xFormers locally:
 
   ```bash
   git clone git@github.com:facebookresearch/xformers.git
+  git submodule update --init --recursive
+  conda create --name xformer_env python=3.8
+  conda activate xformer_env
   cd xformers
   pip install -r requirements.txt
   pip install -e .

diff --git a/setup.py b/setup.py
@@ -10,7 +10,9 @@
 import os
 import re
 import shutil
+import subprocess
 import sys
+from pathlib import Path
 
 import setuptools
 import torch
@@ -44,6 +46,84 @@ def find_version(version_file_path):
         raise RuntimeError("Unable to find version string.")
 
 
+def get_cuda_version(cuda_dir) -> int:
+    nvcc_bin = "nvcc" if cuda_dir is None else cuda_dir + "/bin/nvcc"
+    raw_output = subprocess.check_output([nvcc_bin, "-V"], universal_newlines=True)
+    output = raw_output.split()
+    release_idx = output.index("release") + 1
+    release = output[release_idx].split(".")
+    bare_metal_major = int(release[0])
+    bare_metal_minor = int(release[1][0])
+
+    assert bare_metal_minor < 100
+    return bare_metal_major * 100 + bare_metal_minor
+
+
+def get_flash_attention_extensions(cuda_version: int, extra_compile_args):
+    # Figure out default archs to target
+    DEFAULT_ARCHS_LIST = ""
+    if cuda_version > 1100:
+        DEFAULT_ARCHS_LIST = "7.5;8.0;8.6"
+    elif cuda_version >= 1100:
+        DEFAULT_ARCHS_LIST = "7.5;8.0"
+    else:
+        return []
+
+    archs_list = os.environ.get("TORCH_CUDA_ARCH_LIST", DEFAULT_ARCHS_LIST)
+    nvcc_archs_flags = []
+    for arch in archs_list.split(";"):
+        assert len(arch) >= 3, f"Invalid sm version: {arch}"
+
+        num = 10 * int(arch[0]) + int(arch[2])
+        # Need at least 7.5
+        if num < 75:
+            continue
+        nvcc_archs_flags.append(f"-gencode=arch=compute_{num},code=sm_{num}")
+        if arch.endswith("+PTX"):
+            nvcc_archs_flags.append(f"-gencode=arch=compute_{num},code=compute_{num}")
+    if not nvcc_archs_flags:
+        return []
+
+    this_dir = os.path.dirname(os.path.abspath(__file__))
+    flash_root = os.path.join(this_dir, "third_party", "flash-attention")
+    return [
+        CUDAExtension(
+            name="xformers._C_flashattention",
+            sources=[
+                os.path.join(this_dir, "third_party", "flash-attention", path)
+                for path in [
+                    "csrc/flash_attn/fmha_api.cpp",
+                    "csrc/flash_attn/src/fmha_fprop_fp16_kernel.sm80.cu",
+                    "csrc/flash_attn/src/fmha_dgrad_fp16_kernel_loop.sm80.cu",
+                    "csrc/flash_attn/src/fmha_block_fprop_fp16_kernel.sm80.cu",
+                    "csrc/flash_attn/src/fmha_block_dgrad_fp16_kernel_loop.sm80.cu",
+                ]
+            ],
+            extra_compile_args={
+                **extra_compile_args,
+                "nvcc": extra_compile_args.get("nvcc", [])
+                + [
+                    "-O3",
+                    "-U__CUDA_NO_HALF_OPERATORS__",
+                    "-U__CUDA_NO_HALF_CONVERSIONS__",
+                    "--expt-relaxed-constexpr",
+                    "--expt-extended-lambda",
+                    "--use_fast_math",
+                    "--ptxas-options=-v",
+                    "-lineinfo",
+                ]
+                + nvcc_archs_flags,
+            },
+            include_dirs=[
+                Path(flash_root) / "csrc" / "flash_attn",
+                Path(flash_root) / "csrc" / "flash_attn" / "src",
+                #            Path(flash_root) / 'csrc' / 'flash_attn' / 'cutlass' / 'include',
+                Path(this_dir) / "third_party" / "cutlass" / "include",
+            ],
+        )
+    ]
+
+
 def get_extensions():
     this_dir = os.path.dirname(os.path.abspath(__file__))
     extensions_dir = os.path.join(
@@ -57,9 +137,11 @@ def get_extensions():
     )
 
     sources = main_file + source_cpu
+
     source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
 
     sputnik_dir = os.path.join(this_dir, "third_party", "sputnik")
+    cutlass_dir = os.path.join(this_dir, "third_party", "cutlass", "include")
 
     extension = CppExtension
 
@@ -73,31 +155,42 @@ def get_extensions():
         extra_compile_args["cxx"].append("-fopenmp")
 
     include_dirs = [extensions_dir]
+    ext_modules = []
 
     if (torch.cuda.is_available() and ((CUDA_HOME is not None))) or os.getenv(
         "FORCE_CUDA", "0"
     ) == "1":
         extension = CUDAExtension
         sources += source_cuda
-        include_dirs += [sputnik_dir]
+        include_dirs += [sputnik_dir, cutlass_dir]
         nvcc_flags = os.getenv("NVCC_FLAGS", "")
         if nvcc_flags == "":
             nvcc_flags = []
         else:
             nvcc_flags = nvcc_flags.split(" ")
+        cuda_version = get_cuda_version(CUDA_HOME)
+        if cuda_version >= 1102:
+            nvcc_flags += ["--threads", "4", "--ptxas-options=-v"]
         extra_compile_args["nvcc"] = nvcc_flags
+        if (
+            cuda_version >= 1100
+            and os.getenv("XFORMERS_DISABLE_FLASH_ATTN", "0") == "0"
+        ):
+            ext_modules += get_flash_attention_extensions(
+                cuda_version=cuda_version, extra_compile_args=extra_compile_args
+            )
 
     sources = [os.path.join(extensions_dir, s) for s in sources]
 
-    ext_modules = [
+    ext_modules.append(
         extension(
             "xformers._C",
             sorted(sources),
             include_dirs=include_dirs,
             define_macros=define_macros,
             extra_compile_args=extra_compile_args,
         )
-    ]
+    )
 
     return ext_modules