Attention mask in Bert #46

pommedeterresautee · 2022-10-16T12:55:52Z

Hi,

I try to use an attention mask in Bert demo script but when I add the tensor to the input dict it crashes.
How can I provide this mask?

Reproduction script (run on the docker image):

#  Copyright (c) Meta Platforms, Inc. and affiliates.
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
#
import time

import click
import torch
from benchmark_ait import compile_module
from modeling.torch_model import BertBaseUncased as BertPt


def run_model(activation: str, graph_mode: bool, use_fp16_acc: bool, verify: bool):
    f = open("measures.txt", mode="w")
    shape = (1, 128)
    inputs_pt = {
        "input_ids": torch.randint(2, 1000, size=shape, dtype=torch.int64, device="cuda"),
        "position_ids": torch.arange(shape[1], dtype=torch.int64).expand(shape).contiguous().cuda(),
        "attention_mask": torch.ones(shape, dtype=torch.int64, device="cuda"),
        "token_type_ids": torch.ones(size=shape, dtype=torch.int64, device="cuda"),
    }

    batch_size, seq_len = inputs_pt["input_ids"].size()

    pt_model = BertPt(pretrained=True)._model
    pt_model.eval()
    hidden_size = pt_model.config.hidden_size

    mod = compile_module(batch_size, seq_len, hidden_size, activation, use_fp16_acc, False, pt_model)

    outputs = [torch.empty(mod.get_output_maximum_shape(0)).half().cuda()]

    # warmup
    for _ in range(10):
        mod.run_with_tensors(inputs_pt, outputs, graph_mode=graph_mode)

    torch.cuda.synchronize()
    timings = list()
    for _ in range(10):
        start = time.time()
        mod.run_with_tensors(inputs_pt, outputs, graph_mode=graph_mode)
        torch.cuda.synchronize()
        timings.append(time.time() - start)

    f.write(f"{shape}: {torch.median(torch.tensor(timings)):.4f}\n")
    f.flush()
    print(f"Logits: {outputs[0]}")
    if verify:
        pt_outputs = pt_model.bert(**inputs_pt)
        torch.allclose(outputs[0], pt_outputs.last_hidden_state, 1e-1, 1e-1)
        print("Verification done!")
    f.close()


@click.command()
@click.option(
    "--activation",
    type=str,
    default="gelu",
    help="Activation function applied on BERT, currently only support gelu and fast_gelu",
)
@click.option(
    "--graph_mode",
    type=bool,
    default=True,
    help="Use CUDA graph or not. (hipGraph is not supported yet)",
)
@click.option(
    "--use_fp16_acc",
    type=bool,
    default=False,
    help="Use fp16 accumulation or not (TensorRT is using fp16_acc)",
)
@click.option(
    "--verify",
    type=bool,
    default=True,
    help="Verify AIT outputs against PT",
)
def run_demo(
    activation: str,
    graph_mode: bool,
    use_fp16_acc: bool,
    verify: bool,
):
    run_model(activation, graph_mode, use_fp16_acc, verify)


if __name__ == "__main__":
    torch.manual_seed(4896)
    run_demo()

Produces:

...
2022-10-16 12:51:44,784 INFO <aitemplate.backend.builder> Building ./tmp/BERT_gelu_1_128/model_interface.obj
2022-10-16 12:52:03,348 INFO <aitemplate.backend.builder> Building ./tmp/BERT_gelu_1_128/test.so
[12:52:03] ./tmp/BERT_gelu_1_128/model-generated.h:225: Init AITemplate Runtime.
Traceback (most recent call last):
  File "./examples/03_bert/demo_new.py", line 101, in <module>
    run_demo()
  File "/usr/local/lib/python3.8/dist-packages/click/core.py", line 1130, in __call__
    return self.main(*args, **kwargs)
  File "/usr/local/lib/python3.8/dist-packages/click/core.py", line 1055, in main
    rv = self.invoke(ctx)
  File "/usr/local/lib/python3.8/dist-packages/click/core.py", line 1404, in invoke
    return ctx.invoke(self.callback, **ctx.params)
  File "/usr/local/lib/python3.8/dist-packages/click/core.py", line 760, in invoke
    return __callback(*args, **kwargs)
  File "./examples/03_bert/demo_new.py", line 96, in run_demo
    run_model(activation, graph_mode, use_fp16_acc, verify)
  File "./examples/03_bert/demo_new.py", line 45, in run_model
    mod.run_with_tensors(inputs_pt, outputs, graph_mode=graph_mode)
  File "/usr/local/lib/python3.8/dist-packages/aitemplate/compiler/model.py", line 483, in run_with_tensors
    outputs_ait = self.run(
  File "/usr/local/lib/python3.8/dist-packages/aitemplate/compiler/model.py", line 438, in run
    return self._run_impl(
  File "/usr/local/lib/python3.8/dist-packages/aitemplate/compiler/model.py", line 367, in _run_impl
    inputs = self._dict_to_ordered_list(inputs, is_inputs=True)
  File "/usr/local/lib/python3.8/dist-packages/aitemplate/compiler/model.py", line 327, in _dict_to_ordered_list
    raise ValueError(
ValueError: Did not get correct number of inputs expected 3, got 4

If I replace position_ids by attention_mask I get:

    inputs_pt = {
        "input_ids": torch.randint(2, 1000, size=shape, dtype=torch.int64, device="cuda"),
        # "position_ids": torch.arange(shape[1], dtype=torch.int64).expand(shape).contiguous().cuda(),
        "attention_mask": torch.ones(shape, dtype=torch.int64, device="cuda"),
        "token_type_ids": torch.ones(size=shape, dtype=torch.int64, device="cuda"),
    }

[12:54:38] ./tmp/BERT_gelu_1_128/model-generated.h:225: Init AITemplate Runtime.
Traceback (most recent call last):
  File "./examples/03_bert/demo_new.py", line 101, in <module>
    run_demo()
  File "/usr/local/lib/python3.8/dist-packages/click/core.py", line 1130, in __call__
    return self.main(*args, **kwargs)
  File "/usr/local/lib/python3.8/dist-packages/click/core.py", line 1055, in main
    rv = self.invoke(ctx)
  File "/usr/local/lib/python3.8/dist-packages/click/core.py", line 1404, in invoke
    return ctx.invoke(self.callback, **ctx.params)
  File "/usr/local/lib/python3.8/dist-packages/click/core.py", line 760, in invoke
    return __callback(*args, **kwargs)
  File "./examples/03_bert/demo_new.py", line 96, in run_demo
    run_model(activation, graph_mode, use_fp16_acc, verify)
  File "./examples/03_bert/demo_new.py", line 45, in run_model
    mod.run_with_tensors(inputs_pt, outputs, graph_mode=graph_mode)
  File "/usr/local/lib/python3.8/dist-packages/aitemplate/compiler/model.py", line 483, in run_with_tensors
    outputs_ait = self.run(
  File "/usr/local/lib/python3.8/dist-packages/aitemplate/compiler/model.py", line 438, in run
    return self._run_impl(
  File "/usr/local/lib/python3.8/dist-packages/aitemplate/compiler/model.py", line 367, in _run_impl
    inputs = self._dict_to_ordered_list(inputs, is_inputs=True)
  File "/usr/local/lib/python3.8/dist-packages/aitemplate/compiler/model.py", line 334, in _dict_to_ordered_list
    raise ValueError(
ValueError: Got unexpected input: attention_mask

The text was updated successfully, but these errors were encountered:

antinucleon · 2022-10-16T13:58:15Z

The current BERT example is only used for benchmarking purposes on fixed length without mask. We are currently working with CUTLASS team on a grouped Attention optimization, which will remove paddings & mask for dynamic sequences. It will appear in next CUTLASS & AIT release.

On Sun, Oct 16, 2022 at 05:56 Michaël Benesty ***@***.***> wrote: Hi, I try to use an attention mask in Bert demo script but when I add the tensor to the input dict it crashes. How can I provide this mask? Reproduction script (run on the docker image): # Copyright (c) Meta Platforms, Inc. and affiliates.## Licensed under the Apache License, Version 2.0 (the "License");# you may not use this file except in compliance with the License.# You may obtain a copy of the License at## http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing, software# distributed under the License is distributed on an "AS IS" BASIS,# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.# See the License for the specific language governing permissions and# limitations under the License.#import time import clickimport torchfrom benchmark_ait import compile_modulefrom modeling.torch_model import BertBaseUncased as BertPt def run_model(activation: str, graph_mode: bool, use_fp16_acc: bool, verify: bool): f = open("measures.txt", mode="w") shape = (1, 128) inputs_pt = { "input_ids": torch.randint(2, 1000, size=shape, dtype=torch.int64, device="cuda"), "position_ids": torch.arange(shape[1], dtype=torch.int64).expand(shape).contiguous().cuda(), "attention_mask": torch.ones(shape, dtype=torch.int64, device="cuda"), "token_type_ids": torch.ones(size=shape, dtype=torch.int64, device="cuda"), } batch_size, seq_len = inputs_pt["input_ids"].size() pt_model = BertPt(pretrained=True)._model pt_model.eval() hidden_size = pt_model.config.hidden_size mod = compile_module(batch_size, seq_len, hidden_size, activation, use_fp16_acc, False, pt_model) outputs = [torch.empty(mod.get_output_maximum_shape(0)).half().cuda()] # warmup for _ in range(10): mod.run_with_tensors(inputs_pt, outputs, graph_mode=graph_mode) torch.cuda.synchronize() timings = list() for _ in range(10): start = time.time() mod.run_with_tensors(inputs_pt, outputs, graph_mode=graph_mode) torch.cuda.synchronize() timings.append(time.time() - start) f.write(f"{shape}: {torch.median(torch.tensor(timings)):.4f}\n") f.flush() print(f"Logits: {outputs[0]}") if verify: pt_outputs = pt_model.bert(**inputs_pt) torch.allclose(outputs[0], pt_outputs.last_hidden_state, 1e-1, 1e-1) print("Verification done!") f.close() @***@***.***( "--activation", type=str, default="gelu", help="Activation function applied on BERT, currently only support gelu and ***@***.***( "--graph_mode", type=bool, default=True, help="Use CUDA graph or not. (hipGraph is not supported ***@***.***( "--use_fp16_acc", type=bool, default=False, help="Use fp16 accumulation or not (TensorRT is using ***@***.***( "--verify", type=bool, default=True, help="Verify AIT outputs against PT",)def run_demo( activation: str, graph_mode: bool, use_fp16_acc: bool, verify: bool, ): run_model(activation, graph_mode, use_fp16_acc, verify) if __name__ == "__main__": torch.manual_seed(4896) run_demo() Produces: ... 2022-10-16 12:51:44,784 INFO <aitemplate.backend.builder> Building ./tmp/BERT_gelu_1_128/model_interface.obj 2022-10-16 12:52:03,348 INFO <aitemplate.backend.builder> Building ./tmp/BERT_gelu_1_128/test.so [12:52:03] ./tmp/BERT_gelu_1_128/model-generated.h:225: Init AITemplate Runtime. Traceback (most recent call last): File "./examples/03_bert/demo_new.py", line 101, in <module> run_demo() File "/usr/local/lib/python3.8/dist-packages/click/core.py", line 1130, in __call__ return self.main(*args, **kwargs) File "/usr/local/lib/python3.8/dist-packages/click/core.py", line 1055, in main rv = self.invoke(ctx) File "/usr/local/lib/python3.8/dist-packages/click/core.py", line 1404, in invoke return ctx.invoke(self.callback, **ctx.params) File "/usr/local/lib/python3.8/dist-packages/click/core.py", line 760, in invoke return __callback(*args, **kwargs) File "./examples/03_bert/demo_new.py", line 96, in run_demo run_model(activation, graph_mode, use_fp16_acc, verify) File "./examples/03_bert/demo_new.py", line 45, in run_model mod.run_with_tensors(inputs_pt, outputs, graph_mode=graph_mode) File "/usr/local/lib/python3.8/dist-packages/aitemplate/compiler/model.py", line 483, in run_with_tensors outputs_ait = self.run( File "/usr/local/lib/python3.8/dist-packages/aitemplate/compiler/model.py", line 438, in run return self._run_impl( File "/usr/local/lib/python3.8/dist-packages/aitemplate/compiler/model.py", line 367, in _run_impl inputs = self._dict_to_ordered_list(inputs, is_inputs=True) File "/usr/local/lib/python3.8/dist-packages/aitemplate/compiler/model.py", line 327, in _dict_to_ordered_list raise ValueError( ValueError: Did not get correct number of inputs expected 3, got 4 If I replace position_ids by attention_mask I get: [12:54:38] ./tmp/BERT_gelu_1_128/model-generated.h:225: Init AITemplate Runtime.Traceback (most recent call last): File "./examples/03_bert/demo_new.py", line 101, in <module> run_demo() File "/usr/local/lib/python3.8/dist-packages/click/core.py", line 1130, in __call__ return self.main(*args, **kwargs) File "/usr/local/lib/python3.8/dist-packages/click/core.py", line 1055, in main rv = self.invoke(ctx) File "/usr/local/lib/python3.8/dist-packages/click/core.py", line 1404, in invoke return ctx.invoke(self.callback, **ctx.params) File "/usr/local/lib/python3.8/dist-packages/click/core.py", line 760, in invoke return __callback(*args, **kwargs) File "./examples/03_bert/demo_new.py", line 96, in run_demo run_model(activation, graph_mode, use_fp16_acc, verify) File "./examples/03_bert/demo_new.py", line 45, in run_model mod.run_with_tensors(inputs_pt, outputs, graph_mode=graph_mode) File "/usr/local/lib/python3.8/dist-packages/aitemplate/compiler/model.py", line 483, in run_with_tensors outputs_ait = self.run( File "/usr/local/lib/python3.8/dist-packages/aitemplate/compiler/model.py", line 438, in run return self._run_impl( File "/usr/local/lib/python3.8/dist-packages/aitemplate/compiler/model.py", line 367, in _run_impl inputs = self._dict_to_ordered_list(inputs, is_inputs=True) File "/usr/local/lib/python3.8/dist-packages/aitemplate/compiler/model.py", line 334, in _dict_to_ordered_list raise ValueError(ValueError: Got unexpected input: attention_mask — Reply to this email directly, view it on GitHub <#46>, or unsubscribe <https://github.com/notifications/unsubscribe-auth/AAJTLXS3GRAEPJ2WM4526STWDP3OHANCNFSM6AAAAAARGK5ZLE> . You are receiving this because you are subscribed to this thread.Message ID: ***@***.***>

-- Bing Xu

pommedeterresautee · 2022-10-16T13:59:54Z

Thank you for your fast answer.
Closing.

…or#46) Although it seems to be useless, just made it compatible with others like pytorch and numpy.

pommedeterresautee closed this as completed Oct 16, 2022

tissue3 pushed a commit to tissue3/AITemplate-1 that referenced this issue Feb 7, 2023

Made concatenate support inputs with a single tensor (facebookincubat…

68755db

…or#46) Although it seems to be useless, just made it compatible with others like pytorch and numpy.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Attention mask in Bert #46

Attention mask in Bert #46

pommedeterresautee commented Oct 16, 2022 •

edited

antinucleon commented Oct 16, 2022 via email

pommedeterresautee commented Oct 16, 2022

Attention mask in Bert #46

Attention mask in Bert #46

Comments

pommedeterresautee commented Oct 16, 2022 • edited

antinucleon commented Oct 16, 2022 via email

pommedeterresautee commented Oct 16, 2022

pommedeterresautee commented Oct 16, 2022 •

edited