#### Onnx Model conversion and Quantization

###### Note load the stuff about ONNX runtime from the machine translation tutorial.

In [None]:
from pathlib import Path


In [None]:
model_path = Path.cwd().joinpath('models')
model_id = 'bio-gpt-qa'
model_path = model_path.joinpath(model_id)

In [None]:
assert model_path.parent.parent.exists(
), f"Model not found at {model_path.parent.parent}"

In [None]:
from transformers import BioGptForCausalLM, BioGptTokenizer, set_seed

tokenizer = BioGptTokenizer.from_pretrained(model_path,  local_files_only=True)
model = BioGptForCausalLM.from_pretrained(model_path, local_files_only=True)

In [None]:
def encode_input(input):
    return tokenizer([input],
                     return_tensors='pt',
                     max_length=1024,
                     truncation=True)

In [None]:
feature = "seq2seq"

In [None]:
model.save_pretrained(model_path)

In [None]:
input = "'question:what is the cause of covid ? context: the cause of covid is a virus'"
encoded_input = tokenizer([input],
                          return_tensors='pt',
                          max_length=1024,
                          truncation=True)

In [None]:
import torch

In [None]:
encoded_input.keys()

In [None]:
set_seed(42)

In [None]:
%%script false --no-raise-error
generate_tokens = model.generate(**encoded_input,
                                 num_beams=5,
                                 do_sample=True,
                                 top_k=50,
                                 top_p=0.95,
                                 max_length=512)

In [None]:
onnx_path = Path.cwd().joinpath('models_repository', "generator", "generator_model", "1", )

In [None]:
onnx_path.mkdir(parents=True, exist_ok=True)

In [None]:
generated_text = tokenizer.decode(generate_tokens[0], skip_special_tokens=True)
print(generated_text)

In [None]:
onnx_path

In [None]:
past[0][1].shape

#### Export with Optimum

This seems to be impossible, let us use the model optimum libray.

In [None]:
from optimum.exporters.onnx import main_export

In [None]:
from typing import Any, Dict, List, OrderedDict

from optimum.exporters.onnx.model_configs import GPT2OnnxConfig

In [None]:



from transformers import PretrainedConfig


class CustomBioGPTConfig(GPT2OnnxConfig):

    def __init__(self, config: PretrainedConfig,
                 task: str = "text-generation-with-past",
                 int_dtype: str = "int32",
                 float_dtype: str = "fp16",
                 use_past: bool = True,
                 use_past_in_inputs: bool = True,
                 preprocessors: List[Any] | None = None, legacy: bool = False):
        super().__init__(config, task, int_dtype, float_dtype, use_past, use_past_in_inputs, preprocessors, legacy)
        print("the int dtype is ", int_dtype)
        self._config.n_layer = config.num_hidden_layers
        self._config.n_head = config.num_attention_heads
    @property
    def inputs(self) -> Dict[str, Dict[int, str]]:
        common_inputs = {"input_ids": {0: "batch_size", 1: "sequence"},
                        "attention_mask": {0: "batch_size", 1: "sequence_length"},
                        "position_ids": {0: "batch_size", 1: "sequence"}}

        self.add_past_key_values(common_inputs, direction="inputs")
        return common_inputs

    @property
    def outputs(self) -> Dict[str, Dict[int, str]]:
        common_outputs = OrderedDict({"logits": {0: "batch_size", 1: "sequence"}})
        self.add_past_key_values(common_outputs, direction="outputs")

        return common_outputs

    def add_past_key_values(self, inputs_or_outputs: Dict[str, Dict[int, str]], direction: str):
        """
        Fills `input_or_outputs` mapping with past_key_values dynamic axes considering the direction.

        Args:
            inputs_or_outputs (`Dict[str, Dict[int, str]]`):
                The mapping to fill.
            direction (`str`):
                either "inputs" or "outputs", it specifies whether `input_or_outputs` is the input mapping or the
                output mapping, this is important for axes naming.
        """
        if direction not in ["inputs", "outputs"]:
            raise ValueError(
                f'direction must either be "inputs" or "outputs", but {direction} was given')

        if direction == "inputs":
            decoder_sequence_name = "past_seq_len"
            name = "past_key_values"
        else:
            decoder_sequence_name = "total_seq_len"
            name = "present"

        for i in range(self._normalized_config.num_layers):
            inputs_or_outputs[f"{name}.{i}.key"] = {
                0: "batch_size", 3: decoder_sequence_name}
            inputs_or_outputs[f"{name}.{i}.value"] = {
                0: "batch_size", 3: decoder_sequence_name}

In [None]:
from transformers import AutoConfig

In [None]:
config = AutoConfig.from_pretrained(model_path, local_files_only=True)

In [None]:
custom_config = CustomBioGPTConfig(config=config)

In [None]:
dummy_inputs = custom_config.generate_dummy_inputs()

The trick is to find a way to overwride the mode and att he attenoin id as input to i.

In [None]:
main_export(
model_name_or_path=model_path,
task="text-generation-with-past",
model_kwargs={"output_attentions": True},
output=onnx_path.joinpath('bio-gpt-model-with-past'),
custom_onnx_configs={"model": custom_config},
)

### Use with Optimum libray

In [None]:
%%script false --no-raise-error
model.config.save_pretrained(onnx_path)

In [None]:
encoder_path  = onnx_path.parent.parent.joinpath('tokenizer_encoder', '1')
decoder_path  = onnx_path.parent.parent.joinpath('tokenizer_decoder', '1')

In [None]:
tokenizer.save_pretrained(encoder_path)
tokenizer.save_pretrained(decoder_path)

In [None]:
model.config.vocab_size

With our model converted to onnx, we will move to the next step which is to perform quantization on the model.

Next step will be exploring quantization approaches to reduce the size of the model and improve the latency for inference.

Ressources: 

- https://www.philschmid.de/static-quantization-optimum.
- https://lilianweng.github.io/posts/2023-01-10-inference-optimization/
- https://github.com/huggingface/notebooks/blob/main/examples/onnx-export.ipynb

#### Quantization

Quantization is a technique to reduce the the size of neural networks by using lower precision datatype to represent the weight and activation function in the neural network. In general weights and activation are represented as 32-bit floating points, but with quantization we can represent those floating points as 16-bit floating point or sometime using int16 or int8.

Quantization have proven to reduce the size of language model hence the inference latency by half while keeping a huge percentage of model accuracy for some downstream tasks. [Source](https://www.philschmid.de/static-quantization-optimum).

The bellow image illustrates the effect of the size and inference of quantization on a BERT model.


We can see that the model size and the inference time is reduce by third size using 8 bit quantization while the performance of the model remain the same.

Quantization does not always keep the same accuracy of the model, so before choosing it we need to make sure we evaluate the performance of the model on the whole dataset.

![image](./images/quantization.webp)



For our model we will convert 32 bits floating points to 16 bits, using the onnx library. 

In [None]:
from onnxruntime.transformers import optimizer

In [None]:
model.config.num_attention_heads

In [None]:
model_path.__str__()

In [None]:
optimized_model =  optimizer.optimize_model(onnx_path.joinpath('bio-gpt-model.onnx').__str__(),
                                            model_type='gpt2',
                                            num_heads=model.config.num_attention_heads,
                                            hidden_size=model.config.hidden_size)

In [None]:
optimized_model.convert_float_to_float16()

In [None]:
quantized_model_path = model_path.parent.joinpath(
    'decoder_model_quantized.onnx')

In [None]:
optimized_model.save_model_to_file(quantized_model_path)

In [None]:
for model in model_path.parent.glob("*.onnx"):
    print(f"the size of {model.stem} the model in MB is: {model.stat().st_size / (1024 * 1024)}")

We can clearly see that the size of our model have been reduced by 50% using the conversion of floats32 to float 16.

We see with this approach that we applied dynamic quantization of the model and it reduce the size of the model! However we could also aplly dynamic quantization to the model but I haven't yet learned about it.  But in [this blog](https://www.philschmid.de/static-quantization-optimum) it have been shown that static quantization improve the inference of the model.

### Using the Quantized model

In [None]:
from optimum.onnxruntime import ORTModelForCausalLM

In [None]:
from pathlib import Path

In [None]:
model_path = Path.cwd().joinpath('models', 'onnx', 'decoder_model_quantized.onnx')

In [None]:
model_path.exists()

In [None]:
quantized_model = ORTModelForCausalLM.from_pretrained(model_path.parent,
                                                      decoder_file_name=model_path,
                                                      use_cache=False,
                                                      use_io_binding=False)

In [None]:
input = "question: Is cytokeratin immunoreactivity useful in the diagnosis of short-segment Barrett's oesophagus in Korea? context: Cytokeratin 7/20 staining has been reported to be helpful in diagnosing Barrett's oesophagus and gastric intestinal metaplasia. However, this is still a matter of some controversy. To determine the diagnostic usefulness of cytokeratin 7/20 immunostaining for short-segment Barrett's oesophagus in Korea. In patients with Barrett's oesophagus, diagnosed endoscopically, at least two biopsy specimens were taken from just below the squamocolumnar junction. If goblet cells were found histologically with alcian blue staining, cytokeratin 7/20 immunohistochemical stains were performed. Intestinal metaplasia at the cardia was diagnosed whenever biopsy specimens taken from within 2 cm below the oesophagogastric junction revealed intestinal metaplasia. Barrett's cytokeratin 7/20 pattern was defined as cytokeratin 20 positivity in only the superficial gland, combined with cytokeratin 7 positivity in both the superficial and deep glands. Barrett's cytokeratin 7/20 pattern was observed in 28 out of 36 cases (77.8%) with short-segment Barrett's oesophagus, 11 out of 28 cases (39.3%) with intestinal metaplasia at the cardia, and nine out of 61 cases (14.8%) with gastric intestinal metaplasia. The sensitivity and specificity of Barrett's cytokeratin 7/20 pattern were 77.8 and 77.5%, respectively. answer: Barrett's cytokeratin 7/20 pattern can be a useful marker for the diagnosis of short-segment Barrett's oesophagus, although the false positive or false negative rate is approximately 25%."
encoded_input = tokenizer([input],
                          return_tensors='pt',
                          max_length=1024,
                          truncation=True)

In [None]:
encode_input


In [None]:
%%script false --no-raise-error
with torch.no_grad():
    generated_text = model.generate(**encoded_input,
                                min_length=50,
                                max_length=1024,
                                num_beams=5,
                                early_stopping=True)

In [None]:
tokenizer.decode(generated_text[0], skip_special_tokens=True,)

###  Converting GPT2 to ONNX with Beam Search

I have found a way to convert the gpt model to Onnx with the support of beam search.

I will be using it tomorrow.

https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/python/tools/transformers/convert_generation.py#L81

In [None]:
from onnxruntime.transformers.convert_generation import (
    GenerationType,
    convert_generation_model,
    parse_arguments,
)

In [None]:
from pathlib import Path

model_path = Path.cwd().joinpath('models')
model_id = 'bio-gpt-qa'
model_path = model_path.joinpath(model_id)

In [None]:
onnx_path = Path.cwd().joinpath('models_repository',
                                "generator", "generator_model", "1", )

In [None]:
onnx_model_with_past = onnx_path.joinpath('bio-gpt-model-with-past')

In [None]:
from argparse import Namespace

In [None]:
onnx_path

In [None]:
model_path

In [None]:
onnx_model_with_past

In [None]:
arguments = Namespace(
    model_name_or_path=model_path.__str__(),
    output=onnx_model_with_past.parent.joinpath("biogpt-model-with-past-and-beam").__str__(),
    model_type="gpt2",
    num_beams="5",
    temperature="0.25",
    model_class="BioGptModel"
)

In [None]:
onnx_model_with_past.joinpath("model.onnx").exists()

In [None]:
arguments_list = []

for key, value in vars(arguments).items():
    arguments_list.append(f"--{key}")
    arguments_list.append(value)

In [None]:
arguments_list

In [None]:
args = parse_arguments(arguments_list)

In [None]:
args.model_name_or_path

In [None]:
args

Need to come back to understand the input generation.

PS: the issues seems to be the postional embedding that waht we need to fix

In [None]:
convert_generation_model(args=args, generation_type=GenerationType.BEAMSEARCH)

NEed to come back and debug this hidden state issue

Now the model is working with beam search on the onnx runtime, we need to set it up and use it with the triton inference server.

###  Testing Inference with the Model

Need to come to the error.

In [None]:
model_path = onnx_model_with_past.parent.joinpath(
    "bio-gpt-model-with-past").joinpath("model.onnx")

In [None]:
model_with_beam = onnx_model_with_past.parent.joinpath(
    "biogpt-model-with-past-and-beam.onnx")

In [None]:
from onnxruntime import InferenceSession

In [None]:
model_path

In [None]:
/Users/esp.py/Projects/Personal/end-to-end-rag/models_repository/generator/generator_model/1/bio-gpt-model-with-past/model.onnx

In [None]:
model_path.exists()

In [None]:
inference_session = InferenceSession(model_with_beam)

The model with past work, but not the model with beam

In [None]:
test_input = {'input_ids': torch.tensor([[2,  4617,  2969,    20,  1994,    21,     6,   533,     5,  1181,
                                          17270,   927,  1544,    20,     6,   533,     5,  1181, 17270,    21,
                                          14,  8493,  2402,   104]]),
            'attention_mask': torch.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [None]:
test_input.get('input_ids').shape

In [None]:
outputs = model(**test_input)

In [None]:
outputs.past_key_values[0][0].shape

In [None]:
outputs.past_key_values[0][0].shape

In [None]:
attention_mask = torch.ones(
    [1, 2], dtype=torch.float16)

In [None]:
positions = (torch.cumsum(attention_mask, dim=1).type_as(
    attention_mask) * attention_mask).long() - 1

In [None]:
position = torch.cumsum(attention_mask, dim=1).type_as(attention_mask) * attention_mask.long() - 1

In [None]:
positions[:, 2:]

In [None]:
past_shape = [1, 16, 1, 64]

In [None]:
(torch.rand(past_shape, dtype=torch.float16, ) * 2.0 - 1.0).shape

So after hacky ways to make the beam search work, we need to make sure the validation of the mode pass.

We willl save the attention mask and positions_ids.