#### Onnx Model conversion and Quantization

###### Note load the stuff about ONNX runtime from the machine translation tutorial.

In [1]:
from pathlib import Path
from transformers.convert_graph_to_onnx import convert

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_path = Path.cwd().joinpath('models')
model_id = 'bio-gpt-qa'
model_path = model_path.joinpath(model_id)

In [3]:
assert model_path.parent.parent.exists(
), f"Model not found at {model_path.parent.parent}"

In [4]:
from torch.onnx import export

In [5]:
from transformers import BioGptTokenizer, BioGptForCausalLM, set_seed


tokenizer = BioGptTokenizer.from_pretrained(model_path,  local_files_only=True)
model = BioGptForCausalLM.from_pretrained(model_path, local_files_only=True)

In [6]:
def encode_input(input):
    return tokenizer([input],
                     return_tensors='pt',
                     max_length=1024,
                     truncation=True)

In [7]:
from transformers.onnx import FeaturesManager

In [8]:
feature = "seq2seq"

In [9]:
input = f"'question:what is the cause of covid ? context: the cause of covid is a virus'"
encoded_input = tokenizer([input],
                          return_tensors='pt',
                          max_length=1024,
                          truncation=True)

In [10]:
import torch

In [12]:
encoded_input.keys()

dict_keys(['input_ids', 'attention_mask'])

In [13]:
import torch

In [14]:
set_seed(42)

In [16]:
generate_tokens = model.generate(**encoded_input,
                                 num_beams=5,
                                 do_sample=True,
                                 top_k=50,
                                 top_p=0.95,
                                 max_length=512)

KeyboardInterrupt: 

In [15]:
onnx_path = Path.cwd().joinpath('models_repository', "generator", "generator_model", "1", )

In [16]:
onnx_path.mkdir(parents=True, exist_ok=True)

In [17]:
generated_text = tokenizer.decode(generate_tokens[0], skip_special_tokens=True)
print(generated_text)

NameError: name 'generate_tokens' is not defined

In [18]:
onnx_path

PosixPath('/Users/esp.py/Projects/Personal/end-to-end-rag/models_repository/generator/generator_model/1')

In [29]:
past_shape = [
            2, # first is for the key , second is for the value
            1, # batch size
            model.config.num_attention_heads,
            1, # past sequence length
            int(model.config.hidden_size / model.config.num_attention_heads)]
past = [((torch.rand(past_shape, dtype=torch.float32) * 2.0 )- 1.0 ) for _ in range(model.config.num_hidden_layers)]

In [30]:
past[0][1].shape

torch.Size([1, 16, 1, 64])

In [31]:
%%script false --no-raise-error
print()

# Friday: I have fixed the issue with past key values, need to comeback and export the model with it.

In [32]:

export(
    model,
    tuple([encoded_input.get('input_ids'), None, past]),
    f=onnx_path.joinpath('bio-gpt-model.onnx'),
    input_names=['input_ids', 'past_key_values'],
    output_names=['logits', "past_key_values"],
    dynamic_axes={'input_ids': {0: 'batch_size', 1: 'sequence'},
                  'past_key_values': {0: 'batch_size', 2: 'sequence'},
                  'logits': {0: 'batch_size', 1: 'sequence'}},
    do_constant_folding=True,
    opset_version=13,
)

the attention mask should be none None


  if (input_shape[-1] > 1 or self.sliding_window is not None) and self.is_causal:
  if past_key_values_length > 0:
  if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
  if attention_mask.size() != (bsz, 1, tgt_len, src_len):
  if layer_head_mask.size() != (self.num_heads,):


ValueError: Head mask for a single layer should be of size (16,), but is torch.Size([2, 1, 16, 1, 64])

In [20]:
%%script false --no-raise-error
model.config.save_pretrained(onnx_path)

In [23]:
encoder_path  = onnx_path.parent.parent.joinpath('tokenizer_encoder', '1')
decoder_path  = onnx_path.parent.parent.joinpath('tokenizer_decoder', '1')

In [24]:
tokenizer.save_pretrained(encoder_path)
tokenizer.save_pretrained(decoder_path)

('/Users/esp.py/Projects/Personal/end-to-end-rag/models_repository/generator/tokenizer_decoder/1/tokenizer_config.json',
 '/Users/esp.py/Projects/Personal/end-to-end-rag/models_repository/generator/tokenizer_decoder/1/special_tokens_map.json',
 '/Users/esp.py/Projects/Personal/end-to-end-rag/models_repository/generator/tokenizer_decoder/1/vocab.json',
 '/Users/esp.py/Projects/Personal/end-to-end-rag/models_repository/generator/tokenizer_decoder/1/merges.txt',
 '/Users/esp.py/Projects/Personal/end-to-end-rag/models_repository/generator/tokenizer_decoder/1/added_tokens.json')

In [25]:
model.config.vocab_size

42393

With our model converted to onnx, we will move to the next step which is to perform quantization on the model.

Next step will be exploring quantization approaches to reduce the size of the model and improve the latency for inference.

Ressources: 

- https://www.philschmid.de/static-quantization-optimum.
- https://lilianweng.github.io/posts/2023-01-10-inference-optimization/
- https://github.com/huggingface/notebooks/blob/main/examples/onnx-export.ipynb

#### Quantization

Quantization is a technique to reduce the the size of neural networks by using lower precision datatype to represent the weight and activation function in the neural network. In general weights and activation are represented as 32-bit floating points, but with quantization we can represent those floating points as 16-bit floating point or sometime using int16 or int8.

Quantization have proven to reduce the size of language model hence the inference latency by half while keeping a huge percentage of model accuracy for some downstream tasks. [Source](https://www.philschmid.de/static-quantization-optimum).

The bellow image illustrates the effect of the size and inference of quantization on a BERT model.


We can see that the model size and the inference time is reduce by third size using 8 bit quantization while the performance of the model remain the same.

Quantization does not always keep the same accuracy of the model, so before choosing it we need to make sure we evaluate the performance of the model on the whole dataset.

![image](./images/quantization.webp)



For our model we will convert 32 bits floating points to 16 bits, using the onnx library. 

In [32]:
from onnxruntime.transformers import optimizer

In [33]:
getattr(model.config, "num_attention_heads")

16

In [34]:
model_path.__str__()

'/Users/esp.py/Projects/Personal/end-to-end-rag/models/bio-gpt-qa'

In [36]:
optimized_model =  optimizer.optimize_model(onnx_path.joinpath('bio-gpt-model.onnx').__str__(), 
                                            model_type='gpt2', 
                                            num_heads=model.config.num_attention_heads,
                                            hidden_size=model.config.hidden_size)

In [37]:
optimized_model.convert_float_to_float16()

In [18]:
quantized_model_path = model_path.parent.joinpath(
    'decoder_model_quantized.onnx')

In [19]:
optimized_model.save_model_to_file(quantized_model_path)

In [20]:
for model in model_path.parent.glob("*.onnx"):
    print(f"the size of {model.stem} the model in MB is: {model.stat().st_size / (1024 * 1024)}")

the size of decoder_model_quantized the model in MB is: 744.6517105102539
the size of bio-gpt the model in MB is: 1488.90811252594


We can clearly see that the size of our model have been reduced by 50% using the conversion of floats32 to float 16.

We see with this approach that we applied dynamic quantization of the model and it reduce the size of the model! However we could also aplly dynamic quantization to the model but I haven't yet learned about it.  But in [this blog](https://www.philschmid.de/static-quantization-optimum) it have been shown that static quantization improve the inference of the model.

### Using the Quantized model

In [21]:
from optimum.onnxruntime import ORTModelForCausalLM
from transformers import AutoTokenizer

In [22]:
from pathlib import Path

In [23]:
model_path = Path.cwd().joinpath('models', 'onnx', 'decoder_model_quantized.onnx')

In [25]:
model_path.exists()

True

In [92]:
quantized_model = ORTModelForCausalLM.from_pretrained(model_path.parent,
                                                      decoder_file_name=model_path,
                                                      use_cache=False,
                                                      use_io_binding=False)

/Users/esp.py/Projects/Personal/end-to-end-rag/models/onnx/decoder_model_quantized.onnx ******* the path form dir ***** /Users/esp.py/Projects/Personal/end-to-end-rag/models/onnx/decoder_model_quantized.onnx True


Generation config file not found, using a generation config created from the model config.


In [45]:
input = f"question: Is cytokeratin immunoreactivity useful in the diagnosis of short-segment Barrett's oesophagus in Korea? context: Cytokeratin 7/20 staining has been reported to be helpful in diagnosing Barrett's oesophagus and gastric intestinal metaplasia. However, this is still a matter of some controversy. To determine the diagnostic usefulness of cytokeratin 7/20 immunostaining for short-segment Barrett's oesophagus in Korea. In patients with Barrett's oesophagus, diagnosed endoscopically, at least two biopsy specimens were taken from just below the squamocolumnar junction. If goblet cells were found histologically with alcian blue staining, cytokeratin 7/20 immunohistochemical stains were performed. Intestinal metaplasia at the cardia was diagnosed whenever biopsy specimens taken from within 2 cm below the oesophagogastric junction revealed intestinal metaplasia. Barrett's cytokeratin 7/20 pattern was defined as cytokeratin 20 positivity in only the superficial gland, combined with cytokeratin 7 positivity in both the superficial and deep glands. Barrett's cytokeratin 7/20 pattern was observed in 28 out of 36 cases (77.8%) with short-segment Barrett's oesophagus, 11 out of 28 cases (39.3%) with intestinal metaplasia at the cardia, and nine out of 61 cases (14.8%) with gastric intestinal metaplasia. The sensitivity and specificity of Barrett's cytokeratin 7/20 pattern were 77.8 and 77.5%, respectively. answer: Barrett's cytokeratin 7/20 pattern can be a useful marker for the diagnosis of short-segment Barrett's oesophagus, although the false positive or false negative rate is approximately 25%."
encoded_input = tokenizer([input],
                          return_tensors='pt',
                          max_length=1024,
                          truncation=True)

NameError: name 'tokenizer' is not defined

In [44]:
encode_input


NameError: name 'encode_input' is not defined

In [1]:
with torch.no_grad():
    generated_text = model.generate(**encoded_input,
                                min_length=50,
                                max_length=1024,
                                num_beams=5,
                                early_stopping=True)

NameError: name 'torch' is not defined

In [117]:
tokenizer.decode(generated_text[0], skip_special_tokens=True,)

'what is the cause of Covid-19? A case report and review of the literature on Covid-19 in patients with chronic kidney disease (CKD) and end-stage renal disease (ESRD) on hemodialysis (HD) and peritoneal dialysis (PD).'

###  Converting GPT2 to ONNX with Beam Search

I have found a way to convert the gpt model to Onnx with the support of beam search.

I will be using it tomorrow.

https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/python/tools/transformers/convert_generation.py#L81

In [1]:
from onnxruntime.transformers.convert_generation  import convert_generation_model, parse_arguments, GenerationType

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from pathlib import Path

model_path = Path.cwd().joinpath('models')
model_id = 'bio-gpt-qa'
model_path = model_path.joinpath(model_id)

In [3]:
onnx_path = Path.cwd().joinpath('models_repository',
                                "generator", "generator_model", "1", )

In [4]:
from  argparse import Namespace

In [5]:
onnx_path

PosixPath('/Users/esp.py/Projects/Personal/end-to-end-rag/models_repository/generator/generator_model/1')

In [6]:
model_path

PosixPath('/Users/esp.py/Projects/Personal/end-to-end-rag/models/bio-gpt-qa')

In [7]:
arguments = Namespace(
    model_name_or_path=model_path.__str__(),
    output=onnx_path.joinpath("bio-gpt-model-with-beam.onnx").__str__(),
    model_type="gpt2",
    num_beams="5",
    min_len="100",
    max_len="512",
    temperature="0.25",
    model_class="BioGPT"
)

In [8]:
arguments_list = []

for key, value in vars(arguments).items():
    arguments_list.append(f"--{key}")
    arguments_list.append(value)

In [9]:
arguments_list

['--model_name_or_path',
 '/Users/esp.py/Projects/Personal/end-to-end-rag/models/bio-gpt-qa',
 '--output',
 '/Users/esp.py/Projects/Personal/end-to-end-rag/models_repository/generator/generator_model/1/bio-gpt-model-with-beam.onnx',
 '--model_type',
 'gpt2',
 '--num_beams',
 '5',
 '--min_len',
 '100',
 '--max_len',
 '512',
 '--temperature',
 '0.25',
 '--model_class',
 'BioGPT']

In [10]:
args = parse_arguments(arguments_list)

In [11]:
args.model_name_or_path

'/Users/esp.py/Projects/Personal/end-to-end-rag/models/bio-gpt-qa'

In [12]:
args

Namespace(model_name_or_path='/Users/esp.py/Projects/Personal/end-to-end-rag/models/bio-gpt-qa', model_type='gpt2', cache_dir='./cache_models', decoder_onnx='', encoder_decoder_init_onnx='', verbose=False, output='/Users/esp.py/Projects/Personal/end-to-end-rag/models_repository/generator/generator_model/1/bio-gpt-model-with-beam.onnx', model_class='BioGPT', precision=<Precision.FLOAT32: 'fp32'>, op_block_list=['auto'], use_external_data_format=False, run_shape_inference=False, disable_pad_vocab_size=False, disable_separate_gpt2_decoder_for_init_run=False, disable_shared_initializers=False, output_sequences_scores=False, output_token_scores=False, early_stopping=False, no_repeat_ngram_size=0, vocab_mask=False, past_present_share_buffer=False, use_decoder_masked_attention=False, prefix_vocab_mask=False, custom_attention_mask=False, presence_mask=False, seed=False, min_length=100, max_length=512, num_beams=5, num_return_sequences=1, length_penalty=1, repetition_penalty=1, temperature=0.25

Need to come back to understand the input generation.

PS: the issues seems to be the postional embedding that waht we need to fix

In [13]:
convert_generation_model(args=args, generation_type=GenerationType.BEAMSEARCH)

Arguments:Namespace(model_name_or_path='/Users/esp.py/Projects/Personal/end-to-end-rag/models/bio-gpt-qa', model_class='BioGPT', cache_dir='./cache_models', output='/Users/esp.py/Projects/Personal/end-to-end-rag/models/bio-gpt-qa_past_fp32.onnx', optimize_onnx=True, use_gpu=False, provider=None, tolerance=0.0005, input_test_file='', precision=<Precision.FLOAT32: 'fp32'>, test_cases=10, test_runs=1, verbose=False, use_external_data_format=False, overwrite=True, use_int64_inputs=False, stage=0, auto_mixed_precision=False, keep_io_types=False, io_block_list=[], op_block_list=[], node_block_list=[], force_fp16_initializers=False)
PyTorch Version:2.2.0
Transformers Version:4.38.2
Onnxruntime Version:1.15.1


Exporting ONNX model to /Users/esp.py/Projects/Personal/end-to-end-rag/models/bio-gpt-qa_BioGPT_past.onnx


in the dummy_input the attention mask shape is torch.Size([1, 16, 1, 64])


Shapes: input_ids=torch.Size([1, 1]) past=torch.Size([16, 1, 64]) output=torch.Size([1, 1, 42393]) present=torch.Size([1, 16, 2, 64])
  if (input_shape[-1] > 1 or self.sliding_window is not None) and self.is_causal:
  if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
  if attention_mask.size() != (bsz, 1, tgt_len, src_len):


AttributeError: 'tuple' object has no attribute 'size'

NEed to come back and debug this hidden state issue

Now the model is working with beam search on the onnx runtime, we need to set it up and use it with the triton inference server.

###  Testing Inference with the Model

In [15]:
model_path = onnx_path.joinpath("bio-gpt-model-with-beam.onnx")

In [16]:
from onnxruntime import InferenceSession

In [17]:
model_path

PosixPath('/Users/esp.py/Projects/Personal/end-to-end-rag/models_repository/generator/generator_model/1/bio-gpt-model-with-beam.onnx')

In [18]:
inference_session = InferenceSession(model_path)

the provider options are [] set()


2024-03-01 20:32:44.552453 [W:onnxruntime:, graph.cc:3543 CleanUnusedInitializersAndNodeArgs] Removing initializer '/biogpt/Shape_7_output_0'. It is not used by any node and should be removed from the model.
2024-03-01 20:32:44.552496 [W:onnxruntime:, graph.cc:3543 CleanUnusedInitializersAndNodeArgs] Removing initializer '/biogpt/Shape_4_output_0'. It is not used by any node and should be removed from the model.
2024-03-01 20:32:44.552504 [W:onnxruntime:, graph.cc:3543 CleanUnusedInitializersAndNodeArgs] Removing initializer '/biogpt/Constant_21_output_0'. It is not used by any node and should be removed from the model.
2024-03-01 20:32:44.552511 [W:onnxruntime:, graph.cc:3543 CleanUnusedInitializersAndNodeArgs] Removing initializer '/biogpt/Constant_16_output_0'. It is not used by any node and should be removed from the model.
2024-03-01 20:32:44.562975 [W:onnxruntime:, graph.cc:3543 CleanUnusedInitializersAndNodeArgs] Removing initializer '/biogpt/Constant_10_output_0'. It is not use

Fail: [ONNXRuntimeError] : 1 : FAIL : subgraph_gpt.cc:131 Validate Invalid GPT-2 subgraph: number of inputs shall be number of outputs plus 2 or 3 (if past_present_share_buffer) or 5 (if past_present_share_buffer and use_decoder_masked_self_attention for BeamSearch)

Need to come back and sort the input list stuff.

In [14]:
import torch

In [29]:
test_input = {'input_ids': torch.tensor([[2,  4617,  2969,    20,  1994,    21,     6,   533,     5,  1181,
                                          17270,   927,  1544,    20,     6,   533,     5,  1181, 17270,    21,
                                          14,  8493,  2402,   104]]), 
            'attention_mask': torch.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [30]:
test_input.get('input_ids').shape

torch.Size([1, 24])

In [35]:
outputs = model(**test_input)

I am here with the input shape torch.Size([1, 24]) the last element is 24


In [36]:
outputs.past_key_values[0][0].shape

torch.Size([1, 16, 24, 64])

In [46]:
outputs.past_key_values[0][0].shape

torch.Size([1, 16, 24, 64])

In [17]:
attention_mask = torch.ones(
    [1, 2], dtype=torch.float16)

In [20]:
positions = (torch.cumsum(attention_mask, dim=1).type_as(
    attention_mask) * attention_mask).long() - 1

In [28]:
position = torch.cumsum(attention_mask, dim=1).type_as(attention_mask) * attention_mask.long() - 1

In [36]:
positions[:, 2:]

tensor([], size=(1, 0), dtype=torch.int64)

In [15]:
past_shape = [1, 16, 1, 64]

In [18]:
(torch.rand(past_shape, dtype=torch.float16, ) * 2.0 - 1.0).shape

torch.Size([1, 16, 1, 64])