In [4]:
from transformers import AutoTokenizer, AutoModel
import torch

In [5]:
name = 'paraphrase-multilingual-mpnet-base-v2'

In [6]:
tokenizer = AutoTokenizer.from_pretrained(f'sentence-transformers/{name}')
model = AutoModel.from_pretrained(f'sentence-transformers/{name}')

In [7]:
tokenizer.save_pretrained(name)

('paraphrase-multilingual-mpnet-base-v2/tokenizer_config.json',
 'paraphrase-multilingual-mpnet-base-v2/special_tokens_map.json',
 'paraphrase-multilingual-mpnet-base-v2/sentencepiece.bpe.model',
 'paraphrase-multilingual-mpnet-base-v2/added_tokens.json',
 'paraphrase-multilingual-mpnet-base-v2/tokenizer.json')

In [8]:
encoded_input = tokenizer(['中文输入'], padding=True, truncation=True, return_tensors='pt')

In [9]:
torch.onnx.export(model,               # model being run
                  # model input (or a tuple for multiple inputs)
                  (encoded_input['input_ids'], encoded_input['attention_mask']),
                  "sts.onnx",   # where to save the model (can be a file or file-like object)
                  export_params=True,        # store the trained parameter weights inside the model file
                  opset_version=11,          # the ONNX version to export the model to
                  do_constant_folding=True,  # whether to execute constant folding for optimization
                  input_names = ['input_ids', 'attention_mask'],   # the model's input names
                  output_names = ['output'], # the model's output names
                  dynamic_axes={
                      'input_ids' : {0 : 'batch_size', 1 : 'sequence_length'},
                      'attention_mask' : {0 : 'batch_size', 1 : 'sequence_length'},# variable length axes
                      'output' : {0 : 'batch_size', 1 : 'sequence_length'}
                  })

In [1]:
from onnxruntime.quantization import quantize_dynamic, QuantType
# from onnxruntime.quantization import quantize_qat, QuantType

quantized_model = quantize_dynamic(
    './sts.onnx',
    './stsq.onnx',
    weight_type=QuantType.QUInt8
)