In [11]:
# import ipywidgets as widgets
from pathlib import Path

import openvino as ov
from optimum.intel.openvino import OVModelForSpeechSeq2Seq
from transformers import AutoProcessor, pipeline

In [12]:
core = ov.Core()
device = "NPU" if "NPU" in core.available_devices else "CPU"

model_id = (
    "distil-whisper/distil-large-v3"
    if device == "NPU"
    else "distil-whisper/distil-large-v2"
)

model_id

'distil-whisper/distil-large-v2'

In [13]:
model_name = model_id.split("/")[1]

In [14]:
from datetime import datetime
import os

ov_config = {"CACHE_DIR": ""}

# モデルを保存するディレクトリ
model_dir = Path(f"../../model/{model_name}")
fp16_model_dir = model_dir / "FP16"  # float 16bitモデルの保存先
int8_model_dir = model_dir / "INT8"  # 量子化モデルの保存先(8bit)

export_command_base = (
    "optimum-cli export openvino --model {} --task automatic-speech-recognition".format(
        model_id
    )
)


def convert_to_fp16() -> None:
    global export_command_base
    export_command = ""
    # すでに存在する場合はスキップ
    if (fp16_model_dir / "openvino_encoder_model.xml").exists():
        return
    export_command = export_command_base + " --weight-format fp16"
    export_command += " " + str(fp16_model_dir)
    # ダウンロード開始
    start_model_download = datetime.now()
    print("export_command:", export_command)
    os.system(export_command)
    # ダウンロード完了
    end_model_download = datetime.now() - start_model_download
    print("export done", end_model_download.total_seconds())


def convert_to_int8() -> None:
    global export_command_base
    export_command = ""
    # すでに存在する場合はスキップ
    if (int8_model_dir / "openvino_encoder_model.xml").exists():
        return
    export_command = export_command_base + " --weight-format int8"
    export_command += " " + str(int8_model_dir)
    # ダウンロード開始
    start_model_download = datetime.now()
    print("export_command:", export_command)
    os.system(export_command)
    # ダウンロード完了
    end_model_download = datetime.now() - start_model_download
    print("export done", end_model_download.total_seconds())

In [15]:
# deviceがNPUならFP16, それ以外はINT8
convert_to_fp16() if device == "NPU" else convert_to_int8()

export_command: optimum-cli export openvino --model distil-whisper/distil-large-v2 --task automatic-speech-recognition --weight-format int8 ../../model/distil-large-v2/INT8
INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, onnx, openvino


Framework not specified. Using pt to export the model.


In [None]:
# CLIではなく、クラスからモデルを読み込む場合

# from pathlib import Path
# from optimum.intel.openvino import OVModelForSpeechSeq2Seq

# # モデルを保存するディレクトリ
# model_dir = Path(f'../../model/{model_name}')
# fp16_model_dir = model_dir / "FP16"  # float 16bitモデルの保存先
# int8_model_dir = model_dir / "INT8"  # 量子化モデルの保存先(8bit)

# ov_config = {"CACHE_DIR": ""}

# # model_path = model_path / 'INT8'

# if not (int8_model_dir / 'openvino_encoder_model.xml').exists():
#     ov_model = OVModelForSpeechSeq2Seq.from_pretrained(
#         model_id,
#         ov_config=ov_config,
#         export=True,
#         compile=False,
#         # load_in_8bit=False,
#         load_in_8bit=True,
#     )
#     # ov_model.half()
#     ov_model.save_pretrained(int8_model_dir)
# else:
#     ov_model = OVModelForSpeechSeq2Seq.from_pretrained(
#        int8_model_dir, ov_config=ov_config, compile=False
#    )

In [None]:
from ipywidgets import widgets

available_models = []
if int8_model_dir.exists():
    available_models.append("INT8")
if fp16_model_dir.exists():
    available_models.append("FP16")

model_to_run = widgets.Dropdown(
    options=available_models,
    value=available_models[0],
    description="Model to run:",
    disabled=False,
)

model_to_run

Dropdown(description='Model to run:', options=('INT8',), value='INT8')

In [None]:
model_to_run.value

'INT8'

In [None]:
if model_to_run.value == "INT8":  # 8bitモデルを使う場合
    model_dir = int8_model_dir
elif model_to_run.value == "FP16":
    model_dir = fp16_model_dir  # 16bitモデルを使う場合
else:
    raise ValueError("Model not found")
print(f"Loading model from {model_dir}")

Loading model from ../../model/distil-small.en/INT8


In [None]:
processor = AutoProcessor.from_pretrained(model_id)
ov_model = OVModelForSpeechSeq2Seq.from_pretrained(
    model_dir, ov_config=ov_config, compile=False
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


RuntimeError: Exception from src/inference/src/cpp/core.cpp:92:
Exception from src/inference/src/model_reader.cpp:154:
Unable to read the model: ../../model/distil-small.en/INT8/openvino_decoder_with_past_model.xml Please check that model format: xml is supported and the model is correct. Available frontends: tf ir onnx paddle pytorch tflite 



In [None]:
ov_model.to(device)
ov_model.compile()

Compiling the encoder to CPU ...
Compiling the decoder to CPU ...
Compiling the decoder to CPU ...


In [None]:
# 音声認識のパイプラインの設定
pipe = pipeline(
    "automatic-speech-recognition",
    model=ov_model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=25,
)

In [None]:
audio_path = "test.wav"
result = pipe(audio_path)

mbind failed: Function not implemented
mbind failed: Function not implemented
mbind failed: Function not implemented
mbind failed: Function not implemented
mbind failed: Function not implemented
mbind failed: Function not implemented
mbind failed: Function not implemented
mbind failed: Function not implemented
mbind failed: Function not implemented
mbind failed: Function not implemented
mbind failed: Function not implemented
mbind failed: Function not implemented
mbind failed: Function not implemented
mbind failed: Function not implemented
mbind failed: Function not implemented
mbind failed: Function not implemented
mbind failed: Function not implemented
mbind failed: Function not implemented
mbind failed: Function not implemented
mbind failed: Function not implemented
mbind failed: Function not implemented
mbind failed: Function not implemented
mbind failed: Function not implemented
mbind failed: Function not implemented
mbind failed: Function not implemented
mbind failed: Function no

In [None]:
import IPython.display as ipd

display(ipd.Audio(audio_path, autoplay=True))

In [None]:
result["text"]

" Yeah, but the other thing I thought about money on the other side. I think that's good at that. Yeah, I think that the same play card, I thought it was a good thing. I don't know, I did, I thought, yeah, I thought, yeah, I was, I don't know, I was, you know, I was, you know, I was, you know, I was, I mean, I was a bad, I was a bad, I was a bad, I said, I, I was a bad, I was, I, I was a bad, I"