In [1]:
# import ipywidgets as widgets
from pathlib import Path
from typing import Tuple

import openvino as ov
from optimum.intel.openvino import OVModelForSpeechSeq2Seq
from optimum.modeling_base import OptimizedModel
from transformers import AutoProcessor, pipeline

INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, onnx, openvino


In [2]:
core = ov.Core()
device = "NPU" if "NPU" in core.available_devices else "CPU"

# テストで使うモデル
model_id = "openai/whisper-large-v3"

model_id

'openai/whisper-large-v3'

In [3]:
model_name = model_id.split("/")[1]

In [4]:
# CLIではなく、クラスからモデルを読み込む場合
from datetime import datetime

# モデルを保存するディレクトリ
model_dir = Path(f"../../model/{model_name}")
fp16_model_dir = model_dir / "FP16"  # float 16bitモデルの保存先
int8_model_dir = model_dir / "INT8"  # 量子化モデルの保存先(8bit)

ov_config = {"CACHE_DIR": ""}


def download_and_convert_to_fp16() -> OptimizedModel:
    # ダウンロード開始
    start_model_download = datetime.now()
    if not (fp16_model_dir / "openvino_encoder_model.xml").exists():
        ov_model = OVModelForSpeechSeq2Seq.from_pretrained(
            model_id,
            ov_config=ov_config,
            export=True,
            compile=False,
            load_in_8bit=False,
        )
        ov_model.half()
        ov_model.save_pretrained(fp16_model_dir)
    else:
        ov_model = OVModelForSpeechSeq2Seq.from_pretrained(
            fp16_model_dir, ov_config=ov_config, compile=False
        )
    # ダウンロード完了
    end_model_download = datetime.now() - start_model_download
    print("export done", end_model_download.total_seconds())
    return ov_model


def download_and_convert_to_int8() -> OptimizedModel:
    # ダウンロード開始
    start_model_download = datetime.now()
    if not (int8_model_dir / "openvino_encoder_model.xml").exists():
        ov_model = OVModelForSpeechSeq2Seq.from_pretrained(
            model_id,
            ov_config=ov_config,
            export=True,
            compile=False,
            load_in_8bit=True,
        )
        ov_model.save_pretrained(int8_model_dir)
    else:
        ov_model = OVModelForSpeechSeq2Seq.from_pretrained(
            int8_model_dir, ov_config=ov_config, compile=False
        )
    # ダウンロード完了
    end_model_download = datetime.now() - start_model_download
    print("export done", end_model_download.total_seconds())
    return ov_model

In [5]:
# deviceがNPUならFP16, それ以外はINT8
ov_model = (
    download_and_convert_to_fp16()
    if device == "NPU"
    else download_and_convert_to_int8()
)
processor = AutoProcessor.from_pretrained(model_id)

export done 4.823963


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
ov_model.to(device)
ov_model.compile()

Compiling the encoder to CPU ...
Compiling the decoder to CPU ...
Compiling the decoder to CPU ...


In [7]:
# 音声認識のパイプラインの設定
pipe = pipeline(
    "automatic-speech-recognition",
    model=ov_model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=256,
    chunk_length_s=25,
)

pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(
    language="Japanese", task="transcribe"
)

In [8]:
from decode_base64_audio import decode_and_resample_audio

In [None]:
base64_audio = ""

In [10]:
sample, rate = decode_and_resample_audio(base64_audio)

In [11]:
audio_path = "audio/sps-smp.mp3"
# result = pipe(audio_path)
result = pipe(sample)



In [12]:
import numpy as np
from pydub import AudioSegment


def load_audio_file(file_path="") -> Tuple[np.ndarray, int]:
    audio_segment = AudioSegment.from_file(file_path, format="mp3")
    samples = np.array(audio_segment.get_array_of_samples())
    return samples, audio_segment.frame_rate


load_audio_file(audio_path)

(array([0, 0, 0, ..., 0, 0, 0], dtype=int16), 16000)

In [13]:
import IPython.display as ipd

display(ipd.Audio(audio_path, autoplay=True))

In [14]:
import IPython.display as ipd

display(ipd.Audio(sample, rate=rate))

In [15]:
sample, rate

(array([0, 0, 0, ..., 0, 0, 0], dtype=int16), 16000)

In [16]:
result

{'text': 'あ、あ、はじめまして、あ、はじめましていやー、僕、ちょっと、新しくコンテストに入らせていただく伊藤ですお願いします、あ、お願いしますえ、なんか今回、やりたいとこやりたいとこって決まってるんですか?特に話題とか決まってなくてあー、自分もちょっとあんまり決まってないんだよね?う~ん'}

In [17]:
result["text"]

'あ、あ、はじめまして、あ、はじめましていやー、僕、ちょっと、新しくコンテストに入らせていただく伊藤ですお願いします、あ、お願いしますえ、なんか今回、やりたいとこやりたいとこって決まってるんですか?特に話題とか決まってなくてあー、自分もちょっとあんまり決まってないんだよね?う~ん'