In [None]:
'''
optimum = {extras = ["openvino"], version = "^1.19.2"}
nncf = "^2.10.0"
torch = "^2.3.0"
datasets = "^2.19.1"
accelerate = "^0.30.1"
openvino-nightly = "^2024.2.0.dev20240515"
gradio = "^4.31.3"
onnx = "^1.16.0"
transformers = "^4.40.2"
einops = "^0.8.0"
transformers-stream-generator = "^0.0.5"
tiktoken = "^0.7.0"
bitsandbytes = "^0.43.1"
'''

In [1]:
from pathlib import Path
import logging
import os

import openvino as ov
import nncf


INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, onnx, openvino


In [2]:
# ## login to huggingfacehub to get access to pretrained model 

# from huggingface_hub import notebook_login, whoami

# try:
#     whoami()
#     print('Authorization token already provided')
# except OSError:
#     notebook_login()

Authorization token already provided


In [2]:
nncf.set_log_level(logging.ERROR)

# ダウンロードするモデル
company = "microsoft"
# company = "google"
# company = "rinna"

model_name = "Phi-3-mini-4k-instruct"
# model_name = "gemma-7b-it"
# model_name = "youri-7b-chat"

model_id = f'{company}/{model_name}'

remote_code = True  # Phi-3はコメントアウト
# remote_code = False

# モデルを保存するディレクトリ
model_dir = Path(f'../../model/{model_name}')
fp16_model_dir = model_dir / "FP16"  # float 16bitモデルの保存先
int8_model_dir = model_dir / "INT8"  # 量子化モデルの保存先(8bit)
int4_model_dir = model_dir / "INT4"  # 量子化モデルの保存先(4bit)

In [3]:
compression_configs = {
    # ここのパラメータは要調整
    # "sym":          対称量子化の利用
    # 'group_size':  グループサイズ  (64, 128が無難？)
    # 'ratio':       量子化後のパラメータの割合  (0.5~0.8で試す)
    "gemma-2b-it": {
        "sym": True,
        "group_size": 64,
        "ratio": 0.6,
    },
    "llama-2-chat-7b": {
        "sym": True,
        "group_size": 128,
        "ratio": 0.8,
    },
    "gemma-7b-it": {
        "sym": True,
        "group_size": 128,
        "ratio": 0.8,
    },
    "default": {
        "sym": False,
        "group_size": 128,
        "ratio": 0.8,
    },
}

In [4]:
model_id

'microsoft/Phi-3-mini-4k-instruct'

In [5]:
fp16_model_dir

PosixPath('../../model/Phi-3-mini-4k-instruct/FP16')

In [6]:
core = ov.Core()
# optimum-cliでモデルをopenvino形式でダウンロード
export_command_base = "optimum-cli export openvino --model {} --task text-generation-with-past".format(model_id)

def convert_to_fp16():
    global export_command_base
    export_command = ''
    # すでに存在する場合はスキップ
    if (fp16_model_dir / "openvino_model.xml" ).exists():
        return
    if remote_code:
        export_command_base += " --trust-remote-code"
    export_command = export_command_base + " --weight-format fp16"
    export_command += " " + str(fp16_model_dir)
    print('export_command:', export_command)
    os.system(export_command)
    print('export done')


def convert_to_int8():
    global export_command_base
    if (int8_model_dir / "openvino_model.xml").exists():
        return
    if remote_code:
        export_command_base += " --trust-remote-code"
    export_command = export_command_base + " --weight-format int8"
    export_command += " " + str(int8_model_dir)
    print('export_command:', export_command)
    os.system('poetry run ' + export_command)
    print('export done')

def convert_to_int4():
    global export_command_base
    if (int4_model_dir / "openvino_model.xml").exists():
        return
    if remote_code:
        export_command_base += " --trust-remote-code"
    # 量子化の設定
    model_compression_params  = compression_configs.get(model_name, compression_configs["default"])
    export_command = export_command_base + " --weight-format int4"
    int4_compression_args = " --group-size {} --ratio {}".format(model_compression_params["group_size"], model_compression_params["ratio"])
    if model_compression_params["sym"]:
        int4_compression_args += " --sym"
    export_command += int4_compression_args + " " + str(int4_model_dir)
    print('export_command:', export_command)
    os.system('poetry run ' + export_command)
    print('export done')


In [7]:
convert_to_int4()

export_command: optimum-cli export openvino --model microsoft/Phi-3-mini-4k-instruct --task text-generation-with-past --trust-remote-code --weight-format int4 --group-size 128 --ratio 0.8 ../../model/Phi-3-mini-4k-instruct/INT4
INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, onnx, openvino


Framework not specified. Using pt to export the model.
`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
Loading checkpoint shards: 100%|██████████| 2/2 [00:07<00:00,  3.66s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Using framework PyTorch: 2.3.0+cu121
The model type phi3 is not yet supported to be used with BetterTransformer. Feel free to open an issue at https://github.com/huggingface/opti

export done


Killed


In [None]:
convert_to_int8()

export_command: optimum-cli export openvino --model google/gemma-2b-it --task text-generation-with-past --weight-format int8 ../../model/gemma-2b-it/INT8
INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, onnx, openvino


Framework not specified. Using pt to export the model.
Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.
Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.04s/it]
Using the export variant default. Available variants are:
    - default: The default ONNX variant.
Using framework PyTorch: 2.3.0+cu121
Overriding 1 configuration item(s)
	- use_cache -> True
  if sequence_length != 1:
  op1 = operator(*args, **kwargs)


INFO:nncf:Statistics of the bitwidth distribution:
┍━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑
│   Num bits (N) │ % all parameters (layers)   │ % ratio-defining parameters (layers)   │
┝━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥
│              8 │ 100% (127 / 127)            │ 100% (127 / 127)                       │
┕━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━���━━━━━━━━━━━━━━━┙
[2KApplying Weight Compression [90m━━━━━━━━━━━━━━━━━━━[0m [35m100%[0m [36m127/127[0m • [36m0:00:26[0m • [36m0:00:00[0m00:01[0m00:02[0m
[?25hexport done


In [None]:
onvert_to_fp16()

export_command: optimum-cli export openvino --model google/gemma-2b-it --task text-generation-with-past --weight-format fp16 ../../model/gemma-2b-it/FP16
INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, onnx, openvino


Framework not specified. Using pt to export the model.
Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.
Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.43s/it]
Using the export variant default. Available variants are:
    - default: The default ONNX variant.
Using framework PyTorch: 2.3.0+cu121
Overriding 1 configuration item(s)
	- use_cache -> True
  if sequence_length != 1:
  op1 = operator(*args, **kwargs)


export done


In [8]:
# モデルが保存されているディレクトリのサイズを確認

fp16_weights = fp16_model_dir / "openvino_model.bin"
int8_weights = int8_model_dir / "openvino_model.bin"
int4_weights = int4_model_dir / "openvino_model.bin"

if fp16_weights.exists():
    print(f"Size of FP16 model is {fp16_weights.stat().st_size / 1024 / 1024:.2f} MB")

for precision, compressed_weights in zip([8, 4], [int8_weights, int4_weights]):
    if compressed_weights.exists():
        print(f"Size of model with INT{precision} compressed weights is {compressed_weights.stat().st_size / 1024 / 1024:.2f} MB")
    if compressed_weights.exists() and fp16_weights.exists():
        print(f"Compression rate for INT{precision} model: {fp16_weights.stat().st_size / compressed_weights.stat().st_size:.3f}")

In [9]:
# デバイスの選択
support_devices = core.available_devices
device = 'NPU' if 'NPU' in support_devices else 'CPU'  # 実機のNPUが使えればいいのだけれど。。。

In [10]:
device

'CPU'

In [11]:
from ipywidgets import widgets

available_models = []
if int4_model_dir.exists():
    available_models.append("INT4")
if int8_model_dir.exists():
    available_models.append("INT8")
if fp16_model_dir.exists():
    available_models.append("FP16")

model_to_run = widgets.Dropdown(
    options=available_models,
    value=available_models[0],
    description="Model to run:",
    disabled=False,
)

model_to_run

Dropdown(description='Model to run:', options=('INT4',), value='INT4')

In [12]:
model_to_run.value

'INT4'

In [13]:
from transformers import AutoConfig, AutoTokenizer
from optimum.intel.openvino import OVModelForCausalLM

if model_to_run.value == "INT4":  # 4bitモデルを使う場合
    model_dir = int4_model_dir
elif model_to_run.value == "INT8":  # 8bitモデルを使う場合
    model_dir = int8_model_dir
else:
    model_dir = fp16_model_dir  # 16bitモデルを使う場合
print(f"Loading model from {model_dir}")

ov_config = {"PERFORMANCE_HINT": "LATENCY", "NUM_STREAMS": "1", "CACHE_DIR": ""}

# On a GPU device a model is executed in FP16 precision. For red-pajama-3b-chat model there known accuracy
# issues caused by this, which we avoid by setting precision hint to "f32".
if model_id == "red-pajama-3b-chat" and "GPU" in core.available_devices and device in ["GPU", "AUTO"]:
    ov_config["INFERENCE_PRECISION_HINT"] = "f32"

tok = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)

model_name = model_id
ov_model = OVModelForCausalLM.from_pretrained(
    model_dir,
    device=device,
    ov_config=ov_config,
    config=AutoConfig.from_pretrained(model_dir, trust_remote_code=True),
    trust_remote_code=True,
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading model from ../../model/Phi-3-mini-4k-instruct/INT4


The argument `trust_remote_code` is to be used along with export=True. It will be ignored.


RuntimeError: Exception from src/inference/src/cpp/core.cpp:92:
Exception from src/inference/src/model_reader.cpp:154:
Unable to read the model: ../../model/Phi-3-mini-4k-instruct/INT4/openvino_model.xml Please check that model format: xml is supported and the model is correct. Available frontends: tf ir onnx paddle pytorch tflite 



In [None]:
import torch.cuda as cuda
device_arch = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
# モデルの動作確認
tokenizer_kwargs = {"add_special_tokens": False} if model_name == 'youri-7b-chat' else {}

test_string = "こんにちは、あなたは誰？"
input_tokens = tok(test_string, return_tensors="pt", **tokenizer_kwargs).to(device_arch)

answer = ov_model.generate(
    **input_tokens,
    max_new_tokens=256,
    temperature=0.2,
    do_sample=True
)

print(tok.batch_decode(answer, skip_special_tokens=True)[0])

こんにちは、あなたは誰？

こんにちは！私は日本語で話せる人です。私は何名の人か知っていますか？

私はまだ特定の人の名前を記憶していませんが、いくつかの質問を聞いています。

1. あなたは誰ですか？
2. あなたはどの分野の仕事をしていると考えられますか？
3. あなたはどのような趣味や興味を持っていると考えられますか？

これらの質問を聞いて、あなたの回答をください。
