ggerganov · ggerganov · Oct 4, 2023 · Sep 17, 2023
diff --git a/models/convert-h5-to-coreml.py b/models/convert-h5-to-coreml.py
@@ -0,0 +1,117 @@
+import argparse
+import importlib.util
+
+spec = importlib.util.spec_from_file_location('whisper_to_coreml', 'models/convert-whisper-to-coreml.py')
+whisper_to_coreml = importlib.util.module_from_spec(spec)
+spec.loader.exec_module(whisper_to_coreml)
+
+from whisper import load_model
+
+from copy import deepcopy
+import torch
+from transformers import WhisperForConditionalGeneration
+from huggingface_hub import metadata_update
+
+# https://github.com/bayartsogt-ya/whisper-multiple-hf-datasets/blob/main/src/multiple_datasets/hub_default_utils.py
+WHISPER_MAPPING = {
+    "layers": "blocks",
+    "fc1": "mlp.0",
+    "fc2": "mlp.2",
+    "final_layer_norm": "mlp_ln",
+    "layers": "blocks",
+    ".self_attn.q_proj": ".attn.query",
+    ".self_attn.k_proj": ".attn.key",
+    ".self_attn.v_proj": ".attn.value",
+    ".self_attn_layer_norm": ".attn_ln",
+    ".self_attn.out_proj": ".attn.out",
+    ".encoder_attn.q_proj": ".cross_attn.query",
+    ".encoder_attn.k_proj": ".cross_attn.key",
+    ".encoder_attn.v_proj": ".cross_attn.value",
+    ".encoder_attn_layer_norm": ".cross_attn_ln",
+    ".encoder_attn.out_proj": ".cross_attn.out",
+    "decoder.layer_norm.": "decoder.ln.",
+    "encoder.layer_norm.": "encoder.ln_post.",
+    "embed_tokens": "token_embedding",
+    "encoder.embed_positions.weight": "encoder.positional_embedding",
+    "decoder.embed_positions.weight": "decoder.positional_embedding",
+    "layer_norm": "ln_post",
+}
+
+# https://github.com/bayartsogt-ya/whisper-multiple-hf-datasets/blob/main/src/multiple_datasets/hub_default_utils.py
+def rename_keys(s_dict):
+    keys = list(s_dict.keys())
+    for key in keys:
+        new_key = key
+        for k, v in WHISPER_MAPPING.items():
+            if k in key:
+                new_key = new_key.replace(k, v)
+
+        print(f"{key} -> {new_key}")
+
+        s_dict[new_key] = s_dict.pop(key)
+    return s_dict
+
+# https://github.com/bayartsogt-ya/whisper-multiple-hf-datasets/blob/main/src/multiple_datasets/hub_default_utils.py
+def convert_hf_whisper(hf_model_name_or_path: str, whisper_state_path: str):
+    transformer_model = WhisperForConditionalGeneration.from_pretrained(hf_model_name_or_path)
+    config = transformer_model.config
+
+    # first build dims
+    dims = {
+        'n_mels': config.num_mel_bins,
+        'n_vocab': config.vocab_size,
+        'n_audio_ctx': config.max_source_positions,
+        'n_audio_state': config.d_model,
+        'n_audio_head': config.encoder_attention_heads,
+        'n_audio_layer': config.encoder_layers,
+        'n_text_ctx': config.max_target_positions,
+        'n_text_state': config.d_model,
+        'n_text_head': config.decoder_attention_heads,
+        'n_text_layer': config.decoder_layers
+    }
+
+    state_dict = deepcopy(transformer_model.model.state_dict())
+    state_dict = rename_keys(state_dict)
+
+    torch.save({"dims": dims, "model_state_dict": state_dict}, whisper_state_path)
+
+# Ported from models/convert-whisper-to-coreml.py
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-name", type=str, help="name of model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, large-v1)", required=True)
+    parser.add_argument("--model-path", type=str, help="path to the model (e.g. if published on HuggingFace: Oblivion208/whisper-tiny-cantonese)", required=True)
+    parser.add_argument("--encoder-only", type=bool, help="only convert encoder", default=False)
+    parser.add_argument("--quantize",     type=bool, help="quantize weights to F16", default=False)
+    parser.add_argument("--optimize-ane", type=bool, help="optimize for ANE execution (currently broken)", default=False)
+    args = parser.parse_args()
+
+    if args.model_name not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large", "large-v1"]:
+        raise ValueError("Invalid model name")
+
+    pt_target_path = f"models/hf-{args.model_name}.pt"
+    convert_hf_whisper(args.model_path, pt_target_path)
+
+    whisper = load_model(pt_target_path).cpu()
+    hparams = whisper.dims
+    print(hparams)
+
+    if args.optimize_ane:
+        whisperANE = whisper_to_coreml.WhisperANE(hparams).eval()
+        whisperANE.load_state_dict(whisper.state_dict())
+
+        encoder = whisperANE.encoder
+        decoder = whisperANE.decoder
+    else:
+        encoder = whisper.encoder
+        decoder = whisper.decoder
+
+    # Convert encoder
+    encoder = whisper_to_coreml.convert_encoder(hparams, encoder, quantize=args.quantize)
+    encoder.save(f"models/coreml-encoder-{args.model_name}.mlpackage")
+
+    if args.encoder_only is False:
+        # Convert decoder
+        decoder = whisper_to_coreml.convert_decoder(hparams, decoder, quantize=args.quantize)
+        decoder.save(f"models/coreml-decoder-{args.model_name}.mlpackage")
+
+    print("done converting")
diff --git a/models/generate-coreml-model.sh b/models/generate-coreml-model.sh
@@ -1,19 +1,30 @@
 #!/bin/bash
 
 # Usage: ./generate-coreml-model.sh <model-name>
-if [ $# -eq 0 ]
-  then
-    echo "No model name supplied"
-    echo "Usage: ./generate-coreml-model.sh <model-name>"
-    exit 1
+if [ $# -eq 0 ]; then
+  echo "No model name supplied"
+  echo "Usage for Whisper models: ./generate-coreml-model.sh <model-name>"
+  echo "Usage for HuggingFace models: ./generate-coreml-model.sh -h5 <model-name> <model-path>"
+  exit 1
+elif [[ "$1" == "-h5" && $# != 3 ]]; then
+  echo "No model name and model path supplied for a HuggingFace model"
+  echo "Usage for HuggingFace models: ./generate-coreml-model.sh -h5 <model-name> <model-path>"
+  exit 1
 fi
 
 mname="$1"
 
 wd=$(dirname "$0")
 cd "$wd/../"
 
-python3 models/convert-whisper-to-coreml.py --model $mname --encoder-only True
+if [[ $mname == "-h5" ]]; then
+  mname="$2"
+  mpath="$3"
+  echo $mpath
+  python3 models/convert-h5-to-coreml.py --model-name $mname --model-path $mpath --encoder-only True
+else
+  python3 models/convert-whisper-to-coreml.py --model $mname --encoder-only True
+fi
 
 xcrun coremlc compile models/coreml-encoder-${mname}.mlpackage models/
 rm -rf models/ggml-${mname}-encoder.mlmodelc