Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 25 additions & 5 deletions examples/model-conversion/Makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# Validation functions
MAKEFLAGS += --no-print-directory

define validate_model_path
@if [ -z "$(MODEL_PATH)" ]; then \
echo "Error: MODEL_PATH must be provided either as:"; \
Expand All @@ -17,6 +18,13 @@ define validate_embedding_model_path
fi
endef

define quantize_model
@CONVERTED_MODEL="$(1)" QUANTIZED_TYPE="$(QUANTIZED_TYPE)" \
TOKEN_EMBD_TYPE="$(TOKEN_EMBD_TYPE)" OUTPUT_TYPE="$(OUTPUT_TYPE)" \
./scripts/utils/quantize.sh "$(1)" "$(QUANTIZED_TYPE)" "$(TOKEN_EMBD_TYPE)" "$(OUTPUT_TYPE)"
@echo "Export the quantized model path to $(2) variable in your environment"
endef

###
### Casual Model targets/recipes
###
Expand Down Expand Up @@ -67,9 +75,15 @@ causal-quantize-Q8_0: causal-quantize-model
causal-quantize-Q4_0: QUANTIZED_TYPE = Q4_0
causal-quantize-Q4_0: causal-quantize-model

# For Quantization Aware Trained (QAT) models in Q4_0 we explicitly set the
# token embedding and output types to Q8_0 instead of the default Q6_K.
causal-quantize-qat-Q4_0: QUANTIZED_TYPE = Q4_0
causal-quantize-qat-Q4_0: TOKEN_EMBD_TYPE = Q8_0
causal-quantize-qat-Q4_0: OUTPUT_TYPE = Q8_0
causal-quantize-qat-Q4_0: causal-quantize-model

causal-quantize-model:
@CONVERTED_MODEL="$(CONVERTED_MODEL)" QUANTIZED_TYPE="$(QUANTIZED_TYPE)" ./scripts/utils/quantize.sh ${CONVERTED_MODEL} ${QUANTIZED_TYPE}
@echo "Export the quantized model path to QUANTIZED_MODEL variable in your environment"
$(call quantize_model,$(CONVERTED_MODEL),QUANTIZED_MODEL)

causal-run-quantized-model:
@QUANTIZED_MODEL="$(QUANTIZED_MODEL)" ./scripts/causal/run-converted-model.sh ${QUANTIZED_MODEL}
Expand Down Expand Up @@ -117,9 +131,15 @@ embedding-quantize-Q8_0: embedding-quantize-model
embedding-quantize-Q4_0: QUANTIZED_TYPE = Q4_0
embedding-quantize-Q4_0: embedding-quantize-model

# For Quantization Aware Trained (QAT) models in Q4_0 we explicitly set the
# token embedding and output types to Q8_0 instead of the default Q6_K.
embedding-quantize-qat-Q4_0: QUANTIZED_TYPE = Q4_0
embedding-quantize-qat-Q4_0: TOKEN_EMBD_TYPE = Q8_0
embedding-quantize-qat-Q4_0: OUTPUT_TYPE = Q8_0
embedding-quantize-qat-Q4_0: embedding-quantize-model

embedding-quantize-model:
@./scripts/utils/quantize.sh ${CONVERTED_EMBEDDING_MODEL} ${QUANTIZED_TYPE}
@echo "Export the quantized model path to QUANTIZED_EMBEDDING_MODEL variable in your environment"
$(call quantize_model,$(CONVERTED_EMBEDDING_MODEL),QUANTIZED_EMBEDDING_MODEL)

embedding-run-quantized-model:
@./scripts/embedding/run-converted-model.sh ${QUANTIZED_EMBEDDING_MODEL}
Expand Down
24 changes: 24 additions & 0 deletions examples/model-conversion/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,18 @@ Then the quantized model can be run using the following command:
(venv) $ make causal-run-quantized-model
```

### Quantizing QAT (Quantization Aware Training) models
When quantizing to `Q4_0`, the default data type for the token embedding weights
will be `Q6_K`. For models that are going to be uploaded to ggml-org it is
recommended to use `Q8_0` instead for the embeddings and output tensors.
The reason is that although `Q6_K` is smaller in size, it requires more compute
to unpack, which can hurt performance during output generation when the entire
embedding matrix must be dequantized to compute vocabulary logits. `Q8_0`
provides practically full quality with better computational efficiency.
```console
(venv) $ make causal-quantize-qat-Q4_0
```


## Embedding Language Model Conversion

Expand Down Expand Up @@ -238,6 +250,18 @@ Then the quantized model can be run using the following command:
(venv) $ make embedding-run-quantized-model
```

### Quantizing QAT (Quantization Aware Training) models
When quantizing to `Q4_0`, the default data type for the token embedding weights
will be `Q6_K`. For models that are going to be uploaded to ggml-org it is
recommended to use `Q8_0` instead for the embeddings and output tensors.
The reason is that although `Q6_K` is smaller in size, it requires more compute
to unpack, which can hurt performance during output generation when the entire
embedding matrix must be dequantized to compute vocabulary logits. `Q8_0`
provides practically full quality with better computational efficiency.
```console
(venv) $ make embedding-quantize-qat-Q4_0
```

## Perplexity Evaluation

### Simple perplexity evaluation
Expand Down
18 changes: 16 additions & 2 deletions examples/model-conversion/scripts/utils/quantize.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ set -e

CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
QUANTIZED_TYPE="${2:-"$QUANTIZED_TYPE"}"
TOKEN_EMBD_TYPE="${3:-"${TOKEN_EMBD_TYPE}"}"
OUTPUT_TYPE="${4:-"${OUTPUT_TYPE}"}"
QUANTIZED_MODEL=$CONVERTED_MODEL

# Final check if we have a model path
Expand All @@ -14,6 +16,11 @@ if [ -z "$CONVERTED_MODEL" ]; then
exit 1
fi

if [ -z "$QUANTIZED_TYPE" ]; then
echo "Error: QUANTIZED_TYPE is required" >&2
exit 1
fi

echo $CONVERTED_MODEL

# Process the quantized model filename
Expand All @@ -26,9 +33,16 @@ else
exit 1
fi


cmake --build ../../build --target llama-quantize -j8

../../build/bin/llama-quantize $CONVERTED_MODEL $QUANTIZED_MODEL $QUANTIZED_TYPE
echo $TOKEN_EMBD_TYPE
echo $OUTPUT_TYPE

CMD_ARGS=("../../build/bin/llama-quantize")
[[ -n "$TOKEN_EMBD_TYPE" ]] && CMD_ARGS+=("--token-embedding-type" "$TOKEN_EMBD_TYPE")
[[ -n "$OUTPUT_TYPE" ]] && CMD_ARGS+=("--output-tensor-type" "$OUTPUT_TYPE")
CMD_ARGS+=("$CONVERTED_MODEL" "$QUANTIZED_MODEL" "$QUANTIZED_TYPE")

"${CMD_ARGS[@]}"

echo "Quantized model saved to: $QUANTIZED_MODEL"