diff --git a/README.md b/README.md index b800541..0555b85 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@
- # 🚀 QuantLLM v2.0
+ # 🚀 QuantLLM v2.1 (pre-release)
**The Ultra-Fast LLM Quantization & Export Library**
@@ -52,9 +52,12 @@ model = AutoModelForCausalLM.from_pretrained(
```python
from quantllm import turbo
-model = turbo("meta-llama/Llama-3-8B") # Auto-quantizes
+model = turbo(
+ "meta-llama/Llama-3-8B",
+ config={"format": "gguf", "quantization": "Q4_K_M", "push_format": "gguf"},
+) # Auto-quantizes
model.generate("Hello!") # Generate text
-model.export("gguf", quantization="Q4_K_M") # Export to GGUF
+model.export() # Export to GGUF with shared config
```
---
@@ -77,14 +80,17 @@ pip install "quantllm[full] @ git+https://github.com/codewithdark-git/QuantLLM.g
from quantllm import turbo
# Load with automatic optimization
-model = turbo("meta-llama/Llama-3.2-3B")
+model = turbo(
+ "meta-llama/Llama-3.2-3B",
+ config={"format": "gguf", "quantization": "Q4_K_M", "push_format": "gguf"},
+)
# Generate text
response = model.generate("Explain quantum computing simply")
print(response)
# Export to GGUF
-model.export("gguf", "model.Q4_K_M.gguf", quantization="Q4_K_M")
+model.export("gguf", "model.Q4_K_M.gguf")
```
**QuantLLM automatically:**
@@ -102,11 +108,14 @@ model.export("gguf", "model.Q4_K_M.gguf", quantization="Q4_K_M")
One unified interface for everything:
```python
-model = turbo("mistralai/Mistral-7B")
+model = turbo(
+ "mistralai/Mistral-7B",
+ config={"format": "gguf", "quantization": "Q4_K_M", "push_format": "gguf"},
+)
model.generate("Hello!")
model.finetune(data, epochs=3)
-model.export("gguf", quantization="Q4_K_M")
-model.push("user/repo", format="gguf")
+model.export()
+model.push("user/repo")
```
### ⚡ Performance Optimizations
@@ -133,7 +142,7 @@ Llama 2/3, Mistral, Mixtral, Qwen 1/2, Phi 1/2/3, Gemma, Falcon, DeepSeek, Yi, S
```
╔════════════════════════════════════════════════════════════╗
-║ 🚀 QuantLLM v2.0.0 ║
+║ 🚀 QuantLLM v2.1.0rc1 ║
║ Ultra-fast LLM Quantization & Export ║
║ ✓ GGUF ✓ ONNX ✓ MLX ✓ SafeTensors ║
╚════════════════════════════════════════════════════════════╝
@@ -148,7 +157,7 @@ Llama 2/3, Mistral, Mixtral, Qwen 1/2, Phi 1/2/3, Gemma, Falcon, DeepSeek, Yi, S
Auto-generates model cards with YAML frontmatter, usage examples, and "Use this model" button:
```python
-model.push("user/my-model", format="gguf", quantization="Q4_K_M")
+model.push("user/my-model")
```
---
@@ -195,7 +204,10 @@ model.export("safetensors", "./model-hf/")
```python
from quantllm import turbo
-model = turbo("meta-llama/Llama-3.2-3B")
+model = turbo(
+ "meta-llama/Llama-3.2-3B",
+ config={"format": "gguf", "quantization": "Q4_K_M", "push_format": "gguf"},
+)
# Simple generation
response = model.generate(
@@ -267,8 +279,6 @@ model = turbo("meta-llama/Llama-3.2-3B")
# Push with auto-generated model card
model.push(
"your-username/my-model",
- format="gguf",
- quantization="Q4_K_M",
license="apache-2.0"
)
```
diff --git a/docs/api/gguf.md b/docs/api/gguf.md
index 1135c5e..12f18dc 100644
--- a/docs/api/gguf.md
+++ b/docs/api/gguf.md
@@ -10,8 +10,11 @@ Export models to GGUF format for llama.cpp, Ollama, and LM Studio.
from quantllm import turbo, convert_to_gguf, quantize_gguf
# Method 1: Via TurboModel
-model = turbo("meta-llama/Llama-3.2-3B")
-model.export("gguf", "model.Q4_K_M.gguf", quantization="Q4_K_M")
+model = turbo(
+ "meta-llama/Llama-3.2-3B",
+ config={"format": "gguf", "quantization": "Q4_K_M", "push_format": "gguf"},
+)
+model.export("gguf", "model.Q4_K_M.gguf")
# Method 2: Direct conversion
convert_to_gguf("meta-llama/Llama-3.2-3B", "model.Q4_K_M.gguf", quant_type="Q4_K_M")
diff --git a/docs/api/hub.md b/docs/api/hub.md
index 0fb1fa6..d501e56 100644
--- a/docs/api/hub.md
+++ b/docs/api/hub.md
@@ -10,8 +10,11 @@ Push models to HuggingFace Hub with auto-generated model cards.
from quantllm import turbo, QuantLLMHubManager
# Method 1: TurboModel.push() (Recommended)
-model = turbo("meta-llama/Llama-3.2-3B")
-model.push("user/my-model", format="gguf", quantization="Q4_K_M")
+model = turbo(
+ "meta-llama/Llama-3.2-3B",
+ config={"format": "gguf", "quantization": "Q4_K_M", "push_format": "gguf"},
+)
+model.push("user/my-model")
# Method 2: QuantLLMHubManager (Advanced)
manager = QuantLLMHubManager("user/my-model", hf_token="hf_...")
@@ -30,7 +33,7 @@ def push(
self,
repo_id: str,
token: Optional[str] = None,
- format: str = "safetensors",
+ format: Optional[str] = None,
quantization: Optional[str] = None,
license: str = "apache-2.0",
commit_message: str = "Upload model via QuantLLM",
@@ -44,7 +47,7 @@ def push(
|-----------|------|---------|-------------|
| `repo_id` | str | required | HuggingFace repo ID (user/model) |
| `token` | str | None | HF token (or use HF_TOKEN env) |
-| `format` | str | "safetensors" | Export format |
+| `format` | str | None | Export format (uses `config["push_format"]` when omitted) |
| `quantization` | str | None | Quantization type |
| `license` | str | "apache-2.0" | License type |
@@ -62,13 +65,14 @@ def push(
```python
from quantllm import turbo
-model = turbo("meta-llama/Llama-3.2-3B")
+model = turbo(
+ "meta-llama/Llama-3.2-3B",
+ config={"format": "gguf", "quantization": "Q4_K_M", "push_format": "gguf"},
+)
# Push as GGUF
model.push(
- "your-username/llama-3.2-3b-gguf",
- format="gguf",
- quantization="Q4_K_M"
+ "your-username/llama-3.2-3b-gguf"
)
# Push as ONNX
diff --git a/docs/api/turbo.md b/docs/api/turbo.md
index 512f8f0..ef4f0ed 100644
--- a/docs/api/turbo.md
+++ b/docs/api/turbo.md
@@ -14,6 +14,7 @@ def turbo(
max_length: Optional[int] = None,
device: Optional[str] = None,
dtype: Optional[str] = None,
+ config: Optional[Dict[str, Any]] = None,
quantize: bool = True,
trust_remote_code: bool = False,
verbose: bool = True,
@@ -32,6 +33,7 @@ def turbo(
| `max_length` | int | auto | Maximum context length |
| `device` | str | auto | Device ("cuda", "cpu", "cuda:0", "auto") |
| `dtype` | str | auto | Data type ("float16", "bfloat16") |
+| `config` | dict | None | Shared export/push defaults (`format`, `quantization`, `push_format`, `push_quantization`) |
| `quantize` | bool | True | Whether to apply quantization |
| `trust_remote_code` | bool | False | Trust remote code in model |
| `verbose` | bool | True | Show loading progress and stats |
@@ -124,7 +126,7 @@ When `verbose=True` (default), you'll see:
```
╔════════════════════════════════════════════════════════════╗
-║ 🚀 QuantLLM v2.0.0 ║
+║ 🚀 QuantLLM v2.1.0rc1 ║
╚════════════════════════════════════════════════════════════╝
📊 Loading: meta-llama/Llama-3.2-3B
diff --git a/docs/api/turbomodel.md b/docs/api/turbomodel.md
index 3542463..d091958 100644
--- a/docs/api/turbomodel.md
+++ b/docs/api/turbomodel.md
@@ -232,8 +232,8 @@ Export the model to various formats.
```python
def export(
self,
- format: str,
- output_path: str,
+ format: Optional[str] = None,
+ output_path: Optional[str] = None,
quantization: Optional[str] = None,
**kwargs
) -> str
@@ -241,14 +241,18 @@ def export(
| Parameter | Type | Description |
|-----------|------|-------------|
-| `format` | str | "gguf", "onnx", "mlx", "safetensors" |
-| `output_path` | str | Output file or directory |
+| `format` | str | "gguf", "onnx", "mlx", "safetensors" (optional, uses shared config) |
+| `output_path` | str | Output file or directory (optional) |
| `quantization` | str | Quantization type (format-specific) |
**Examples:**
```python
# GGUF
-model.export("gguf", "model.Q4_K_M.gguf", quantization="Q4_K_M")
+model = turbo(
+ "meta-llama/Llama-3.2-3B",
+ config={"format": "gguf", "quantization": "Q4_K_M", "push_format": "gguf"},
+)
+model.export()
# ONNX
model.export("onnx", "./model-onnx/")
@@ -269,7 +273,7 @@ def push(
self,
repo_id: str,
token: Optional[str] = None,
- format: str = "safetensors",
+ format: Optional[str] = None,
quantization: Optional[str] = None,
license: str = "apache-2.0",
commit_message: str = "Upload model via QuantLLM",
@@ -281,9 +285,7 @@ def push(
```python
# Push as GGUF
model.push(
- "your-username/my-model",
- format="gguf",
- quantization="Q4_K_M"
+ "your-username/my-model"
)
# Push as MLX
diff --git a/docs/conf.py b/docs/conf.py
index dacb626..ebd8a42 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -3,7 +3,7 @@
project = 'QuantLLM'
copyright = '2024, Dark Coder'
author = 'Dark Coder'
-release = '2.0.0'
+release = '2.1.0rc1'
# Extensions
extensions = [
@@ -21,7 +21,7 @@
# HTML output
html_theme = 'sphinx_rtd_theme'
html_static_path = ['_static']
-html_title = 'QuantLLM v2.0'
+html_title = 'QuantLLM v2.1'
html_logo = 'images/logo.png'
html_favicon = 'images/favicon.ico'
diff --git a/docs/guide/finetuning.md b/docs/guide/finetuning.md
index 654debf..3dd29e4 100644
--- a/docs/guide/finetuning.md
+++ b/docs/guide/finetuning.md
@@ -193,13 +193,13 @@ print("Fine-tuned:", model.generate("prompt"))
```python
# Export to GGUF
-model.export("gguf", "finetuned.Q4_K_M.gguf", quantization="Q4_K_M")
+model.export("gguf", "finetuned.Q4_K_M.gguf")
# Export to SafeTensors
model.export("safetensors", "./finetuned-model/")
# Push to HuggingFace
-model.push("your-username/finetuned-model", format="gguf")
+model.push("your-username/finetuned-model")
```
### Save and Load
diff --git a/docs/guide/gguf-export.md b/docs/guide/gguf-export.md
index 0c61f8d..35329d5 100644
--- a/docs/guide/gguf-export.md
+++ b/docs/guide/gguf-export.md
@@ -130,10 +130,12 @@ print(output["choices"][0]["text"])
Export and push in one step:
```python
+model = turbo(
+ "meta-llama/Llama-3.2-3B",
+ config={"format": "gguf", "quantization": "Q4_K_M", "push_format": "gguf"},
+)
model.push(
"your-username/my-model-gguf",
- format="gguf",
- quantization="Q4_K_M",
license="apache-2.0"
)
```
diff --git a/docs/guide/hub-integration.md b/docs/guide/hub-integration.md
index 176976a..c0887c6 100644
--- a/docs/guide/hub-integration.md
+++ b/docs/guide/hub-integration.md
@@ -11,14 +11,15 @@ The easiest way to share your model:
```python
from quantllm import turbo
-model = turbo("meta-llama/Llama-3.2-3B")
+model = turbo(
+ "meta-llama/Llama-3.2-3B",
+ config={"format": "gguf", "quantization": "Q4_K_M", "push_format": "gguf"},
+)
# Push with auto-generated model card
model.push(
"your-username/my-model",
- token="hf_...",
- format="gguf",
- quantization="Q4_K_M"
+ token="hf_..."
)
```
@@ -49,34 +50,18 @@ model.push("user/repo", token="hf_...")
```python
from quantllm import turbo
-model = turbo("meta-llama/Llama-3.2-3B")
-
-# Push as GGUF (for Ollama, llama.cpp, LM Studio)
-model.push(
- "your-username/my-model-gguf",
- format="gguf",
- quantization="Q4_K_M",
- license="apache-2.0"
+model = turbo(
+ "meta-llama/Llama-3.2-3B",
+ config={
+ "format": "gguf",
+ "quantization": "Q4_K_M",
+ "push_format": "gguf",
+ },
)
-# Push as ONNX
-model.push(
- "your-username/my-model-onnx",
- format="onnx"
-)
-
-# Push as MLX (Apple Silicon)
-model.push(
- "your-username/my-model-mlx",
- format="mlx",
- quantization="4bit"
-)
-
-# Push as SafeTensors (default)
-model.push(
- "your-username/my-model",
- format="safetensors"
-)
+# Uses shared config defaults
+model.export()
+model.push("your-username/my-model-gguf", license="apache-2.0")
```
### Method 2: QuantLLMHubManager (Advanced)
diff --git a/docs/index.md b/docs/index.md
index 7fc9f0c..63a78d3 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -8,24 +8,27 @@
---
-## Welcome to QuantLLM v2.0
+## Welcome to QuantLLM v2.1 (pre-release)
QuantLLM makes working with large language models simple. Load any model, quantize it automatically, fine-tune with your data, and export to any format — all with just a few lines of code.
```python
from quantllm import turbo
-# Load with automatic 4-bit quantization
-model = turbo("meta-llama/Llama-3.2-3B")
+# Load with shared export/push defaults
+model = turbo(
+ "meta-llama/Llama-3.2-3B",
+ config={"format": "gguf", "quantization": "Q4_K_M", "push_format": "gguf"},
+)
# Generate text
print(model.generate("Explain quantum computing"))
# Export to GGUF for Ollama/llama.cpp
-model.export("gguf", "model.Q4_K_M.gguf", quantization="Q4_K_M")
+model.export()
# Push to HuggingFace with auto-generated model card
-model.push("username/my-model", format="gguf", quantization="Q4_K_M")
+model.push("username/my-model")
```
---
@@ -89,7 +92,11 @@ model = turbo("microsoft/phi-3-mini")
### Export to Any Format
```python
-model.export("gguf", "model.gguf", quantization="Q4_K_M")
+model = turbo(
+ "meta-llama/Llama-3.2-3B",
+ config={"format": "gguf", "quantization": "Q4_K_M", "push_format": "gguf"},
+)
+model.export()
model.export("onnx", "./model-onnx/")
model.export("mlx", "./model-mlx/", quantization="4bit")
```
@@ -101,7 +108,7 @@ model.finetune("training_data.json", epochs=3)
### Push to HuggingFace
```python
-model.push("username/my-model", format="gguf")
+model.push("username/my-model")
```
---
diff --git a/docs/installation.md b/docs/installation.md
index 15f4c7c..478ee94 100644
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -90,7 +90,7 @@ Expected output:
```
╔════════════════════════════════════════════════════════════╗
║ ║
-║ 🚀 QuantLLM v2.0.0 ║
+║ 🚀 QuantLLM v2.1.0rc1 ║
║ Ultra-fast LLM Quantization & Export ║
║ ║
║ ✓ GGUF ✓ ONNX ✓ MLX ✓ SafeTensors ║
diff --git a/docs/quickstart.md b/docs/quickstart.md
index 7382236..8050787 100644
--- a/docs/quickstart.md
+++ b/docs/quickstart.md
@@ -128,10 +128,12 @@ Share your model with the world:
```python
# Push with auto-generated model card
+model = turbo(
+ "meta-llama/Llama-3.2-3B",
+ config={"format": "gguf", "quantization": "Q4_K_M", "push_format": "gguf"},
+)
model.push(
"your-username/my-awesome-model",
- format="gguf",
- quantization="Q4_K_M",
license="apache-2.0"
)
```
@@ -196,7 +198,7 @@ quantllm.show_banner()
```
╔════════════════════════════════════════════════════════════╗
║ ║
-║ 🚀 QuantLLM v2.0.0 ║
+║ 🚀 QuantLLM v2.1.0rc1 ║
║ Ultra-fast LLM Quantization & Export ║
║ ║
║ ✓ GGUF ✓ ONNX ✓ MLX ✓ SafeTensors ║
diff --git a/examples/01_quickstart.py b/examples/01_quickstart.py
index 563a6df..a40e128 100644
--- a/examples/01_quickstart.py
+++ b/examples/01_quickstart.py
@@ -1,5 +1,5 @@
"""
-QuantLLM v2.0 - Quick Start Example
+QuantLLM v2.1 - Quick Start Example
The simplest way to use QuantLLM.
"""
diff --git a/examples/02_gguf_export.py b/examples/02_gguf_export.py
index 0efa2a7..37fee24 100644
--- a/examples/02_gguf_export.py
+++ b/examples/02_gguf_export.py
@@ -1,5 +1,5 @@
"""
-QuantLLM v2.0 - GGUF Export Example
+QuantLLM v2.1 - GGUF Export Example
Export models to GGUF format for use with llama.cpp, Ollama, LM Studio.
No external dependencies required!
diff --git a/examples/03_finetuning.py b/examples/03_finetuning.py
index c2021bd..f254a7f 100644
--- a/examples/03_finetuning.py
+++ b/examples/03_finetuning.py
@@ -1,5 +1,5 @@
"""
-QuantLLM v2.0 - Fine-tuning Example
+QuantLLM v2.1 - Fine-tuning Example
Fine-tune a quantized model using LoRA.
"""
diff --git a/examples/04_hub_push.py b/examples/04_hub_push.py
index af1ef3b..6a438c9 100644
--- a/examples/04_hub_push.py
+++ b/examples/04_hub_push.py
@@ -1,5 +1,5 @@
"""
-QuantLLM v2.0 - Push to HuggingFace Hub
+QuantLLM v2.1 - Push to HuggingFace Hub
Push your models to HuggingFace Hub.
"""
diff --git a/examples/README.md b/examples/README.md
index b6a0e68..810f7d7 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -1,4 +1,4 @@
-# QuantLLM v2.0 Examples
+# QuantLLM v2.1 Examples
Simple examples for the new TurboModel API.
diff --git a/quantllm/__init__.py b/quantllm/__init__.py
index 5c81eb6..6f2933b 100644
--- a/quantllm/__init__.py
+++ b/quantllm/__init__.py
@@ -1,5 +1,5 @@
"""
-QuantLLM v2.0 - Ultra-fast LLM Quantization & GGUF Export
+QuantLLM v2.1 - Ultra-fast LLM Quantization & GGUF Export
The simplest way to load, quantize, fine-tune, and export LLMs.
@@ -13,16 +13,19 @@
>>> from quantllm import turbo
>>>
>>> # Load any model (auto-quantizes to 4-bit)
- >>> model = turbo("meta-llama/Llama-3.2-3B")
+ >>> model = turbo(
+ ... "meta-llama/Llama-3.2-3B",
+ ... config={"format": "gguf", "quantization": "Q4_K_M", "push_format": "gguf"},
+ ... )
>>>
>>> # Generate text
>>> model.generate("Hello, world!")
>>>
>>> # Export to GGUF with Q4_K_M quantization
- >>> model.export("gguf", "model.Q4_K_M.gguf", quantization="Q4_K_M")
+ >>> model.export()
>>>
>>> # Push to HuggingFace Hub
- >>> model.push("username/my-model", format="gguf", quantization="Q4_K_M")
+ >>> model.push("username/my-model")
"""
import os
@@ -73,7 +76,7 @@
# Configure logging (minimal by default)
configure_logging("WARNING")
-__version__ = "2.0.0"
+__version__ = "2.1.0rc1"
__title__ = "QuantLLM"
__description__ = "Ultra-fast LLM Quantization & Export (GGUF, ONNX, MLX)"
__author__ = "Dark Coder"
diff --git a/quantllm/core/export.py b/quantllm/core/export.py
index 40f517a..05dcbb5 100644
--- a/quantllm/core/export.py
+++ b/quantllm/core/export.py
@@ -1,5 +1,5 @@
"""
-Universal Export Module for QuantLLM v2.0
+Universal Export Module for QuantLLM v2.1
Provides unified export functionality to multiple formats:
- GGUF (llama.cpp, Ollama, LM Studio)
diff --git a/quantllm/core/memory.py b/quantllm/core/memory.py
index bed8196..43298b8 100644
--- a/quantllm/core/memory.py
+++ b/quantllm/core/memory.py
@@ -1,5 +1,5 @@
"""
-Memory Optimization Utilities for QuantLLM v2.0
+Memory Optimization Utilities for QuantLLM v2.1
Advanced memory management for training and inference of large models
on limited GPU memory.
diff --git a/quantllm/core/training.py b/quantllm/core/training.py
index 053ec99..7ab9932 100644
--- a/quantllm/core/training.py
+++ b/quantllm/core/training.py
@@ -1,5 +1,5 @@
"""
-Advanced Training Utilities for QuantLLM v2.0
+Advanced Training Utilities for QuantLLM v2.1
Provides auto-configuration and optimization for fine-tuning
with minimal user input.
diff --git a/quantllm/core/turbo_model.py b/quantllm/core/turbo_model.py
index c04de1d..53ec668 100644
--- a/quantllm/core/turbo_model.py
+++ b/quantllm/core/turbo_model.py
@@ -26,6 +26,12 @@
from .memory import memory_optimized_tensor_order
DEFAULT_CHUNKED_SHARD_SIZE = "2GB"
+DEFAULT_EXPORT_PUSH_CONFIG = {
+ "format": "safetensors",
+ "push_format": "safetensors",
+ "quantization": "Q4_K_M",
+ "push_quantization": None,
+}
class TurboModel:
@@ -57,6 +63,7 @@ def __init__(
model: PreTrainedModel,
tokenizer: PreTrainedTokenizer,
config: SmartConfig,
+ export_push_config: Optional[Dict[str, Any]] = None,
verbose: bool = False,
):
"""
@@ -73,9 +80,7 @@ def __init__(
self._is_quantized = False
self._is_finetuned = False
self._lora_applied = False
- self._is_quantized = False
- self._is_finetuned = False
- self._lora_applied = False
+ self.export_push_config = self._build_export_push_config(export_push_config)
self.verbose = verbose
@classmethod
@@ -92,6 +97,7 @@ def from_pretrained(
trust_remote_code: bool = True,
quantize: bool = True,
config_override: Optional[Dict[str, Any]] = None,
+ config: Optional[Dict[str, Any]] = None,
verbose: bool = True,
) -> "TurboModel":
"""
@@ -112,8 +118,7 @@ def from_pretrained(
trust_remote_code: Trust remote code in model
quantize: Whether to quantize the model
config_override: Dict to override any auto-detected settings
- quantize: Whether to quantize the model
- config_override: Dict to override any auto-detected settings
+ config: Shared export/push config (format, quantization, push_format, etc.)
verbose: Print loading progress
Returns:
@@ -268,7 +273,7 @@ def from_pretrained(
print_success("Model loaded successfully!")
logger.info("")
- instance = cls(model, tokenizer, smart_config)
+ instance = cls(model, tokenizer, smart_config, export_push_config=config)
instance._is_quantized = quantize and smart_config.bits < 16
return instance
@@ -494,6 +499,30 @@ def _get_quantization_kwargs(config: SmartConfig) -> Dict[str, Any]:
except ImportError:
logger.warning("⚠ bitsandbytes not installed, loading without quantization")
return {}
+
+ @staticmethod
+ def _build_export_push_config(config: Optional[Dict[str, Any]]) -> Dict[str, Any]:
+ """Build shared export/push config with deterministic defaults."""
+ resolved = dict(DEFAULT_EXPORT_PUSH_CONFIG)
+ if config:
+ aliases = {
+ "export_format": "format",
+ "export_quantization": "quantization",
+ }
+ nullable_overrides = {"push_quantization"}
+ for key, value in config.items():
+ mapped_key = aliases.get(key, key)
+ if mapped_key in resolved and (
+ value is not None or mapped_key in nullable_overrides
+ ):
+ resolved[mapped_key] = value
+
+ if "format" in config and "push_format" not in config:
+ resolved["push_format"] = resolved["format"]
+ if "quantization" in config and "push_quantization" not in config:
+ resolved["push_quantization"] = resolved["quantization"]
+
+ return resolved
@staticmethod
def _enable_flash_attention(model: PreTrainedModel, verbose: bool = True) -> None:
@@ -945,7 +974,7 @@ def tokenize_function(examples):
def export(
self,
- format: str,
+ format: Optional[str] = None,
output_path: Optional[str] = None,
*,
quantization: Optional[str] = None,
@@ -961,7 +990,7 @@ def export(
- "mlx": For Apple Silicon Macs
Args:
- format: Target format (gguf, safetensors, onnx, mlx)
+ format: Target format (gguf, safetensors, onnx, mlx). Uses shared config when omitted.
output_path: Output file/directory path
quantization: Format-specific quantization:
- GGUF: Q4_K_M, Q5_K_M, Q8_0, etc.
@@ -978,7 +1007,16 @@ def export(
>>> model.export("onnx", "./my_model_onnx/")
>>> model.export("mlx", "./my_model_mlx/", quantization="4bit")
"""
- format = format.lower()
+ format = (
+ format
+ if format is not None
+ else self.export_push_config.get("format", DEFAULT_EXPORT_PUSH_CONFIG["format"])
+ ).lower()
+ effective_quantization = quantization
+ if effective_quantization is None and format == "gguf":
+ effective_quantization = self.export_push_config.get(
+ "quantization", DEFAULT_EXPORT_PUSH_CONFIG["quantization"]
+ )
# Merge LoRA if applied
if self._lora_applied:
@@ -991,7 +1029,7 @@ def export(
if output_path is None:
model_name = self.model.config._name_or_path.split('/')[-1]
if format == "gguf":
- quant = quantization or self.config.quant_type or "q4_k_m"
+ quant = effective_quantization
output_path = f"{model_name}.{quant.upper()}.gguf"
elif format == "safetensors":
output_path = f"./{model_name}-quantllm/"
@@ -1012,7 +1050,7 @@ def export(
raise ValueError(f"Unknown format: {format}. Supported: {list(exporters.keys())}")
print_header(f"Exporting to {format.upper()}")
- result = exporters[format](output_path, quantization=quantization, **kwargs)
+ result = exporters[format](output_path, quantization=effective_quantization, **kwargs)
print_success(f"Exported to: {result}")
return result
@@ -1021,7 +1059,7 @@ def push_to_hub(
self,
repo_id: str,
token: Optional[str] = None,
- format: str = "safetensors",
+ format: Optional[str] = None,
quantization: Optional[str] = None,
commit_message: str = "Upload model via QuantLLM",
license: str = "apache-2.0",
@@ -1052,7 +1090,14 @@ def push_to_hub(
"""
from ..hub import QuantLLMHubManager
- format_lower = format.lower()
+ format_lower = (
+ format
+ if format is not None
+ else self.export_push_config.get("push_format", DEFAULT_EXPORT_PUSH_CONFIG["push_format"])
+ ).lower()
+ push_quantization = quantization or self.export_push_config.get(
+ "push_quantization", DEFAULT_EXPORT_PUSH_CONFIG["push_quantization"]
+ )
# Get the original base model name (full path for HuggingFace link)
base_model_full = self.model.config._name_or_path
@@ -1066,7 +1111,9 @@ def push_to_hub(
if format_lower == "gguf":
# Export GGUF directly to staging
- quant_label = quantization or (self.config.quant_type if self.config.quant_type != "GGUF" else "q4_k_m") or "q4_k_m"
+ quant_label = push_quantization or self.export_push_config.get(
+ "quantization", DEFAULT_EXPORT_PUSH_CONFIG["quantization"]
+ )
filename = f"{model_name}.{quant_label.upper()}.gguf"
save_path = os.path.join(manager.staging_dir, filename)
@@ -1085,11 +1132,11 @@ def push_to_hub(
print_info("Exporting to ONNX format...")
save_path = manager.staging_dir
- self._export_onnx(save_path, quantization=quantization, **kwargs)
+ self._export_onnx(save_path, quantization=push_quantization, **kwargs)
manager.track_hyperparameters({
"format": "onnx",
- "quantization": quantization,
+ "quantization": push_quantization,
"base_model": base_model_full,
"license": license,
})
@@ -1100,11 +1147,11 @@ def push_to_hub(
print_info("Exporting to MLX format...")
save_path = manager.staging_dir
- self._export_mlx(save_path, quantization=quantization, **kwargs)
+ self._export_mlx(save_path, quantization=push_quantization, **kwargs)
manager.track_hyperparameters({
"format": "mlx",
- "quantization": quantization,
+ "quantization": push_quantization,
"base_model": base_model_full,
"license": license,
})
@@ -1117,7 +1164,7 @@ def push_to_hub(
"base_model": base_model_full,
"license": license,
})
- manager.save_final_model(self, format=format)
+ manager.save_final_model(self, format=format_lower)
manager._generate_model_card(format=format_lower)
manager.push(commit_message=commit_message)
@@ -1852,6 +1899,7 @@ def turbo(
max_length: Optional[int] = None,
device: Optional[str] = None,
dtype: Optional[str] = None,
+ config: Optional[Dict[str, Any]] = None,
**kwargs,
) -> TurboModel:
"""
@@ -1866,6 +1914,7 @@ def turbo(
max_length: Override max sequence length (default: auto)
device: Override device (default: best GPU)
dtype: Override dtype (default: bf16/fp16)
+ config: Shared export/push config (format, quantization, push_format, etc.)
**kwargs: Additional options passed to from_pretrained
Returns:
@@ -1896,5 +1945,6 @@ def turbo(
max_length=max_length,
device=device,
dtype=dtype,
+ config=config,
**kwargs,
)
diff --git a/quantllm/hub/model_card.py b/quantllm/hub/model_card.py
index 33a887f..66d8513 100644
--- a/quantllm/hub/model_card.py
+++ b/quantllm/hub/model_card.py
@@ -427,7 +427,7 @@ def _generate_details_section(self) -> str:
| **Quantization** | {self.quantization or "Full Precision"} |
| **License** | `{self.license}` |
| **Export Date** | {datetime.now().strftime("%Y-%m-%d")} |
-| **Exported By** | [QuantLLM v2.0](https://github.com/codewithdark-git/QuantLLM) |
+| **Exported By** | [QuantLLM v2.1](https://github.com/codewithdark-git/QuantLLM) |
'''
def _generate_quantization_section(self) -> str:
diff --git a/requirements.txt b/requirements.txt
index 4a67ebe..ef5e1c8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-# QuantLLM v2.0 Requirements
+# QuantLLM v2.1 (pre-release) Requirements
# Core dependencies
torch>=2.0.0
diff --git a/setup.py b/setup.py
index c858e64..eb2ebc8 100644
--- a/setup.py
+++ b/setup.py
@@ -11,7 +11,7 @@
setup(
name="quantllm",
- version="2.0.0",
+ version="2.1.0rc1",
author="Dark Coder",
author_email="codewithdark90@gmail.com",
description="Ultra-fast LLM quantization, fine-tuning, and deployment with one line of code",
@@ -117,4 +117,4 @@
},
include_package_data=True,
zip_safe=False,
-)
\ No newline at end of file
+)
diff --git a/tests/test_export_push_config.py b/tests/test_export_push_config.py
new file mode 100644
index 0000000..dba32bf
--- /dev/null
+++ b/tests/test_export_push_config.py
@@ -0,0 +1,164 @@
+from types import SimpleNamespace
+
+from quantllm.core.turbo_model import TurboModel
+
+
+def _stub_model(name: str = "org/test-model"):
+ return SimpleNamespace(config=SimpleNamespace(_name_or_path=name))
+
+
+def _stub_turbo(export_push_config):
+ model = TurboModel.__new__(TurboModel)
+ model.model = _stub_model()
+ model.tokenizer = None
+ smart_config = SimpleNamespace(quant_type="Q8_0")
+ model.config = smart_config
+ model._lora_applied = False
+ model.verbose = False
+ model.export_push_config = export_push_config
+ return model
+
+
+def test_build_export_push_config_uses_deterministic_defaults():
+ resolved = TurboModel._build_export_push_config(None)
+ assert resolved["format"] == "safetensors"
+ assert resolved["push_format"] == "safetensors"
+ assert resolved["quantization"] == "Q4_K_M"
+ assert resolved["push_quantization"] is None
+
+
+def test_build_export_push_config_aligns_push_values_with_export_values():
+ resolved = TurboModel._build_export_push_config(
+ {"format": "gguf", "quantization": "Q5_K_M"}
+ )
+ assert resolved["format"] == "gguf"
+ assert resolved["push_format"] == "gguf"
+ assert resolved["quantization"] == "Q5_K_M"
+ assert resolved["push_quantization"] == "Q5_K_M"
+
+
+def test_build_export_push_config_allows_nullable_push_quantization_override():
+ resolved = TurboModel._build_export_push_config(
+ {"format": "gguf", "quantization": "Q5_K_M", "push_quantization": None}
+ )
+ assert resolved["quantization"] == "Q5_K_M"
+ assert resolved["push_quantization"] is None
+
+
+def test_export_prefers_shared_quantization_over_smart_config_quant_type():
+ model = _stub_turbo(
+ {
+ "format": "gguf",
+ "push_format": "gguf",
+ "quantization": "Q4_K_M",
+ "push_quantization": "Q4_K_M",
+ }
+ )
+
+ captured = {}
+
+ def fake_export_gguf(output_path, quantization=None, **kwargs):
+ captured["output_path"] = output_path
+ captured["quantization"] = quantization
+ return output_path
+
+ model._export_gguf = fake_export_gguf
+ model._export_safetensors = lambda *args, **kwargs: ""
+ model._export_onnx = lambda *args, **kwargs: ""
+ model._export_mlx = lambda *args, **kwargs: ""
+
+ output = model.export()
+
+ assert model.config.quant_type == "Q8_0"
+ assert output.endswith(".Q4_K_M.gguf")
+ assert captured["quantization"] == "Q4_K_M"
+
+
+def test_gguf_push_uses_shared_config_when_omitted(monkeypatch, tmp_path):
+ model = _stub_turbo({
+ "format": "gguf",
+ "push_format": "gguf",
+ "quantization": "Q4_K_M",
+ "push_quantization": "Q4_K_M",
+ })
+
+ calls = {}
+
+ def fake_export(*, format, output_path, quantization=None, **kwargs):
+ calls["export"] = {
+ "format": format,
+ "output_path": output_path,
+ "quantization": quantization,
+ }
+ return output_path
+
+ model.export = fake_export
+
+ class FakeManager:
+ def __init__(self, repo_id, hf_token=None):
+ self.staging_dir = str(tmp_path / "quantllm-test-staging")
+
+ def track_hyperparameters(self, params):
+ calls["tracked"] = params
+
+ def _generate_model_card(self, format):
+ calls["card_format"] = format
+
+ def push(self, commit_message):
+ calls["pushed"] = commit_message
+
+ def save_final_model(self, *args, **kwargs):
+ raise AssertionError(
+ "save_final_model should not be called for GGUF push"
+ )
+
+ import quantllm.hub as hub_module
+
+ monkeypatch.setattr(hub_module, "QuantLLMHubManager", FakeManager)
+
+ model.push("user/repo")
+
+ assert calls["export"]["format"] == "gguf"
+ assert calls["export"]["quantization"] == "Q4_K_M"
+ assert calls["tracked"]["quantization"] == "Q4_K_M"
+
+
+def test_onnx_push_does_not_force_quantization(monkeypatch, tmp_path):
+ model = _stub_turbo(
+ TurboModel._build_export_push_config({"push_format": "onnx"})
+ )
+
+ calls = {}
+
+ class FakeManager:
+ def __init__(self, repo_id, hf_token=None):
+ self.staging_dir = str(tmp_path / "quantllm-test-staging")
+
+ def track_hyperparameters(self, params):
+ calls["tracked"] = params
+
+ def _generate_model_card(self, format):
+ calls["card_format"] = format
+
+ def push(self, commit_message):
+ calls["pushed"] = commit_message
+
+ def save_final_model(self, *args, **kwargs):
+ raise AssertionError(
+ "save_final_model should not be called for ONNX push"
+ )
+
+ def fake_export_onnx(output_path, quantization=None, **kwargs):
+ calls["onnx_quantization"] = quantization
+ return output_path
+
+ model._export_onnx = fake_export_onnx
+
+ import quantllm.hub as hub_module
+
+ monkeypatch.setattr(hub_module, "QuantLLMHubManager", FakeManager)
+
+ model.push("user/repo")
+
+ assert calls["onnx_quantization"] is None
+ assert calls["tracked"]["quantization"] is None