diff --git a/README.md b/README.md index b800541..0555b85 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@
QuantLLM Logo - # 🚀 QuantLLM v2.0 + # 🚀 QuantLLM v2.1 (pre-release) **The Ultra-Fast LLM Quantization & Export Library** @@ -52,9 +52,12 @@ model = AutoModelForCausalLM.from_pretrained( ```python from quantllm import turbo -model = turbo("meta-llama/Llama-3-8B") # Auto-quantizes +model = turbo( + "meta-llama/Llama-3-8B", + config={"format": "gguf", "quantization": "Q4_K_M", "push_format": "gguf"}, +) # Auto-quantizes model.generate("Hello!") # Generate text -model.export("gguf", quantization="Q4_K_M") # Export to GGUF +model.export() # Export to GGUF with shared config ``` --- @@ -77,14 +80,17 @@ pip install "quantllm[full] @ git+https://github.com/codewithdark-git/QuantLLM.g from quantllm import turbo # Load with automatic optimization -model = turbo("meta-llama/Llama-3.2-3B") +model = turbo( + "meta-llama/Llama-3.2-3B", + config={"format": "gguf", "quantization": "Q4_K_M", "push_format": "gguf"}, +) # Generate text response = model.generate("Explain quantum computing simply") print(response) # Export to GGUF -model.export("gguf", "model.Q4_K_M.gguf", quantization="Q4_K_M") +model.export("gguf", "model.Q4_K_M.gguf") ``` **QuantLLM automatically:** @@ -102,11 +108,14 @@ model.export("gguf", "model.Q4_K_M.gguf", quantization="Q4_K_M") One unified interface for everything: ```python -model = turbo("mistralai/Mistral-7B") +model = turbo( + "mistralai/Mistral-7B", + config={"format": "gguf", "quantization": "Q4_K_M", "push_format": "gguf"}, +) model.generate("Hello!") model.finetune(data, epochs=3) -model.export("gguf", quantization="Q4_K_M") -model.push("user/repo", format="gguf") +model.export() +model.push("user/repo") ``` ### ⚡ Performance Optimizations @@ -133,7 +142,7 @@ Llama 2/3, Mistral, Mixtral, Qwen 1/2, Phi 1/2/3, Gemma, Falcon, DeepSeek, Yi, S ``` ╔════════════════════════════════════════════════════════════╗ -║ 🚀 QuantLLM v2.0.0 ║ +║ 🚀 QuantLLM v2.1.0rc1 ║ ║ Ultra-fast LLM Quantization & Export ║ ║ ✓ GGUF ✓ ONNX ✓ MLX ✓ SafeTensors ║ ╚════════════════════════════════════════════════════════════╝ @@ -148,7 +157,7 @@ Llama 2/3, Mistral, Mixtral, Qwen 1/2, Phi 1/2/3, Gemma, Falcon, DeepSeek, Yi, S Auto-generates model cards with YAML frontmatter, usage examples, and "Use this model" button: ```python -model.push("user/my-model", format="gguf", quantization="Q4_K_M") +model.push("user/my-model") ``` --- @@ -195,7 +204,10 @@ model.export("safetensors", "./model-hf/") ```python from quantllm import turbo -model = turbo("meta-llama/Llama-3.2-3B") +model = turbo( + "meta-llama/Llama-3.2-3B", + config={"format": "gguf", "quantization": "Q4_K_M", "push_format": "gguf"}, +) # Simple generation response = model.generate( @@ -267,8 +279,6 @@ model = turbo("meta-llama/Llama-3.2-3B") # Push with auto-generated model card model.push( "your-username/my-model", - format="gguf", - quantization="Q4_K_M", license="apache-2.0" ) ``` diff --git a/docs/api/gguf.md b/docs/api/gguf.md index 1135c5e..12f18dc 100644 --- a/docs/api/gguf.md +++ b/docs/api/gguf.md @@ -10,8 +10,11 @@ Export models to GGUF format for llama.cpp, Ollama, and LM Studio. from quantllm import turbo, convert_to_gguf, quantize_gguf # Method 1: Via TurboModel -model = turbo("meta-llama/Llama-3.2-3B") -model.export("gguf", "model.Q4_K_M.gguf", quantization="Q4_K_M") +model = turbo( + "meta-llama/Llama-3.2-3B", + config={"format": "gguf", "quantization": "Q4_K_M", "push_format": "gguf"}, +) +model.export("gguf", "model.Q4_K_M.gguf") # Method 2: Direct conversion convert_to_gguf("meta-llama/Llama-3.2-3B", "model.Q4_K_M.gguf", quant_type="Q4_K_M") diff --git a/docs/api/hub.md b/docs/api/hub.md index 0fb1fa6..d501e56 100644 --- a/docs/api/hub.md +++ b/docs/api/hub.md @@ -10,8 +10,11 @@ Push models to HuggingFace Hub with auto-generated model cards. from quantllm import turbo, QuantLLMHubManager # Method 1: TurboModel.push() (Recommended) -model = turbo("meta-llama/Llama-3.2-3B") -model.push("user/my-model", format="gguf", quantization="Q4_K_M") +model = turbo( + "meta-llama/Llama-3.2-3B", + config={"format": "gguf", "quantization": "Q4_K_M", "push_format": "gguf"}, +) +model.push("user/my-model") # Method 2: QuantLLMHubManager (Advanced) manager = QuantLLMHubManager("user/my-model", hf_token="hf_...") @@ -30,7 +33,7 @@ def push( self, repo_id: str, token: Optional[str] = None, - format: str = "safetensors", + format: Optional[str] = None, quantization: Optional[str] = None, license: str = "apache-2.0", commit_message: str = "Upload model via QuantLLM", @@ -44,7 +47,7 @@ def push( |-----------|------|---------|-------------| | `repo_id` | str | required | HuggingFace repo ID (user/model) | | `token` | str | None | HF token (or use HF_TOKEN env) | -| `format` | str | "safetensors" | Export format | +| `format` | str | None | Export format (uses `config["push_format"]` when omitted) | | `quantization` | str | None | Quantization type | | `license` | str | "apache-2.0" | License type | @@ -62,13 +65,14 @@ def push( ```python from quantllm import turbo -model = turbo("meta-llama/Llama-3.2-3B") +model = turbo( + "meta-llama/Llama-3.2-3B", + config={"format": "gguf", "quantization": "Q4_K_M", "push_format": "gguf"}, +) # Push as GGUF model.push( - "your-username/llama-3.2-3b-gguf", - format="gguf", - quantization="Q4_K_M" + "your-username/llama-3.2-3b-gguf" ) # Push as ONNX diff --git a/docs/api/turbo.md b/docs/api/turbo.md index 512f8f0..ef4f0ed 100644 --- a/docs/api/turbo.md +++ b/docs/api/turbo.md @@ -14,6 +14,7 @@ def turbo( max_length: Optional[int] = None, device: Optional[str] = None, dtype: Optional[str] = None, + config: Optional[Dict[str, Any]] = None, quantize: bool = True, trust_remote_code: bool = False, verbose: bool = True, @@ -32,6 +33,7 @@ def turbo( | `max_length` | int | auto | Maximum context length | | `device` | str | auto | Device ("cuda", "cpu", "cuda:0", "auto") | | `dtype` | str | auto | Data type ("float16", "bfloat16") | +| `config` | dict | None | Shared export/push defaults (`format`, `quantization`, `push_format`, `push_quantization`) | | `quantize` | bool | True | Whether to apply quantization | | `trust_remote_code` | bool | False | Trust remote code in model | | `verbose` | bool | True | Show loading progress and stats | @@ -124,7 +126,7 @@ When `verbose=True` (default), you'll see: ``` ╔════════════════════════════════════════════════════════════╗ -║ 🚀 QuantLLM v2.0.0 ║ +║ 🚀 QuantLLM v2.1.0rc1 ║ ╚════════════════════════════════════════════════════════════╝ 📊 Loading: meta-llama/Llama-3.2-3B diff --git a/docs/api/turbomodel.md b/docs/api/turbomodel.md index 3542463..d091958 100644 --- a/docs/api/turbomodel.md +++ b/docs/api/turbomodel.md @@ -232,8 +232,8 @@ Export the model to various formats. ```python def export( self, - format: str, - output_path: str, + format: Optional[str] = None, + output_path: Optional[str] = None, quantization: Optional[str] = None, **kwargs ) -> str @@ -241,14 +241,18 @@ def export( | Parameter | Type | Description | |-----------|------|-------------| -| `format` | str | "gguf", "onnx", "mlx", "safetensors" | -| `output_path` | str | Output file or directory | +| `format` | str | "gguf", "onnx", "mlx", "safetensors" (optional, uses shared config) | +| `output_path` | str | Output file or directory (optional) | | `quantization` | str | Quantization type (format-specific) | **Examples:** ```python # GGUF -model.export("gguf", "model.Q4_K_M.gguf", quantization="Q4_K_M") +model = turbo( + "meta-llama/Llama-3.2-3B", + config={"format": "gguf", "quantization": "Q4_K_M", "push_format": "gguf"}, +) +model.export() # ONNX model.export("onnx", "./model-onnx/") @@ -269,7 +273,7 @@ def push( self, repo_id: str, token: Optional[str] = None, - format: str = "safetensors", + format: Optional[str] = None, quantization: Optional[str] = None, license: str = "apache-2.0", commit_message: str = "Upload model via QuantLLM", @@ -281,9 +285,7 @@ def push( ```python # Push as GGUF model.push( - "your-username/my-model", - format="gguf", - quantization="Q4_K_M" + "your-username/my-model" ) # Push as MLX diff --git a/docs/conf.py b/docs/conf.py index dacb626..ebd8a42 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -3,7 +3,7 @@ project = 'QuantLLM' copyright = '2024, Dark Coder' author = 'Dark Coder' -release = '2.0.0' +release = '2.1.0rc1' # Extensions extensions = [ @@ -21,7 +21,7 @@ # HTML output html_theme = 'sphinx_rtd_theme' html_static_path = ['_static'] -html_title = 'QuantLLM v2.0' +html_title = 'QuantLLM v2.1' html_logo = 'images/logo.png' html_favicon = 'images/favicon.ico' diff --git a/docs/guide/finetuning.md b/docs/guide/finetuning.md index 654debf..3dd29e4 100644 --- a/docs/guide/finetuning.md +++ b/docs/guide/finetuning.md @@ -193,13 +193,13 @@ print("Fine-tuned:", model.generate("prompt")) ```python # Export to GGUF -model.export("gguf", "finetuned.Q4_K_M.gguf", quantization="Q4_K_M") +model.export("gguf", "finetuned.Q4_K_M.gguf") # Export to SafeTensors model.export("safetensors", "./finetuned-model/") # Push to HuggingFace -model.push("your-username/finetuned-model", format="gguf") +model.push("your-username/finetuned-model") ``` ### Save and Load diff --git a/docs/guide/gguf-export.md b/docs/guide/gguf-export.md index 0c61f8d..35329d5 100644 --- a/docs/guide/gguf-export.md +++ b/docs/guide/gguf-export.md @@ -130,10 +130,12 @@ print(output["choices"][0]["text"]) Export and push in one step: ```python +model = turbo( + "meta-llama/Llama-3.2-3B", + config={"format": "gguf", "quantization": "Q4_K_M", "push_format": "gguf"}, +) model.push( "your-username/my-model-gguf", - format="gguf", - quantization="Q4_K_M", license="apache-2.0" ) ``` diff --git a/docs/guide/hub-integration.md b/docs/guide/hub-integration.md index 176976a..c0887c6 100644 --- a/docs/guide/hub-integration.md +++ b/docs/guide/hub-integration.md @@ -11,14 +11,15 @@ The easiest way to share your model: ```python from quantllm import turbo -model = turbo("meta-llama/Llama-3.2-3B") +model = turbo( + "meta-llama/Llama-3.2-3B", + config={"format": "gguf", "quantization": "Q4_K_M", "push_format": "gguf"}, +) # Push with auto-generated model card model.push( "your-username/my-model", - token="hf_...", - format="gguf", - quantization="Q4_K_M" + token="hf_..." ) ``` @@ -49,34 +50,18 @@ model.push("user/repo", token="hf_...") ```python from quantllm import turbo -model = turbo("meta-llama/Llama-3.2-3B") - -# Push as GGUF (for Ollama, llama.cpp, LM Studio) -model.push( - "your-username/my-model-gguf", - format="gguf", - quantization="Q4_K_M", - license="apache-2.0" +model = turbo( + "meta-llama/Llama-3.2-3B", + config={ + "format": "gguf", + "quantization": "Q4_K_M", + "push_format": "gguf", + }, ) -# Push as ONNX -model.push( - "your-username/my-model-onnx", - format="onnx" -) - -# Push as MLX (Apple Silicon) -model.push( - "your-username/my-model-mlx", - format="mlx", - quantization="4bit" -) - -# Push as SafeTensors (default) -model.push( - "your-username/my-model", - format="safetensors" -) +# Uses shared config defaults +model.export() +model.push("your-username/my-model-gguf", license="apache-2.0") ``` ### Method 2: QuantLLMHubManager (Advanced) diff --git a/docs/index.md b/docs/index.md index 7fc9f0c..63a78d3 100644 --- a/docs/index.md +++ b/docs/index.md @@ -8,24 +8,27 @@ --- -## Welcome to QuantLLM v2.0 +## Welcome to QuantLLM v2.1 (pre-release) QuantLLM makes working with large language models simple. Load any model, quantize it automatically, fine-tune with your data, and export to any format — all with just a few lines of code. ```python from quantllm import turbo -# Load with automatic 4-bit quantization -model = turbo("meta-llama/Llama-3.2-3B") +# Load with shared export/push defaults +model = turbo( + "meta-llama/Llama-3.2-3B", + config={"format": "gguf", "quantization": "Q4_K_M", "push_format": "gguf"}, +) # Generate text print(model.generate("Explain quantum computing")) # Export to GGUF for Ollama/llama.cpp -model.export("gguf", "model.Q4_K_M.gguf", quantization="Q4_K_M") +model.export() # Push to HuggingFace with auto-generated model card -model.push("username/my-model", format="gguf", quantization="Q4_K_M") +model.push("username/my-model") ``` --- @@ -89,7 +92,11 @@ model = turbo("microsoft/phi-3-mini") ### Export to Any Format ```python -model.export("gguf", "model.gguf", quantization="Q4_K_M") +model = turbo( + "meta-llama/Llama-3.2-3B", + config={"format": "gguf", "quantization": "Q4_K_M", "push_format": "gguf"}, +) +model.export() model.export("onnx", "./model-onnx/") model.export("mlx", "./model-mlx/", quantization="4bit") ``` @@ -101,7 +108,7 @@ model.finetune("training_data.json", epochs=3) ### Push to HuggingFace ```python -model.push("username/my-model", format="gguf") +model.push("username/my-model") ``` --- diff --git a/docs/installation.md b/docs/installation.md index 15f4c7c..478ee94 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -90,7 +90,7 @@ Expected output: ``` ╔════════════════════════════════════════════════════════════╗ ║ ║ -║ 🚀 QuantLLM v2.0.0 ║ +║ 🚀 QuantLLM v2.1.0rc1 ║ ║ Ultra-fast LLM Quantization & Export ║ ║ ║ ║ ✓ GGUF ✓ ONNX ✓ MLX ✓ SafeTensors ║ diff --git a/docs/quickstart.md b/docs/quickstart.md index 7382236..8050787 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -128,10 +128,12 @@ Share your model with the world: ```python # Push with auto-generated model card +model = turbo( + "meta-llama/Llama-3.2-3B", + config={"format": "gguf", "quantization": "Q4_K_M", "push_format": "gguf"}, +) model.push( "your-username/my-awesome-model", - format="gguf", - quantization="Q4_K_M", license="apache-2.0" ) ``` @@ -196,7 +198,7 @@ quantllm.show_banner() ``` ╔════════════════════════════════════════════════════════════╗ ║ ║ -║ 🚀 QuantLLM v2.0.0 ║ +║ 🚀 QuantLLM v2.1.0rc1 ║ ║ Ultra-fast LLM Quantization & Export ║ ║ ║ ║ ✓ GGUF ✓ ONNX ✓ MLX ✓ SafeTensors ║ diff --git a/examples/01_quickstart.py b/examples/01_quickstart.py index 563a6df..a40e128 100644 --- a/examples/01_quickstart.py +++ b/examples/01_quickstart.py @@ -1,5 +1,5 @@ """ -QuantLLM v2.0 - Quick Start Example +QuantLLM v2.1 - Quick Start Example The simplest way to use QuantLLM. """ diff --git a/examples/02_gguf_export.py b/examples/02_gguf_export.py index 0efa2a7..37fee24 100644 --- a/examples/02_gguf_export.py +++ b/examples/02_gguf_export.py @@ -1,5 +1,5 @@ """ -QuantLLM v2.0 - GGUF Export Example +QuantLLM v2.1 - GGUF Export Example Export models to GGUF format for use with llama.cpp, Ollama, LM Studio. No external dependencies required! diff --git a/examples/03_finetuning.py b/examples/03_finetuning.py index c2021bd..f254a7f 100644 --- a/examples/03_finetuning.py +++ b/examples/03_finetuning.py @@ -1,5 +1,5 @@ """ -QuantLLM v2.0 - Fine-tuning Example +QuantLLM v2.1 - Fine-tuning Example Fine-tune a quantized model using LoRA. """ diff --git a/examples/04_hub_push.py b/examples/04_hub_push.py index af1ef3b..6a438c9 100644 --- a/examples/04_hub_push.py +++ b/examples/04_hub_push.py @@ -1,5 +1,5 @@ """ -QuantLLM v2.0 - Push to HuggingFace Hub +QuantLLM v2.1 - Push to HuggingFace Hub Push your models to HuggingFace Hub. """ diff --git a/examples/README.md b/examples/README.md index b6a0e68..810f7d7 100644 --- a/examples/README.md +++ b/examples/README.md @@ -1,4 +1,4 @@ -# QuantLLM v2.0 Examples +# QuantLLM v2.1 Examples Simple examples for the new TurboModel API. diff --git a/quantllm/__init__.py b/quantllm/__init__.py index 5c81eb6..6f2933b 100644 --- a/quantllm/__init__.py +++ b/quantllm/__init__.py @@ -1,5 +1,5 @@ """ -QuantLLM v2.0 - Ultra-fast LLM Quantization & GGUF Export +QuantLLM v2.1 - Ultra-fast LLM Quantization & GGUF Export The simplest way to load, quantize, fine-tune, and export LLMs. @@ -13,16 +13,19 @@ >>> from quantllm import turbo >>> >>> # Load any model (auto-quantizes to 4-bit) - >>> model = turbo("meta-llama/Llama-3.2-3B") + >>> model = turbo( + ... "meta-llama/Llama-3.2-3B", + ... config={"format": "gguf", "quantization": "Q4_K_M", "push_format": "gguf"}, + ... ) >>> >>> # Generate text >>> model.generate("Hello, world!") >>> >>> # Export to GGUF with Q4_K_M quantization - >>> model.export("gguf", "model.Q4_K_M.gguf", quantization="Q4_K_M") + >>> model.export() >>> >>> # Push to HuggingFace Hub - >>> model.push("username/my-model", format="gguf", quantization="Q4_K_M") + >>> model.push("username/my-model") """ import os @@ -73,7 +76,7 @@ # Configure logging (minimal by default) configure_logging("WARNING") -__version__ = "2.0.0" +__version__ = "2.1.0rc1" __title__ = "QuantLLM" __description__ = "Ultra-fast LLM Quantization & Export (GGUF, ONNX, MLX)" __author__ = "Dark Coder" diff --git a/quantllm/core/export.py b/quantllm/core/export.py index 40f517a..05dcbb5 100644 --- a/quantllm/core/export.py +++ b/quantllm/core/export.py @@ -1,5 +1,5 @@ """ -Universal Export Module for QuantLLM v2.0 +Universal Export Module for QuantLLM v2.1 Provides unified export functionality to multiple formats: - GGUF (llama.cpp, Ollama, LM Studio) diff --git a/quantllm/core/memory.py b/quantllm/core/memory.py index bed8196..43298b8 100644 --- a/quantllm/core/memory.py +++ b/quantllm/core/memory.py @@ -1,5 +1,5 @@ """ -Memory Optimization Utilities for QuantLLM v2.0 +Memory Optimization Utilities for QuantLLM v2.1 Advanced memory management for training and inference of large models on limited GPU memory. diff --git a/quantllm/core/training.py b/quantllm/core/training.py index 053ec99..7ab9932 100644 --- a/quantllm/core/training.py +++ b/quantllm/core/training.py @@ -1,5 +1,5 @@ """ -Advanced Training Utilities for QuantLLM v2.0 +Advanced Training Utilities for QuantLLM v2.1 Provides auto-configuration and optimization for fine-tuning with minimal user input. diff --git a/quantllm/core/turbo_model.py b/quantllm/core/turbo_model.py index c04de1d..53ec668 100644 --- a/quantllm/core/turbo_model.py +++ b/quantllm/core/turbo_model.py @@ -26,6 +26,12 @@ from .memory import memory_optimized_tensor_order DEFAULT_CHUNKED_SHARD_SIZE = "2GB" +DEFAULT_EXPORT_PUSH_CONFIG = { + "format": "safetensors", + "push_format": "safetensors", + "quantization": "Q4_K_M", + "push_quantization": None, +} class TurboModel: @@ -57,6 +63,7 @@ def __init__( model: PreTrainedModel, tokenizer: PreTrainedTokenizer, config: SmartConfig, + export_push_config: Optional[Dict[str, Any]] = None, verbose: bool = False, ): """ @@ -73,9 +80,7 @@ def __init__( self._is_quantized = False self._is_finetuned = False self._lora_applied = False - self._is_quantized = False - self._is_finetuned = False - self._lora_applied = False + self.export_push_config = self._build_export_push_config(export_push_config) self.verbose = verbose @classmethod @@ -92,6 +97,7 @@ def from_pretrained( trust_remote_code: bool = True, quantize: bool = True, config_override: Optional[Dict[str, Any]] = None, + config: Optional[Dict[str, Any]] = None, verbose: bool = True, ) -> "TurboModel": """ @@ -112,8 +118,7 @@ def from_pretrained( trust_remote_code: Trust remote code in model quantize: Whether to quantize the model config_override: Dict to override any auto-detected settings - quantize: Whether to quantize the model - config_override: Dict to override any auto-detected settings + config: Shared export/push config (format, quantization, push_format, etc.) verbose: Print loading progress Returns: @@ -268,7 +273,7 @@ def from_pretrained( print_success("Model loaded successfully!") logger.info("") - instance = cls(model, tokenizer, smart_config) + instance = cls(model, tokenizer, smart_config, export_push_config=config) instance._is_quantized = quantize and smart_config.bits < 16 return instance @@ -494,6 +499,30 @@ def _get_quantization_kwargs(config: SmartConfig) -> Dict[str, Any]: except ImportError: logger.warning("⚠ bitsandbytes not installed, loading without quantization") return {} + + @staticmethod + def _build_export_push_config(config: Optional[Dict[str, Any]]) -> Dict[str, Any]: + """Build shared export/push config with deterministic defaults.""" + resolved = dict(DEFAULT_EXPORT_PUSH_CONFIG) + if config: + aliases = { + "export_format": "format", + "export_quantization": "quantization", + } + nullable_overrides = {"push_quantization"} + for key, value in config.items(): + mapped_key = aliases.get(key, key) + if mapped_key in resolved and ( + value is not None or mapped_key in nullable_overrides + ): + resolved[mapped_key] = value + + if "format" in config and "push_format" not in config: + resolved["push_format"] = resolved["format"] + if "quantization" in config and "push_quantization" not in config: + resolved["push_quantization"] = resolved["quantization"] + + return resolved @staticmethod def _enable_flash_attention(model: PreTrainedModel, verbose: bool = True) -> None: @@ -945,7 +974,7 @@ def tokenize_function(examples): def export( self, - format: str, + format: Optional[str] = None, output_path: Optional[str] = None, *, quantization: Optional[str] = None, @@ -961,7 +990,7 @@ def export( - "mlx": For Apple Silicon Macs Args: - format: Target format (gguf, safetensors, onnx, mlx) + format: Target format (gguf, safetensors, onnx, mlx). Uses shared config when omitted. output_path: Output file/directory path quantization: Format-specific quantization: - GGUF: Q4_K_M, Q5_K_M, Q8_0, etc. @@ -978,7 +1007,16 @@ def export( >>> model.export("onnx", "./my_model_onnx/") >>> model.export("mlx", "./my_model_mlx/", quantization="4bit") """ - format = format.lower() + format = ( + format + if format is not None + else self.export_push_config.get("format", DEFAULT_EXPORT_PUSH_CONFIG["format"]) + ).lower() + effective_quantization = quantization + if effective_quantization is None and format == "gguf": + effective_quantization = self.export_push_config.get( + "quantization", DEFAULT_EXPORT_PUSH_CONFIG["quantization"] + ) # Merge LoRA if applied if self._lora_applied: @@ -991,7 +1029,7 @@ def export( if output_path is None: model_name = self.model.config._name_or_path.split('/')[-1] if format == "gguf": - quant = quantization or self.config.quant_type or "q4_k_m" + quant = effective_quantization output_path = f"{model_name}.{quant.upper()}.gguf" elif format == "safetensors": output_path = f"./{model_name}-quantllm/" @@ -1012,7 +1050,7 @@ def export( raise ValueError(f"Unknown format: {format}. Supported: {list(exporters.keys())}") print_header(f"Exporting to {format.upper()}") - result = exporters[format](output_path, quantization=quantization, **kwargs) + result = exporters[format](output_path, quantization=effective_quantization, **kwargs) print_success(f"Exported to: {result}") return result @@ -1021,7 +1059,7 @@ def push_to_hub( self, repo_id: str, token: Optional[str] = None, - format: str = "safetensors", + format: Optional[str] = None, quantization: Optional[str] = None, commit_message: str = "Upload model via QuantLLM", license: str = "apache-2.0", @@ -1052,7 +1090,14 @@ def push_to_hub( """ from ..hub import QuantLLMHubManager - format_lower = format.lower() + format_lower = ( + format + if format is not None + else self.export_push_config.get("push_format", DEFAULT_EXPORT_PUSH_CONFIG["push_format"]) + ).lower() + push_quantization = quantization or self.export_push_config.get( + "push_quantization", DEFAULT_EXPORT_PUSH_CONFIG["push_quantization"] + ) # Get the original base model name (full path for HuggingFace link) base_model_full = self.model.config._name_or_path @@ -1066,7 +1111,9 @@ def push_to_hub( if format_lower == "gguf": # Export GGUF directly to staging - quant_label = quantization or (self.config.quant_type if self.config.quant_type != "GGUF" else "q4_k_m") or "q4_k_m" + quant_label = push_quantization or self.export_push_config.get( + "quantization", DEFAULT_EXPORT_PUSH_CONFIG["quantization"] + ) filename = f"{model_name}.{quant_label.upper()}.gguf" save_path = os.path.join(manager.staging_dir, filename) @@ -1085,11 +1132,11 @@ def push_to_hub( print_info("Exporting to ONNX format...") save_path = manager.staging_dir - self._export_onnx(save_path, quantization=quantization, **kwargs) + self._export_onnx(save_path, quantization=push_quantization, **kwargs) manager.track_hyperparameters({ "format": "onnx", - "quantization": quantization, + "quantization": push_quantization, "base_model": base_model_full, "license": license, }) @@ -1100,11 +1147,11 @@ def push_to_hub( print_info("Exporting to MLX format...") save_path = manager.staging_dir - self._export_mlx(save_path, quantization=quantization, **kwargs) + self._export_mlx(save_path, quantization=push_quantization, **kwargs) manager.track_hyperparameters({ "format": "mlx", - "quantization": quantization, + "quantization": push_quantization, "base_model": base_model_full, "license": license, }) @@ -1117,7 +1164,7 @@ def push_to_hub( "base_model": base_model_full, "license": license, }) - manager.save_final_model(self, format=format) + manager.save_final_model(self, format=format_lower) manager._generate_model_card(format=format_lower) manager.push(commit_message=commit_message) @@ -1852,6 +1899,7 @@ def turbo( max_length: Optional[int] = None, device: Optional[str] = None, dtype: Optional[str] = None, + config: Optional[Dict[str, Any]] = None, **kwargs, ) -> TurboModel: """ @@ -1866,6 +1914,7 @@ def turbo( max_length: Override max sequence length (default: auto) device: Override device (default: best GPU) dtype: Override dtype (default: bf16/fp16) + config: Shared export/push config (format, quantization, push_format, etc.) **kwargs: Additional options passed to from_pretrained Returns: @@ -1896,5 +1945,6 @@ def turbo( max_length=max_length, device=device, dtype=dtype, + config=config, **kwargs, ) diff --git a/quantllm/hub/model_card.py b/quantllm/hub/model_card.py index 33a887f..66d8513 100644 --- a/quantllm/hub/model_card.py +++ b/quantllm/hub/model_card.py @@ -427,7 +427,7 @@ def _generate_details_section(self) -> str: | **Quantization** | {self.quantization or "Full Precision"} | | **License** | `{self.license}` | | **Export Date** | {datetime.now().strftime("%Y-%m-%d")} | -| **Exported By** | [QuantLLM v2.0](https://github.com/codewithdark-git/QuantLLM) | +| **Exported By** | [QuantLLM v2.1](https://github.com/codewithdark-git/QuantLLM) | ''' def _generate_quantization_section(self) -> str: diff --git a/requirements.txt b/requirements.txt index 4a67ebe..ef5e1c8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -# QuantLLM v2.0 Requirements +# QuantLLM v2.1 (pre-release) Requirements # Core dependencies torch>=2.0.0 diff --git a/setup.py b/setup.py index c858e64..eb2ebc8 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ setup( name="quantllm", - version="2.0.0", + version="2.1.0rc1", author="Dark Coder", author_email="codewithdark90@gmail.com", description="Ultra-fast LLM quantization, fine-tuning, and deployment with one line of code", @@ -117,4 +117,4 @@ }, include_package_data=True, zip_safe=False, -) \ No newline at end of file +) diff --git a/tests/test_export_push_config.py b/tests/test_export_push_config.py new file mode 100644 index 0000000..dba32bf --- /dev/null +++ b/tests/test_export_push_config.py @@ -0,0 +1,164 @@ +from types import SimpleNamespace + +from quantllm.core.turbo_model import TurboModel + + +def _stub_model(name: str = "org/test-model"): + return SimpleNamespace(config=SimpleNamespace(_name_or_path=name)) + + +def _stub_turbo(export_push_config): + model = TurboModel.__new__(TurboModel) + model.model = _stub_model() + model.tokenizer = None + smart_config = SimpleNamespace(quant_type="Q8_0") + model.config = smart_config + model._lora_applied = False + model.verbose = False + model.export_push_config = export_push_config + return model + + +def test_build_export_push_config_uses_deterministic_defaults(): + resolved = TurboModel._build_export_push_config(None) + assert resolved["format"] == "safetensors" + assert resolved["push_format"] == "safetensors" + assert resolved["quantization"] == "Q4_K_M" + assert resolved["push_quantization"] is None + + +def test_build_export_push_config_aligns_push_values_with_export_values(): + resolved = TurboModel._build_export_push_config( + {"format": "gguf", "quantization": "Q5_K_M"} + ) + assert resolved["format"] == "gguf" + assert resolved["push_format"] == "gguf" + assert resolved["quantization"] == "Q5_K_M" + assert resolved["push_quantization"] == "Q5_K_M" + + +def test_build_export_push_config_allows_nullable_push_quantization_override(): + resolved = TurboModel._build_export_push_config( + {"format": "gguf", "quantization": "Q5_K_M", "push_quantization": None} + ) + assert resolved["quantization"] == "Q5_K_M" + assert resolved["push_quantization"] is None + + +def test_export_prefers_shared_quantization_over_smart_config_quant_type(): + model = _stub_turbo( + { + "format": "gguf", + "push_format": "gguf", + "quantization": "Q4_K_M", + "push_quantization": "Q4_K_M", + } + ) + + captured = {} + + def fake_export_gguf(output_path, quantization=None, **kwargs): + captured["output_path"] = output_path + captured["quantization"] = quantization + return output_path + + model._export_gguf = fake_export_gguf + model._export_safetensors = lambda *args, **kwargs: "" + model._export_onnx = lambda *args, **kwargs: "" + model._export_mlx = lambda *args, **kwargs: "" + + output = model.export() + + assert model.config.quant_type == "Q8_0" + assert output.endswith(".Q4_K_M.gguf") + assert captured["quantization"] == "Q4_K_M" + + +def test_gguf_push_uses_shared_config_when_omitted(monkeypatch, tmp_path): + model = _stub_turbo({ + "format": "gguf", + "push_format": "gguf", + "quantization": "Q4_K_M", + "push_quantization": "Q4_K_M", + }) + + calls = {} + + def fake_export(*, format, output_path, quantization=None, **kwargs): + calls["export"] = { + "format": format, + "output_path": output_path, + "quantization": quantization, + } + return output_path + + model.export = fake_export + + class FakeManager: + def __init__(self, repo_id, hf_token=None): + self.staging_dir = str(tmp_path / "quantllm-test-staging") + + def track_hyperparameters(self, params): + calls["tracked"] = params + + def _generate_model_card(self, format): + calls["card_format"] = format + + def push(self, commit_message): + calls["pushed"] = commit_message + + def save_final_model(self, *args, **kwargs): + raise AssertionError( + "save_final_model should not be called for GGUF push" + ) + + import quantllm.hub as hub_module + + monkeypatch.setattr(hub_module, "QuantLLMHubManager", FakeManager) + + model.push("user/repo") + + assert calls["export"]["format"] == "gguf" + assert calls["export"]["quantization"] == "Q4_K_M" + assert calls["tracked"]["quantization"] == "Q4_K_M" + + +def test_onnx_push_does_not_force_quantization(monkeypatch, tmp_path): + model = _stub_turbo( + TurboModel._build_export_push_config({"push_format": "onnx"}) + ) + + calls = {} + + class FakeManager: + def __init__(self, repo_id, hf_token=None): + self.staging_dir = str(tmp_path / "quantllm-test-staging") + + def track_hyperparameters(self, params): + calls["tracked"] = params + + def _generate_model_card(self, format): + calls["card_format"] = format + + def push(self, commit_message): + calls["pushed"] = commit_message + + def save_final_model(self, *args, **kwargs): + raise AssertionError( + "save_final_model should not be called for ONNX push" + ) + + def fake_export_onnx(output_path, quantization=None, **kwargs): + calls["onnx_quantization"] = quantization + return output_path + + model._export_onnx = fake_export_onnx + + import quantllm.hub as hub_module + + monkeypatch.setattr(hub_module, "QuantLLMHubManager", FakeManager) + + model.push("user/repo") + + assert calls["onnx_quantization"] is None + assert calls["tracked"]["quantization"] is None