Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 23 additions & 13 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
<div align="center">
<img src="docs/images/1.png" alt="QuantLLM Logo" />

# 🚀 QuantLLM v2.0
# 🚀 QuantLLM v2.1 (pre-release)

**The Ultra-Fast LLM Quantization & Export Library**

Expand Down Expand Up @@ -52,9 +52,12 @@ model = AutoModelForCausalLM.from_pretrained(
```python
from quantllm import turbo

model = turbo("meta-llama/Llama-3-8B") # Auto-quantizes
model = turbo(
"meta-llama/Llama-3-8B",
config={"format": "gguf", "quantization": "Q4_K_M", "push_format": "gguf"},
) # Auto-quantizes
model.generate("Hello!") # Generate text
model.export("gguf", quantization="Q4_K_M") # Export to GGUF
model.export() # Export to GGUF with shared config
```

---
Expand All @@ -77,14 +80,17 @@ pip install "quantllm[full] @ git+https://github.com/codewithdark-git/QuantLLM.g
from quantllm import turbo

# Load with automatic optimization
model = turbo("meta-llama/Llama-3.2-3B")
model = turbo(
"meta-llama/Llama-3.2-3B",
config={"format": "gguf", "quantization": "Q4_K_M", "push_format": "gguf"},
)

# Generate text
response = model.generate("Explain quantum computing simply")
print(response)

# Export to GGUF
model.export("gguf", "model.Q4_K_M.gguf", quantization="Q4_K_M")
model.export("gguf", "model.Q4_K_M.gguf")
```

**QuantLLM automatically:**
Expand All @@ -102,11 +108,14 @@ model.export("gguf", "model.Q4_K_M.gguf", quantization="Q4_K_M")
One unified interface for everything:

```python
model = turbo("mistralai/Mistral-7B")
model = turbo(
"mistralai/Mistral-7B",
config={"format": "gguf", "quantization": "Q4_K_M", "push_format": "gguf"},
)
model.generate("Hello!")
model.finetune(data, epochs=3)
model.export("gguf", quantization="Q4_K_M")
model.push("user/repo", format="gguf")
model.export()
model.push("user/repo")
```

### ⚡ Performance Optimizations
Expand All @@ -133,7 +142,7 @@ Llama 2/3, Mistral, Mixtral, Qwen 1/2, Phi 1/2/3, Gemma, Falcon, DeepSeek, Yi, S

```
╔════════════════════════════════════════════════════════════╗
║ 🚀 QuantLLM v2.0.0
║ 🚀 QuantLLM v2.1.0rc1
║ Ultra-fast LLM Quantization & Export ║
║ ✓ GGUF ✓ ONNX ✓ MLX ✓ SafeTensors ║
╚════════════════════════════════════════════════════════════╝
Expand All @@ -148,7 +157,7 @@ Llama 2/3, Mistral, Mixtral, Qwen 1/2, Phi 1/2/3, Gemma, Falcon, DeepSeek, Yi, S
Auto-generates model cards with YAML frontmatter, usage examples, and "Use this model" button:

```python
model.push("user/my-model", format="gguf", quantization="Q4_K_M")
model.push("user/my-model")
```

---
Expand Down Expand Up @@ -195,7 +204,10 @@ model.export("safetensors", "./model-hf/")
```python
from quantllm import turbo

model = turbo("meta-llama/Llama-3.2-3B")
model = turbo(
"meta-llama/Llama-3.2-3B",
config={"format": "gguf", "quantization": "Q4_K_M", "push_format": "gguf"},
)

# Simple generation
response = model.generate(
Expand Down Expand Up @@ -267,8 +279,6 @@ model = turbo("meta-llama/Llama-3.2-3B")
# Push with auto-generated model card
model.push(
"your-username/my-model",
format="gguf",
quantization="Q4_K_M",
license="apache-2.0"
)
```
Expand Down
7 changes: 5 additions & 2 deletions docs/api/gguf.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,11 @@ Export models to GGUF format for llama.cpp, Ollama, and LM Studio.
from quantllm import turbo, convert_to_gguf, quantize_gguf

# Method 1: Via TurboModel
model = turbo("meta-llama/Llama-3.2-3B")
model.export("gguf", "model.Q4_K_M.gguf", quantization="Q4_K_M")
model = turbo(
"meta-llama/Llama-3.2-3B",
config={"format": "gguf", "quantization": "Q4_K_M", "push_format": "gguf"},
)
model.export("gguf", "model.Q4_K_M.gguf")

# Method 2: Direct conversion
convert_to_gguf("meta-llama/Llama-3.2-3B", "model.Q4_K_M.gguf", quant_type="Q4_K_M")
Expand Down
20 changes: 12 additions & 8 deletions docs/api/hub.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,11 @@ Push models to HuggingFace Hub with auto-generated model cards.
from quantllm import turbo, QuantLLMHubManager

# Method 1: TurboModel.push() (Recommended)
model = turbo("meta-llama/Llama-3.2-3B")
model.push("user/my-model", format="gguf", quantization="Q4_K_M")
model = turbo(
"meta-llama/Llama-3.2-3B",
config={"format": "gguf", "quantization": "Q4_K_M", "push_format": "gguf"},
)
model.push("user/my-model")

# Method 2: QuantLLMHubManager (Advanced)
manager = QuantLLMHubManager("user/my-model", hf_token="hf_...")
Expand All @@ -30,7 +33,7 @@ def push(
self,
repo_id: str,
token: Optional[str] = None,
format: str = "safetensors",
format: Optional[str] = None,
quantization: Optional[str] = None,
license: str = "apache-2.0",
commit_message: str = "Upload model via QuantLLM",
Expand All @@ -44,7 +47,7 @@ def push(
|-----------|------|---------|-------------|
| `repo_id` | str | required | HuggingFace repo ID (user/model) |
| `token` | str | None | HF token (or use HF_TOKEN env) |
| `format` | str | "safetensors" | Export format |
| `format` | str | None | Export format (uses `config["push_format"]` when omitted) |
| `quantization` | str | None | Quantization type |
| `license` | str | "apache-2.0" | License type |

Expand All @@ -62,13 +65,14 @@ def push(
```python
from quantllm import turbo

model = turbo("meta-llama/Llama-3.2-3B")
model = turbo(
"meta-llama/Llama-3.2-3B",
config={"format": "gguf", "quantization": "Q4_K_M", "push_format": "gguf"},
)

# Push as GGUF
model.push(
"your-username/llama-3.2-3b-gguf",
format="gguf",
quantization="Q4_K_M"
"your-username/llama-3.2-3b-gguf"
)

# Push as ONNX
Expand Down
4 changes: 3 additions & 1 deletion docs/api/turbo.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ def turbo(
max_length: Optional[int] = None,
device: Optional[str] = None,
dtype: Optional[str] = None,
config: Optional[Dict[str, Any]] = None,
quantize: bool = True,
trust_remote_code: bool = False,
verbose: bool = True,
Expand All @@ -32,6 +33,7 @@ def turbo(
| `max_length` | int | auto | Maximum context length |
| `device` | str | auto | Device ("cuda", "cpu", "cuda:0", "auto") |
| `dtype` | str | auto | Data type ("float16", "bfloat16") |
| `config` | dict | None | Shared export/push defaults (`format`, `quantization`, `push_format`, `push_quantization`) |
| `quantize` | bool | True | Whether to apply quantization |
| `trust_remote_code` | bool | False | Trust remote code in model |
| `verbose` | bool | True | Show loading progress and stats |
Expand Down Expand Up @@ -124,7 +126,7 @@ When `verbose=True` (default), you'll see:

```
╔════════════════════════════════════════════════════════════╗
║ 🚀 QuantLLM v2.0.0
║ 🚀 QuantLLM v2.1.0rc1
╚════════════════════════════════════════════════════════════╝

📊 Loading: meta-llama/Llama-3.2-3B
Expand Down
20 changes: 11 additions & 9 deletions docs/api/turbomodel.md
Original file line number Diff line number Diff line change
Expand Up @@ -232,23 +232,27 @@ Export the model to various formats.
```python
def export(
self,
format: str,
output_path: str,
format: Optional[str] = None,
output_path: Optional[str] = None,
quantization: Optional[str] = None,
**kwargs
) -> str
```

| Parameter | Type | Description |
|-----------|------|-------------|
| `format` | str | "gguf", "onnx", "mlx", "safetensors" |
| `output_path` | str | Output file or directory |
| `format` | str | "gguf", "onnx", "mlx", "safetensors" (optional, uses shared config) |
| `output_path` | str | Output file or directory (optional) |
| `quantization` | str | Quantization type (format-specific) |

**Examples:**
```python
# GGUF
model.export("gguf", "model.Q4_K_M.gguf", quantization="Q4_K_M")
model = turbo(
"meta-llama/Llama-3.2-3B",
config={"format": "gguf", "quantization": "Q4_K_M", "push_format": "gguf"},
)
model.export()

# ONNX
model.export("onnx", "./model-onnx/")
Expand All @@ -269,7 +273,7 @@ def push(
self,
repo_id: str,
token: Optional[str] = None,
format: str = "safetensors",
format: Optional[str] = None,
quantization: Optional[str] = None,
license: str = "apache-2.0",
commit_message: str = "Upload model via QuantLLM",
Expand All @@ -281,9 +285,7 @@ def push(
```python
# Push as GGUF
model.push(
"your-username/my-model",
format="gguf",
quantization="Q4_K_M"
"your-username/my-model"
)

# Push as MLX
Expand Down
4 changes: 2 additions & 2 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
project = 'QuantLLM'
copyright = '2024, Dark Coder'
author = 'Dark Coder'
release = '2.0.0'
release = '2.1.0rc1'

# Extensions
extensions = [
Expand All @@ -21,7 +21,7 @@
# HTML output
html_theme = 'sphinx_rtd_theme'
html_static_path = ['_static']
html_title = 'QuantLLM v2.0'
html_title = 'QuantLLM v2.1'
html_logo = 'images/logo.png'
html_favicon = 'images/favicon.ico'

Expand Down
4 changes: 2 additions & 2 deletions docs/guide/finetuning.md
Original file line number Diff line number Diff line change
Expand Up @@ -193,13 +193,13 @@ print("Fine-tuned:", model.generate("prompt"))

```python
# Export to GGUF
model.export("gguf", "finetuned.Q4_K_M.gguf", quantization="Q4_K_M")
model.export("gguf", "finetuned.Q4_K_M.gguf")

# Export to SafeTensors
model.export("safetensors", "./finetuned-model/")

# Push to HuggingFace
model.push("your-username/finetuned-model", format="gguf")
model.push("your-username/finetuned-model")
```

### Save and Load
Expand Down
6 changes: 4 additions & 2 deletions docs/guide/gguf-export.md
Original file line number Diff line number Diff line change
Expand Up @@ -130,10 +130,12 @@ print(output["choices"][0]["text"])
Export and push in one step:

```python
model = turbo(
"meta-llama/Llama-3.2-3B",
config={"format": "gguf", "quantization": "Q4_K_M", "push_format": "gguf"},
)
model.push(
"your-username/my-model-gguf",
format="gguf",
quantization="Q4_K_M",
license="apache-2.0"
)
```
Expand Down
45 changes: 15 additions & 30 deletions docs/guide/hub-integration.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,15 @@ The easiest way to share your model:
```python
from quantllm import turbo

model = turbo("meta-llama/Llama-3.2-3B")
model = turbo(
"meta-llama/Llama-3.2-3B",
config={"format": "gguf", "quantization": "Q4_K_M", "push_format": "gguf"},
)

# Push with auto-generated model card
model.push(
"your-username/my-model",
token="hf_...",
format="gguf",
quantization="Q4_K_M"
token="hf_..."
)
```

Expand Down Expand Up @@ -49,34 +50,18 @@ model.push("user/repo", token="hf_...")
```python
from quantllm import turbo

model = turbo("meta-llama/Llama-3.2-3B")

# Push as GGUF (for Ollama, llama.cpp, LM Studio)
model.push(
"your-username/my-model-gguf",
format="gguf",
quantization="Q4_K_M",
license="apache-2.0"
model = turbo(
"meta-llama/Llama-3.2-3B",
config={
"format": "gguf",
"quantization": "Q4_K_M",
"push_format": "gguf",
},
)

# Push as ONNX
model.push(
"your-username/my-model-onnx",
format="onnx"
)

# Push as MLX (Apple Silicon)
model.push(
"your-username/my-model-mlx",
format="mlx",
quantization="4bit"
)

# Push as SafeTensors (default)
model.push(
"your-username/my-model",
format="safetensors"
)
# Uses shared config defaults
model.export()
model.push("your-username/my-model-gguf", license="apache-2.0")
```

### Method 2: QuantLLMHubManager (Advanced)
Expand Down
Loading