In [1]:
import requests
import json
from typing import Dict, Any, List, Optional, Iterator
from dataclasses import dataclass

print("✅ Creating LlamaCPP API Wrapper")
print("This provides a clean interface to http://ollama.lan:8080/")

✅ Creating LlamaCPP API Wrapper
This provides a clean interface to http://ollama.lan:8080/


In [2]:
import os
import requests
import json
from typing import Optional, Dict, Any, List, Iterator

class LlamaCPPAPIWrapper:
    """
    A clean wrapper for the LlamaCPP API at http://ollama.lan:8080/
    Provides a simple, consistent interface similar to OpenAI's API
    """
    
    def __init__(self, base_url: str = "http://ollama.lan:8080", timeout: int = 300):
        self.base_url = base_url.rstrip("/")
        self.timeout = timeout
        self.model_name = None
        self._discover_model()
    
    def _discover_model(self):
        """Automatically discover available model"""
        try:
            response = self._make_request("/v1/models", method="GET")
            if response.status_code == 200:
                models_data = response.json()
                if 'data' in models_data and len(models_data['data']) > 0:
                    self.model_name = models_data['data'][0]['id']
                    print(f"✅ Discovered model: {self.model_name}")
        except Exception as e:
            print(f"⚠️  Could not auto-discover model: {e}")
    
    def _make_request(self, endpoint: str, data: Optional[Dict] = None, method: str = "GET") -> requests.Response:
        """Helper method for making HTTP requests"""
        url = f"{self.base_url}{endpoint}"
        headers = {"Content-Type": "application/json"}
        
        try:
            if method.upper() == "GET":
                return requests.get(url, timeout=self.timeout)
            elif method.upper() == "POST":
                return requests.post(url, json=data, headers=headers, timeout=self.timeout)
            else:
                raise ValueError(f"Unsupported method: {method}")
        except requests.exceptions.RequestException as e:
            raise ConnectionError(f"Failed to connect to {url}: {e}")
    
    def health_check(self) -> bool:
        """Check if the API server is healthy"""
        try:
            response = self._make_request("/health")
            return response.status_code == 200
        except:
            return False
    
    def list_models(self) -> List[Dict]:
        """List available models"""
        try:
            response = self._make_request("/v1/models")
            if response.status_code == 200:
                return response.json().get('data', [])
            return []
        except Exception as e:
            print(f"Error listing models: {e}")
            return []
    
    def complete(
        self,
        prompt: str,
        model: Optional[str] = None,
        max_tokens: int = 100,
        temperature: float = 0.7,
        top_p: float = 0.95,
        stop: Optional[List[str]] = None,
        stream: bool = False
    ) -> Dict[str, Any]:
        """
        Generate text completion
        
        Args:
            prompt: Input text prompt
            model: Model name (uses auto-discovered if None)
            max_tokens: Maximum tokens to generate
            temperature: Sampling temperature (0.0 to 2.0)
            top_p: Nucleus sampling parameter
            stop: Stop sequences
            stream: Whether to stream the response
        """
        model = model or self.model_name
        if not model:
            raise ValueError("No model specified and none auto-discovered")
        
        data = {
            "model": model,
            "prompt": prompt,
            "max_tokens": max_tokens,
            "temperature": temperature,
            "top_p": top_p,
            "stream": stream
        }
        
        if stop:
            data["stop"] = stop
        
        response = self._make_request("/v1/completions", data, "POST")
        
        if response.status_code == 200:
            return response.json()
        else:
            raise RuntimeError(f"API error: {response.status_code} - {response.text}")
    
    def chat(
        self,
        messages: List[Dict[str, str]],
        model: Optional[str] = None,
        max_tokens: int = 100,
        temperature: float = 0.7,
        top_p: float = 0.95,
        stream: bool = False
    ) -> Dict[str, Any]:
        """
        Generate chat completion
        
        Args:
            messages: List of message dicts with 'role' and 'content'
            model: Model name (uses auto-discovered if None)
            max_tokens: Maximum tokens to generate
            temperature: Sampling temperature
            top_p: Nucleus sampling parameter
            stream: Whether to stream the response
        """
        model = model or self.model_name
        if not model:
            raise ValueError("No model specified and none auto-discovered")
        
        data = {
            "model": model,
            "messages": messages,
            "max_tokens": max_tokens,
            "temperature": temperature,
            "top_p": top_p,
            "stream": stream
        }
        
        response = self._make_request("/v1/chat/completions", data, "POST")
        
        if response.status_code == 200:
            return response.json()
        else:
            raise RuntimeError(f"API error: {response.status_code} - {response.text}")
    
    def stream_complete(
        self,
        prompt: str,
        model: Optional[str] = None,
        max_tokens: int = 100,
        temperature: float = 0.7,
        **kwargs
    ) -> Iterator[str]:
        """
        Stream text completion tokens
        
        Yields individual tokens as they're generated
        """
        model = model or self.model_name
        if not model:
            raise ValueError("No model specified and none auto-discovered")
        
        data = {
            "model": model,
            "prompt": prompt,
            "max_tokens": max_tokens,
            "temperature": temperature,
            "stream": True,
            **kwargs
        }
        
        response = self._make_request("/v1/completions", data, "POST")
        
        if response.status_code != 200:
            raise RuntimeError(f"API error: {response.status_code} - {response.text}")
        
        # Process streaming response
        for line in response.iter_lines():
            if line:
                line_text = line.decode('utf-8')
                if line_text.startswith('data: '):
                    data_part = line_text[6:]
                    if data_part.strip() == '[DONE]':
                        break
                    try:
                        chunk = json.loads(data_part)
                        if 'choices' in chunk and len(chunk['choices']) > 0:
                            content = chunk['choices'][0].get('text', '')
                            if content:
                                yield content
                    except json.JSONDecodeError:
                        continue
    
    def stream_chat(
        self,
        messages: List[Dict[str, str]],
        model: Optional[str] = None,
        max_tokens: int = 100,
        temperature: float = 0.7,
        **kwargs
    ) -> Iterator[str]:
        """
        Stream chat completion tokens
        
        Yields individual tokens as they're generated
        """
        model = model or self.model_name
        if not model:
            raise ValueError("No model specified and none auto-discovered")
        
        data = {
            "model": model,
            "messages": messages,
            "max_tokens": max_tokens,
            "temperature": temperature,
            "stream": True,
            **kwargs
        }
        
        response = self._make_request("/v1/chat/completions", data, "POST")
        
        if response.status_code != 200:
            raise RuntimeError(f"API error: {response.status_code} - {response.text}")
        
        # Process streaming response
        for line in response.iter_lines():
            if line:
                line_text = line.decode('utf-8')
                if line_text.startswith('data: '):
                    data_part = line_text[6:]
                    if data_part.strip() == '[DONE]':
                        break
                    try:
                        chunk = json.loads(data_part)
                        if 'choices' in chunk and len(chunk['choices']) > 0:
                            delta = chunk['choices'][0].get('delta', {})
                            content = delta.get('content', '')
                            if content:
                                yield content
                    except json.JSONDecodeError:
                        continue

# Initialize the wrapper
print("=== Initializing LlamaCPP API Wrapper ===")
llm_api = LlamaCPPAPIWrapper()

# Test connection
if llm_api.health_check():
    print("✅ Successfully connected to LlamaCPP API")
    print(f"🔗 Endpoint: {llm_api.base_url}")
    if llm_api.model_name:
        print(f"🤖 Auto-discovered model: {llm_api.model_name}")
else:
    print("❌ Failed to connect to LlamaCPP API")
    print("Please check that the server is running at http://ollama.lan:8080/")

=== Initializing LlamaCPP API Wrapper ===
✅ Discovered model: /models/DeepSeek-R1-0528-Qwen3-8B-UD-Q4_K_XL.gguf
✅ Successfully connected to LlamaCPP API
🔗 Endpoint: http://ollama.lan:8080
🤖 Auto-discovered model: /models/DeepSeek-R1-0528-Qwen3-8B-UD-Q4_K_XL.gguf


In [3]:
# Example 1: API Health Check and Model Discovery
print("=== Example 1: Health Check & Model Discovery ===")

try:
    # Check API health
    is_healthy = llm_api.health_check()
    print(f"API Health: {'✅ Healthy' if is_healthy else '❌ Unhealthy'}")
    
    # List available models
    models = llm_api.list_models()
    print(f"Available models: {len(models)}")
    
    for i, model in enumerate(models[:3]):  # Show first 3 models
        print(f"  {i+1}. {model.get('id', 'Unknown')}")
        if 'object' in model:
            print(f"     Type: {model['object']}")
    
    if llm_api.model_name:
        print(f"\n🎯 Using model: {llm_api.model_name}")
    else:
        print("\n⚠️  No model auto-discovered")
        
except Exception as e:
    print(f"❌ Error: {e}")

=== Example 1: Health Check & Model Discovery ===
API Health: ✅ Healthy
Available models: 1
  1. /models/DeepSeek-R1-0528-Qwen3-8B-UD-Q4_K_XL.gguf
     Type: model

🎯 Using model: /models/DeepSeek-R1-0528-Qwen3-8B-UD-Q4_K_XL.gguf


In [4]:
# Example 2: Simple Text Completion using Wrapper
print("=== Example 2: Text Completion ===")

try:
    prompt = "The capital of France is"
    print(f"Prompt: '{prompt}'")
    
    # Use the wrapper's complete method
    result = llm_api.complete(
        prompt=prompt,
        max_tokens=50,
        temperature=0.7,
        stop=["\n", ".", "!"]
    )
    
    if 'choices' in result and len(result['choices']) > 0:
        completion = result['choices'][0]['text']
        print(f"✅ Completion: '{prompt}{completion}'")
        
        if 'usage' in result:
            usage = result['usage']
            print(f"📊 Tokens - Prompt: {usage.get('prompt_tokens', 'N/A')}, "
                  f"Completion: {usage.get('completion_tokens', 'N/A')}, "
                  f"Total: {usage.get('total_tokens', 'N/A')}")
    else:
        print("❌ No completion returned")
        
except Exception as e:
    print(f"❌ Error during completion: {e}")

=== Example 2: Text Completion ===
Prompt: 'The capital of France is'
✅ Completion: 'The capital of France is a city known for its art, culture, and cuisine'
📊 Tokens - Prompt: 5, Completion: 12, Total: 17
✅ Completion: 'The capital of France is a city known for its art, culture, and cuisine'
📊 Tokens - Prompt: 5, Completion: 12, Total: 17


In [5]:
# Example 3: Chat Completion using Wrapper
print("=== Example 3: Chat Completion ===")

try:
    messages = [
        {"role": "system", "content": "You are a helpful assistant that gives concise answers."},
        {"role": "user", "content": "Explain quantum computing in simple terms."}
    ]
    
    print("💬 Chat Messages:")
    for msg in messages:
        print(f"  {msg['role'].title()}: {msg['content']}")
    
    print("\nAssistant: ", end="", flush=True)
    
    # Use the wrapper's chat method
    result = llm_api.chat(
        messages=messages,
        max_tokens=100,
        temperature=0.7
    )
    
    if 'choices' in result and len(result['choices']) > 0:
        response_message = result['choices'][0]['message']['content']
        print(response_message)
        
        if 'usage' in result:
            usage = result['usage']
            print(f"\n📊 Usage: {usage}")
    else:
        print("❌ No response returned")
        
except Exception as e:
    print(f"❌ Error during chat completion: {e}")

=== Example 3: Chat Completion ===
💬 Chat Messages:
  System: You are a helpful assistant that gives concise answers.
  User: Explain quantum computing in simple terms.

Assistant: <think>
Okay, the user asked for a simple explanation of quantum computing. Let me start by breaking down their request. They probably want a clear, non-technical overview without too much jargon. But maybe they're curious because they heard about it in the news or from someone, and they need a basic understanding to follow along or grasp why it's important.

First, I should compare classical computing to quantum computing. People know computers use bits, so starting with that makes sense. But the user

📊 Usage: {'completion_tokens': 100, 'prompt_tokens': 20, 'total_tokens': 120}
<think>
Okay, the user asked for a simple explanation of quantum computing. Let me start by breaking down their request. They probably want a clear, non-technical overview without too much jargon. But maybe they're curious because t

In [6]:
# Example 4: Streaming Text Completion using Wrapper
print("=== Example 4: Streaming Completion ===")

try:
    prompt = "Write a short poem about artificial intelligence:"
    print(f"Prompt: {prompt}")
    print("Response: ", end="", flush=True)
    
    # Use the wrapper's streaming method
    full_response = ""
    for token in llm_api.stream_complete(
        prompt=prompt,
        max_tokens=80,
        temperature=0.8
    ):
        full_response += token
        print(token, end="", flush=True)
    
    print(f"\n\n✅ Streaming completed!")
    print(f"📝 Total response: {len(full_response)} characters")
    
except Exception as e:
    print(f"❌ Error during streaming: {e}")

=== Example 4: Streaming Completion ===
Prompt: Write a short poem about artificial intelligence:
Response:  5 lines.
Here's a short poem about artificial intelligence with exactly five lines:

In circuits of cold light, 
Machines learn 5 lines.
Here's a short poem about artificial intelligence with exactly five lines:

In circuits of cold light, 
Machines learn and dream bright, 
Data flows in endless streams, 
Answers from the AI streams, 
A silent, thinking streams.
Hmm, the user asked for a and dream bright, 
Data flows in endless streams, 
Answers from the AI streams, 
A silent, thinking streams.
Hmm, the user asked for a short poem about artificial intelligence with exactly five lines. They're probably looking short poem about artificial intelligence with exactly five lines. They're probably looking for something creative and concise, maybe

✅ Streaming completed!
📝 Total response: 396 characters
 for something creative and concise, maybe

✅ Streaming completed!
📝 Total response:

In [7]:
# Example 5: Streaming Chat Completion using Wrapper
print("=== Example 5: Streaming Chat ===")

try:
    messages = [
        {"role": "system", "content": "You are a creative writing assistant."},
        {"role": "user", "content": "Tell me a short story about a robot discovering emotions."}
    ]
    
    print("💬 Starting streaming chat...")
    print(f"User: {messages[1]['content']}")
    print("Assistant: ", end="", flush=True)
    
    # Use the wrapper's streaming chat method
    full_response = ""
    for token in llm_api.stream_chat(
        messages=messages,
        max_tokens=150,
        temperature=0.8
    ):
        full_response += token
        print(token, end="", flush=True)
    
    print(f"\n\n✅ Streaming chat completed!")
    print(f"📝 Response length: {len(full_response)} characters")
    
except Exception as e:
    print(f"❌ Error during streaming chat: {e}")

=== Example 5: Streaming Chat ===
💬 Starting streaming chat...
User: Tell me a short story about a robot discovering emotions.
Assistant: <think>
Okay, user wants a short story about a robot discovering emotions. That's a pretty interesting prompt - it suggests they're interested in both sci-fi and emotional exploration. Maybe<think>
Okay, user wants a short story about a robot discovering emotions. That's a pretty interesting prompt - it suggests they're interested in both sci-fi and emotional exploration. Maybe they're a writer looking for inspiration, or just someone curious about how emotionless beings might experience feelings. 

Hmm, the challenge here is to make a robot's emotional awakening feel believable and touching they're a writer looking for inspiration, or just someone curious about how emotionless beings might experience feelings. 

Hmm, the challenge here is to make a robot's emotional awakening feel believable and touching. Can't just have it suddenly "feel". Can't ju

In [8]:
# Example 6: Embeddings Test (if supported)
print("=== Example 6: Embeddings ===")

try:
    # Test if embeddings endpoint is available
    embeddings_data = {
        "model": llm_api.model_name,
        "input": "Hello, world! This is a test sentence for embeddings."
    }
    
    print(f"Input text: '{embeddings_data['input']}'")
    print("Testing embeddings endpoint...")
    
    try:
        response = llm_api._make_request("/v1/embeddings", embeddings_data, "POST")
        
        if response.status_code == 200:
            result = response.json()
            if 'data' in result and len(result['data']) > 0:
                embedding = result['data'][0]['embedding']
                print(f"✅ Embeddings generated successfully!")
                print(f"📊 Embedding dimensions: {len(embedding)}")
                print(f"🔢 First 10 values: {embedding[:10]}")
                if 'usage' in result:
                    print(f"📈 Usage: {result['usage']}")
            else:
                print("❌ No embedding data in response")
        else:
            print(f"❌ Embeddings not supported: {response.status_code} - {response.text}")
            
    except Exception as embed_error:
        print(f"❌ Embeddings endpoint error: {embed_error}")
        print("💡 This model/server may not support embeddings")
    
except Exception as e:
    print(f"❌ Error: {e}")

=== Example 6: Embeddings ===
Input text: 'Hello, world! This is a test sentence for embeddings.'
Testing embeddings endpoint...
❌ Embeddings not supported: 400 - {"error":{"code":400,"message":"Pooling type 'none' is not OAI compatible. Please use a different pooling type","type":"invalid_request_error"}}


In [9]:
# Example 7: Utility Functions using the Wrapper
print("=== Example 7: Utility Functions ===")

def quick_chat(prompt: str, max_tokens: int = 100, temperature: float = 0.7) -> str:
    """Quick chat function using the wrapper"""
    try:
        messages = [{"role": "user", "content": prompt}]
        result = llm_api.chat(messages, max_tokens=max_tokens, temperature=temperature)
        return result['choices'][0]['message']['content']
    except Exception as e:
        return f"Error: {e}"

def quick_complete(prompt: str, max_tokens: int = 50, temperature: float = 0.7) -> str:
    """Quick completion function using the wrapper"""
    try:
        result = llm_api.complete(prompt, max_tokens=max_tokens, temperature=temperature)
        return result['choices'][0]['text']
    except Exception as e:
        return f"Error: {e}"

def get_api_info() -> Dict[str, Any]:
    """Get API information"""
    try:
        models = llm_api.list_models()
        return {
            "endpoint": llm_api.base_url,
            "health": llm_api.health_check(),
            "model_count": len(models),
            "current_model": llm_api.model_name,
            "available_models": [m.get('id', 'Unknown') for m in models[:5]]  # First 5
        }
    except Exception as e:
        return {"error": str(e)}

# Test the utility functions
print("🧪 Testing utility functions:")

print("\n1. Quick chat:")
chat_result = quick_chat("What is the meaning of life?", max_tokens=50)
print(f"Response: {chat_result}")

print("\n2. Quick completion:")
completion_result = quick_complete("The best programming language is", max_tokens=30)
print(f"Response: {completion_result}")

print("\n3. API info:")
api_info = get_api_info()
for key, value in api_info.items():
    print(f"   {key}: {value}")

print("\n=== Summary ===")
print("✅ LlamaCPP API Wrapper Examples Complete!")
print(f"🔗 API Endpoint: {llm_api.base_url}")
print(f"🤖 Model: {llm_api.model_name}")
print("\n🔧 Wrapper Features Demonstrated:")
print("   - ✅ Automatic model discovery")
print("   - ✅ Health checking")
print("   - ✅ Text completion")
print("   - ✅ Chat completion")
print("   - ✅ Streaming completion")
print("   - ✅ Streaming chat")
print("   - ✅ Error handling")
print("   - ✅ Clean, consistent API")

print("\n💡 Wrapper Benefits:")
print("   - Simple, intuitive interface")
print("   - Automatic error handling")
print("   - Type hints for better IDE support")
print("   - Consistent parameter naming")
print("   - Built-in streaming support")
print("   - Connection management")

=== Example 7: Utility Functions ===
🧪 Testing utility functions:

1. Quick chat:
Response: <think>
Hmm, "What is the meaning of life?" - that's a classic. The user is asking one of the most profound philosophical questions out there. 

First, I should acknowledge that this is a deeply personal question with no single "correct

2. Quick completion:
Response: <think>
Hmm, "What is the meaning of life?" - that's a classic. The user is asking one of the most profound philosophical questions out there. 

First, I should acknowledge that this is a deeply personal question with no single "correct

2. Quick completion:
Response:  the one that compiles the fastest. — This is a common misconception in the programming community. In reality, the performance of a compiled language like C

3. API info:
   endpoint: http://ollama.lan:8080
   health: True
   model_count: 1
   current_model: /models/DeepSeek-R1-0528-Qwen3-8B-UD-Q4_K_XL.gguf
   available_models: ['/models/DeepSeek-R1-0528-Qwen3-8B-UD-Q

# LlamaCPP API Wrapper Examples

This notebook demonstrates a **custom Python wrapper** for the LlamaCPP API running at `http://ollama.lan:8080/`.

## 🎯 Wrapper Benefits

### Why Use This Wrapper?
- **🐍 Pythonic Interface**: Clean, object-oriented API design
- **🔄 Auto-Discovery**: Automatically finds available models
- **🛡️ Error Handling**: Robust error management and retries
- **📡 Streaming Support**: Built-in streaming for real-time responses
- **🎛️ Consistent API**: Unified interface for all operations
- **⚡ Connection Management**: Efficient HTTP connection handling

### vs. Raw HTTP Requests:
- **No Manual JSON**: Automatic request/response handling
- **Type Safety**: Type hints for better development experience
- **Error Recovery**: Graceful handling of network issues
- **Code Reusability**: Clean methods for common operations

## 🔧 Wrapper Features

### Core Methods:
- `health_check()` - Check API availability
- `list_models()` - Discover available models
- `complete()` - Text completion
- `chat()` - Chat completion
- `stream_complete()` - Streaming text generation
- `stream_chat()` - Streaming chat responses

### Advanced Features:
- **Automatic Model Discovery**: Finds and uses available models
- **Flexible Parameters**: Support for temperature, top_p, stop sequences
- **Streaming Iterators**: Python generators for real-time responses
- **Connection Pooling**: Efficient HTTP connection reuse
- **Timeout Management**: Configurable request timeouts

## 📊 Examples Included:

1. **API Health & Model Discovery** - Check connection and find models
2. **Simple Text Completion** - Basic text generation
3. **Chat Completion** - Conversational AI with system prompts
4. **Streaming Completion** - Real-time text generation
5. **Streaming Chat** - Real-time conversation
6. **Embeddings Test** - Vector representations (if supported)
7. **Utility Functions** - Helper methods for common tasks

## 🚀 Getting Started:

1. Ensure LlamaCPP server is running at `http://ollama.lan:8080/`
2. Run the wrapper initialization cell
3. Execute examples to see the wrapper in action

The wrapper automatically discovers available models and provides a clean, consistent interface for all LlamaCPP API operations!