In [9]:
from typing import Optional, List, Iterator, Any

import json
import httpx
from langchain_core.language_models import BaseChatModel
from langchain_core.messages import AIMessage, AIMessageChunk, BaseMessage, HumanMessage
from langchain_core.outputs import ChatGeneration, ChatGenerationChunk, ChatResult
from langchain_core.callbacks import CallbackManagerForChainRun
from pydantic import Field

# from app.global_settings import API_BASE_URL
API_BASE_URL = "http://127.0.0.1:1234"


class LmStudioChatModel(BaseChatModel):
    model_name: str = Field(alias="model")
    temperature: Optional[float] = None
    max_tokens: Optional[int] = None
    timeout: Optional[int] = None
    stop: Optional[List[str]] = None
    max_retries: int = 2

    def _generate(
        self,
        messages: List[BaseMessage],
        stop: Optional[List[str]] = None,
        run_manager: Optional[CallbackManagerForChainRun] = None,
        **kwargs: Any,
    ) -> ChatResult:
        formatted_messages = [
            {"role": "user", "content": message.content} for message in messages
        ]

        payload = {
            "model": self.model_name,
            "messages": formatted_messages,
            "temperature": self.temperature,
            "max_tokens": self.max_tokens,
            "timeout": self.timeout,
            "stop": self.stop,
        }

        headers = {
            "Content-Type": "application/json",
        }

        try:
            response = httpx.post(
                f"{API_BASE_URL}/v1/chat/completions",
                json=payload,
                headers=headers,
            )
            response.raise_for_status()
            data = response.json()
            content = data["choices"][0]["message"]["content"]
            return ChatResult(generations=[ChatGeneration(message=AIMessage(content=content))])
        except Exception as e:
            print(e)
            raise e

    def _stream(
        self,
        messages: List[BaseMessage],
        stop: Optional[List[str]] = None,
        run_manager: Optional[CallbackManagerForChainRun] = None,
        **kwargs: Any,
    ) -> Iterator[ChatGenerationChunk]:
        formatted_messages = [
            {"role": "user", "content": message.content} for message in messages
        ]

        payload = {
            "model": self.model_name,
            "messages": formatted_messages,
            "temperature": self.temperature,
            "max_tokens": self.max_tokens,
            "timeout": self.timeout,
            "stop": self.stop,
            "stream": True,
        }
        
        headers = {
            "Content-Type": "application/json",
        }

        try:
            with httpx.stream(
                "POST",
                f"{API_BASE_URL}/v1/chat/completions",
                json=payload,
                headers=headers,
                timeout=self.timeout or 30,
            ) as response:
                response.raise_for_status()
                buffer = ""
                for line in response.iter_lines():
                    if not line:
                        continue
                    if line.startswith("data: "):
                        data = line[len("data: ") :].strip()
                        if data == "[DONE]":
                            break
                        try:
                            chunk = json.loads(data)
                            delta = chunk["choices"][0]["delta"].get("content", "")
                            if delta:
                                yield ChatGenerationChunk(
                                    message=AIMessageChunk(content=delta)
                                )
                                if run_manager:
                                    run_manager.on_llm_new_token(delta)
                        except json.JSONDecodeError:
                            continue
        except Exception as e:
            print(f"Streaming error: {e}")
            raise e

    def invoke(self, input: str, **kwargs: Any) -> AIMessage:
        print("Streaming response:")
        messages = [HumanMessage(content=input)]
        full_content = ""

        for chunk in self._stream(messages):
            token = chunk.message.content
            print(token, end="", flush=True)
            full_content += token

        print()  # Newline after stream ends
        return AIMessage(content=full_content)

    @property
    def _llm_type(self) -> str:
        return "lmstudio"


def get_chat_model(
    temperature: float = 0.7,
    model_name: str = "gemma-3-4b-it",
    api_base_url: Optional[str] = None,
) -> LmStudioChatModel:
    """
    Get a configured ChatOpenAI instance.

    Args:
        temperature: Controls randomness in the output (0.0 to 1.0)
        model_name: The name of the model to use
        api_base_url: Optional custom API base URL

    Returns:
        ChatOpenAI: Configured chat model instance
    """
    return LmStudioChatModel(
        temperature=temperature,
        model_name=model_name,
        openai_api_base=api_base_url or API_BASE_URL,
    )


model = LmStudioChatModel(
    temperature=0.7,
    model_name="gemma-3-4b-it",
    api_base_url=API_BASE_URL,
)

response = model.invoke("Write a blog post about the benefits of using LM Studio.")
print("\nFinal full response:\n", response.content)


Streaming response:
Okay, here's a blog post about the benefits of using LM Studio, aiming for around 600-700 words and suitable for a tech-savvy audience interested in local AI:

---

## Stop Relying on APIs: Why You Need to Try LM Studio for Local LLM Exploration

For months now, we’ve been bombarded with the incredible potential of Large Language Models (LLMs) like ChatGPT. But let's be honest – relying solely on cloud-based APIs can feel limiting. High costs, data privacy concerns, and occasional outages are all valid frustrations.  That's where LM Studio comes in, and it’s rapidly becoming a game-changer for anyone wanting to dive deeper into the world of local AI.

**What is LM Studio?**

LM Studio is essentially a desktop application that allows you to download, run, and experiment with various open-source Large Language Models *directly on your computer*.  No more API keys, no more worrying about server uptime, just pure, unadulterated LLM power at your fingertips. It’s built a