In [None]:
from langchain_openai.chat_models import ChatOpenAI
from pydantic import SecretStr, Field
from typing import Optional, Dict, Any
import os

class ChatOpenRouter(ChatOpenAI):
    """Chat model for OpenRouter.ai compatibility"""
    

    def __init__(
        self,
        model_name: str,
        openai_api_key: str,
        openai_api_base: str = "https://openrouter.ai/api/v1",
        **kwargs: Dict[str, Any]
    ):
        # Convert API key to SecretStr
        secret_key = SecretStr(openai_api_key.strip())

        super().__init__(
            openai_api_key=secret_key,
            openai_api_base=openai_api_base,
            model_name=model_name,
            **kwargs
        )

    @property
    def _client_params(self) -> Dict[str, Any]:
        """Override to include our custom headers"""
        params = super()._client_params
        params["default_headers"] = self.default_headers
        return params

from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate

# Initialize with proper headers
llm = ChatOpenRouter(
    model_name='deepseek/deepseek-r1:free',
    openai_api_key=os.getenv("OPENROUTER_API_KEY"),
    # http_referer="https://your.site",
    # x_title="My AI App"
)

# Create and run chain
template = """Question: {question}\nAnswer: Let's think step by step."""
prompt = PromptTemplate(template=template, input_variables=["question"])
llm_chain = prompt|llm

question = "What NFL team won the Super Bowl when Justin Bieber was born?"
print(llm_chain.run(question))

In [None]:
from typing import Literal,Callable,Type
from pydantic import BaseModel, model_validator
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_groq import ChatGroq

class llmClient(BaseModel):
    id:str
    source:Literal['openrouter','groq','google_gen_ai']
    


LLMoptions={
    'qwen Reasoning':llmClient(
        id='qwen-qwq-32b',
        client=ChatGroq,
        source='groq'),
    'deepseek r1 distilled':llmClient(
        id='deepseek-r1-distill-llama-70b',
        client=ChatGroq,
        source='groq'),
    
    'Deepseek R1':llmClient(
        id='deepseek/deepseek-r1:free',
        client=ChatOpenRouter,
        source='openrouter'),
    
    'gemini reasoning':llmClient(
        id= 'gemini-2.0-flash-thinking-exp-01-21',
        client=ChatGoogleGenerativeAI,
        source='gemini'),
    'gemini 2.0 pro':llmClient(
        id='gemini-2.0-pro-exp-02-05',
        client=ChatGoogleGenerativeAI,
        source='gemini'),
    'gemini image generation':llmClient(
        id='gemini-2.0-flash-exp-image-generation',
        client=ChatGoogleGenerativeAI,
        source='gemini'),
    'gemini 2.0 flash':llmClient(
        id='gemini-2.0-flash',
        client=ChatGoogleGenerativeAI,
        source='gemini'),
}

In [21]:
LLMoptions={
    'qwen Reasoning':('qwen-qwq-32b','groq'),
    'deepseek r1 distilled':('deepseek-r1-distill-llama-70b','groq'),
    'Deepseek R1':('deepseek/deepseek-r1:free','openrouter]'),
    'gemini reasoning':( 'gemini-2.0-flash-thinking-exp-01-21','gemini'),
    'gemini 2.0 pro':('gemini-2.0-pro-exp-02-05','gemini'),
    'gemini image generation':('gemini-2.0-flash-exp-image-generation','gemini'),
    'gemini 2.0 flash':('gemini-2.0-flash','gemini'),
}

In [12]:
from dotenv import load_dotenv


load_dotenv()

True

In [None]:

# Usage
llm = ChatOpenRouter(
    model_name="deepseek/deepseek-r1:free",
    openai_api_key=os.getenv("OPENROUTER_API_KEY")
)

# Test with simple prompt
print(llm.invoke("Explain quantum computing in 3 sentences"))

<class 'pydantic.types.SecretStr'>
content='Quantum computing leverages quantum bits (qubits), which can exist in superposition (representing 0 and 1 simultaneously) and entanglement (correlated states across qubits), enabling parallel processing of vast computational possibilities. This allows quantum computers to solve specific problems—like factoring large numbers, simulating quantum systems, or optimizing complex processes—exponentially faster than classical computers. While still in early stages with technical challenges (e.g., error correction, scalability), advancements could revolutionize fields such as cryptography, materials science, and AI.' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 257, 'prompt_tokens': 13, 'total_tokens': 270, 'completion_tokens_details': None, 'prompt_tokens_details': None}, 'model_name': 'deepseek/deepseek-r1', 'system_fingerprint': None, 'id': 'gen-1742330315-dM5NlAPW0u7vSo6QXWpr', 'finish_reason': 'stop

In [19]:
from openai import OpenAI

client = OpenAI(
  base_url="https://openrouter.ai/api/v1",
  api_key=os.getenv("OPENROUTER_API_KEY"),
)



# print(completion.choices[0].message.content)
print(client.models.list().model_dump_json(indent=4))


{
    "data": [
        {
            "id": "mistralai/mistral-small-3.1-24b-instruct",
            "created": 1742238937,
            "object": null,
            "owned_by": null,
            "name": "Mistral: Mistral Small 3.1 24B",
            "description": "Mistral Small 3.1 24B Instruct is an upgraded variant of Mistral Small 3 (2501), featuring 24 billion parameters with advanced multimodal capabilities. It provides state-of-the-art performance in text-based reasoning and vision tasks, including image analysis, programming, mathematical reasoning, and multilingual support across dozens of languages. Equipped with an extensive 128k token context window and optimized for efficient local inference, it supports use cases such as conversational agents, function calling, long-document comprehension, and privacy-sensitive deployments.",
            "context_length": 128000,
            "architecture": {
                "modality": "text+image->text",
                "tokenizer": "Mistr

In [3]:
from typing import Literal
llmOptions=Literal['qwen Reasoning',
'deepseek r1 distilled',
'Deepseek R1',
'gemini reasoning',
'gemini 2.0 pro',
'gemini image generation',]

In [None]:
llmClient(id='gemini-2.0-pro',client=ChatGoogleGenerativeAI,source='gemini')

Data(id='gemini-2.0-pro', client=<class 'langchain_google_genai.chat_models.ChatGoogleGenerativeAI'>, source='gemini')

In [None]:
class llmFactory(BaseModel):
    pass

In [None]:
LLMoptions={
    'qwen Reasoning':llmClient(
        id='qwen-qwq-32b',
        source='groq'),
    'deepseek r1 distilled':llmClient(
        id='deepseek-r1-distill-llama-70b',
        source='groq'),
    
    'Deepseek R1':llmClient(
        id='deepseek/deepseek-r1:free',
        source='openrouter'),
    
    'gemini reasoning':llmClient(
        id= 'gemini-2.0-flash-thinking-exp-01-21',
        source='google_genai'),
    'gemini 2.0 pro':llmClient(
        id='gemini-2.0-pro-exp-02-05',
        client=ChatGoogleGenerativeAI,
        source='google_genai'),
    'gemini image generation':llmClient(
        client=ChatGoogleGenerativeAI,
        source='gemini'),
    'gemini 2.0 flash':llmClient(
        id='gemini-2.0-flash',
        source='gemini'),
}

In [None]:
class LLMFactory:
    def __init__(self, options: Dict[str, llmClient]):
        self.options = options

    def get_llm(self, model_name: str, temperature: Optional[float] = None) -> BaseChatModel:
        if model_name not in self.options:
            raise ValueError(f"Model {model_name} not found in options")
        
        llm_client = self.options[model_name]
        client_class = llm_client.client
        
        # Initialize the LLM with the specified temperature if provided
        if temperature is not None:
            return client_class(model_name=llm_client.id, temperature=temperature)
        else:
            return client_class(model_name=llm_client.id)

# Usage
llm_factory = LLMFactory(LLMoptions)
llm_instance = llm_factory.get_llm('Deepseek R1', temperature=0.7)
print(llm_instance.invoke("Explain quantum computing in 3 sentences"))

- View options from st and get the options  and the data.
- get the chat_client from the

In [None]:

from langchain.chat_models import init_chat_model

load_dotenv()
init_chat_model(model='deepseek/deepseek-r1:free', temperature=0.7,model_provider='openai',
    api_key=os.getenv("OPENROUTER_API_KEY"),
    base_url='https://openrouter.ai/api/v1'
).invoke('Think and explain QLORA and fineTuning')

AIMessage(content='**QLORA and Fine-Tuning Explained**\n\n**1. Fine-Tuning Overview:**\n- **Purpose:** Adapt pre-trained language models (e.g., GPT, LLaMA) to specific tasks (e.g., sentiment analysis, chatbots) using a smaller, task-specific dataset.\n- **Methods:**\n  - **Full Fine-Tuning:** Updates all model parameters, requiring significant computational resources.\n  - **Parameter-Efficient Fine-Tuning (PEFT):** Adjusts only a subset of parameters (e.g., LoRA, Adapters), reducing resource demands.\n\n**2. LoRA (Low-Rank Adaptation):**\n- **Concept:** Introduces low-rank matrices to model layers, capturing weight updates (ΔW) as the product of smaller matrices \\( A \\) (d×r) and \\( B \\) (r×d), where \\( r \\ll d \\).\n- **Efficiency:** Reduces trainable parameters (e.g., 2×d×r vs. d²), enabling training on modest hardware.\n\n**3. QLORA: Quantized LoRA**\n- **Innovation:** Combines 4-bit quantization with LoRA for extreme efficiency.\n- **Key Components:**\n  - **4-bit NormalFloa

In [14]:
from langchain.chat_models import init_chat_model
from langchain.chat_models.base import _ConfigurableModel
from typing import Literal
import os
from dotenv import load_dotenv
from IPython.display import display_markdown

load_dotenv()

# Define the available LLM options 
LLMOptions = Literal[
    'qwen Reasoning',
    'deepseek r1 distilled',
    'Deepseek R1',
    'gemini reasoning',
    'gemini 2.0 pro',
    'gemini image generation',
    'gemini 2.0 flash'
]

class LLMFactory:
    # Initialize chat models with their respective configurations
    chat_models = {
        'qwen Reasoning': init_chat_model(
            model='qwen-qwq-32b', 
            model_provider='groq', 
            configurable_fields='any'
        ),
        'deepseek r1 distilled': init_chat_model(
            model='deepseek-r1-distill-llama-70b', 
            model_provider='groq', 
            configurable_fields='any'
        ),
        'Deepseek R1': init_chat_model(
            model='deepseek/deepseek-r1:free', 
            model_provider='openai', 
            api_key=os.getenv('OPENROUTER_API_KEY'), 
            base_url='https://openrouter.ai/api/v1', 
            configurable_fields='any'
        ),
        'gemini reasoning': init_chat_model(
            model='gemini-2.0-flash-thinking-exp-01-21', 
            model_provider='google_genai', 
            configurable_fields='any'
        ),
        'gemini 2.0 pro': init_chat_model(
            model='gemini-2.0-pro-exp-02-05', 
            model_provider='google_genai', 
            configurable_fields='any'
        ),
        'gemini image generation': init_chat_model(
            model='gemini-2.0-flash-exp-image-generation', 
            model_provider='google_genai', 
            configurable_fields='any'
        ),
        'gemini 2.0 flash': init_chat_model(
            model='gemini-2.0-flash', 
            model_provider='google_genai', 
            configurable_fields='any'
        ),
    }

    @staticmethod
    def get_chat_model_options():
        """Return a list of available chat model options"""
        return list(LLMFactory.chat_models.keys())

    @staticmethod
    def get_client(model_choice: LLMOptions) -> _ConfigurableModel:
        """Retrieve the client for the specified model choice"""
        if model_choice not in LLMFactory.chat_models:
            raise ValueError(
                f"Model {model_choice} not found. Available models are {LLMFactory.get_chat_model_options()}"
            )
        return LLMFactory.chat_models[model_choice]

# Usage
client = LLMFactory.get_client('Deepseek R1')

respone=client.invoke("Elaborate on QLORA and give me code to explain it. Think and answer")


In [17]:
from IPython.display import display_markdown

display_markdown(respone.content,raw=True)

QLORA (Quantized Low-Rank Adaptation) is an efficient fine-tuning technique that combines **4-bit quantization** with **Low-Rank Adapters (LoRA)**. It enables training large language models (LLMs) on consumer GPUs by drastically reducing memory usage while maintaining model performance. Here's how it works:

1. **4-bit Quantization**: The base model weights are stored in 4-bit precision (vs. 16/32-bit), reducing memory footprint. During computation, weights are dequantized to 16-bit for forward/backward passes.
2. **LoRA Adapters**: Small trainable matrices are added to the model layers. Only these matrices are updated during training, while the base model remains frozen.
3. **Memory Optimization**: Techniques like NF4 quantization, double quantization, and paged optimizers further minimize memory usage.

---

### Code Example with Hugging Face Libraries
Below is a practical implementation using `bitsandbytes` (quantization), `peft` (LoRA), and `transformers`:

```python
# Install required libraries
!pip install -q -U bitsandbytes transformers accelerate peft trl

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments
)
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer
import torch

# 1. Configure 4-bit quantization
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",       # Use 4-bit NormalFloat
    bnb_4bit_use_double_quant=True,  # Apply double quantization
    bnb_4bit_compute_dtype=torch.bfloat16  # Compute dtype for efficiency
)

# 2. Load base model (e.g., Llama-2-7B)
model_name = "meta-llama/Llama-2-7b-hf"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quant_config,
    device_map="auto"  # Automatic device placement
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Fix padding

# 3. Configure LoRA
lora_config = LoraConfig(
    r=8,              # Rank of the adapter matrices
    lora_alpha=32,    # Scaling factor
    target_modules=["q_proj", "v_proj"],  # Target layers in transformer
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# 4. Apply LoRA to the quantized model
peft_model = get_peft_model(model, lora_config)
peft_model.print_trainable_parameters()  # Output: ~3.5M trainable params

# 5. Set up training (example with dummy dataset)
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    num_train_epochs=1,
    fp16=True,  # Mixed precision training
    logging_steps=10,
    save_strategy="steps",
    save_steps=500
)

# Dummy dataset (replace with your data)
train_dataset = [{"text": "QLORA is..."}] * 100  # Sample data

# 6. Initialize trainer
trainer = SFTTrainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
    dataset_text_field="text",
    max_seq_length=512,
    tokenizer=tokenizer
)

# 7. Start training (only LoRA params updated)
trainer.train()
```

---

### Key Features:
1. **Memory Efficiency**:
   - 4-bit weights reduce memory usage by ~4x compared to 16-bit.
   - Example: A 7B model uses ~10GB VRAM instead of ~35GB.

2. **Performance Retention**:
   - NF4 quantization preserves information better than standard 4-bit types.
   - LoRA adapters capture task-specific updates without altering the base model.

3. **Practical Benefits**:
   - Fine-tune multi-billion parameter models on consumer GPUs (e.g., RTX 3090).
   - Merge adapters into the base model post-training for easy deployment.

---

### Customization Tips:
- **Target Modules**: Choose layers to attach LoRA (common: `q_proj`, `v_proj` in transformers).
- **Rank (`r`)**: Higher values increase adapter capacity but use more memory (typical: 8–64).
- **Quantization**: Adjust `bnb_4bit_quant_type` and `compute_dtype` based on hardware support.

For more details, refer to the [QLoRA paper](https://arxiv.org/abs/2305.14314) and [Hugging Face PEFT documentation](https://huggingface.co/docs/peft/en/index).

In [16]:
print(respone.model_dump_json(indent=1))

{
 "content": "QLORA (Quantized Low-Rank Adaptation) is an efficient fine-tuning technique that combines **4-bit quantization** with **Low-Rank Adapters (LoRA)**. It enables training large language models (LLMs) on consumer GPUs by drastically reducing memory usage while maintaining model performance. Here's how it works:\n\n1. **4-bit Quantization**: The base model weights are stored in 4-bit precision (vs. 16/32-bit), reducing memory footprint. During computation, weights are dequantized to 16-bit for forward/backward passes.\n2. **LoRA Adapters**: Small trainable matrices are added to the model layers. Only these matrices are updated during training, while the base model remains frozen.\n3. **Memory Optimization**: Techniques like NF4 quantization, double quantization, and paged optimizers further minimize memory usage.\n\n---\n\n### Code Example with Hugging Face Libraries\nBelow is a practical implementation using `bitsandbytes` (quantization), `peft` (LoRA), and `transformers`:\n

# Mongo ODM tests

In [None]:
from typing import Annotated
from uuid import UUID, uuid4
from beanie import Document, Indexed,BeanieObjectId
from pydantic import Field

class Session(Document):    
    id:UUID=Field(default_factory=uuid4)
    userId:str
    Title:str

class MasterSession(Document):
    id:UUID=Field(default_factory=uuid4)
    userId:str
    Title:str


class Users(Document):
    id:UUID=Field(default_factory=uuid4)
    name:str
    email:Annotated[str, Indexed(unique=True)]
    password:str



In [28]:
from motor.motor_asyncio import AsyncIOMotorClient
from beanie import init_beanie
from dotenv import load_dotenv

load_dotenv()

async def init_db():
    client=AsyncIOMotorClient(os.getenv('MONGO_URL'))
    print(client.db_name)
    await init_beanie(database=client.db_name, document_models=[Session,Users],multiprocessing_mode=True)



In [38]:
client=AsyncIOMotorClient(os.getenv('MONGO_URL'))
print(client.db_name)

AsyncIOMotorDatabase(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=False, driver=DriverInfo(name='Motor', version='3.7.0', platform='asyncio')), 'db_name'))


In [34]:
data=await init_db()

AsyncIOMotorDatabase(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=False, driver=DriverInfo(name='Motor', version='3.7.0', platform='asyncio')), 'db_name'))


In [40]:
'701a0a04-4956-481b-bda8-d24665fe7503'==UUID('701a0a04-4956-481b-bda8-d24665fe7503')

False

In [41]:
uuid4().__str__()

'739817ef-09af-4957-b8bf-c1e7bb118f26'

AttributeError: 'NoneType' object has no attribute 'db_name'