<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/DEEPSEEK_Kimi_K2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#!pip install openai -q
!pip install unsloth -q

## INTEGRA-LLM

In [1]:
import os
import torch
from typing import List, Dict
from datetime import datetime, timedelta
from unsloth import FastLanguageModel
from openai import OpenAI
from google.colab import userdata

# ====================================================================
# DeepSeek Model (Local execution with Unsloth)
# ====================================================================

# Assume a global variable for the loaded model and tokenizer
deepseek_model = None
deepseek_tokenizer = None

def load_deepseek_model():
    """Loads the DeepSeek model and tokenizer using Unsloth."""
    global deepseek_model, deepseek_tokenizer

    if deepseek_model is None or deepseek_tokenizer is None:
        # Check for CUDA and bfloat16 support
        if not torch.cuda.is_available():
            print("CUDA is not available. DeepSeek model cannot be loaded.")
            return False

        max_seq_length = 4096
        dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
        load_in_4bit = True

        print("Loading DeepSeek model...")
        try:
            deepseek_model, deepseek_tokenizer = FastLanguageModel.from_pretrained(
                model_name="unsloth/DeepSeek-R1-Distill-Llama-8B",
                max_seq_length=max_seq_length,
                dtype=dtype,
                load_in_4bit=load_in_4bit,
            )
            print("DeepSeek model loaded successfully.")
            return True
        except Exception as e:
            print(f"Failed to load DeepSeek model: {e}")
            return False
    return True

def generate_with_deepseek(prompt):
    """Generates text using the loaded DeepSeek model."""
    if deepseek_model is None or deepseek_tokenizer is None:
        print("DeepSeek model is not loaded. Skipping generation.")
        return

    # Assuming a simple chat template for demonstration
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": prompt}
    ]
    formatted_prompt = deepseek_tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    print("\n--- Generating with DeepSeek R1 (Local) ---")
    inputs = deepseek_tokenizer([formatted_prompt], return_tensors="pt").to("cuda")

    outputs = deepseek_model.generate(
        **inputs,
        max_new_tokens=512,
        use_cache=True,
        pad_token_id=deepseek_tokenizer.eos_token_id, # Set pad_token_id to EOS token
    )

    decoded_output = deepseek_tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

    # Extract only the assistant's response
    response_start_tag = "<|assistant|>"
    response_start_index = decoded_output.find(response_start_tag)
    if response_start_index != -1:
        clean_output = decoded_output[response_start_index + len(response_start_tag):].strip()
    else:
        clean_output = decoded_output.strip()

    print(clean_output)

# ====================================================================
# Kimi K2 (Remote execution via OpenRouter API)
# ====================================================================

def generate_with_kimi(prompt, openrouter_api_key):
    """Generates text using the Kimi K2 model via the OpenRouter API."""
    print("\n--- Generating with Kimi K2 (OpenRouter API) ---")

    client = OpenAI(
        base_url="https://openrouter.ai/api/v1",
        api_key=openrouter_api_key,
    )

    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": prompt}
    ]

    # Define the desired streaming behavior before the API call
    use_streaming = False

    try:
        stream = client.chat.completions.create(
            model="moonshotai/kimi-dev-72b:free",
            messages=messages,
            temperature=0.3,
            stream=use_streaming, # Use the variable here
            max_tokens=512
        )

        if not use_streaming: # Now checking the boolean variable, not the response object
            print(stream.choices[0].message.content)
        else:
            for chunk in stream:
                if chunk.choices[0].delta.content is not None:
                    print(chunk.choices[0].delta.content, end="")
            print()

    except Exception as e:
        print(f"An error occurred with the Kimi API call: {e}")

# ====================================================================
# Main script execution
# ====================================================================

if __name__ == "__main__":
    # Test prompt for both models
    test_prompt = "What are the benefits of using a Mixture-of-Experts (MoE) architecture in large language models?"

    # --- Demo DeepSeek ---
    if load_deepseek_model():
        generate_with_deepseek(test_prompt)

    # --- Demo Kimi K2 ---
    try:
        openrouter_api_key = userdata.get('OPENROUTER_API_KEY')
        if openrouter_api_key:
            generate_with_kimi(test_prompt, openrouter_api_key)
        else:
            print("\nOPENROUTER_API_KEY not found. Skipping Kimi K2 demo.")
    except Exception as e:
        print(f"\nCould not retrieve OPENROUTER_API_KEY from userdata: {e}")

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
Loading DeepSeek model...
==((====))==  Unsloth 2025.8.1: Fast Llama patching. Transformers: 4.54.0.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.1+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.3.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.31.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
DeepSeek model loaded successfully.

--- Generating with DeepSeek R1 (Local) ---
You are a helpful assistant.<｜User｜>What are the benefits of using a Mixture-of-Experts (MoE) architecture in large language models?<｜Assistant｜><think>
Okay, so I'm trying to understand the benefits of using a Mixture-of-Experts (MoE) architecture in large language models. I've heard about this te

## AGENTIC

In [2]:
import os
import torch
from typing import List, Dict
from datetime import datetime, timedelta
from unsloth import FastLanguageModel
from openai import OpenAI
# Assuming google.colab.userdata is available in the environment
# If running outside Colab, you'd need to load API key differently (e.g., from environment variables)
try:
    from google.colab import userdata
except ImportError:
    print("google.colab.userdata not found. Please ensure you are in a Colab environment or handle API key loading manually.")
    # Fallback for non-Colab environments (e.g., set OPENROUTER_API_KEY as an environment variable)
    class UserDataMock:
        def get(self, key):
            return os.getenv(key)
    userdata = UserDataMock()


# ====================================================================
# DeepSeek Model (Local execution with Unsloth)
# This model will act as our "Summarizer Agent".
# ====================================================================

# Global variables for the loaded model and tokenizer
deepseek_model = None
deepseek_tokenizer = None

def load_deepseek_model():
    """
    Loads the DeepSeek model and tokenizer using Unsloth.
    Returns True if successful, False otherwise.
    """
    global deepseek_model, deepseek_tokenizer

    if deepseek_model is None or deepseek_tokenizer is None:
        # Check for CUDA availability as Unsloth requires a GPU
        if not torch.cuda.is_available():
            print("CUDA is not available. DeepSeek model cannot be loaded for local inference.")
            return False

        # Determine appropriate dtype based on CUDA support
        max_seq_length = 4096
        dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
        load_in_4bit = True # Load model in 4-bit for memory efficiency

        print("Loading DeepSeek model for Summarizer Agent...")
        try:
            deepseek_model, deepseek_tokenizer = FastLanguageModel.from_pretrained(
                model_name="unsloth/DeepSeek-R1-Distill-Llama-8B",
                max_seq_length=max_seq_length,
                dtype=dtype,
                load_in_4bit=load_in_4bit,
            )
            print("DeepSeek model loaded successfully for Summarizer Agent.")
            return True
        except Exception as e:
            print(f"Failed to load DeepSeek model for Summarizer Agent: {e}")
            print("Please ensure you have a compatible GPU and sufficient memory.")
            return False
    return True # Model already loaded

def summarize_with_deepseek(text_to_summarize: str) -> str:
    """
    Generates a summary of the given text using the loaded DeepSeek model.
    Acts as the "Summarizer Agent".
    """
    if deepseek_model is None or deepseek_tokenizer is None:
        print("DeepSeek model (Summarizer Agent) is not loaded. Skipping summarization.")
        return "Summarization failed: Model not loaded."

    print("\n[Summarizer Agent] - Summarizing information...")

    # Define a prompt template for summarization
    summarization_prompt = f"""Summarize the following text concisely and clearly.
Text:
---
{text_to_summarize}
---
Summary:"""

    messages = [
        {"role": "system", "content": "You are a helpful assistant that summarizes text."},
        {"role": "user", "content": summarization_prompt}
    ]

    # Apply chat template for DeepSeek
    formatted_prompt = deepseek_tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    inputs = deepseek_tokenizer([formatted_prompt], return_tensors="pt").to("cuda")

    try:
        outputs = deepseek_model.generate(
            **inputs,
            max_new_tokens=256, # Limit summary length
            use_cache=True,
            pad_token_id=deepseek_tokenizer.eos_token_id,
            temperature=0.5, # Make summarization more focused
            do_sample=True,
        )

        decoded_output = deepseek_tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

        # Extract only the assistant's response, removing any chat template artifacts
        response_start_tag = "<|assistant|>"
        response_start_index = decoded_output.find(response_start_tag)
        if response_start_index != -1:
            clean_output = decoded_output[response_start_index + len(response_start_tag):].strip()
        else:
            clean_output = decoded_output.strip()

        # Remove any lingering "think" tags or similar internal monologue if present
        clean_output = clean_output.replace("◁think▷", "").replace("◁/think▷", "").strip()
        clean_output = clean_output.replace("<think>", "").replace("</think>", "").strip()

        return clean_output

    except Exception as e:
        print(f"Error during DeepSeek summarization: {e}")
        return f"Summarization failed due to an error: {e}"

# ====================================================================
# Kimi K2 (Remote execution via OpenRouter API)
# This model will act as our "Research Agent".
# ====================================================================

def research_with_kimi(query: str, openrouter_api_key: str) -> str:
    """
    Performs research based on the query using the Kimi K2 model via the OpenRouter API.
    Acts as the "Research Agent".
    """
    print(f"\n[Research Agent] - Researching: '{query}'...")

    client = OpenAI(
        base_url="https://openrouter.ai/api/v1",
        api_key=openrouter_api_key,
    )

    messages = [
        {"role": "system", "content": "You are a helpful and informative research assistant. Provide detailed and factual information."},
        {"role": "user", "content": query}
    ]

    use_streaming = False # For this multi-agent setup, we want the full response at once

    try:
        # Make the API call to Kimi K2
        response_object = client.chat.completions.create(
            model="moonshotai/kimi-dev-72b:free", # Using the specified Kimi model
            messages=messages,
            temperature=0.3, # Keep temperature low for factual research
            stream=use_streaming,
            max_tokens=1024 # Allow for detailed research output
        )

        # Extract content from the response
        if not use_streaming:
            research_content = response_object.choices[0].message.content
        else:
            # This block would only execute if use_streaming was True, but we set it to False
            # However, including for completeness if streaming were to be enabled.
            research_content_parts = []
            for chunk in response_object:
                if chunk.choices[0].delta.content is not None:
                    research_content_parts.append(chunk.choices[0].delta.content)
            research_content = "".join(research_content_parts)

        # Clean up any internal "think" tags from the research agent's output
        research_content = research_content.replace("◁think▷", "").replace("◁/think▷", "").strip()
        research_content = research_content.replace("<think>", "").replace("</think>", "").strip()

        print("[Research Agent] - Research complete.")
        return research_content

    except Exception as e:
        print(f"Error during Kimi K2 research API call: {e}")
        return f"Research failed due to an error: {e}"

# ====================================================================
# Orchestrator (Main script logic for multi-agent coordination)
# ====================================================================

def run_multi_agent_solution(initial_query: str):
    """
    Orchestrates the multi-agent solution to research and summarize a topic.
    """
    print("--- Starting Multi-Agent Solution ---")
    print(f"Initial Query: '{initial_query}'")

    # 1. Load DeepSeek Model for Summarizer Agent (if not already loaded)
    deepseek_loaded = load_deepseek_model()
    if not deepseek_loaded:
        print("DeepSeek model could not be loaded. Summarizer Agent will not function.")
        print("Multi-agent solution cannot proceed without the Summarizer Agent.")
        return

    # 2. Get OpenRouter API Key for Research Agent
    openrouter_api_key = None
    try:
        openrouter_api_key = userdata.get('OPENROUTER_API_KEY')
        if not openrouter_api_key:
            print("OPENROUTER_API_KEY not found in userdata. Research Agent will not function.")
            print("Multi-agent solution cannot proceed without the Research Agent.")
            return
    except Exception as e:
        print(f"Could not retrieve OPENROUTER_API_KEY: {e}. Research Agent will not function.")
        return

    # 3. Research Phase (Research Agent - Kimi K2)
    researched_info = research_with_kimi(initial_query, openrouter_api_key)

    if "Research failed" in researched_info:
        print("Research phase failed. Cannot proceed to summarization.")
        return

    print("\n--- Research Information Collected ---")
    print(researched_info)
    print("------------------------------------")

    # 4. Summarization Phase (Summarizer Agent - DeepSeek R1)
    final_summary = summarize_with_deepseek(researched_info)

    print("\n--- Final Summary from Summarizer Agent ---")
    print(final_summary)
    print("-----------------------------------------")
    print("--- Multi-Agent Solution Finished ---")

# ====================================================================
# Entry point for the script
# ====================================================================

if __name__ == "__main__":
    # Define the initial problem for the multi-agent system
    problem_query = "Explain the history and significance of the Renaissance period in Europe."
    run_multi_agent_solution(problem_query)

--- Starting Multi-Agent Solution ---
Initial Query: 'Explain the history and significance of the Renaissance period in Europe.'
Loading DeepSeek model for Summarizer Agent...
==((====))==  Unsloth 2025.8.1: Fast Llama patching. Transformers: 4.54.0.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.1+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.3.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.31.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
DeepSeek model loaded successfully for Summarizer Agent.

[Research Agent] - Researching: 'Explain the history and significance of the Renaissance period in Europe.'...
[Research Agent] - Research complete.

--- Research Information Collected ---
Okay, let's tackle explaining the history and significance of the Renaissance period in Europe. 

First, I need to recall