In [1]:
import jupyter_black
from openai import OpenAI
import os
import json
from markitdown import MarkItDown
import re
from IPython.display import Markdown

API_KEY = os.environ["DEEPSEEK_API_KEY"]
BASE_URL = "https://api.deepseek.com"
# MODEL = "deepseek-chat"
MODEL = "deepseek-reasoner"

jupyter_black.load()

In [2]:
def get_text_from_arxiv_paper(url: str) -> str:
    md = MarkItDown(enable_plugins=True)
    result = md.convert(url)
    return result.text_content

In [3]:
def llm_call(prompt: str, with_json_output: bool = False) -> tuple[str | dict, str]:
    client = OpenAI(api_key=API_KEY, base_url=BASE_URL)

    args = {
        "model": MODEL,
        "messages": [],
    }

    if with_json_output is True:
        json_prompt = """
        Output your response in JSON format with the keys specified in the prompt.
        Do not include any other text such as ```json or ```.
        The response should be directly parseable by json.loads.
        """.strip()
        args["messages"].append({"role": "system", "content": json_prompt})
        args["response_format"] = {"type": "json_object"}

    args["messages"].append({"role": "user", "content": prompt})
    response = client.chat.completions.create(**args)

    reasoning = response.choices[0].message.reasoning_content
    final_response = response.choices[0].message.content

    if with_json_output is True:
        return json.loads(final_response), reasoning

    return final_response, reasoning

In [4]:
def generate(prompt: str, task: str, context: str = "") -> tuple[str, str]:
    """Generate and improve a solution based on feedback."""
    full_prompt = (
        f"{prompt}\n{context}\nTask: {task}" if context else f"{prompt}\nTask: {task}"
    )
    response, thoughts = llm_call(full_prompt)
    result = re.search(r"<RESPONSE>(.*?)</RESPONSE>", response, re.DOTALL).group(1)

    print("\n=== GENERATION START ===")

    print("\n*** THOUGHTS START ***")
    print(thoughts)
    print("\n*** THOUGHTS END ***")

    print("\n*** RESULT START ***")
    print(result)
    print("\n*** RESULT END ***")

    print("=== GENERATION END ===\n")

    return thoughts, result


def evaluate(prompt: str, content: str, task: str) -> tuple[str, str]:

    full_prompt = f"{prompt}\nOriginal task: {task}\nContent to evaluate: {content}"
    response, thoughts = llm_call(full_prompt, with_json_output=True)
    evaluation = response.get("evaluation")
    feedback = response.get("feedback")

    print("=== EVALUATION START ===")

    print("\n*** THOUGHTS START ***")
    print(thoughts)
    print("\n*** THOUGHTS END ***")

    print("\n*** STATUS START ***")
    print(f"Status: {evaluation}")
    print("\n*** STATUS END ***")

    print("\n*** FEEDBACK START ***")
    print(feedback)
    print("\n*** FEEDBACK END ***")

    print("=== EVALUATION END ===\n")

    return evaluation, feedback


def loop(
    task: str, evaluator_prompt: str, generator_prompt: str
) -> tuple[str, list[dict]]:
    """Keep generating and evaluating until requirements are met."""
    memory = []
    chain_of_thought = []

    thoughts, result = generate(generator_prompt, task)
    memory.append(result)
    chain_of_thought.append({"thoughts": thoughts, "result": result})

    while True:
        evaluation, feedback = evaluate(evaluator_prompt, result, task)
        if evaluation == "PASS":
            return result, chain_of_thought

        context = "\n".join(
            [
                "Previous attempts:",
                *[f"- {m}" for m in memory],
                f"\nFeedback: {feedback}",
            ]
        )

        thoughts, result = generate(generator_prompt, task, context)
        memory.append(result)
        chain_of_thought.append({"thoughts": thoughts, "result": result})

In [5]:
evaluator_prompt = """

Evaluate the following summary. A good summary should:

1. Be understandable by an undergraduate student
2. Formatted in markdown, with proper headings and subheadings
3. Have a title and a clear structure
4. Have at least 500 words
5. Grammar and spelling should be correct

You should be evaluating only and not attemping to solve the task.
Only output "PASS" if all criteria are met and you have no further suggestions for improvements.
Output your evaluation concisely in the following format:

EXAMPLE JSON OUTPUT:
{
    "evaluation": "PASS, NEEDS_IMPROVEMENT, or FAIL",
    "feedback": "What needs improvement and why."
}
"""

generator_prompt = """
Your goal is to complete the task based on <user input>. If there are feedback 
from your previous generations, you should reflect on them to improve your solution

Output your answer concisely in the following format:

<RESPONSE>
Content of the response
</RESPONSE>
"""


def generate_agentic_summary_for(paper_url: str) -> str:

    web_page_text = get_text_from_arxiv_paper(paper_url)

    task = f"""
    <user input>
    Write a summary of the following article:

    <article>
    {web_page_text}
    </article>

    </user input>
    """

    result, cot = loop(task, evaluator_prompt, generator_prompt)

    return result, cot

In [6]:
result, cot = generate_agentic_summary_for("https://arxiv.org/pdf/2506.13585")




=== GENERATION START ===

*** THOUGHTS START ***
We are writing a summary of the provided article about MiniMax-M1. The article introduces MiniMax-M1, an open-weight, large-scale hybrid-attention reasoning model. It uses a hybrid Mixture-of-Experts (MoE) architecture and a lightning attention mechanism, enabling efficient scaling of test-time compute. The model supports a context length of 1 million tokens and consumes significantly fewer FLOPs compared to models like DeepSeek R1 for long generations.

Key points from the article:
- MiniMax-M1 is based on MiniMax-Text-01 and has 456 billion parameters with 45.9 billion activated per token.
- It uses lightning attention for linear computational complexity, allowing efficient scaling to long sequences.
- The model was trained using large-scale reinforcement learning (RL) on diverse problems, including mathematical reasoning, coding, and software engineering.
- A novel RL algorithm, CISPO (Clipped IS-weight Policy Optimization), is intro

In [7]:
Markdown(result)


**MiniMax-M1: Revolutionizing Efficient Long-Context Reasoning in Large Language Models**  

### Introduction  
MiniMax-M1 represents a groundbreaking advancement in open-weight large language models (LLMs), engineered to tackle complex real-world problems requiring extensive reasoning and long-context processing. Developed by MiniMax AI, it pioneers efficient scaling of "test-time compute" (computational resources used during task execution) through a hybrid architecture and novel training techniques. The model supports context windows of **1 million tokens**—8 times larger than predecessors like DeepSeek R1—and generates outputs up to **80K tokens**, enabling unprecedented capabilities in domains like software engineering, scientific research, and multi-step agent workflows.  

### Architectural Innovations  
At its core, MiniMax-M1 combines a **Mixture-of-Experts (MoE)** design with **lightning attention**, a linear-complexity mechanism that dramatically reduces computational overhead:  
- **Scalable Context Handling**: The model processes 1M-token inputs natively, allowing analysis of entire codebases, research papers, or lengthy legal documents.  
- **Computational Efficiency**: Lightning attention achieves near-linear scaling, reducing FLOPs by **75%** compared to DeepSeek R1 at 100K-token generation. For example, generating 100K tokens with M1 consumes only 25% of the energy required by traditional models.  
- **Parameter Optimization**: With 456 billion total parameters, only 45.9 billion activate per token, optimizing resource use during inference. This hybrid design alternates transformer blocks with softmax attention and "transnormer" blocks with lightning attention, enabling efficient reasoning chains spanning hundreds of thousands of tokens.  

### Training Methodology  
MiniMax-M1 was trained in three stages, emphasizing efficiency and diversity:  
1. **Continual Pretraining**: The base model (MiniMax-Text-01) was refined on 7.5T tokens of reasoning-intensive data, with STEM/code content increased to 70%. A four-stage context extension strategy (32K → 1M tokens) mitigated gradient instability.  
2. **Supervised Fine-Tuning (SFT)**: High-quality chain-of-thought (CoT) examples (60% math/coding) instilled reflection-based reasoning patterns.  
3. **Reinforcement Learning (RL)**:  
   - **Novel CISPO Algorithm**: Unlike prior methods (e.g., DAPO, GRPO), CISPO clips importance sampling weights—not token updates—preserving critical low-probability tokens (e.g., "recheck" steps). This doubled convergence speed in controlled tests.  
   - **Diverse RL Environments**: RL spanned verifiable tasks (math Olympiads, SWE-bench bug fixes in sandboxed environments) and unverifiable tasks (creative writing) using generative reward models. A curriculum gradually blended these domains to prevent catastrophic forgetting.  
   - **Efficiency**: Full RL training completed in **3 weeks** on 512 H800 GPUs (cost: $534,700), accelerated by lightning attention and solutions to precision mismatches in training kernels.  

### Performance Excellence  
MiniMax-M1 sets new standards for open-weight models across 12+ benchmarks:  
- **Software Engineering**: Achieved **56% accuracy** on SWE-bench (real-world GitHub issue resolution), surpassing DeepSeek-R1 (49.2%) and Qwen3-235B (34.4%).  
- **Long-Context Understanding**: Dominated OpenAI-MRCR (needle-in-haystack retrieval) and LongBench-v2, outperforming OpenAI o3 and Claude 4 Opus. The 1M-token window enabled near-perfect data retrieval in massive documents.  
- **Agentic Tool Use**: Scored **62.8%** on TAU-bench (tool-integrated dialogue), beating Gemini 2.5 Pro (58.6%) and showcasing real-world deployment readiness.  
- **Math/Coding**: Matched top models with **86% accuracy** on AIME 2024 and strong LiveCodeBench results. The 80K-token version consistently outperformed the 40K variant, validating extended test-time compute.  
- **Efficiency Validation**: The 80K-token model reduced latency by 4× versus traditional architectures at 128K-token generation (Figure 1).  

### Challenges and Solutions  
Unique hurdles arose during training:  
- **Precision Mismatch**: Discrepancies between training/inference probabilities were resolved by switching the LM head to FP32 precision (correlation improved from 0.9× to 0.99×).  
- **Length Bias**: Generative reward models initially favored verbosity; online monitoring and reward shaping mitigated this.  
- **Repetition Collapse**: Early truncation heuristics (e.g., halting after 3,000 consecutive high-probability tokens) stabilized long generations.  

### Release and Real-World Impact  
MiniMax-M1 is publicly accessible via:  
- **Open-weight release** on [GitHub](https://github.com/MiniMax-AI/MiniMax-M1) and Hugging Face.  
- **Commercial API** at minimax.io, optimized for vLLM and Transformers frameworks.  
Applications include:  
- **Automated Workflow Agents**: Debugging, data analysis, and CI/CD pipeline management.  
- **Research Tools**: Processing massive datasets (e.g., genomic sequences or climate models).  
- **AI Assistants**: Agents requiring long-term memory and tool integration (e.g., customer support bots).  

### Conclusion  
By merging lightning attention, hybrid MoE, and the CISPO algorithm, MiniMax-M1 dramatically reduces computational barriers to advanced reasoning. It outperforms leading open-weight models in software engineering and long-context understanding while matching top performers in math and coding. As a foundation for next-generation AI agents, M1 paves the way for LLMs to solve real-world challenges—from scientific discovery to enterprise automation—with unprecedented efficiency. Future work will focus on scaling test-time compute further for multi-agent systems and real-time decision-making.  
