In [1]:
%%capture
import os
!pip install langgraph langchain_core arxiv
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [2]:
from unsloth import FastLanguageModel
from google.colab import drive

drive.mount('/content/drive')

# 1. Load the BASE model (The big 14B one)
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen3-14B", # Changed from Qwen3-14B to Qwen3-8B
    max_seq_length = 1024,   # Context length - can be longer, but uses more memory
    load_in_4bit = True,     # 4bit uses much less memory
    full_finetuning = False, # We have full finetuning now!
    # token = "hf_...",      # use one if using gated models
)

# 2. Load your ADAPTERS on top (The files you just saved)
model.load_adapter("/content/drive/MyDrive/ai_models/lora_adapters")

# 3. Enable Inference Speedup
FastLanguageModel.for_inference(model)

print("Model loaded successfully from Drive!")

ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
ü¶• Unsloth Zoo will now patch everything to make training faster!
Unsloth: Could not find `steps_per_generation` in grpo_trainer
Unsloth: Could not find `generation_batch_size` in grpo_trainer
Mounted at /content/drive
==((====))==  Unsloth 2026.1.3: Fast Qwen3 patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.59G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/1.56G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/237 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

Model loaded successfully from Drive!


Collecting arxiv
  Downloading arxiv-2.4.0-py3-none-any.whl.metadata (6.3 kB)
Collecting feedparser~=6.0.10 (from arxiv)
  Downloading feedparser-6.0.12-py3-none-any.whl.metadata (2.7 kB)
Collecting sgmllib3k (from feedparser~=6.0.10->arxiv)
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading arxiv-2.4.0-py3-none-any.whl (12 kB)
Downloading feedparser-6.0.12-py3-none-any.whl (81 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m81.5/81.5 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: sgmllib3k
  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone
  Created wheel for sgmllib3k: filename=sgmllib3k-1.0.0-py3-none-any.whl size=6046 sha256=5e0969273f80fb26413377d077c3e733b0f6d9cd6d16b9339b63f7a7c6e87b49
  Stored in directory: /root/.cache/pip/wheels/03/f5/1a/23761066dac1d0e8e683e5fdb27

In [4]:
import json
import arxiv
from typing import TypedDict, List, Annotated
from langgraph.graph import StateGraph, END
from langchain_core.messages import SystemMessage, HumanMessage

# --- 1. DEFINE THE STATE ---
# This dictionary tracks the data as it moves between agents
class ResearchState(TypedDict):
    task: str               # The user's original question
    plan: List[str]         # The list of steps to research
    content: List[str]      # The raw data gathered from ArXiv
    draft: str              # The current written report
    critique: str           # Feedback from the critic
    revision_number: int    # To track iterations
    max_revisions: int      # Limit to stop infinite loops

# --- 2. HELPER: CONNECT UNSLOTH MODEL ---
# This function wraps your loaded 'model' and 'tokenizer' to work like a chat bot
def call_local_model(messages, max_tokens=1024):
    """
    Formats messages for Qwen/Unsloth and generates a response.
    """
    # Apply the specific chat template for your model (Qwen handles this well)
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    # Create inputs
    model_inputs = tokenizer([text], return_tensors="pt").to("cuda")

    # Generate
    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=max_tokens,
        use_cache=True
    )

    # Decode and strip the prompt (so we only get the new response)
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    response_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return response_text

# --- 3. DEFINE THE NODES (AGENTS) ---

def planner_node(state: ResearchState):
    print("--- üß† PLANNER IS THINKING ---")

    # Construct the prompt
    messages = [
        {"role": "system", "content": "You are a Research Planner. Return a Python list of 3 short, specific search queries related to the user's task. Example format: ['query 1', 'query 2', 'query 3']. Do not explain, just return the list."},
        {"role": "user", "content": f"Task: {state['task']}"}
    ]

    # Get response from your local model
    response = call_local_model(messages)

    # Simple parsing to ensure we get a list (Basic robustness)
    # If the model chats too much, we try to extract the list part
    try:
        # Try to find the bracketed list in the text
        import ast
        start = response.find('[')
        end = response.rfind(']') + 1
        plan = ast.literal_eval(response[start:end])
    except:
        # Fallback if model fails to output strict list
        plan = [f"{state['task']} generic analysis", f"{state['task']} method comparison"]

    return {"plan": plan}

def researcher_node(state: ResearchState):
    print("--- üïµÔ∏è RESEARCHER IS SEARCHING ARXIV ---")

    collected_content = []
    client = arxiv.Client()

    # Iterate through the plan generated by the previous node
    for query in state['plan']:
        print(f"Searching for: {query}")
        search = arxiv.Search(
            query=query,
            max_results=2, # Keep low for speed in demo
            sort_by=arxiv.SortCriterion.Relevance
        )

        for r in client.results(search):
            paper_summary = f"Title: {r.title}\nAbstract: {r.summary[:500]}..."
            collected_content.append(paper_summary)

    return {"content": collected_content}

def writer_node(state: ResearchState):
    print("--- ‚úçÔ∏è WRITER IS DRAFTING ---")

    # Combine all research into one context string
    context_str = "\n\n".join(state['content'])

    messages = [
        {"role": "system", "content": "You are a Research Analyst. Synthesize the provided research summaries into a clear, structured report."},
        {"role": "user", "content": f"Task: {state['task']}\n\nResearch Materials:\n{context_str}"}
    ]

    draft = call_local_model(messages)

    return {
        "draft": draft,
        "revision_number": state.get("revision_number", 0) + 1
    }

def critic_node(state: ResearchState):
    print("--- üßê CRITIC IS REVIEWING ---")

    messages = [
        {"role": "system", "content": "You are a strict Academic Reviewer. Check the draft. If it is high quality, reply with only the word 'APPROVE'. If it needs work, provide 1 sentence of feedback."},
        {"role": "user", "content": f"Draft: {state['draft']}"}
    ]

    critique = call_local_model(messages)
    return {"critique": critique}

def should_continue(state: ResearchState):
    critique = state.get('critique', '')
    rev_num = state.get('revision_number', 0)
    max_rev = state.get('max_revisions', 2)

    if rev_num >= max_rev:
        print("--- üõë MAX REVISIONS REACHED ---")
        return "end"

    if "APPROVE" in critique.upper():
        print("--- ‚úÖ DRAFT APPROVED ---")
        return "end"
    else:
        print("--- üîÑ LOOPING BACK TO WRITER ---")
        return "writer" # In a complex app, this might go back to researcher

# --- 4. BUILD THE GRAPH ---

workflow = StateGraph(ResearchState)

# Add Nodes
workflow.add_node("planner", planner_node)
workflow.add_node("researcher", researcher_node)
workflow.add_node("writer", writer_node)
workflow.add_node("critic", critic_node)

# Set Entry Point
workflow.set_entry_point("planner")

# Define Edges
workflow.add_edge("planner", "researcher")
workflow.add_edge("researcher", "writer")
workflow.add_edge("writer", "critic")

# Conditional Edge (The Logic Loop)
workflow.add_conditional_edges(
    "critic",
    should_continue,
    {
        "writer": "writer",  # If rejected, go back to writing (or researching)
        "end": END           # If approved, finish
    }
)

# Compile
app = workflow.compile()

print("Graph compiled! Ready to run.")

Graph compiled! Ready to run.


In [10]:
# Initialize the state
initial_state = {
    "task": "Compare the performance of Mamba vs Transformers in 2024",
    "max_revisions": 2,
    "revision_number": 0,
    "content": [],
    "plan": [],
    "draft": "",
    "critique": ""
}

# Variable to store the latest draft
final_draft = ""

print("Starting the Research Agent...")

# Run the graph
for output in app.stream(initial_state):
    # output looks like: {'node_name': {'key': 'value'}}

    for node_name, node_content in output.items():
        print(f"--- Finished running: {node_name} ---")

        # If this node produced a draft, save it!
        if 'draft' in node_content:
            final_draft = node_content['draft']
            print(f"Draft updated (Length: {len(final_draft)} chars)")

# Print Final Result
print("\n\n=== FINAL RESEARCH REPORT ===")
if final_draft:
    print(final_draft)
else:
    print("No draft was generated (Did the loop crash or max out?)")

Starting the Research Agent...
--- üß† PLANNER IS THINKING ---
--- Finished running: planner ---
--- üïµÔ∏è RESEARCHER IS SEARCHING ARXIV ---
Searching for: Mamba vs Transformers 2024 performance comparison
Searching for: Mamba model efficiency vs Transformers 2024
Searching for: 2024 Mamba vs Transformers benchmark results
--- Finished running: researcher ---
--- ‚úçÔ∏è WRITER IS DRAFTING ---
--- Finished running: writer ---
Draft updated (Length: 2548 chars)
--- üßê CRITIC IS REVIEWING ---
--- ‚úÖ DRAFT APPROVED ---
--- Finished running: critic ---


=== FINAL RESEARCH REPORT ===
<think>

</think>

### Comparative Analysis of Mamba vs Transformers in 2024

#### Introduction
In the realm of Large Language Models (LLMs), Transformers have traditionally been the backbone due to their ability to model long sequences effectively. However, their inference overhead grows linearly with the sequence length, posing challenges for tasks requiring processing of very long sequences. Enter Mamb