<a href="https://colab.research.google.com/github/gqcpm/scholar_stream/blob/main/research_agent.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
import os
!pip install langgraph langchain_core arxiv
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [None]:
import json
import arxiv
from typing import TypedDict, List, Annotated
from langgraph.graph import StateGraph, END
from langchain_core.messages import SystemMessage, HumanMessage

from unsloth import FastLanguageModel
from google.colab import drive
import re

In [None]:
drive.mount('/content/drive')

# 1. Load the BASE model (The big 14B one)
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen3-14B", # Changed from Qwen3-14B to Qwen3-8B
    max_seq_length = 2048,   # Context length - can be longer, but uses more memory
    load_in_4bit = True,     # 4bit uses much less memory
    full_finetuning = False, # We have full finetuning now!
    # token = "hf_...",      # use one if using gated models
)

# 2. Load your ADAPTERS on top (The files you just saved)
model.load_adapter("/content/drive/MyDrive/ai_models/lora_model_thinking")

# 3. Enable Inference Speedup
FastLanguageModel.for_inference(model)

print("Model loaded successfully from Drive!")

In [None]:
# --- 1. DEFINE THE STATE ---
# This dictionary tracks the data as it moves between agents
class ResearchState(TypedDict):
    task: str               # The user's original question
    plan: List[str]         # The list of steps to research
    content: List[str]      # The raw data gathered from ArXiv
    draft: str              # The current written report
    critique: str           # Feedback from the critic
    revision_number: int    # To track iterations
    max_revisions: int      # Limit to stop infinite loops

# --- 2. HELPER: CONNECT UNSLOTH MODEL ---
# This function wraps your loaded 'model' and 'tokenizer' to work like a chat bot
def call_local_model(messages, force_start=None, max_tokens=2048):
    """
    Formats messages for Qwen/Unsloth and generates a response.
    """
    # Apply the specific chat template for your model (Qwen handles this well)
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=True
    )

    if force_start:
        text = text + force_start

    # Create inputs
    model_inputs = tokenizer([text], return_tensors="pt").to("cuda")

    # Generate
    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=max_tokens,
        use_cache=True,
        temperature=0.6,      # CRITICAL: Do not use 0. Use 0.6 for reasoning.
        min_p=0.1,            # Helps keep output coherent but creative
        do_sample=True,       # Must be True for temperature to work
        repetition_penalty=1.1
    )

    # Decode and strip the prompt (so we only get the new response)
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    response_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    if force_start:
        return force_start + response_text
    return response_text

# --- 3. DEFINE THE NODES (AGENTS) ---

def planner_node(state: ResearchState):
    print("--- üß† PLANNER IS THINKING ---")

    # Construct the prompt
    messages = [
        {"role": "system", "content": "You are a Research Planner. Return a Python list of 3 short, specific search queries related to the user's task. Example format: ['query 1', 'query 2', 'query 3']. Do not explain, just return the list."},
        {"role": "user", "content": f"Task: {state['task']}"}
    ]

    # Get response from your local model
    response = call_local_model(messages)

    # Simple parsing to ensure we get a list (Basic robustness)
    # If the model chats too much, we try to extract the list part
    try:
        # Try to find the bracketed list in the text
        import ast
        start = response.find('[')
        end = response.rfind(']') + 1
        plan = ast.literal_eval(response[start:end])
    except:
        # Fallback if model fails to output strict list
        plan = [f"{state['task']} generic analysis", f"{state['task']} method comparison"]

    return {"plan": plan}

def researcher_node(state: ResearchState):
    print("--- üïµÔ∏è RESEARCHER IS SEARCHING ARXIV ---")

    collected_content = []
    client = arxiv.Client()

    # Iterate through the plan generated by the previous node
    for query in state['plan']:
        print(f"Searching for: {query}")
        search = arxiv.Search(
            query=query,
            max_results=2, # Keep low for speed in demo
            sort_by=arxiv.SortCriterion.Relevance
        )

        for r in client.results(search):
            paper_summary = f"Title: {r.title}\nAbstract: {r.summary[:500]}..."
            collected_content.append(paper_summary)

    return {"content": collected_content}

def writer_node(state: ResearchState):
    print("--- ‚úçÔ∏è WRITER IS DRAFTING ---")

    # Combine all research into one context string
    context_str = "\n\n".join(state['content'])

    messages = [
        {"role": "system", "content": "You are a Research Analyst. Synthesize the provided research summaries into a clear, structured report."},
        {"role": "user", "content": f"Task: {state['task']}\n\nResearch Materials:\n{context_str}"}
    ]

    draft = call_local_model(messages,
                             force_start="<think>\nTo write this report, I need to organize the research findings into a logical structure. First, I will outline the key points about"
                             )

    return {
        "draft": draft,
        "revision_number": state.get("revision_number", 0) + 1
    }

def critic_node(state: ResearchState):
    print("--- üßê CRITIC IS REVIEWING ---")

    user_content = (
        f"Here is the draft you need to critique:\n\n"
        f"--- START DRAFT ---\n"
        f"{state['draft']}\n"
        f"--- END DRAFT ---\n\n"
        f"CRITIQUE INSTRUCTIONS:\n"
        f"1. Identify logic errors, hallucinations, or vague claims.\n"
        f"2. Check if the tone is objective.\n"
        f"3. Provide a numbered list of specific critiques."
    )

    messages = [
        {"role": "system", "content": "You are a harsh Critic. Do not rewrite the draft. Only list its flaws."},
        {"role": "user", "content": user_content}
    ]

    critique = call_local_model(messages,
                                force_start="<think>\nTo critique this draft, I need to check for logical flow and missing citations. I will specifically look for")
    print(critique)
    return {"critique": critique}

def should_continue(state: ResearchState):
    critique = state.get('critique', '')
    rev_num = state.get('revision_number', 0)
    max_rev = state.get('max_revisions', 2)

    if rev_num >= max_rev:
        print("--- üõë MAX REVISIONS REACHED ---")
        return "end"

    if "APPROVE" in critique.upper():
        print("--- ‚úÖ DRAFT APPROVED ---")
        return "end"
    else:
        print("--- üîÑ LOOPING BACK TO WRITER ---")
        return "writer" # In a complex app, this might go back to researcher

# --- 4. BUILD THE GRAPH ---

workflow = StateGraph(ResearchState)

# Add Nodes
workflow.add_node("planner", planner_node)
workflow.add_node("researcher", researcher_node)
workflow.add_node("writer", writer_node)
workflow.add_node("critic", critic_node)

# Set Entry Point
workflow.set_entry_point("planner")

# Define Edges
workflow.add_edge("planner", "researcher")
workflow.add_edge("researcher", "writer")
workflow.add_edge("writer", "critic")

# Conditional Edge (The Logic Loop)
workflow.add_conditional_edges(
    "critic",
    should_continue,
    {
        "writer": "writer",  # If rejected, go back to writing (or researching)
        "end": END           # If approved, finish
    }
)

# Compile
app = workflow.compile()

print("Graph compiled! Ready to run.")

Graph compiled! Ready to run.


In [None]:
# Initialize the state
initial_state = {
    "task": "Tell me about the newest and biggest changes in the SWE agents realm as of today (Jan 21, 2026)",
    "max_revisions": 3,
    "revision_number": 0,
    "content": [],
    "plan": [],
    "draft": "",
    "critique": ""
}

# Variable to store the latest draft
final_draft = ""

print("Starting the Research Agent...")

# Run the graph
for output in app.stream(initial_state):
    for node_name, node_content in output.items():
        print(f"--- Finished running: {node_name} ---")

        # 1. Check if there are messages (common in 'planner' nodes)
        if 'messages' in node_content:
            # Get the last message's content and print it directly
            last_msg = node_content['messages'][-1]
            # Handle if it's an object or a dict
            content = last_msg.content if hasattr(last_msg, 'content') else last_msg.get('content')
            print(f"FULL OUTPUT: {content}\n")

        # 2. Check if there is a draft
        if 'draft' in node_content:
            final_draft = node_content['draft']
            print(f"Draft updated (Length: {len(final_draft)} chars)")
            # Optional: Print the first 500 chars to verify no dots
            print(f"Preview: {final_draft[:500]}...\n")

# Print Final Result
print("\n\n=== FINAL RESEARCH REPORT ===")
if final_draft:
    # Remove the <think> block for the clean final presentation
    clean_report = re.sub(r'<think>.*?</think>', '', final_draft, flags=re.DOTALL).strip()
    print(clean_report)
else:
    print("No draft was generated (Did the loop crash or max out?)")

Starting the Research Agent...
--- üß† PLANNER IS THINKING ---
--- Finished running: planner ---
--- üïµÔ∏è RESEARCHER IS SEARCHING ARXIV ---
Searching for: latest developments in SWE agents 2026
Searching for: major updates in SWE agent technology 2026
Searching for: current trends in SWE agent advancements
--- Finished running: researcher ---
--- ‚úçÔ∏è WRITER IS DRAFTING ---
--- Finished running: writer ---
Draft updated (Length: 4502 chars)
Preview: <think>
To write this report, I need to organize the research findings into a logical structure. First, I will outline the key points about SWE-Gym and its impact on training software engineering agents. Then, I will discuss the challenges identified by SWE-rebench and how they relate to the broader context of training and evaluating these agents.

The introduction should set the stage by highlighting the importance of software engineering agents and the recent advancements in this area. The mai...

--- üßê CRITIC IS REVIEWING ---
<t

In [None]:
tasks = ["Investigate the three most popular 'Autonomous Coding Agents' released in late 2025. Create a table comparing their marketed success rates on the SWE-bench Verified benchmark against independent third-party evaluations found on GitHub or Reddit. Highlight any discrepancies greater than 5% and identify the specific failure modes (e.g., infinite loops, context window crashes) reported by users but omitted in the official launch posts.",
"Analyze the 'multi-agent orchestration' protocols used in the latest version of LangGraph and AutoGen as of Jan 2026. Don't just list features; specifically compare how each framework handles race conditions when two agents attempt to write to the shared state simultaneously. Cite the specific class names or functions responsible for this state management.",
 ]

In [None]:
for task in tasks:
  initial_state = {
      "task": task,
      "max_revisions": 3,
      "revision_number": 0,
      "content": [],
      "plan": [],
      "draft": "",
      "critique": ""
  }

  # Variable to store the latest draft
  final_draft = ""

  print("Starting the Research Agent...")

  # Run the graph
  for output in app.stream(initial_state):
      for node_name, node_content in output.items():
          print(f"--- Finished running: {node_name} ---")

          # 1. Check if there are messages (common in 'planner' nodes)
          if 'messages' in node_content:
              # Get the last message's content and print it directly
              last_msg = node_content['messages'][-1]
              # Handle if it's an object or a dict
              content = last_msg.content if hasattr(last_msg, 'content') else last_msg.get('content')
              print(f"FULL OUTPUT: {content}\n")

          # 2. Check if there is a draft
          if 'draft' in node_content:
              final_draft = node_content['draft']
              print(f"Draft updated (Length: {len(final_draft)} chars)")
              # Optional: Print the first 500 chars to verify no dots
              print(f"Preview: {final_draft[:500]}...\n")

  # Print Final Result
  print("\n\n=== FINAL RESEARCH REPORT ===")
  if final_draft:
      print(final_draft)
  else:
      print("No draft was generated (Did the loop crash or max out?)")

Starting the Research Agent...
--- üß† PLANNER IS THINKING ---
--- Finished running: planner ---
--- üïµÔ∏è RESEARCHER IS SEARCHING ARXIV ---
Searching for: Top Autonomous Coding Agents 2025
Searching for: SWE-bench Verified benchmark results 2025
Searching for: User-reported issues with 2025 coding agents
--- Finished running: researcher ---
--- ‚úçÔ∏è WRITER IS DRAFTING ---
--- Finished running: writer ---
Draft updated (Length: 3074 chars)
Preview: <think>
To write this report, I need to organize the research findings into a logical structure. First, I will outline the key points about the three autonomous coding agents mentioned in the research materials. Then, I'll create a comparison table highlighting their performance on the SWE-bench Verified benchmark and any discrepancies between their claimed success rates and independent evaluations found online. Finally, I'll analyze the potential reasons for these discrepancies and suggest area...

--- üßê CRITIC IS REVIEWING ---
<thi