In [None]:
# Idea evaluator based on Sakana's paper level evaluator
# Simple prompt based evaluator with few-shot examples
# Prompt and scoring guidelines taken from NeurIPS 2024 guidelines
# No external tool provided
# Assuming list of idea input
# Prompts updated for idea review
# Supports ensemble (multiple calls to the same model for review)
# Supports statistical aggregation
# TODO: Using LLM metareview to aggregate
# Saves JSONL with reviews per idea

### LLM Calling Functions

In [69]:
# Getting OpenRouter API Key
from google.colab import userdata
openrouter_api_key = userdata.get('OpenRouter_Key')

import requests
import json
from typing import Optional
import re

In [72]:
def send_ai_request(user_message, system_prompt=Optional, model="google/gemini-2.5-pro-preview-03-25", file=Optional, file_data=Optional, file_name=Optional, temperature=0.7):
    """Send a request to the OpenRouter API and return the response"""
    # Construction message
    messages_array = []
    if system_prompt:
      system_prompt_message = {
          "role": "system",
          "content": system_prompt
      }
      messages_array.append(system_prompt_message)

    user_message_prompt_message = [{
        "type": "text",
        "text": user_message
    }]
    if file:
        user_message_prompt_message.append({
            "type": "file",
            "file": {
            "filename": file_name,
            "file_data": file_data
            }
        }
        )
    user_message_prompt_message = str(user_message_prompt_message)

    user_message = {
        "role": "user",
        "content": user_message_prompt_message
    }

    messages_array.append(user_message)

    response = requests.post(
        url="https://openrouter.ai/api/v1/chat/completions",
        headers={
            "Authorization": f"Bearer {openrouter_api_key}"
        },
        data=json.dumps({
            "model": model,
            "messages": messages_array,
            "temperature": temperature,
            # "max_tokens": 5000,
            "transforms" : ["middle-out"]
        })
    )
    return response.json()

In [66]:
def parse_ai_response(response, reasoning=False):
    """Extract the content and usage metrics from API response"""
    try:
        content = response['choices'][0]['message']['content']
        usage = response['usage']
        if reasoning:
            reasoning_text = response['choices'][0]['message']['reasoning']
            return {
              'content': content,
              'usage': usage,
              'reasoning': reasoning_text,
              'success': True
          }
        elif reasoning is False:
            return {
              'content': content,
              'usage': usage,
              'success': True
          }
    except (KeyError, IndexError) as e:
        return {
            'content': None,
            'usage': None,
            'success': False,
            'reasoning': None,
            'error': str(e),
            'response': response
        }

In [65]:
def extract_json_between_markers(llm_output: str) -> dict | None:
    # Regular expression pattern to find JSON content between ```json and ```
    json_pattern = r"```json(.*?)```"
    matches = re.findall(json_pattern, llm_output, re.DOTALL)

    if not matches:
        # Fallback: Try to find any JSON-like content in the output
        json_pattern = r"\{.*?\}"
        matches = re.findall(json_pattern, llm_output, re.DOTALL)

    for json_string in matches:
        json_string = json_string.strip()
        try:
            parsed_json = json.loads(json_string)
            return parsed_json
        except json.JSONDecodeError:
            # Attempt to fix common JSON issues
            try:
                # Remove invalid control characters
                json_string_clean = re.sub(r"[\x00-\x1F\x7F]", "", json_string)
                parsed_json = json.loads(json_string_clean)
                return parsed_json
            except json.JSONDecodeError:
                continue  # Try next match

    return None  # No valid JSON found

### Reviewer Prompt Creation

In [64]:
reviewer_system_prompt_base = "You are an AI researcher who is reviewing a research idea submitted by a peer for evaluation. Be critical and cautious in your decision."

In [63]:
# Removing focus on Limitations and Ethical Concerns, Presentation and Soundness
# Rewrote the Summary section to focus on idea
# Reworded NeurIPS community to AI Research community
# Removed Overall, Decision

neurips_form_guidelines = """
## Idea Review Form
Below is a description of the questions you will be asked on the idea review form for each idea and some guidelines on what to consider when answering these questions.
When writing your review, please keep in mind that after decisions have been made, reviews and meta-reviews of accepted ideas and opted-in rejected ideas will be made public.

1. Summary: Briefly summarize the idea and its contributions. This is not the place to critique the idea; the authors should generally agree with a well-written summary.
2. Strengths and Weaknesses: Please provide a thorough assessment of the strengths and weaknesses of the idea.
3. Originality: Are the tasks or methods new? Is the work a novel combination of well-known techniques? (This can be valuable!) Is it clear how this work differs from previous contributions? Is related work adequately cited
4. Quality: Is the idea technically sound? Are the claims well supported (e.g., by potential theoretical frameworks or concrete experimental approaches)? Are the proposed methods appropriate and feasible given current technology? Does the idea clearly articulate how it would be validated or tested?
5. Clarity: Is the idea clearly written? Is it well organized? (If not, please make constructive suggestions for improving its clarity.) Does it adequately inform the reader? (Note that a superbly written idea provides enough information for an expert reader to gauge its relevance.)
6. Significance: Are the possible results from the idea important? Are others (researchers or practitioners) likely to use the ideas or build on them? Does the idea address a difficult task in a better way than previous work? Does it advance the state of the art in a demonstrable way? Does it suggest unique data, unique conclusions about existing data, or a unique theoretical or experimental approach?
7. Questions: Please list up and carefully describe any questions and suggestions for the authors. Think of the things where a response from the author can change your opinion, clarify a confusion or address a limitation. This can be very important for a productive rebuttal and discussion phase with the authors.

8. Contribution: Please assign the idea a numerical rating on the following scale to indicate the quality of the overall contribution this idea makes to the research area being studied. Are the questions being asked important? Does the idea bring a significant originality of ideas and/or execution? Are the results valuable to share with the broader AI research community.
  4: excellent
  3: good
  2: fair
  1: poor

9. Confidence:  Please provide a "confidence score" for your assessment of this submission to indicate how confident you are in your evaluation. Choices:
  5: You are absolutely certain about your assessment. You are very familiar with the related work and checked the math/other details carefully.
  4: You are confident in your assessment, but not absolutely certain. It is unlikely, but not impossible, that you did not understand some parts of the submission or that you are unfamiliar with some pieces of related work.
  3: You are fairly confident in your assessment. It is possible that you did not understand some parts of the submission or that you are unfamiliar with some pieces of related work. Math/other details were not carefully checked.
  2: You are willing to defend your assessment, but it is quite likely that you did not understand the central parts of the submission or that you are unfamiliar with some pieces of related work. Math/other details were not carefully checked.
  1: Your assessment is an educated guess. The submission is not in your area or the submission was difficult to understand. Math/other details were not carefully checked.
"""

In [62]:
template_instructions = """
Respond in the following format:

THOUGHT:
<THOUGHT>

REVIEW JSON:
```json
<JSON>
```

In <THOUGHT>, first briefly discuss your intuitions and reasoning for the evaluation.
Detail your high-level arguments, necessary choices and desired outcomes of the review.
Do not make generic comments here, but be specific to your current idea.
Treat this as the note-taking phase of your review.

In <JSON>, provide the review in JSON format with the following fields in the order:
- "Summary": A summary of the paper content and its contributions.
- "Strengths": A list of strengths of the paper.
- "Weaknesses": A list of weaknesses of the paper.
- "Originality": A rating from 1 to 4 (low, medium, high, very high).
- "Quality": A rating from 1 to 4 (low, medium, high, very high).
- "Clarity": A rating from 1 to 4 (low, medium, high, very high).
- "Significance": A rating from 1 to 4 (low, medium, high, very high).
- "Questions": A set of clarifying questions to be answered by the paper authors.
- "Contribution": A rating from 1 to 4 (poor, fair, good, excellent).
- "Confidence": A rating from 1 to 5 (low, medium, high, very high, absolute).

This JSON will be automatically parsed, so ensure the format is precise.
"""

In [61]:
def build_review_prompt(prompt_base, guidelines, output_instructions, include_few_shot=False, few_shot_examples=None):
    prompt = prompt_base + guidelines
    prompt += output_instructions
    if include_few_shot:
        prompt += few_shot_examples
    return prompt

### Reviewing Single Idea Text

In [60]:
def review_single_idea(idea_text, reviewer_prompt, ensemble: int = 1, model: str = "google/gemini-2.5-pro-preview-03-25"):
  # creating user_message
  print("Creating user message.")
  user_message = f"""
    Here is the idea you are asked to review:
    ```
    {idea_text}
    ```
    """
  review_results = []
  review_count = 1 # Minimum
  if ensemble:
    review_count = ensemble
  print(f"Starting review calls for ensemble length: {review_count}")
  for i in range(ensemble):
    print(f"Review # : {i} with {model}")
    review_raw_response = send_ai_request(user_message=user_message, system_prompt=reviewer_prompt, model=model, temperature=0.75)
    print(f"Review # : {i} with {model} - Raw response received.")
    review_response_parsed = parse_ai_response(review_raw_response)
    print(f"Review # : {i} with {model} - Raw response parsed.")
    review_response_json = extract_json_between_markers(review_response_parsed['content'])
    print(f"Review # : {i} with {model} - Score JSON from parsed response extracted.")
    review_object = {
        "review_count" : i,
        "review_raw_response" : review_raw_response,
        "review_response_parsed" : review_response_parsed,
        "review_response_json" : review_response_json,
        "model" : model
    }
    review_results.append(review_object)
    print(f"Review # : {i} with {model} - Appended to review results array.")
    i += 1
    print(f"Moving on to review # : {i}")
  return review_results

### Input Processing Module

In [59]:
import os
import json
import csv

In [58]:
def load_ideas(file_path):
    """
    Load idea records from a JSONL or CSV file.

    Returns:
      List[dict] each with keys:
        - id: str (simple incremental, e.g. idea_001, idea_002, …)
        - text: str (idea text)
        - original_data: dict (all fields from the source)
    """
    ideas = []
    ext = os.path.splitext(file_path)[1].lower()
    counter = 1

    if ext == '.jsonl':
        with open(file_path, 'r', encoding='utf-8') as f:
            for lineno, line in enumerate(f, start=1):
                try:
                    item = json.loads(line)
                except json.JSONDecodeError:
                    print(f"Skipping malformed JSONL line {lineno}")
                    continue

                text = item.get('text', '').strip()
                if not text:
                    print(f"Skipping empty text at JSONL line {lineno}")
                    continue

                idea_id = f"idea_{counter:03d}"
                counter += 1
                ideas.append({
                    'id': idea_id,
                    'text': text,
                    'original_data': item
                })

    elif ext == '.csv':
        with open(file_path, 'r', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            for rowno, row in enumerate(reader, start=2):  # header is row 1
                # prefer an 'idea_text' column if present
                text = row.get('idea_text') or row.get('text') or row.get('description') or ''
                text = text.strip()
                if not text:
                    print(f"Skipping empty text at CSV row {rowno}")
                    continue

                idea_id = f"idea_{counter:03d}"
                counter += 1
                ideas.append({
                    'id': idea_id,
                    'text': text,
                    'original_data': row
                })

    else:
        raise ValueError(f"Unsupported file format: {ext}")

    return ideas


### Aggregating Reviews Across Ensemble

In [57]:
from collections import Counter
import statistics

def aggregate_reviews(raw_jsons):
    # raw_jsons: list of dicts from review_response_json
    agg = {}
    # 1) Numeric fields
    for field in ("Originality","Quality","Clarity","Significance","Contribution","Confidence"):
        vals = [r[field] for r in raw_jsons if field in r]
        agg[field] = int(round(statistics.mean(vals)))

    # 2) Text summary – pick the 1st (or concatenate if you like)
    agg["Summary"] = raw_jsons[0]["Summary"]

    # 3) Strengths & Weaknesses – majority vote
    for list_field in ("Strengths","Weaknesses"):
      agg[list_field] = set(q for r in raw_jsons for q in r[list_field])

    # 4) Questions – union
    questions = set(q for r in raw_jsons for q in r["Questions"])
    agg["Questions"] = list(questions)

    return agg

### Output Generation

In [56]:
import json

def save_raw_reviews_to_jsonl(ideas, output_path):
    """
    Persist the batch review output to a JSONL file.

    Args:
      ideas (List[dict]):
        Each dict should have keys 'id', 'text', 'original_data', and 'review_results'
        where 'review_results' is the list returned by review_single_idea.
      output_path (str): path to write the .jsonl file.
    """
    with open(output_path, 'w', encoding='utf-8') as fout:
        for idea in ideas:
            # Build a clean record for this idea
            record = {
                "id": idea["id"],
                "text": idea["text"],
                # "original_data": idea.get("original_data", {}),
                "reviews": []
            }

            for r in idea.get("review_results", []):
                # Pull out the fields you want
                review_entry = {
                    "model": r.get("model"),
                    "parsed_json": r.get("review_response_json"),
                    # "raw_content": r.get("review_response_parsed", {}).get("content"),
                    "prompt_tokens": r.get("review_response_parsed", {}).get("usage").get("prompt_tokens"),
                    "completion_tokens": r.get("review_response_parsed", {}).get("usage").get("completion_tokens")
                }
                record["reviews"].append(review_entry)

            fout.write(json.dumps(record) + "\n")


### Test Run

In [29]:
print("Setting system prompt.")
system_prompt = build_review_prompt(prompt_base=reviewer_system_prompt_base, guidelines=neurips_form_guidelines, output_instructions=template_instructions, include_few_shot=False, few_shot_examples=None)

print("Fetching list of ideas.")
list_of_ideas_file_path = input("Enter file path.")
ideas = load_ideas(list_of_ideas_file_path)
print(f"Type of ideas: {type(ideas)}")
print(f"Number of ideas loaded: {len(ideas)}")

print("Starting ensemble review process.")
for idea in ideas:
    print(f"Reviewing idea: {idea['id']}")
    review_results = review_single_idea(idea_text=idea['text'], reviewer_prompt=system_prompt, ensemble=2, model="openai/gpt-4o")
    idea["review_results"] = review_results
    print(f"Review complete for idea: {idea['id']}")

print("Reviewing output per idea:")
for idea in ideas:
  print(f"Idea ID: {idea['id']}")
  print(f"Review Extracted JSON from Content: {idea['review_results'][0]['review_response_json']}")
  print(f"Review Content Extracted from Raw: {idea['review_results'][0]['review_response_parsed']}")



Setting system prompt.
Fetching list of ideas.
Enter file path./content/test_ideas.jsonl
Type of ideas: <class 'list'>
Number of ideas loaded: 3
Starting ensemble review process.
Reviewing idea: idea_001
Creating user message.
Starting review calls for ensemble length: 2
Review # : 0 with openai/gpt-4o
Review # : 0 with openai/gpt-4o - Raw response received.
Review # : 0 with openai/gpt-4o - Raw response parsed.
Review # : 0 with openai/gpt-4o - Score JSON from parsed response extracted.
Review # : 0 with openai/gpt-4o - Appended to review results array.
Moving on to review # : 1
Review # : 1 with openai/gpt-4o
Review # : 1 with openai/gpt-4o - Raw response received.
Review # : 1 with openai/gpt-4o - Raw response parsed.
Review # : 1 with openai/gpt-4o - Score JSON from parsed response extracted.
Review # : 1 with openai/gpt-4o - Appended to review results array.
Moving on to review # : 2
Review complete for idea: idea_001
Reviewing idea: idea_002
Creating user message.
Starting review

In [71]:
for idea in ideas:
  print(f"Idea ID: {idea['id']}")
  print(f"Number of Reviews for Idea: {len(idea['review_results'])}")
  raw_review_json_list = []
  for i in range(len(idea['review_results'])):
    print(f"Review {i}")
    print(f"Model: {idea['review_results'][i]['model']}")
    raw_review_json_list.append(idea['review_results'][i]['review_response_json'])

  print("Getting simple aggregate")
  agg = aggregate_reviews(raw_review_json_list)
  for key in agg:
    print(f"{key}: {agg[key]}")

Idea ID: idea_001
Number of Reviews for Idea: 2
Review 0
Model: openai/gpt-4o
Review 1
Model: openai/gpt-4o
Getting simple aggregate
Originality: 2
Quality: 2
Clarity: 2
Significance: 3
Contribution: 2
Confidence: 4
Summary: The proposal suggests using Generative Adversarial Networks (GANs) to simulate missing seismic waveforms. This approach aims to fill gaps in seismic data, which could improve the analysis and interpretation of seismic events.
Strengths: {'Could potentially improve the quality of seismic data analysis.', 'Potential to improve the accuracy of earthquake data analysis.', 'Innovative application of GANs to a significant problem in seismology.', 'Addresses a practical problem in the field of seismology.', 'Utilizes GANs, which are known for their ability to generate realistic data.'}
Weaknesses: {'It is not clear how the proposed method compares to existing methods for simulating missing seismic data.', 'Lack of detail on how the GANs will be specifically applied to sei

### TODO: LLM Meta-Reviewer

In [None]:
# # TODO: Meta Reviewer
# meta_reviewer_system_prompt = """You are an Area Chair at a machine learning conference.
# # You are in charge of meta-reviewing a paper that was reviewed by {reviewer_count} reviewers.
# # Your job is to aggregate the reviews into a single meta-review in the same format.
# # Be critical and cautious in your decision, find consensus, and respect the opinion of all the reviewers."""

# def get_meta_review(model, client, temperature, reviews):
#     review_text = ""
#     for i, r in enumerate(reviews):
#         review_text += f"""
# Review {i + 1}/{len(reviews)}:
# ```
# {json.dumps(r)}
# ```
# """
#     base_prompt = neurips_form + review_text
#     llm_review, _ = get_response_from_llm(
#         base_prompt,
#         model=model,
#         client=client,
#         system_message=meta_reviewer_system_prompt.format(reviewer_count=len(reviews)),
#         print_debug=False,
#         msg_history=None,
#         temperature=temperature,
#     )
#     meta_review = extract_json_between_markers(llm_review)
#     return meta_review