### Input Processing Module

In [None]:
import os
import json
import csv

In [None]:
def load_ideas(file_path):
    """
    Load idea records from a JSONL or CSV file.

    Returns:
      List[dict] each with keys:
        - id: str (simple incremental, e.g. idea_001, idea_002, …)
        - text: str (idea text)
        - original_data: dict (all fields from the source)
    """
    ideas = []
    ext = os.path.splitext(file_path)[1].lower()
    counter = 1

    if ext == '.jsonl':
        with open(file_path, 'r', encoding='utf-8') as f:
            for lineno, line in enumerate(f, start=1):
                try:
                    item = json.loads(line)
                except json.JSONDecodeError:
                    print(f"Skipping malformed JSONL line {lineno}")
                    continue

                text = item.get('text', '').strip()
                if not text:
                    print(f"Skipping empty text at JSONL line {lineno}")
                    continue

                idea_id = f"idea_{counter:03d}"
                counter += 1
                ideas.append({
                    'id': idea_id,
                    'text': text,
                    'original_data': item
                })

    elif ext == '.csv':
        with open(file_path, 'r', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            for rowno, row in enumerate(reader, start=2):  # header is row 1
                # prefer an 'idea_text' column if present
                text = row.get('idea_text') or row.get('text') or row.get('description') or ''
                text = text.strip()
                if not text:
                    print(f"Skipping empty text at CSV row {rowno}")
                    continue

                idea_id = f"idea_{counter:03d}"
                counter += 1
                ideas.append({
                    'id': idea_id,
                    'text': text,
                    'original_data': row
                })

    else:
        raise ValueError(f"Unsupported file format: {ext}")

    return ideas


### Evaluation Prompt 1 - Taken from Chain of Ideas Paper
* Common Criteria: Significance, Clarity

---

* Unique Criteria: Novelty /Originality in LLM Reviewer/
Feasibility, Effectiveness


In [30]:
def generate_coi_prompt(idea0: str, idea1: str, topic: str) -> str:
    """Generates the prompt for evaluating two research ideas."""
    evaluation_prompt = f'''
    You are a judge in a competition. You have to decide which idea is better.

    The idea0 is: {idea0}
    The idea1 is: {idea1}

    The topic is: {topic}

    Which idea do you think is better? Please write a short paragraph to explain your choice.

    Here are your evaluation criteria:
    Novelty: Are the problems or approaches new? Is this a novel combination of familiar techniques? Is it clear how this work differs from previous contributions? Is related work adequately referenced?
    Significance: Are the idea important? Are other people (practitioners or researchers) likely to use these ideas or build on them? Does the idea address a difficult problem in a better way than previous research? Does it provide a unique theoretical or pragmatic approach?
    Feasibility: Can the idea be realized with existing technology or methods? Are there any technical difficulties or bottlenecks? Is the idea clear and logical? Is there any obvious error or unreasonable part in the idea, and can the experiment be designed normally according to this idea.
    Clarity: Is the paper clearly written? Is it well-organized? Does it adequately inform the reader?
    Effectiveness: How likely the proposed idea is going to work well (e.g., better than existing baselines).

    Note:
    Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. DO NOT allow the LENGTH of the responses to influence your evaluation, choose the one that is straight-to-the-point instead of unnecessarily verbose. Be as objective as possible. (very important!!!)

    If you think idea0 is better than idea1, you should output 0. If you think idea1 is better than idea0, you should output 1. If you think idea0 and idea1 are equally good, you should output 2.

    Your output should be strictly in following format:
    Your thinking process:

    Your choice:
    <novelty>{{ Your choice for novelty }}</novelty>
    <significance>{{ Your choice for significance }}</significance>
    <feasibility>{{ Your choice for feasibility }}</feasibility>
    <clarity>{{ Your choice for clarity }}</clarity>
    <effectiveness>{{ Your choice for effectiveness }}</effectiveness>
    '''
    return evaluation_prompt


In [31]:
def parse_coi_response(response_content: str) -> dict:
    """
    Parse a Chain-of-Ideas style evaluation response.

    Args:
        response_content: The LLM's response text containing thinking process and axis scores

    Returns:
        dict with keys:
        - 'thinking_process': str
        - 'axis_scores': dict mapping axis names to scores (0, 1, or 2)
        - 'overall_winner': str ('idea0', 'idea1', or 'tie')
        - 'success': bool indicating if parsing succeeded
    """
    result = {
        'thinking_process': '',
        'axis_scores': {},
        'overall_winner': 'tie',  # Default to tie
        'success': False
    }

    try:
        # Extract thinking process (everything before the first XML tag)
        first_tag_match = re.search(r'<(novelty|significance|feasibility|clarity|effectiveness)>',
                                    response_content)
        if first_tag_match:
            result['thinking_process'] = response_content[:first_tag_match.start()].strip()
        else:
            # If no tags found, treat entire response as thinking process
            result['thinking_process'] = response_content.strip()

        # Extract axis scores
        axes = ['novelty', 'significance', 'feasibility', 'clarity', 'effectiveness']
        for axis in axes:
            pattern = f'<{axis}>\\s*(\\d)\\s*</{axis}>'
            match = re.search(pattern, response_content)
            if match:
                score = int(match.group(1))
                # Validate score is 0, 1, or 2
                if score in [0, 1, 2]:
                    result['axis_scores'][axis] = score
                else:
                    # Invalid score, treat as parsing failure for this axis
                    result['axis_scores'][axis] = 2  # Default to tie
            else:
                # Missing axis, treat as tie
                result['axis_scores'][axis] = 2

        # Calculate overall winner using point system
        if len(result['axis_scores']) == 5:  # All axes parsed
            idea0_points = 0
            idea1_points = 0

            for axis, score in result['axis_scores'].items():
                if score == 0:
                    idea0_points += 1
                elif score == 1:
                    idea1_points += 1
                else:  # score == 2 (tie)
                    idea0_points += 0.5
                    idea1_points += 0.5

            # Determine overall winner with string labels
            if idea0_points > idea1_points:
                result['overall_winner'] = 'idea0'
            elif idea1_points > idea0_points:
                result['overall_winner'] = 'idea1'
            else:
                result['overall_winner'] = 'tie'

            result['success'] = True

            # Add point totals for transparency
            result['point_totals'] = {
                'idea0': idea0_points,
                'idea1': idea1_points
            }

    except Exception as e:
        # If any parsing error occurs, return failure
        result['error'] = str(e)
        result['success'] = False

    return result

### Evaluation Prompt 2 - From Google Co-Scientist
* Can use custom idea attributes to run tournament, removed reviews of hypothesis and addtional note / research goal.

In [32]:
def generate_gsc_simple_prompt(idea0, idea1, idea_attributes, primary_area):
  prompt = f'''
  You are an expert evaluator tasked with comparing two hypotheses.
  Evaluate the two provided hypotheses (hypothesis 1 and hypothesis 2) and determine which one is superior based on the specified evaluation criteria.
  Provide a concise rationale for your selection, concluding with the phrase "better idea: <1 or 2>".

  Primary Area: {primary_area}

  Evaluation criteria:
  {idea_attributes}

  Hypothesis 1:
  {idea0}
  Hypothesis 2:
  {idea1}

  Reasoning and conclusion (end with "better hypothesis: <1 or 2>"):
  '''
  return prompt

In [33]:
def parse_gcs_response(response_content: str) -> dict:
    """
    Parse a Google Co-Scientist style evaluation response.

    Args:
        response_content: The LLM's response text ending with "better hypothesis: <1 or 2>"

    Returns:
        dict with keys:
        - 'reasoning': str (the rationale text)
        - 'overall_winner': str ('idea0', 'idea1', or 'tie')
        - 'success': bool indicating if parsing succeeded
    """
    result = {
        'reasoning': '',
        'overall_winner': 'tie',  # Default to tie
        'success': False
    }

    try:
        # Look for the pattern "better hypothesis: <number>" at the end
        # Case-insensitive and flexible with spacing
        pattern = r'better\s+hypothesis:\s*<?(\d+)>?\s*$'
        match = re.search(pattern, response_content, re.IGNORECASE | re.MULTILINE)

        if match:
            winner_num = int(match.group(1))

            # Extract reasoning (everything before the conclusion)
            result['reasoning'] = response_content[:match.start()].strip()

            # Map winner number to our convention
            # Note: GCS uses 1/2, we use idea0/idea1
            if winner_num == 1:
                result['overall_winner'] = 'idea0'  # First idea in our pairing
            elif winner_num == 2:
                result['overall_winner'] = 'idea1'  # Second idea in our pairing
            else:
                # Invalid number, treat as tie
                result['overall_winner'] = 'tie'

            result['success'] = True
        else:
            # Pattern not found, treat entire response as reasoning
            result['reasoning'] = response_content.strip()
            result['success'] = False
            result['error'] = "Could not find 'better hypothesis: <1 or 2>' pattern"

    except Exception as e:
        result['error'] = str(e)
        result['success'] = False

    return result

### Prompt Registry

In [34]:
PROMPT_REGISTRY = {
    "coi_5axis": {
        "generator": generate_coi_prompt,
        "parser": parse_coi_response,
        "params": ["idea0_text", "idea1_text", "topic"]
    },
    "gcs_simple": {
        "generator": generate_gsc_simple_prompt,
        "parser": parse_gcs_response,
        "params": ["idea0_text", "idea1_text", "idea_attributes", "primary_area"]
    }
}

### AI Functions

In [35]:
# Getting OpenRouter API Key
from google.colab import userdata
openrouter_api_key = userdata.get('OpenRouter_Key')

import requests
import json
from typing import Optional
import re

In [37]:
def send_ai_request(user_message, system_prompt=None, model="google/gemini-2.5-pro-preview-03-25", file=None, file_data=None, file_name=None, temperature=1):
    """Send a request to the OpenRouter API and return the response"""
    # Construction message
    messages_array = []
    if system_prompt:
      system_prompt_message = {
          "role": "system",
          "content": system_prompt
      }
      messages_array.append(system_prompt_message)

    user_message_prompt_message = [{
        "type": "text",
        "text": user_message
    }]
    if file:
        user_message_prompt_message.append({
            "type": "file",
            "file": {
            "filename": file_name,
            "file_data": file_data
            }
        }
        )
    user_message_prompt_message = str(user_message_prompt_message)

    user_message = {
        "role": "user",
        "content": user_message_prompt_message
    }

    messages_array.append(user_message)

    response = requests.post(
        url="https://openrouter.ai/api/v1/chat/completions",
        headers={
            "Authorization": f"Bearer {openrouter_api_key}"
        },
        data=json.dumps({
            "model": model,
            "messages": messages_array,
            "temperature": temperature,
            # "max_tokens": 5000,
            "transforms" : ["middle-out"]
        })
    )
    return response.json()

In [38]:
def parse_ai_response(response, reasoning=False):
    """Extract the content and usage metrics from API response"""
    try:
        content = response['choices'][0]['message']['content']
        usage = response['usage']
        if reasoning:
            reasoning_text = response['choices'][0]['message']['reasoning']
            return {
              'content': content,
              'usage': usage,
              'reasoning': reasoning_text,
              'success': True
          }
        elif reasoning is False:
            return {
              'content': content,
              'usage': usage,
              'success': True
          }
    except (KeyError, IndexError) as e:
        return {
            'content': None,
            'usage': None,
            'success': False,
            'reasoning': None,
            'error': str(e),
            'response': response
        }

In [39]:
def extract_json_between_markers(llm_output: str) -> dict | None:
    # Regular expression pattern to find JSON content between ```json and ```
    json_pattern = r"```json(.*?)```"
    matches = re.findall(json_pattern, llm_output, re.DOTALL)

    if not matches:
        # Fallback: Try to find any JSON-like content in the output
        json_pattern = r"\{.*?\}"
        matches = re.findall(json_pattern, llm_output, re.DOTALL)

    for json_string in matches:
        json_string = json_string.strip()
        try:
            parsed_json = json.loads(json_string)
            return parsed_json
        except json.JSONDecodeError:
            # Attempt to fix common JSON issues
            try:
                # Remove invalid control characters
                json_string_clean = re.sub(r"[\x00-\x1F\x7F]", "", json_string)
                parsed_json = json.loads(json_string_clean)
                return parsed_json
            except json.JSONDecodeError:
                continue  # Try next match

    return None  # No valid JSON found

### Single Match Runner

In [40]:
def execute_match(idea_a, idea_b, variant, model, prompt_kwargs, bidirectional=False):
    """
    Execute pairwise comparison between two ideas.

    Args:
        idea_a: dict with keys 'id', 'text', 'original_data'
        idea_b: dict with keys 'id', 'text', 'original_data'
        variant: str, key from PROMPT_REGISTRY ('coi_5axis' or 'gcs_simple')
        model: str, model identifier for LLM
        prompt_kwargs: dict with variant-specific parameters (e.g., topic, idea_attributes)
        bidirectional: bool, whether to run comparison in both directions

    Returns:
        dict with 'match_results' list, 'variant', and 'model'
    """
    # Validate variant
    if variant not in PROMPT_REGISTRY:
        return {
            "match_results": [],
            "variant": variant,
            "model": model,
            "error": f"Unknown variant: {variant}"
        }

    print(f"\n[DEBUG] Starting match: {idea_a['id']} vs {idea_b['id']}")
    print(f"[DEBUG] Variant: {variant}, Model: {model}")
    print(f"[DEBUG] Prompt kwargs: {prompt_kwargs}")

    registry_entry = PROMPT_REGISTRY[variant]
    generator = registry_entry["generator"]
    parser = registry_entry["parser"]

    results = []

    # Define the evaluation directions
    if bidirectional:
        directions = [
            ("forward", idea_a, idea_b),
            ("reverse", idea_b, idea_a)
        ]
    else:
        directions = [("forward", idea_a, idea_b)]



    # Run evaluation(s)
    for direction, first_idea, second_idea in directions:
        try:
            # Generate prompt with idea texts and variant-specific params
            prompt = generator(first_idea['text'], second_idea['text'], **prompt_kwargs)

            print(f"\n[DEBUG] Direction: {direction}")
            print(f"[DEBUG] Generated prompt length: {len(prompt)}")
            print(f"[DEBUG] First 200 chars of prompt: {prompt[:200]}...")

            # Call LLM
            raw_response = send_ai_request(prompt, model=model, temperature=1)
            parsed_response = parse_ai_response(raw_response)

            # Parse the response
            if parsed_response['success']:
                print(f"[DEBUG] LLM call success: {parsed_response['success']}")
                print(f"[DEBUG] Response length: {len(parsed_response['content'])}")
                print(f"[DEBUG] First 200 chars: {parsed_response['content'][:200]}...")
                result = parser(parsed_response['content'])

                print(f"[DEBUG] Parser success: {result.get('success', 'No success field')}")
                print(f"[DEBUG] Overall winner: {result.get('overall_winner', 'No winner field')}")
                if not result.get('success', True):
                  print(f"[DEBUG] Parser error: {result.get('error', 'No error details')}")

                # Add metadata
                result['direction'] = direction
                result['idea0_id'] = first_idea['id']
                result['idea1_id'] = second_idea['id']
                result['idea0_text'] = first_idea['text']
                result['idea1_text'] = second_idea['text']

                # Add winning text for convenience
                if result.get('overall_winner') == 'idea0':
                    result['winner_text'] = first_idea['text']
                    result['winner_id'] = first_idea['id']
                elif result.get('overall_winner') == 'idea1':
                    result['winner_text'] = second_idea['text']
                    result['winner_id'] = second_idea['id']
                else:  # tie
                    result['winner_text'] = None
                    result['winner_id'] = None

                # Include LLM metadata
                result['llm_usage'] = parsed_response.get('usage', {})

            else:
                # LLM call failed, create failure result
                print(f"[DEBUG] LLM Error: {parsed_response.get('error', 'Unknown error')}")
                result = {
                    'direction': direction,
                    'idea0_id': first_idea['id'],
                    'idea1_id': second_idea['id'],
                    'idea0_text': first_idea['text'],
                    'idea1_text': second_idea['text'],
                    'overall_winner': 'tie',
                    'winner_text': None,
                    'winner_id': None,
                    'success': False,
                    'error': parsed_response.get('error', 'LLM call failed')
                }

        except Exception as e:
            # Unexpected error, create failure result
            result = {
                'direction': direction,
                'idea0_id': first_idea['id'],
                'idea1_id': second_idea['id'],
                'idea0_text': first_idea['text'],
                'idea1_text': second_idea['text'],
                'overall_winner': 'tie',
                'winner_text': None,
                'winner_id': None,
                'success': False,
                'error': f"Unexpected error: {str(e)}"
            }

        results.append(result)

    return {
        "match_results": results,
        "variant": variant,
        "model": model
    }

### Tournament Runner

In [41]:
from datetime import datetime

In [42]:
def run_tournament(ideas, variant, model, prompt_kwargs, bidirectional=False):
    """
    Run a complete round-robin tournament comparing all ideas.

    Parameters:
    - ideas: list of dicts with 'id', 'text', 'original_data'
    - variant: 'coi_5axis' or 'gcs_simple'
    - model: LLM model string
    - prompt_kwargs: dict with variant-specific params (topic, idea_attributes, etc.)
    - bidirectional: whether to evaluate both A vs B and B vs A

    Returns: dict with all tournament data
    """

    # Initialize tracking structures
    win_counts = {idea['id']: 0 for idea in ideas}
    all_matches = []
    total_comparisons = 0
    failed_comparisons = 0

    # Print progress header
    print(f"Starting tournament with {len(ideas)} ideas")
    total_matches = len(ideas) * (len(ideas)-1) // 2
    print(f"Total matches to run: {total_matches}")
    if bidirectional:
        print(f"Total LLM calls: {total_matches * 2}")
    print("-" * 50)

    # Round-robin pairing
    for i in range(len(ideas)):
        for j in range(i+1, len(ideas)):  # Only pair each once
            # Progress indicator
            print(f"Match {total_comparisons + 1}/{total_matches}: {ideas[i]['id']} vs {ideas[j]['id']}")

            # Execute the match
            match_result = execute_match(
                ideas[i],
                ideas[j],
                variant,
                model,
                prompt_kwargs,
                bidirectional
            )

            # Update win counts based on results
            for result in match_result['match_results']:
                if result.get('success', True):  # Default to True for backward compatibility
                    print("Match results include success.")
                    if result['overall_winner'] == 'idea0':
                        win_counts[result['idea0_id']] += 1
                    elif result['overall_winner'] == 'idea1':
                        win_counts[result['idea1_id']] += 1
                    else:  # tie
                        win_counts[result['idea0_id']] += 0.5
                        win_counts[result['idea1_id']] += 0.5
                else:
                    # Failed comparison counts as tie
                    win_counts[result['idea0_id']] += 0.5
                    win_counts[result['idea1_id']] += 0.5
                    failed_comparisons += 1

            # Store the complete match result
            all_matches.append(match_result)
            total_comparisons += 1

    # Create final rankings (list of tuples)
    rankings = sorted(win_counts.items(), key=lambda x: x[1], reverse=True)

    # Add idea text to rankings for convenience
    rankings_with_text = []
    idea_text_map = {idea['id']: idea['text'] for idea in ideas}

    for rank, (idea_id, wins) in enumerate(rankings, 1):
        idea_text = idea_text_map[idea_id]
        text_preview = idea_text[:100] + '...' if len(idea_text) > 100 else idea_text

        rankings_with_text.append({
            'rank': rank,
            'idea_id': idea_id,
            'wins': wins,
            'win_rate': wins / (len(ideas) - 1) if bidirectional else wins / ((len(ideas) - 1) / 2),
            'idea_text': text_preview
        })

    print("-" * 50)
    print(f"Tournament complete! Failed comparisons: {failed_comparisons}")

    # Compile final output
    return {
        'rankings': rankings_with_text,
        'win_counts': win_counts,
        'matches': all_matches,
        'metadata': {
            'variant': variant,
            'model': model,
            'prompt_kwargs': prompt_kwargs,
            'bidirectional': bidirectional,
            'total_ideas': len(ideas),
            'total_comparisons': total_comparisons,
            'failed_comparisons': failed_comparisons,
            'timestamp': datetime.now().isoformat()
        }
    }

### Saving Output

In [51]:
def save_tournament_results(tournament_data, output_path):
    """
    Save tournament results to JSONL file.

    File structure:
    - Line 1: Tournament summary with rankings and metadata
    - Lines 2+: Individual match results
    """

    with open(output_path, 'w', encoding='utf-8') as f:
        # First line: Tournament summary
        summary = {
            'type': 'tournament_summary',
            'rankings': tournament_data['rankings'],
            'metadata': tournament_data['metadata']
        }
        f.write(json.dumps(summary) + '\n')

        # Subsequent lines: Individual matches with all details
        for match_idx, match in enumerate(tournament_data['matches']):
            match_record = {
                'type': 'match_result',
                'match_index': match_idx,
                'variant': match['variant'],
                'model': match['model'],
                'results': match['match_results']  # This includes all parse details
            }
            print(f"[DEBUG] Match {match_idx} has {len(match['match_results'])} results")
            f.write(json.dumps(match_record) + '\n')

    print(f"\nTournament results saved to {output_path}")
    print(f"Total lines written: {len(tournament_data['matches']) + 1}")

    # Print top 3 rankings
    print("\nTop 3 ideas:")
    for item in tournament_data['rankings'][:3]:
        print(f"{item['rank']}. {item['idea_id']} (wins: {item['wins']}, rate: {item['win_rate']:.2%})")
        print(f"   {item['idea_text']}")

### Main

In [53]:
def main():
    """
    Main function to run the tournament-based idea evaluation.
    """
    print("=== Tournament-Based Idea Reviewer ===\n")

    # Step 1: Load ideas
    file_path = input("Enter path to ideas file (JSONL or CSV): ").strip()
    try:
        ideas = load_ideas(file_path)
        print(f"✓ Loaded {len(ideas)} ideas successfully\n")
    except Exception as e:
        print(f"✗ Error loading ideas: {e}")
        return

    # Step 2: Select evaluation variant
    print("Available evaluation variants:")
    for key in PROMPT_REGISTRY.keys():
        print(f"  - {key}")
    variant = input("Select variant: ").strip()

    if variant not in PROMPT_REGISTRY:
        print(f"✗ Invalid variant: {variant}")
        return

    # Step 3: Collect variant-specific parameters
    print(f"\nParameters needed for {variant}:")
    prompt_kwargs = {}

    if variant == "coi_5axis":
        topic = input("Enter topic/primary area: ").strip()
        prompt_kwargs['topic'] = topic

    elif variant == "gcs_simple":
        primary_area = input("Enter primary area: ").strip()
        idea_attributes = input("Enter evaluation criteria (e.g., 'novelty, feasibility, impact'): ").strip()
        prompt_kwargs['primary_area'] = primary_area
        prompt_kwargs['idea_attributes'] = idea_attributes

    # Step 4: Model selection
    print("\nCommon models:")
    print("  - google/gemini-2.0-flash-exp")
    print("  - openai/gpt-4o")
    print("  - anthropic/claude-3-5-sonnet")
    print("  - openai/o3")
    print("  - anthropic/claude-3.7-sonnet")
    model = input("Enter model name: ").strip()

    # Step 5: Bidirectional option
    bidirectional_input = input("\nUse bidirectional evaluation? (y/n, default: n): ").strip().lower()
    bidirectional = bidirectional_input != 'n'

    # Step 6: Confirm before starting
    print("\n=== Tournament Configuration ===")
    print(f"Ideas: {len(ideas)}")
    print(f"Variant: {variant}")
    print(f"Model: {model}")
    print(f"Bidirectional: {bidirectional}")
    print(f"Parameters: {prompt_kwargs}")
    print(f"Estimated LLM calls: {len(ideas) * (len(ideas)-1) // 2 * (2 if bidirectional else 1)}")

    confirm = input("\nProceed with tournament? (y/n): ").strip().lower()
    if confirm != 'y':
        print("Tournament cancelled.")
        return

    # Step 7: Run tournament
    print("\n" + "="*50)
    try:
        tournament_results = run_tournament(
            ideas=ideas,
            variant=variant,
            model=model,
            prompt_kwargs=prompt_kwargs,
            bidirectional=bidirectional
        )
    except Exception as e:
        print(f"\n✗ Error during tournament: {e}")
        return

    # Step 8: Save results
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_path = f"tournament_results_{variant}_{timestamp}.jsonl"

    try:
        save_tournament_results(tournament_results, output_path)
        print(f"\n✓ Tournament completed successfully!")
    except Exception as e:
        print(f"\n✗ Error saving results: {e}")
        return

    # Step 9: Offer to display full rankings
    show_all = input("\nShow all rankings? (y/n): ").strip().lower()
    if show_all == 'y':
        print("\n=== Complete Rankings ===")
        for item in tournament_results['rankings']:
            print(f"{item['rank']:2d}. {item['idea_id']} - Wins: {item['wins']:.1f} ({item['win_rate']:.1%})")
            print(f"    {item['idea_text']}\n")

if __name__ == "__main__":
    main()

=== Tournament-Based Idea Reviewer ===

Enter path to ideas file (JSONL or CSV): /content/sample_ideas.jsonl
✓ Loaded 4 ideas successfully

Available evaluation variants:
  - coi_5axis
  - gcs_simple
Select variant: coi_5axis

Parameters needed for coi_5axis:
Enter topic/primary area: Artificial Intelligence

Common models:
  - google/gemini-2.0-flash-exp
  - openai/gpt-4o
  - anthropic/claude-3-5-sonnet
  - openai/o3
  - anthropic/claude-3.7-sonnet
Enter model name: anthropic/claude-3.7-sonnet

Use bidirectional evaluation? (y/n, default: n): y

=== Tournament Configuration ===
Ideas: 4
Variant: coi_5axis
Model: anthropic/claude-3.7-sonnet
Bidirectional: True
Parameters: {'topic': 'Artificial Intelligence'}
Estimated LLM calls: 12

Proceed with tournament? (y/n): y

Starting tournament with 4 ideas
Total matches to run: 6
Total LLM calls: 12
--------------------------------------------------
Match 1/6: idea_001 vs idea_002

[DEBUG] Starting match: idea_001 vs idea_002
[DEBUG] Variant: