# Benchmark Test Results

This document records benchmark test results for various language models evaluating article huntability scores.

In [5]:
import pandas as pd
import numpy as np

# Initialize benchmark data
data = {
    'Article': [
        '1974',
        '1909',
        '1866',
        '1860',
        '1937',
        '1794'
    ],
    'Hunt Score': [81.3, 99.6, 96.7, 97.9, 93.1, 95.3],
    'Huntable GPT': ['7', '7', '5, 9', '7', '8', '6'],
    'GPT4o Median of 5 runs': ['6', '6', '5', '5', '3', '6'],
    'gpt-oss-20b': [1, 1, 2, 7, 6, 10],  # Median scores
    'gpt-oss-20b-mean': [3.40, 1.00, 3.80, 7.80, 7.20, 8.80],
    'gpt-oss-20b-variance': [10.80, 0.00, 13.20, 1.70, 6.70, 4.70],
    'Claude Sonnet 3.5': [7, 6, 6, 8, 7, 6],
    'Claude Sonnet 3.5-mean': [7.0, 6, 6, 8, 7, 6.2],
    'Claude Sonnet 3.5-variance': [0.0, 0.0, 0.0, 0.0, 0.0, 0.2],
    'Meta-llama-3.1-8b-instruct': [None, None, None, None, None, None],
    'codellama-7b-instruct': [7, 7, 9, 8, 9, 8],
    'codellama-7b-instruct-mean': [7.0, 7.0, 9.0, 8.0, 9.0, 8.0],
    'codellama-7b-instruct-variance': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
    'qwen2-7b-instruct': [9, 7, 8, 8, 8, 8],
    'qwen2-7b-instruct-mean': [9.0, 6.6, 8.0, 8.0, 8.0, 8.4],
    'qwen2-7b-instruct-variance': [0.0, 4.8, 0.0, 0.0, 0.0, 0.3],
    'nous-hermes-2-mistral-7b-dpo': [8, 7, None, 2, 6, 1],
    'nous-hermes-2-mistral-7b-dpo-mean': [8.0, 7.0, None, 2.0, 6.0, 1.0],
    'nous-hermes-2-mistral-7b-dpo-variance': [0.0, 0.0, None, 0.0, 0.0, 0.0],
    'qwen/qwen2.5-coder-32b': [7, 8, 5, 8, 7, 7],
    'qwen/qwen2.5-coder-32b-mean': [7.0, 8.0, 5.8, 8.0, 7.0, 7.0],
    'qwen/qwen2.5-coder-32b-variance': [0.0, 0.0, 1.2, 0.0, 0.0, 0.0],
    'Mistral 7B Instruct v0.3': [8, 7, 8, 6, 7, 7],  # Median scores
    'Mistral 7B Instruct v0.3-mean': [7.8, 7.2, 7.6, 6.0, 6.8, 7.0],
    'Mistral 7B Instruct v0.3-variance': [0.2, 0.2, 0.3, 0.0, 0.2, 0.0],
}

df = pd.DataFrame(data)
df


Unnamed: 0,Article,Hunt Score,Huntable GPT,GPT4o Median of 5 runs,gpt-oss-20b,gpt-oss-20b-mean,gpt-oss-20b-variance,Claude Sonnet 3.5,Claude Sonnet 3.5-mean,Claude Sonnet 3.5-variance,...,qwen2-7b-instruct-variance,nous-hermes-2-mistral-7b-dpo,nous-hermes-2-mistral-7b-dpo-mean,nous-hermes-2-mistral-7b-dpo-variance,qwen/qwen2.5-coder-32b,qwen/qwen2.5-coder-32b-mean,qwen/qwen2.5-coder-32b-variance,Mistral 7B Instruct v0.3,Mistral 7B Instruct v0.3-mean,Mistral 7B Instruct v0.3-variance
0,1974,81.3,7,6,1,3.4,10.8,7,7.0,0.0,...,0.0,8.0,8.0,0.0,7,7.0,0.0,8,7.8,0.2
1,1909,99.6,7,6,1,1.0,0.0,6,6.0,0.0,...,4.8,7.0,7.0,0.0,8,8.0,0.0,7,7.2,0.2
2,1866,96.7,"5, 9",5,2,3.8,13.2,6,6.0,0.0,...,0.0,,,,5,5.8,1.2,8,7.6,0.3
3,1860,97.9,7,5,7,7.8,1.7,8,8.0,0.0,...,0.0,2.0,2.0,0.0,8,8.0,0.0,6,6.0,0.0
4,1937,93.1,8,3,6,7.2,6.7,7,7.0,0.0,...,0.0,6.0,6.0,0.0,7,7.0,0.0,7,6.8,0.2
5,1794,95.3,6,6,10,8.8,4.7,6,6.2,0.2,...,0.3,1.0,1.0,0.0,7,7.0,0.0,7,7.0,0.0


## Notes

- Scores are integers on a scale (typically 0-10 or as defined by the evaluation prompt)
- "Median of 5 runs" indicates the median score across 5 separate evaluations
- Multiple scores may be comma-separated when multiple evaluations are recorded
- Edit the DataFrame above to add new results, then run the cell below to export back to markdown

In [4]:
# Export to markdown format
def export_to_markdown(df, filename='BENCHMARKS.md'):
    """Export DataFrame to markdown table format."""
    # Replace NaN with empty string for markdown
    df_md = df.fillna('')
    
    markdown_content = "# Benchmark Test Results\n\n"
    markdown_content += "This document records benchmark test results for various language models evaluating article huntability scores.\n\n"
    markdown_content += "## Test Results\n\n"
    
    # Create markdown table
    markdown_content += df_md.to_markdown(index=False)
    
    markdown_content += "\n\n---\n\n"
    markdown_content += "## Notes\n\n"
    markdown_content += "- Scores are integers on a scale (typically 0-10 or as defined by the evaluation prompt)\n"
    markdown_content += "- \"Median of 5 runs\" indicates the median score across 5 separate evaluations\n"
    markdown_content += "- Multiple scores may be comma-separated when multiple evaluations are recorded"
    
    with open(filename, 'w') as f:
        f.write(markdown_content)
    
    print(f"✅ Exported to {filename}")

# Uncomment to export:
# export_to_markdown(df)