# Benchmark Test Results

This document records benchmark test results for various language models evaluating article huntability scores.

In [3]:
import pandas as pd
import numpy as np

# Initialize benchmark data
data = {
    'Article': [
        'http://127.0.0.1:8001/articles/1974',
        'http://127.0.0.1:8001/articles/1909',
        'http://127.0.0.1:8001/articles/1866',
        'http://127.0.0.1:8001/articles/1860',
        'http://127.0.0.1:8001/articles/1937',
        'http://127.0.0.1:8001/articles/1794'
    ],
    'Hunt Score': [81.3, 99.6, 96.7, 97.9, 93.1, 95.3],
    'Huntable GPT': ['7', '7', '5, 9', '7', '8', '6'],
    'GPT4o Median of 5 runs': ['6', '6', '5', '5', '3', '6'],
    'gpt-oss-20b': [None, None, None, None, None, None],
    'Claude Sonnet 3.5': [7, 6, 6, 8, 7, 6],
    'Claude Sonnet 3.5-mean': [7.0, 6, 6, 8, 7, 6.2],
    'Claude Sonnet 3.5-variance': [0.0, 0.0, 0.0, 0.0, 0.0, 0.2],
    'Mistral 7B Instruct v0.2': [None, None, None, None, None, None],
    'Mixtral 8x 7B Instruct': [None, None, None, None, None, None],
    'LLaMA 3 8B Instruct': [None, None, None, None, None, None],
    'LLaMA 3 70B Instruct': [None, None, None, None, None, None],
    'codellama-7b-instruct': [7, 7, 9, 8, 9, 8],
    'codellama-7b-instruct-mean': [7, 7, 9, 8, 9, 8],
    'codellama-7b-instruct-variance': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
    'qwen2-7b-instruct': [9, 7, 8, 8, 8, 8],    'qwen2-7b-instruct-mean': [9.0, 6.6, 8.0, 8.0, 8.0, 8.4],    'qwen2-7b-instruct-variance': [0.0, 4.8, 0.0, 0.0, 0.0, 0.3],    'nous-hermes-2-mistral-7b-dpo': [None, None, None, None, None, None],
    'qwen/qwen2.5-coder-32b': [None, None, None, None, None, None],
    'Phi-3 Medium (14 B)': [None, None, None, None, None, None]
}

df = pd.DataFrame(data)
df

Unnamed: 0,Article,Hunt Score,Huntable GPT,GPT4o Median of 5 runs,gpt-oss-20b,Claude Sonnet 3.5,Claude Sonnet 3.5-mean,Claude Sonnet 3.5-variance,Mistral 7B Instruct v0.2,Mixtral 8x 7B Instruct,...,LLaMA 3 70B Instruct,codellama-7b-instruct,codellama-7b-instruct-mean,codellama-7b-instruct-variance,qwen2-7b-instruct,qwen2-7b-instruct-mean,qwen2-7b-instruct-variance,nous-hermes-2-mistral-7b-dpo,qwen/qwen2.5-coder-32b,Phi-3 Medium (14 B)
0,http://127.0.0.1:8001/articles/1974,81.3,7,6,,7,7.0,0.0,,,...,,7,7,0.0,9,9.0,0.0,,,
1,http://127.0.0.1:8001/articles/1909,99.6,7,6,,6,6.0,0.0,,,...,,7,7,0.0,7,6.6,4.8,,,
2,http://127.0.0.1:8001/articles/1866,96.7,"5, 9",5,,6,6.0,0.0,,,...,,9,9,0.0,8,8.0,0.0,,,
3,http://127.0.0.1:8001/articles/1860,97.9,7,5,,8,8.0,0.0,,,...,,8,8,0.0,8,8.0,0.0,,,
4,http://127.0.0.1:8001/articles/1937,93.1,8,3,,7,7.0,0.0,,,...,,9,9,0.0,8,8.0,0.0,,,
5,http://127.0.0.1:8001/articles/1794,95.3,6,6,,6,6.2,0.2,,,...,,8,8,0.0,8,8.4,0.3,,,


## Notes

- Scores are integers on a scale (typically 0-10 or as defined by the evaluation prompt)
- "Median of 5 runs" indicates the median score across 5 separate evaluations
- Multiple scores may be comma-separated when multiple evaluations are recorded
- Edit the DataFrame above to add new results, then run the cell below to export back to markdown

In [4]:
# Export to markdown format
def export_to_markdown(df, filename='BENCHMARKS.md'):
    """Export DataFrame to markdown table format."""
    # Replace NaN with empty string for markdown
    df_md = df.fillna('')
    
    markdown_content = "# Benchmark Test Results\n\n"
    markdown_content += "This document records benchmark test results for various language models evaluating article huntability scores.\n\n"
    markdown_content += "## Test Results\n\n"
    
    # Create markdown table
    markdown_content += df_md.to_markdown(index=False)
    
    markdown_content += "\n\n---\n\n"
    markdown_content += "## Notes\n\n"
    markdown_content += "- Scores are integers on a scale (typically 0-10 or as defined by the evaluation prompt)\n"
    markdown_content += "- \"Median of 5 runs\" indicates the median score across 5 separate evaluations\n"
    markdown_content += "- Multiple scores may be comma-separated when multiple evaluations are recorded"
    
    with open(filename, 'w') as f:
        f.write(markdown_content)
    
    print(f"✅ Exported to {filename}")

# Uncomment to export:
# export_to_markdown(df)