In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Colors
COLOR_GEMINI = '#636EFA'  # Blue for Gemini models
COLOR_GPT4 = '#EF553B'     # Red for GPT-4
COLOR_FAIR_ZONE = '#00CC96'      # Green for fair comparison zone
COLOR_OPTIMIZED_ZONE = '#D3D3D3' # Gray for optimized config zone

ALPHA_FULL = 1.0
ALPHA_FADED = 0.5
ZONE_ALPHA = 0.2

# Benchmark data
benchmark_names = [
    "HumanEval*", "MMLU\n(CoT@32*)", "GSM8K*", "DROP*",
    "Nat2Code", "BBH", "MATH", "MMLU\n(5-shot)", "HellaSwag"
]

scores_ultra = [74.4, 90.04, 94.4, 82.4, 74.9, 83.6, 53.2, 83.7, 87.8]
scores_gpt4 = [67.0, 87.29, 92.0, 80.9, 73.9, 83.1, 52.9, 86.4, 95.3]
scores_pro = [67.7, 79.13, 86.5, 74.1, 69.6, 75.0, 32.6, 71.8, 84.7]

# Flag which benchmarks use optimized config
is_optimized = [True, True, True, True, False, False, False, False, False]

# Calculate performance deltas
deltas = [ultra - gpt4 for ultra, gpt4 in zip(scores_ultra, scores_gpt4)]

# Plot setup
fig, (ax_deltas, ax_scores) = plt.subplots(2, 1, figsize=(16, 9), facecolor='white', sharex=True)
plt.subplots_adjust(hspace=0.2, bottom=0.15, top=0.9)

divider_position = 3.5

# TOP CHART: Performance deltas
bar_colors = [COLOR_GEMINI if delta > 0 else COLOR_GPT4 for delta in deltas]
bars = ax_deltas.bar(benchmark_names, deltas, color=bar_colors, edgecolor='black', alpha=0.9, zorder=3)

ax_deltas.axhline(0, color='black', linewidth=1, zorder=4)
ax_deltas.axvspan(-0.5, divider_position, color=COLOR_OPTIMIZED_ZONE, alpha=ZONE_ALPHA)
ax_deltas.axvspan(divider_position, len(benchmark_names)-0.5, color=COLOR_FAIR_ZONE, alpha=ZONE_ALPHA)
ax_deltas.axvline(x=divider_position, color='black', linestyle='-', linewidth=2.5, zorder=5)

# Add delta labels
for bar, delta in zip(bars, deltas):
    vertical_align, offset = ('bottom', 0.5) if delta > 0 else ('top', -0.5)
    ax_deltas.text(
        bar.get_x() + bar.get_width()/2, delta + offset,
        f'{delta:+.1f}%',
        ha='center', va=vertical_align, fontweight='bold', color='black', fontsize=10
    )

ax_deltas.set_ylabel('Delta (Ultra - GPT-4 %)', fontweight='bold')
ax_deltas.set_title('Gemini Ultra vs GPT-4: Performance Gap Audit', fontweight='bold', fontsize=16)
ax_deltas.spines[['top', 'right', 'left']].set_visible(False)
ax_deltas.grid(axis='y', linestyle='--', alpha=0.2, zorder=0)

# Zone labels
y_label_position = ax_deltas.get_ylim()[1] * 0.8
ax_deltas.text(1.5, y_label_position, 'DIFFERENT CONFIG\n(Optimised for Gemini)', 
               ha='center', fontweight='bold', color='#555555', fontsize=10)
ax_deltas.text(6, y_label_position, 'SAME CONFIG\n(Fair Comparison)', 
               ha='center', fontweight='bold', color='#008c66', fontsize=10)

# BOTTOM CHART: Absolute scores
x_positions = np.arange(len(benchmark_names))
bar_width = 0.22

bars_ultra = ax_scores.bar(x_positions - bar_width, scores_ultra, bar_width, 
                           label='Gemini Ultra', color=COLOR_GEMINI, alpha=ALPHA_FULL, zorder=3)
bars_pro = ax_scores.bar(x_positions, scores_pro, bar_width, 
                         label='Gemini Pro', color=COLOR_GEMINI, alpha=ALPHA_FADED, zorder=3)
bars_gpt4 = ax_scores.bar(x_positions + bar_width, scores_gpt4, bar_width, 
                          label='GPT-4', color=COLOR_GPT4, zorder=3)

ax_scores.axvspan(-0.5, divider_position, color=COLOR_OPTIMIZED_ZONE, alpha=ZONE_ALPHA)
ax_scores.axvspan(divider_position, len(benchmark_names)-0.5, color=COLOR_FAIR_ZONE, alpha=ZONE_ALPHA)
ax_scores.axvline(x=divider_position, color='black', linestyle='-', linewidth=2.5, zorder=5)

# Add score labels
def add_score_labels(bars, scores, is_ultra_model):
    for i, (bar, score) in enumerate(zip(bars, scores)):
        height = bar.get_height()
        max_score = max(scores_ultra[i], scores_pro[i], scores_gpt4[i])
        is_winner = abs(height - max_score) < 1e-7
        
        # Add asterisk for optimized ultra scores
        label_text = f"*{score:.1f}" if (is_winner and is_ultra_model and is_optimized[i]) else f"{score:.1f}"
        
        ax_scores.annotate(
            label_text,
            xy=(bar.get_x() + bar.get_width()/2, height),
            xytext=(0, 3),
            textcoords="offset points",
            ha='center', va='bottom',
            fontweight='bold' if is_winner else 'normal',
            fontsize=9
        )

add_score_labels(bars_ultra, scores_ultra, True)
add_score_labels(bars_pro, scores_pro, False)
add_score_labels(bars_gpt4, scores_gpt4, False)

ax_scores.set_ylabel('Score (%)', fontweight='bold')
ax_scores.set_ylim(0, 115)
ax_scores.set_xticks(x_positions)
ax_scores.set_xticklabels(benchmark_names, fontweight='bold', rotation=0)
ax_scores.spines[['top', 'right', 'left']].set_visible(False)
ax_scores.legend(loc='lower center', bbox_to_anchor=(0.5, -0.3), ncol=3, frameon=False, fontsize=11)
ax_scores.grid(axis='y', linestyle='--', alpha=0.2, zorder=0)

plt.savefig('gemini_vs_gpt4_performance_audit.png', dpi=300, bbox_inches='tight')
plt.show()