# Token Histogram per Document

In [None]:
from utils.count_tokens import get_all_token_counts
import matplotlib.pyplot as plt
import numpy as np

# Get token counts from both datasets
token_counts = get_all_token_counts()

# Create histogram
plt.figure(figsize=(12, 6))
plt.hist(token_counts, bins=30, alpha=0.7, edgecolor='black')
plt.xlabel('Token Count')
plt.ylabel('Number of Files')
plt.title('Distribution of Token Counts Across All Markdown Files')
plt.grid(True, alpha=0.3)

# Add statistics as text
mean_tokens = np.mean(token_counts)
median_tokens = np.median(token_counts)
plt.axvline(mean_tokens, color='red', linestyle='--',
label=f'Mean: {mean_tokens:,.0f}')
plt.axvline(median_tokens, color='green', linestyle='--',
label=f'Median: {median_tokens:,.0f}')
plt.legend()

plt.tight_layout()
plt.show()

# LLM Judge Validation

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Data
data_A = {
    "Correct":   {"Correct": 1022, "Incorrect": 107, "No answer": 6},
    "Incorrect": {"Correct": 27,   "Incorrect": 467, "No answer": 34},
    "No answer": {"Correct": 25,   "Incorrect": 41,  "No answer": 671}
}

data_B = {
    "Correct":   {"Correct": 1061, "Incorrect": 67,  "No answer": 7},
    "Incorrect": {"Correct": 58,   "Incorrect": 417, "No answer": 53},
    "No answer": {"Correct": 22,   "Incorrect": 17,  "No answer": 698}
}

data_C = {
    "Correct":   {"Correct": 1076, "Incorrect": 54,  "No answer": 5},
    "Incorrect": {"Correct": 54,   "Incorrect": 401, "No answer": 73},
    "No answer": {"Correct": 29,   "Incorrect": 27,  "No answer": 681}
}

data_D = {
    "Correct":   {"Correct": 1013, "Incorrect": 118, "No answer": 3},
    "Incorrect": {"Correct": 32,   "Incorrect": 458, "No answer": 34},
    "No answer": {"Correct": 24,   "Incorrect": 117, "No answer": 596}
}

# Store in list
matrices = [data_A, data_B, data_C, data_D]
titles = ["DeepSeek-R1", "GPT-5", "GPT-5-mini", "GPT-5-nano"]

# Create figure (2x2)
fig, axes = plt.subplots(1, 4, figsize=(21, 6))
sns.set_theme(style="white", font_scale=1.2)

# Plot each matrix
for ax, matrix, title in zip(axes.flat, matrices, titles):
    df = pd.DataFrame(matrix).T
    sns.heatmap(df,
                annot=True,
                fmt="d",
                cmap="Greens",
                linewidths=0.8,
                linecolor='white',
                cbar=False,
                annot_kws={"size": 12, "weight": "bold"},
                ax=ax)

    ax.set_title(title, fontsize=16, weight='bold')
    ax.set_xlabel("")  # remove individual axis titles
    ax.set_ylabel("")

# Add shared big labels for the whole figure
fig.text(0.04, 0.5, "True Labels", va='center', rotation='vertical',
         fontsize=20, weight='bold')
fig.text(0.5, 0.04, "Predicted Labels", ha='center',
         fontsize=20, weight='bold')
fig.suptitle("LLM Judgment vs. Financial Analyst Ground Truth", fontsize=22, weight='bold', y=0.97)
plt.tight_layout(rect=[0.06, 0.06, 1, 1])  # Leave space for big labels
plt.savefig("confusion_matrices_1x4_single_labels.pdf", format="pdf", bbox_inches='tight')
plt.savefig("confusion_matrices_2x2_single_labels.eps", format="eps", bbox_inches='tight')
plt.show()

# FinanceBench

In [None]:
# %matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from itertools import groupby

CATEGORIES = ["correct", "coherent", "deviated", "incorrect", "no_answer"]
# Using the colors from your provided script
COLORS = {
    "correct":   "#4CAF50",  # green
    "coherent":  "#FFF59D",  # pale yellow (lighter)
    "deviated":  "#FFB000",  # vivid amber (darker yellow)
    "incorrect": "#E74C3C",  # red
    "no_answer": "#A93226",  # dark red
}

def _to_float(x):
    try: return float(x)
    except: return 0.0

def _normalize_to_fractions(vals):
    s = sum(vals)
    if s == 0: return [0.0 for _ in vals]
    if 0.98 <= s <= 1.02: return vals
    if 98 <= s <= 102:    return [v/100 for v in vals]
    return [v/s for v in vals]

def plot_judgment_bars(
    data_dict,
    model_order=None,
    figsize=(15, 6), # Slightly wider to accommodate gaps
    edgecolor="black",
    show_values=False,
    value_fmt="{:.0%}",
    group_labels=False,
    group_gap=0.6, # NEW: Controls the extra space between groups
    group_sizes=None,
    separators=None,
    vline_kwargs=None,
    method_display_names=None  # NEW: Optional mapping for custom method names
):
    models_to_plot = list(data_dict.keys()) if model_order is None else [m for m in model_order if m in data_dict]
    if not models_to_plot:
        raise ValueError("No models found to plot.")

    x_coords = np.arange(len(models_to_plot)) # Default x-positions
    minor_labels = models_to_plot
    major_labels_info = []
    
    if group_labels:
        parsed_models = []
        for key in models_to_plot:
            parts = key.split('\n', 1)
            method = parts[0] if len(parts) > 1 else ""
            model = parts[1] if len(parts) > 1 else key
            parsed_models.append({'key': key, 'method': method, 'model': model})

        grouped_data = groupby(parsed_models, key=lambda x: x['method'])
        
        # --- NEW: Calculate gapped x-coordinates and labels ---
        temp_x = []
        minor_labels = []
        final_model_order = []
        auto_group_sizes = []
        
        current_x_start = 0
        for method, group in grouped_data:
            group_list = list(group)
            size = len(group_list)
            auto_group_sizes.append(size)
            
            # Define x-positions for this group
            group_x_positions = [current_x_start + i for i in range(size)]
            temp_x.extend(group_x_positions)
            
            # Store labels and model order
            for item in group_list:
                final_model_order.append(item['key'])
                minor_labels.append(item['model'])
            
            # Calculate center for the major label using the new coordinates
            center_pos = np.mean(group_x_positions)
            
            # NEW: Use custom display name if provided
            display_method = method
            if method_display_names and method in method_display_names:
                display_method = method_display_names[method]
            
            major_labels_info.append({'text': display_method, 'pos': center_pos})
            
            # Update the starting position for the next group, adding the gap
            current_x_start += size + 0.4
            
        x_coords = np.array(temp_x)
        models_to_plot = final_model_order
        if group_sizes is None:
            group_sizes = auto_group_sizes
    # --- End of New Logic ---

    fractions = []
    for m in models_to_plot:
        vals = [_to_float(data_dict[m].get(cat, 0)) for cat in CATEGORIES]
        fractions.append(_normalize_to_fractions(vals))
    fractions = np.array(fractions)

    width = 0.8
    fig, ax = plt.subplots(figsize=figsize)

    bottom = np.zeros(len(models_to_plot))
    for idx, cat in enumerate(CATEGORIES):
        heights = fractions[:, idx]
        label_map = {"correct": "Correct", "coherent": "Coherent", "deviated": "Deviated", "incorrect": "Incorrect", "no_answer": "No Answer"}
        bars = ax.bar(x_coords, heights, width, bottom=bottom,
                      color=COLORS[cat], edgecolor=edgecolor, linewidth=1.0, alpha=0.8,
                      label=label_map.get(cat, cat.replace("_", " ").title()))
        if show_values:
            for rect, h in zip(bars, heights):
                if h > 0.035:
                    ax.text(rect.get_x() + rect.get_width() / 2, rect.get_y() + h / 2,
                            value_fmt.format(h), ha="center", va="center", fontsize=10, color="black")
        bottom += heights

    ax.set_ylim(0, 1.0)
    ax.set_ylabel("% of Evaluation Data", fontsize=12)
    ax.set_xticks(x_coords) # Use the new gapped coordinates for ticks
    ax.set_xticklabels(minor_labels, fontsize=10)
    ax.grid(axis="y", linestyle=":", alpha=0.6)
    ax.set_xlim(-0.6, max(x_coords) + 0.6)
    ax.legend(loc="upper center", bbox_to_anchor=(0.5, 1.12), ncol=5, frameon=True)

    if group_labels:
        ax.tick_params(axis='x', which='major', length=0)
        # Lowered y_pos slightly to prevent potential overlap with tick labels
        y_pos = -0.05
        for info in major_labels_info:
            ax.text(info['pos'], y_pos, info['text'], 
                    ha='center', va='top', fontsize=11, fontweight="bold",
                    transform=ax.get_xaxis_transform())

    # ----- Vertical grey dashed lines -----
    style = dict(color="grey", linestyle="--", linewidth=1.5, alpha=0.8)
    if vline_kwargs:
        style.update(vline_kwargs)

    positions = []
    if group_sizes:
        # --- NEW: Calculate separator positions based on gapped coordinates ---
        # The line goes halfway through the gap between groups
        group_end_indices = np.cumsum(group_sizes[:-1]) - 1
        for i in group_end_indices:
            # Position of the last bar in the current group
            last_bar_x = x_coords[i]
            # The separator is placed after this bar, in the middle of the gap
            separator_pos = last_bar_x + (width / 2) + (group_gap / 2) 
            positions.append(separator_pos)
            
    if separators: # This part is less common with auto-gapping but retained for flexibility
        for s in separators:
            positions.append(float(s) + 0.5 if isinstance(s, int) else float(s))

    for xpos in positions:
        ax.axvline(xpos, **style)

    plt.tight_layout(rect=[0, 0.05, 1, 1]) # Adjust bottom margin for labels
    plt.savefig("financebench_results.pdf", format="pdf", bbox_inches="tight")
    plt.show()

# -------- Example usage with method display mapping --------
example_data = {
    "finMapReduce\ngpt-4o-mini": {"correct": 63, "coherent": 5, "deviated": 18, "incorrect": 13, "no_answer": 1},
    "finMapReduce\ngpt-5-nano": {"correct": 74, "coherent": 5, "deviated": 9, "incorrect": 9, "no_answer": 3},
    "finMapReduce\ngpt-5-mini": {"correct": 81, "coherent": 2, "deviated": 9, "incorrect": 5, "no_answer": 3},
    "finMapReduce\ngpt-5": {'correct': 79, 'coherent': 2, 'deviated': 13, 'incorrect': 6, 'no_answer': 0},
    "Base MapReduce\ngpt-4o-mini": {"correct": 57, "coherent": 8, "deviated": 10, "incorrect": 16, "no_answer": 9},
    "Tail Truncation\ngpt-4o-mini": {"correct": 64, "coherent": 3, "deviated": 19, "incorrect": 11, "no_answer": 3},
    "Tail Truncation\ngpt-5-nano": {"correct": 74, "coherent": 2, "deviated": 17, "incorrect": 6, "no_answer": 1},
    "Tail Truncation\ngpt-5-mini": {"correct": 86, "coherent": 1, "deviated": 9, "incorrect": 3, "no_answer": 1},
    "Head Truncation\ngpt-4o-mini": {"correct": 53, "coherent": 6, "deviated": 18, "incorrect": 17, "no_answer": 6},
    "Long-Context\ngpt-4.1-nano": {"correct": 65, "coherent": 6, "deviated": 15, "incorrect": 11, "no_answer": 3},
    "Long-Context\ngpt-4.1-mini": {"correct": 75, "coherent": 5, "deviated": 12, "incorrect": 8, "no_answer": 0}
}

# Define custom display names for methods
method_names = {
    "finMapReduce": "Ours",
    "Base MapReduce": "Baseline\nMapReduce", 
    "Tail Truncation": "Tail Truncation",
    "Head Truncation": "Head Truncation",
    "Long-Context": "Long-Context"
}

# Call the function with method display mapping
plot_judgment_bars(example_data, show_values=True, group_labels=True, 
                  group_gap=0.6, method_display_names=method_names)

# FinQA

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Categories and colors: 1 green, 2 yellows, 2 reds
CATEGORIES = ["correct", "coherent", "deviated", "incorrect", "no_answer"]
COLORS = {
    "correct":   "#4CAF50",  # green
    "coherent":  "#FFF59D",  # pale yellow (lighter)
    "deviated":  "#FFB000",  # vivid amber (darker yellow)
    "incorrect": "#E74C3C",  # red
    "no_answer": "#A93226",  # dark red
}

def normalize(vals):
    s = sum(vals)
    if s == 0: return [0]*len(vals)
    if 0.98 <= s <= 1.02: return vals           # already fractions
    if 98 <= s <= 102:    return [v/100 for v in vals]  # percentages
    return [v/s for v in vals]                  # counts

# Example data for two bars (counts/percents/fractions all OK)

data = {
    "finMapReduce": {
        "correct": 68,
        "coherent": 0,
        "deviated": 12,
        "incorrect": 61,
        "no_answer": 9
      },
    "FinGEAR": {
        "correct": 50,
        "coherent": 0,
        "deviated": 0,
        "incorrect": 50,
        "no_answer": 0
    }
}

models = list(data.keys())
fractions = np.array([normalize([data[m].get(c, 0) for c in CATEGORIES]) for m in models])
y = np.arange(len(models))

fig, ax = plt.subplots(figsize=(9, 3.8))
left = np.zeros(len(models))

for i, cat in enumerate(CATEGORIES):
    vals = fractions[:, i]
    ax.barh(y, vals, left=left, color=COLORS[cat], edgecolor="black", linewidth=1.0, alpha=0.8,
            height=0.6, label=cat.replace("_", " ").title())
    left += vals

ax.set_xlim(0, 1.0)
ax.set_xlabel("% of Evaluation Data")
ax.set_yticks(y, labels=models)
ax.grid(axis="x", linestyle="--", alpha=0.25)
ax.legend(loc="upper center", bbox_to_anchor=(0.5, 1.25), ncol=3, frameon=True)
plt.tight_layout()
plt.savefig("finqa_baselines.pdf", format="pdf", bbox_inches="tight")
plt.show()

# Hyperparameters

In [None]:
# %matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

CATEGORIES = ["correct", "coherent", "deviated", "incorrect", "no_answer"]
COLORS = {
    "correct":   "#4CAF50",  # green
    "coherent":  "#FFF59D",  # pale yellow (lighter)
    "deviated":  "#FFB000",  # vivid amber (darker yellow)
    "incorrect": "#E74C3C",  # red
    "no_answer": "#A93226",  # dark red
}

def _to_float(x):
    try: return float(x)
    except: return 0.0

def _normalize_to_fractions(vals):
    s = sum(vals)
    if s == 0: return [0.0 for _ in vals]
    if 0.98 <= s <= 1.02: return vals           # already fractions
    if 98 <= s <= 102:    return [v/100 for v in vals]  # percentages
    return [v/s for v in vals]                  # counts -> fractions

def plot_judgment_bars(
    data_dict,
    model_order=None,
    figsize=(12, 5),
    edgecolor="black",
    show_values=False,
    value_fmt="{:.0%}",
    # NEW: vertical lines
    group_sizes=None,      # e.g., [2,3,2] -> lines after 2nd and 5th bars
    separators=None,       # e.g., [1.5, 4.5] -> exact x positions between bars
    vline_kwargs=None      # style overrides for vertical lines
):
    models = list(data_dict.keys()) if model_order is None else [m for m in model_order if m in data_dict]
    if not models:
        raise ValueError("No models found to plot.")

    fractions = []
    for m in models:
        vals = [_to_float(data_dict[m].get(cat, 0)) for cat in CATEGORIES]
        fractions.append(_normalize_to_fractions(vals))
    fractions = np.array(fractions)

    x = np.arange(len(models))
    width = 0.8
    fig, ax = plt.subplots(figsize=figsize)

    bottom = np.zeros(len(models))
    for idx, cat in enumerate(CATEGORIES):
        heights = fractions[:, idx]
        bars = ax.bar(x, heights, width, bottom=bottom,
                      color=COLORS[cat], edgecolor=edgecolor, linewidth=1.0, alpha=0.8,
                      label=cat.replace("_", " ").title())
        if show_values:
            for rect, h in zip(bars, heights):
                if h > 0.03:
                    ax.text(rect.get_x()+rect.get_width()/2, rect.get_y()+h/2,
                            value_fmt.format(h), ha="center", va="center", fontsize=9)
        bottom += heights

    ax.set_ylim(0, 1.0)
    ax.set_ylabel("% of Evaluation Data")
    ax.set_xticks(x, models)
    ax.grid(axis="y", linestyle="--", alpha=0.25)
    ax.legend(loc="upper center", bbox_to_anchor=(0.5, 1.15), ncol=5, frameon=True)

    # ----- Vertical grey dashed lines -----
    style = dict(color="grey", linestyle="--", linewidth=1.5, alpha=0.8)
    if vline_kwargs:
        style.update(vline_kwargs)

    positions = []
    if group_sizes:
        # lines after each group; bars are centered on integers -> boundary at idx+0.5
        cum = np.cumsum(group_sizes[:-1])  # exclude last (no line after final group)
        positions.extend([c - 0.5 for c in cum])
    if separators:
        # accept ints as “after bar i” (convert to i+0.5), or floats as absolute x
        for s in separators:
            if isinstance(s, int):
                positions.append(s + 0.5)
            else:
                positions.append(float(s))

    for xpos in positions:
        ax.axvline(xpos, **style)

    plt.tight_layout()
    # plt.savefig("chunk_size.pdf", format="pdf", bbox_inches="tight")
    plt.show()

# -------- Example usage --------
example_data = {
    "2048\n128": {
        "correct": 78,
        "coherent": 8,
        "deviated": 22,
        "incorrect": 27,
        "no_answer": 15
      },
    "8192\n512": {
        "correct": 78,
        "coherent": 12,
        "deviated": 20,
        "incorrect": 36,
        "no_answer": 4
    },
    "32768\n4096": {
        "correct": 94,
        "coherent": 7,
        "deviated": 27,
        "incorrect": 19,
        "no_answer": 3
    },
    "32768\n0": {
        "correct": 95,
        "coherent": 14,
        "deviated": 19,
        "incorrect": 18,
        "no_answer": 4
    },
    "65536\n4096": {
        "correct": 88,
        "coherent": 12,
        "deviated": 20,
        "incorrect": 27,
        "no_answer": 3
    },
    "120k\n4096": {
        "correct": 92,
        "coherent": 7,
        "deviated": 25,
        "incorrect": 20,
        "no_answer": 6
    },

}

# Option A: group sizes -> lines after 2nd and 4th bars (2 + 2)
plot_judgment_bars(example_data, group_sizes=[2, 2, 1], show_values=True)

# Option B: explicit positions -> lines between bars 1|2 and 4|5
# plot_judgment_bars(example_data, separators=[1.5, 4.5], show_values=True)

# Performance per Question Type

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# Example data for three models
model_A = {
    "metrics-generated": {"accuracy": 0.78},
    "domain-relevant": {"accuracy": 0.52},
    "novel-generated": {"accuracy": 0.58}
}

model_B = {
    "metrics-generated": {"accuracy": 0.84},
    "domain-relevant": {"accuracy": 0.52},
    "novel-generated": {"accuracy": 0.60}
}

model_C = {
    "metrics-generated": {"accuracy": 0.84},
    "domain-relevant": {"accuracy": 0.70},
    "novel-generated": {"accuracy": 0.70}
}

# Store models for iteration
models = {
    "MapReduce gpt-4o-mini": model_A,
    "Long-Context gpt-4.1-nano": model_B,
    "Long-Context gpt-4.1-mini": model_C
}

# Categories (question types)
categories = ["metrics-generated", "domain-relevant", "novel-generated"]

# Extract accuracies
accuracies = {model: [data[cat]["accuracy"] for cat in categories] for model, data in models.items()}

# Bar plot settings
x = np.arange(len(categories))  # category positions
width = 0.25  # bar width

# Use Seaborn color palette
colors = sns.color_palette("Set2", n_colors=len(models))

plt.style.use("seaborn-v0_8-muted")  # modern clean style
fig, ax = plt.subplots(figsize=(9,6))

# Plot each model
for i, (model_name, accs) in enumerate(accuracies.items()):
    bars = ax.bar(x + i*width, accs, width, label=model_name, color=colors[i], alpha=0.85)

    # Add value labels
    for bar in bars:
        height = bar.get_height()
        ax.annotate(f'{height:.2f}',
                    xy=(bar.get_x() + bar.get_width() / 2, height),
                    xytext=(0, 3),  # offset
                    textcoords="offset points",
                    ha='center', va='bottom', fontsize=9, color="black")

# Formatting
ax.set_ylabel("Accuracy", fontsize=12)
ax.set_ylim(0, 1.05)
ax.set_title("Model Accuracy by Question Type on FinanceBench", fontsize=14, weight="bold")
ax.set_xticks(x + width)
ax.set_xticklabels(categories, fontsize=11)
ax.legend(title="Models", fontsize=10)

sns.despine()  # cleaner axes

plt.tight_layout()
plt.savefig("question_type.pdf", format="pdf", bbox_inches="tight")
plt.show()
