In [None]:
import numpy as np
import pandas as pd
from scipy.stats import ttest_ind
import altair as alt
import pickle as pkl
import matplotlib

In [None]:
# load mind articled_mind.pkl
with open("../data/mind/articles_mind.pickle", "rb") as f:
    articles = pkl.load(f)

In [None]:
with open("../results/mind_results_k@10.pkl", "rb") as f:
    results = pkl.load(f)

print(len(results['topic_calibration']['naml']))
results_mean = results.map(
    lambda x: np.mean(x) if isinstance(x, (list, np.ndarray)) and len(x) > 0 else np.nan
)

display(results_mean)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

legend_name_map = {
    'pop': 'Popularity',
    'random': 'Random',
    'npa': 'NPA',
    'nrms': 'NRMS',
    'lstur': 'LSTUR',
    'naml': 'NAML',
}
ordered_recommenders = ['lstur', 'naml', 'nrms', 'npa', 'pop', 'random']
metric_specs = [
    {'label': 'Topic Calibration', 'x_label': 'Topic Calibration', 'path': Path('../results/mind_topic_tradeoff_k@10.csv')},
    {'label': 'Subtopic Calibration', 'x_label': 'Subtopic Calibration', 'path': Path('../results/mind_subtopic_tradeoff_k@10.csv')},
    {'label': 'Complexity Calibration', 'x_label': 'Complexity Calibration', 'path': Path('../results/mind_complexity_tradeoff_k@10.csv')},
    {'label': 'Fragmentation', 'x_label': 'Fragmentation', 'path': Path('../results/mind_fragmentation_tradeoff_k@10.csv')},
    {'label': 'Activation', 'x_label': 'Activation', 'path': Path('../results/mind_activation_tradeoff_k@10.csv')},
]

palette = matplotlib.colormaps['tab10']
color_map = {rec: palette(i) for i, rec in enumerate(ordered_recommenders)}

fig, axes = plt.subplots(2, 3, figsize=(18, 12), sharey=True)
axes = axes.flatten()
legend_handles = {}

for ax, spec in zip(axes, metric_specs):
    tradeoff_df = pd.read_csv(spec['path'])
    tradeoff_df = tradeoff_df.dropna(subset=['ndcg', 'divergence'])
    tradeoff_df = tradeoff_df.sort_values(['recommender', 'lambda'])
    plot_df = tradeoff_df[tradeoff_df['recommender'] != 'incorrect_random']

    for rec in ordered_recommenders:
        group = plot_df[plot_df['recommender'] == rec]
        if group.empty:
            continue
        display_name = legend_name_map.get(rec, rec)

        # Add a tiny jitter when multiple lambdas land on the exact same point (shift right only)
        jittered = group.copy()
        seen_coords = {}
        jitter_step = 2.5e-3
        jitter_margin = 1e-2
        for idx, row in jittered.iterrows():
            key = (
                round(row['divergence'] / jitter_margin),
                round(row['ndcg'] / jitter_margin),
            )
            offset = seen_coords.get(key, 0)
            if offset:
                jittered.at[idx, 'divergence'] = row['divergence'] + jitter_step * offset
            seen_coords[key] = offset + 1

        line, = ax.plot(
            jittered['divergence'],
            jittered['ndcg'],
            linewidth=3.5,
            marker='o',
            markersize=8,
            color=color_map[rec],
            label=display_name,
        )
        legend_handles[display_name] = line

    ax.set_xlabel(spec['x_label'], fontsize=18)
    ax.set_ylabel('NDCG@10', fontsize=18)
    ax.tick_params(axis='both', which='major', labelsize=14)
    ax.grid(True, linestyle='--', alpha=0.5)

for ax in axes[len(metric_specs):]:
    ax.axis('off')

fig.legend(
    legend_handles.values(),
    legend_handles.keys(),
    loc='lower center',
    ncol=3,
    fontsize=16,
    title='Recommender',
    title_fontsize=16,
    bbox_to_anchor=(0.5, -0.04),
)
plt.tight_layout(rect=[0, 0.08, 1, 1])
plt.subplots_adjust(bottom=0.12)
plt.show()




In [None]:
# Switch to the default data transformer to avoid memory issues with vegafusion
alt.data_transformers.enable('default', max_rows=10000)

metrics = [
    'topic_calibration', 'subtopic_calibration', 'complexity_calibration',
    'activation', 'fragmentation', 'representation', 'alternative_voices'
]
metric_titles = {
    'topic_calibration': 'Topic Calibration Score',
    'subtopic_calibration': 'Subtopic Calibration Score', 
    'complexity_calibration': 'Complexity Calibration Score',
    'activation': 'Activation Score',
    'fragmentation': 'Fragmentation Score',
    'representation': 'Representation Score',
    'alternative_voices': 'Alternative Voice Score'
}

# Define custom colors: blue, orange, red
custom_colors = ['#1f77b4', '#ff7f0e', '#d62728']  # blue, orange, red

# Prepare a long-form DataFrame for all metrics and types
all_data = []
all_samples = []

for metric in metrics:
    for method, method_name in zip(
        ['naml', 'random', 'incorrect_random'],
        ['NAML', 'Random', 'Original Random']
    ):
        values = results[metric][method]
        for v in values:
            all_data.append({
                'value': v,
                'type': method_name,
                'metric': metric_titles[metric]
            })
        # For sample points
        np.random.seed(1)
        if len(values) >= 5:
            sample_values = np.random.choice(values, 5, replace=False)
        else:
            sample_values = values  # fallback if not enough
        for i, v in enumerate(sample_values):
            all_samples.append({
                'value': v,
                'type': method_name,
                'metric': metric_titles[metric],
                'sample_id': i
            })

df_all = pd.DataFrame(all_data)
df_samples = pd.DataFrame(all_samples)

# --- REWRITE: Layer before faceting ---

# Create a list to hold layered charts for each metric
layered_charts = []

for metric in metrics:
    metric_title = metric_titles[metric]
    # Filter data for this metric
    df_metric = df_all[df_all['metric'] == metric_title]
    df_samples_metric = df_samples[df_samples['metric'] == metric_title]

    # To avoid memory issues, limit the number of rows for density estimation
    # If there are more than 2000 rows, sample 2000 for the density plot
    if len(df_metric) > 2000:
        df_metric_density = df_metric.sample(10000, random_state=42)
    else:
        df_metric_density = df_metric

    # Density chart for this metric
    density = alt.Chart(df_metric_density).transform_density(
        'value',
        as_=['value', 'density'],
        groupby=['type'],
        extent=[df_metric_density['value'].min(), df_metric_density['value'].max()],
        bandwidth=(df_metric_density['value'].max() - df_metric_density['value'].min())/20 if df_metric_density['value'].max() > df_metric_density['value'].min() else 0.01
    ).mark_area(opacity=0.3).encode(
        x=alt.X('value:Q', title=metric_title, axis=alt.Axis(title=None)),
        y=alt.Y('density:Q', title='Probability Density', stack=None),
        color=alt.Color(
            'type:N',
            title='Method',
            scale=alt.Scale(
                domain=['NAML', 'Random', 'Original Random'],
                range=custom_colors
            ),
            legend=alt.Legend(
                titleFontSize=16,
                labelFontSize=14,
                symbolSize=150,
                padding=10,
                orient='right',
                direction='vertical',
                legendX=20,
                legendY=20,
                fillColor='white',
                symbolOpacity=1,
            )
        )
    )

    # Sample points for this metric
    sample_points = alt.Chart(df_samples_metric).mark_point(
        filled=True,
        size=100,
        opacity=1
    ).encode(
        x=alt.X('value:Q', title=None),
        y=alt.Y('type:N', 
                title=None, 
                axis=None, 
                sort=['NAML','Random','Original Random'],
                scale=alt.Scale(padding=40)
               ),
        shape=alt.Shape(
            'sample_id:O',
            scale=alt.Scale(domain=list(range(5)), range=['circle','square','triangle','diamond','cross']),
            legend=None
        ),
        color=alt.Color(
            'type:N',
            legend=None,
            scale=alt.Scale(
                domain=['NAML', 'Random', 'Original Random'],
                range=custom_colors
            )
        )
    )

    # Layer density and sample points for this metric
    layered = (density + sample_points).resolve_scale(
        y='independent', x='independent', color='shared'
    ).properties(
        width=300, height=200, title=metric_title
    )
    layered_charts.append(layered)

# Concatenate all layered charts into a faceted grid (2 columns)
chart = alt.vconcat(
    *[layered_charts[i] | layered_charts[i+1] if i+1 < len(layered_charts) else layered_charts[i] 
      for i in range(0, len(layered_charts), 2)]
).configure_title(
    fontSize=20,
).configure_axis(
    titleFontSize=16,
    labelFontSize=14
)

# Save with higher resolution, but use a lower scale_factor to avoid memory issues
chart.save('../results/mind_all_metrics_distributions.png', scale_factor=1.5)

In [None]:
metrics = ['topic_calibration', 'subtopic_calibration', 'complexity_calibration', 'activation', 'tf_idf_ild', 'sentbert_ild', 'gini', 'ndcg_values']

# Perform significance testing between two methods. In this case, we compare LSTUR and random.
for metric in metrics:
    lstur = results[metric]['nrms']
    random = results[metric]['random']
    t, p = ttest_ind(lstur, random)
    print(f'{metric} t: {t}, p: {p}')
    if p < 0.01:
        print(f'{metric} is significant')
    else: 
        print(f'{metric} is not significant')
    print()


In [None]:
# This code plots the mean of all metrics for each method as the number of samples increases.

metrics = ['topic_calibration', 'subtopic_calibration', 'complexity_calibration', 'activation', 
          'tf_idf_ild', 'sentbert_ild', 'gini', 'ndcg_values']

for metric_to_plot in metrics:
    x = np.arange(1, 201)
    y1 = [] 
    y2 = []
    y3 = []
    y4 = []
    
    for seed in range(10):
        y1_seed = []
        y2_seed = []
        y3_seed = []
        y4_seed = []

        start = seed * 200

        for n in x:
            end = start + n
            y1_seed.append(np.mean(results[metric_to_plot]['random'][start:end]))
            y2_seed.append(np.mean(results[metric_to_plot]['nrms'][start:end]))
            y3_seed.append(np.mean(results[metric_to_plot]['naml'][start:end]))
            y4_seed.append(np.mean(results[metric_to_plot]['lstur'][start:end]))
            
        y1.append(y1_seed)
        y2.append(y2_seed)
        y3.append(y3_seed)
        y4.append(y4_seed)

    y1 = np.array(y1)
    y2 = np.array(y2)
    y3 = np.array(y3)
    y4 = np.array(y4)

    plot_1 = np.mean(y1, axis=0)
    plot_2 = np.mean(y2, axis=0)
    plot_3 = np.mean(y3, axis=0)
    plot_4 = np.mean(y4, axis=0)

    # Calculate standard errors
    std_1 = 2 * np.std(y1, axis=0) / np.sqrt(y1.shape[0])
    std_2 = 2 * np.std(y2, axis=0) / np.sqrt(y2.shape[0])
    std_3 = 2 * np.std(y3, axis=0) / np.sqrt(y3.shape[0])
    std_4 = 2 * np.std(y4, axis=0) / np.sqrt(y4.shape[0])

    x_values = np.arange(len(plot_1))

    # Create individual DataFrames for each plot with error bars
    df1 = pd.DataFrame({
        'x': x_values, 
        'mean': plot_1, 
        'std': std_1,
        'metric': 'Random'
    })
    df2 = pd.DataFrame({
        'x': x_values, 
        'mean': plot_2, 
        'std': std_2,
        'metric': 'NRMS'
    })
    df3 = pd.DataFrame({
        'x': x_values, 
        'mean': plot_3, 
        'std': std_3,
        'metric': 'NAML'
    })
    df4 = pd.DataFrame({
        'x': x_values, 
        'mean': plot_4, 
        'std': std_4,
        'metric': 'LSTUR'
    })

    # Combine all DataFrames into one
    df = pd.concat([df1, df2, df3, df4], ignore_index=True)

    df['upper'] = df['mean'] + df['std']
    df['lower'] = df['mean'] - df['std']

    # Create the base line chart
    base = alt.Chart(df).mark_line().encode(
        x=alt.X('x', title='Number of samples', axis=alt.Axis(labelFontSize=14, titleFontSize=16)),
        y=alt.Y('mean', title=metric_to_plot.replace('_', ' ').title(), 
                scale=alt.Scale(domain=[(df['lower'].min() - 0.1*(df['upper'].max()-df['lower'].min())),
                                         (df['upper'].max() + 0.1*(df['upper'].max()-df['lower'].min()))]),
                axis=alt.Axis(labelFontSize=14, titleFontSize=16)),
        color=alt.Color('metric:N', scale=alt.Scale(scheme='category10'), 
        legend=alt.Legend(
                title='Method',
                orient='right',
                labelFontSize=14,
                titleFontSize=16,
                padding=10,
                symbolOpacity=1,
                symbolFillColor='transparent',
                direction='vertical',
            )),
    ).properties(
        width=500,
        height=200,
    )

    # Create error bands
    error_bands = alt.Chart(df).mark_area(opacity=0.2).encode(
        x='x',
        y='lower:Q',
        y2='upper:Q',
        color='metric:N'
    )

    # Combine and save
    final_chart = (base + error_bands).properties(
        width=500,
        height=200,
    )

    # Save at high resolution with metric-specific filename
    filename = f'../results/mind_converging_{metric_to_plot}.png'
    final_chart.save(filename, scale_factor=3.0)
    display(final_chart)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

results_dir = Path('results')
metric_files = {
    'Topic': results_dir / 'mind_topic_tradeoff_k@10.csv',
    'Subtopic': results_dir / 'mind_subtopic_tradeoff_k@10.csv',
    'Activation': results_dir / 'mind_activation_tradeoff_k@10.csv',
}
available = [(label, path) for label, path in metric_files.items() if path.exists()]
if not available:
    raise FileNotFoundError('No tradeoff CSVs found in results/.')

fig, axes = plt.subplots(1, len(available), figsize=(8 * len(available), 5), squeeze=False)
for ax, (label, path) in zip(axes.flat, available):
    tradeoff_df = pd.read_csv(path)
    tradeoff_df = tradeoff_df.dropna(subset=['ndcg', 'calibration'])
    tradeoff_df = tradeoff_df.sort_values(['recommender', 'lambda'])

    for recommender, group in tradeoff_df.groupby('recommender'):
        ax.plot(
            group['calibration'],
            group['ndcg'],
            marker='o',
            label=recommender,
        )
        for _, row in group.iterrows():
            ax.annotate(
                f"λ={row['lambda']:.2f}",
                (row['calibration'], row['ndcg']),
                textcoords="offset points",
                xytext=(4, 4),
                fontsize=8,
            )

    ax.set_xlabel(f'{label} Calibration Divergence (JSD, higher is better)')
    ax.set_ylabel('NDCG')
    ax.set_title(f'NDCG vs {label} Calibration Tradeoff (Top-10 Reranking)')
    ax.grid(True, linestyle='--', alpha=0.4)
    ax.legend(title='Recommender', bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

# Load production reranker results
tradeoff_path = Path('../results/mind_topic_production_tradeoff_k@10.csv')
if tradeoff_path.exists():
    tradeoff_df = pd.read_csv(tradeoff_path)
    tradeoff_df = tradeoff_df.dropna(subset=['ndcg', 'divergence'])
    tradeoff_df = tradeoff_df.sort_values(['lambda'])
    
    fig, ax = plt.subplots(figsize=(10, 10))
    
    ax.plot(
        tradeoff_df['divergence'],
        tradeoff_df['ndcg'],
        marker='o',
        linestyle='-',
        linewidth=3,
        markersize=12,
        label='Production Reranker (NRMS)',
        color='#2ca02c'
    )
    
    # Annotate lambda values
    for _, row in tradeoff_df.iterrows():
        ax.annotate(
            f"λ={row['lambda']:.2f}",
            (row['divergence'], row['ndcg']),
            textcoords="offset points",
            xytext=(8, 8),
            fontsize=14,
            fontweight='bold'
        )
    
    ax.set_xlabel('Topic Calibration Divergence', fontsize=20)
    ax.set_ylabel('NDCG@10 (GT)', fontsize=20)
    ax.tick_params(axis='both', which='major', labelsize=16)
    ax.grid(True, linestyle='--', alpha=0.6)
    ax.legend(fontsize=18, loc='upper right')
    plt.tight_layout()
    plt.show()
else:
    print(f"Production reranker results not found at {tradeoff_path}")
    print("Run: uv run python evaluate_production_reranker.py")


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

# Load production reranker results for subtopic
tradeoff_path = Path('../results/mind_subtopic_production_tradeoff_k@10.csv')
if tradeoff_path.exists():
    tradeoff_df = pd.read_csv(tradeoff_path)
    tradeoff_df = tradeoff_df.dropna(subset=['ndcg', 'divergence'])
    tradeoff_df = tradeoff_df.sort_values(['lambda'])
    
    fig, ax = plt.subplots(figsize=(10, 10))
    
    ax.plot(
        tradeoff_df['divergence'],
        tradeoff_df['ndcg'],
        marker='o',
        linestyle='-',
        linewidth=3,
        markersize=12,
        label='Production Reranker (NRMS)',
        color='#2ca02c'
    )
    
    for _, row in tradeoff_df.iterrows():
        ax.annotate(
            f"λ={row['lambda']:.2f}",
            (row['divergence'], row['ndcg']),
            textcoords="offset points",
            xytext=(8, 8),
            fontsize=14,
            fontweight='bold'
        )
    
    ax.set_xlabel('Subtopic Calibration Divergence', fontsize=20)
    ax.set_ylabel('NDCG@10 (GT)', fontsize=20)
    ax.tick_params(axis='both', which='major', labelsize=16)
    ax.grid(True, linestyle='--', alpha=0.6)
    ax.legend(fontsize=18, loc='upper right')
    plt.tight_layout()
    plt.show()
else:
    print(f"Production reranker results not found at {tradeoff_path}")


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

# Load production reranker results for activation
tradeoff_path = Path('../results/mind_activation_production_tradeoff_k@10.csv')
if tradeoff_path.exists():
    tradeoff_df = pd.read_csv(tradeoff_path)
    tradeoff_df = tradeoff_df.dropna(subset=['ndcg', 'divergence'])
    tradeoff_df = tradeoff_df.sort_values(['lambda'])
    
    fig, ax = plt.subplots(figsize=(10, 10))
    
    ax.plot(
        tradeoff_df['divergence'],
        tradeoff_df['ndcg'],
        marker='o',
        linestyle='-',
        linewidth=3,
        markersize=12,
        label='Production Reranker (NRMS)',
        color='#2ca02c'
    )
    
    for _, row in tradeoff_df.iterrows():
        ax.annotate(
            f"λ={row['lambda']:.2f}",
            (row['divergence'], row['ndcg']),
            textcoords="offset points",
            xytext=(8, 8),
            fontsize=14,
            fontweight='bold'
        )
    
    ax.set_xlabel('Activation Divergence', fontsize=20)
    ax.set_ylabel('NDCG@10 (GT)', fontsize=20)
    ax.tick_params(axis='both', which='major', labelsize=16)
    ax.grid(True, linestyle='--', alpha=0.6)
    ax.legend(fontsize=18, loc='upper right')
    plt.tight_layout()
    plt.show()
else:
    print(f"Production reranker results not found at {tradeoff_path}")


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

# Load production reranker results for complexity
tradeoff_path = Path('../results/mind_complexity_production_tradeoff_k@10.csv')
if tradeoff_path.exists():
    tradeoff_df = pd.read_csv(tradeoff_path)
    tradeoff_df = tradeoff_df.dropna(subset=['ndcg', 'divergence'])
    tradeoff_df = tradeoff_df.sort_values(['lambda'])
    
    fig, ax = plt.subplots(figsize=(10, 10))
    
    ax.plot(
        tradeoff_df['divergence'],
        tradeoff_df['ndcg'],
        marker='o',
        linestyle='-',
        linewidth=3,
        markersize=12,
        label='Production Reranker (NRMS)',
        color='#2ca02c'
    )
    
    for _, row in tradeoff_df.iterrows():
        ax.annotate(
            f"λ={row['lambda']:.2f}",
            (row['divergence'], row['ndcg']),
            textcoords="offset points",
            xytext=(8, 8),
            fontsize=14,
            fontweight='bold'
        )
    
    ax.set_xlabel('Complexity Calibration Divergence', fontsize=20)
    ax.set_ylabel('NDCG@10 (GT)', fontsize=20)
    ax.tick_params(axis='both', which='major', labelsize=16)
    ax.grid(True, linestyle='--', alpha=0.6)
    ax.legend(fontsize=18, loc='upper right')
    plt.tight_layout()
    plt.show()
else:
    print(f"Production reranker results not found at {tradeoff_path}")


## Comparison: Production vs Research Rerankers

The plot below compares how the production reranker (optimizing with predicted relevance) performs against the research reranker (optimizing with ground truth relevance).


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

# Load both production and research results for topic metric
prod_path = Path('../results/mind_topic_production_tradeoff_k@10.csv')
research_path = Path('../results/mind_topic_tradeoff_k@10.csv')

fig, ax = plt.subplots(figsize=(12, 10))

# Plot research reranker (baseline recommenders)
if research_path.exists():
    research_df = pd.read_csv(research_path)
    research_df = research_df.dropna(subset=['ndcg', 'divergence'])
    research_df = research_df[research_df['recommender'] != 'incorrect_random']
    
    for recommender, group in research_df.groupby('recommender'):
        ax.plot(
            group['divergence'],
            group['ndcg'],
            marker='o',
            linestyle='--',
            linewidth=2,
            markersize=8,
            alpha=0.6,
            label=f'{recommender.upper()} (Research)',
        )

# Plot production reranker
if prod_path.exists():
    prod_df = pd.read_csv(prod_path)
    prod_df = prod_df.dropna(subset=['ndcg', 'divergence'])
    prod_df = prod_df.sort_values(['lambda'])
    
    ax.plot(
        prod_df['divergence'],
        prod_df['ndcg'],
        marker='o',
        linestyle='-',
        linewidth=4,
        markersize=14,
        label='NRMS (Production)',
        color='#2ca02c'
    )
    
    for _, row in prod_df.iterrows():
        ax.annotate(
            f"λ={row['lambda']:.2f}",
            (row['divergence'], row['ndcg']),
            textcoords="offset points",
            xytext=(10, 10),
            fontsize=14,
            fontweight='bold',
            color='#2ca02c'
        )

ax.set_xlabel('Topic Calibration Divergence', fontsize=20)
ax.set_ylabel('NDCG@10 (GT)', fontsize=20)
ax.tick_params(axis='both', which='major', labelsize=16)
ax.grid(True, linestyle='--', alpha=0.6)
ax.legend(fontsize=16, loc='upper right', framealpha=0.95)
plt.tight_layout()
plt.show()
