In [1]:
import numpy as np
import pandas as pd
from scipy.stats import ttest_ind
import ast
import numpy as np
import altair as alt
import re

In [2]:
def parse_list_float64(s):
    # Extract all float numbers inside the np.float64(...) constructs
    float_strings = re.findall(r'np\.float64\(([^)]+)\)', s)
    # Convert to Python float
    return [float(num) for num in float_strings]

In [None]:
# Parse RADio results and save them to a pickle file
results = pd.read_csv('results/ebnerd_results_k@10.csv', index_col=0)

results['topic_calibrations'] = results['topic_calibrations'].apply(lambda x: ast.literal_eval(x))
results['subtopic_calibrations'] = results['subtopic_calibrations'].apply(lambda x: ast.literal_eval(x))
results['complexity_calibrations'] = results['complexity_calibrations'].apply(lambda x: ast.literal_eval(x))
results['fragmentations'] = results['fragmentations'].apply(lambda x: ast.literal_eval(x))
results['activations'] = results['activations'].apply(lambda x: ast.literal_eval(x))
results['representations'] = results['representations'].apply(lambda x: ast.literal_eval(x))
results['alternative_voices'] = results['alternative_voices'].apply(lambda x: ast.literal_eval(x))
results['tf_idf_ild_values'] = results['tf_idf_ild_values'].apply(lambda x: parse_list_float64(x))
results['sentbert_ild_values'] = results['sentbert_ild_values'].apply(lambda x: ast.literal_eval(x))
results['gini_values'] = results['gini_values'].apply(lambda x: ast.literal_eval(x))
results['ndcg_values'] = results['ndcg_values'].apply(lambda x: parse_list_float64(x))

In [None]:
alt.data_transformers.enable("vegafusion")

df_naml   = pd.DataFrame({'value': results['topic_calibrations']['naml'], 'type': 'NAML'})
df_nrms   = pd.DataFrame({'value': results['topic_calibrations']['nrms'], 'type': 'NRMS'})
df_lstur  = pd.DataFrame({'value': results['topic_calibrations']['lstur'], 'type': 'LSTUR'})
df_random = pd.DataFrame({'value': results['topic_calibrations']['random'], 'type': 'Random'})
df_original_random = pd.DataFrame({'value': results['topic_calibrations']['incorrect_random'], 'type': 'Original Random'})
df_all    = pd.concat([df_naml, df_nrms, df_lstur, df_random, df_original_random])

sample = (
    df_all
    .groupby('type', group_keys=False)
    .apply(lambda d: (
        d.sample(5, random_state=1)
         .assign(sample_id=lambda df: np.arange(len(df)))
    ))
    .reset_index(drop=True)
)

density = alt.Chart(df_all).transform_density(
    'value',
    as_=['value','density'],
    groupby=['type'],
    extent=[df_all.value.min(), df_all.value.max()],
    bandwidth=(df_all.value.max() - df_all.value.min())/20
).mark_area(opacity=0.3).encode(
    x=alt.X('value:Q', title='Topic Calibration Score'),
    y=alt.Y('density:Q', title='Probability Density', stack=None),
    color=alt.Color(
        'type:N',
        title='Method',
        scale=alt.Scale(domain=['NAML', 'NRMS', 'LSTUR', 'Random', 'Original Random']),
        legend=alt.Legend(
            titleFontSize=16,
            labelFontSize=14,
            symbolSize=150,
            padding=10,
            orient='right',
            direction='vertical',
            # tweak these to nudge the legend into the plot
            legendX=20,
            legendY=20,
            fillColor='white',
            symbolOpacity=1,
        )
    )
)

# sample-point layer
sample_points = alt.Chart(sample).mark_point(
    filled=True,
    size=100,
    opacity=1
).encode(
    x='value:Q',
    y=alt.Y('type:N', 
            title=None, 
            axis=None, 
            sort=['NAML','NRMS','LSTUR','Random','Original Random'],
            scale=alt.Scale(padding=40)  # Add padding to lower the Y values
           ),
    shape=alt.Shape(
        'sample_id:O',
        scale=alt.Scale(domain=list(range(5)), range=['circle','square','triangle','diamond','cross']),
        legend=None
    ),
    color=alt.Color('type:N', legend=None)
)

# combine + axis + title styling
chart = (density + sample_points).properties(
    width=600, height=300
).configure_title(
    fontSize=20,
).configure_axis(
    titleFontSize=16,
    labelFontSize=14
)

# # Display the plot
# alt.data_transformers.enable('default')
# alt.renderers.enable('default')

# Save with higher resolution (scale factor increases the resolution)
chart.save('results/ebnerd_topic_calibration_distributions.png', scale_factor=3.0)
chart

In [None]:
alt.data_transformers.enable("vegafusion")

df_naml   = pd.DataFrame({'value': results['subtopic_calibrations']['naml'], 'type': 'NAML'})
df_nrms   = pd.DataFrame({'value': results['subtopic_calibrations']['nrms'], 'type': 'NRMS'})
df_lstur  = pd.DataFrame({'value': results['subtopic_calibrations']['lstur'], 'type': 'LSTUR'})
df_random = pd.DataFrame({'value': results['subtopic_calibrations']['random'], 'type': 'Random'})
df_original_random = pd.DataFrame({'value': results['subtopic_calibrations']['incorrect_random'], 'type': 'Original Random'})
df_all    = pd.concat([df_naml, df_nrms, df_lstur, df_random, df_original_random])

sample = (
    df_all
    .groupby('type', group_keys=False)
    .apply(lambda d: (
        d.sample(5, random_state=1)
         .assign(sample_id=lambda df: np.arange(len(df)))
    ))
    .reset_index(drop=True)
)

density = alt.Chart(df_all).transform_density(
    'value',
    as_=['value','density'],
    groupby=['type'],
    extent=[df_all.value.min(), df_all.value.max()],
    bandwidth=(df_all.value.max() - df_all.value.min())/20
).mark_area(opacity=0.3).encode(
    x=alt.X('value:Q', title='Subtopic Calibration Score'),
    y=alt.Y('density:Q', title='Probability Density', stack=None),
    color=alt.Color(
        'type:N',
        title='Method',
        scale=alt.Scale(domain=['NAML', 'NRMS', 'LSTUR', 'Random', 'Original Random']),
        legend=alt.Legend(
            titleFontSize=16,
            labelFontSize=14,
            symbolSize=150,
            padding=10,
            orient='right',
            direction='vertical',
            # tweak these to nudge the legend into the plot
            legendX=20,
            legendY=20,
            fillColor='white',
            symbolOpacity=1,
        )
    )
)

# sample-point layer
sample_points = alt.Chart(sample).mark_point(
    filled=True,
    size=100,
    opacity=1
).encode(
    x='value:Q',
    y=alt.Y('type:N', 
            title=None, 
            axis=None, 
            sort=['NAML','NRMS','LSTUR','Random','Original Random'],
            scale=alt.Scale(padding=40)  # Add padding to lower the Y values
           ),
    shape=alt.Shape(
        'sample_id:O',
        scale=alt.Scale(domain=list(range(5)), range=['circle','square','triangle','diamond','cross']),
        legend=None
    ),
    color=alt.Color('type:N', legend=None)
)

# combine + axis + title styling
chart = (density + sample_points).properties(
    width=600, height=300
).configure_title(
    fontSize=20,
).configure_axis(
    titleFontSize=16,
    labelFontSize=14
)

# # Display the plot
# alt.data_transformers.enable('default')
# alt.renderers.enable('default')

# Save with higher resolution (scale factor increases the resolution)
chart.save('results/ebnerd_topic_subcalibration_distributions.png', scale_factor=3.0)
chart

In [None]:
alt.data_transformers.enable("vegafusion")

df_naml   = pd.DataFrame({'value': results['activations']['naml'], 'type': 'NAML'})
df_nrms   = pd.DataFrame({'value': results['activations']['nrms'], 'type': 'NRMS'})
df_lstur  = pd.DataFrame({'value': results['activations']['lstur'], 'type': 'LSTUR'})
df_random = pd.DataFrame({'value': results['activations']['random'], 'type': 'Random'})
df_original_random = pd.DataFrame({'value': results['activations']['incorrect_random'], 'type': 'Original Random'})
df_all    = pd.concat([df_naml, df_nrms, df_lstur, df_random, df_original_random])

sample = (
    df_all
    .groupby('type', group_keys=False)
    .apply(lambda d: (
        d.sample(5, random_state=1)
         .assign(sample_id=lambda df: np.arange(len(df)))
    ))
    .reset_index(drop=True)
)

density = alt.Chart(df_all).transform_density(
    'value',
    as_=['value','density'],
    groupby=['type'],
    extent=[df_all.value.min(), df_all.value.max()],
    bandwidth=(df_all.value.max() - df_all.value.min())/20
).mark_area(opacity=0.3).encode(
    x=alt.X('value:Q', title='Activation Score'),
    y=alt.Y('density:Q', title='Probability Density', stack=None),
    color=alt.Color(
        'type:N',
        title='Method',
        scale=alt.Scale(domain=['NAML', 'NRMS', 'LSTUR', 'Random', 'Original Random']),
        legend=alt.Legend(
            titleFontSize=16,
            labelFontSize=14,
            symbolSize=150,
            padding=10,
            orient='right',
            direction='vertical',
            # tweak these to nudge the legend into the plot
            legendX=20,
            legendY=20,
            fillColor='white',
            symbolOpacity=1,
        )
    )
)

# sample-point layer
sample_points = alt.Chart(sample).mark_point(
    filled=True,
    size=100,
    opacity=1
).encode(
    x='value:Q',
    y=alt.Y('type:N', 
            title=None, 
            axis=None, 
            sort=['NAML','NRMS','LSTUR','Random','Original Random'],
            scale=alt.Scale(padding=40)  # Add padding to lower the Y values
           ),
    shape=alt.Shape(
        'sample_id:O',
        scale=alt.Scale(domain=list(range(5)), range=['circle','square','triangle','diamond','cross']),
        legend=None
    ),
    color=alt.Color('type:N', legend=None)
)

# combine + axis + title styling
chart = (density + sample_points).properties(
    width=600, height=300
).configure_title(
    fontSize=20,
).configure_axis(
    titleFontSize=16,
    labelFontSize=14
)

# Save with higher resolution (scale factor increases the resolution)
chart.save('results/ebnerd_activation_distributions.png', scale_factor=3.0)
chart

In [None]:
aggregated_results = results.applymap(lambda x: np.mean(x) if isinstance(x, list) else x)
display(aggregated_results)

In [None]:
metrics = ['topic_calibrations', 'subtopic_calibrations', 'complexity_calibrations', 'activations', 'tf_idf_ild_values', 'sentbert_ild_values', 'gini_values', 'ndcg_values']

# Perform significance testing between two methods. In this case, we compare LSTUR and random.
for metric in metrics:
    lstur = results[metric]['pop']
    random = results[metric]['random']
    t, p = ttest_ind(lstur, random)
    print(f'{metric} t: {t}, p: {p}')
    if p < 0.01:
        print(f'{metric} is significant')
    else: 
        print(f'{metric} is not significant')
    print()


In [None]:
# This code plots the mean of the metric of interest for each method as the number of samples increases.

metric_to_plot = 'topic_calibrations'

x = np.arange(1, 201)
y1 = [] 
y2 = []
y3 = []
y4 = []
y_5 = []
for seed in range(10):

    y1_seed = []
    y2_seed = []
    y3_seed = []
    y4_seed = []

    start = seed * 200

    for n in x:

        end = start + n
        y1_seed.append(np.mean(results[metric_to_plot]['random'][start:end]))
        y2_seed.append(np.mean(results[metric_to_plot]['nrms'][start:end]))
        y3_seed.append(np.mean(results[metric_to_plot]['naml'][start:end]))
        y4_seed.append(np.mean(results[metric_to_plot]['lstur'][start:end]))
        
    y1.append(y1_seed)
    y2.append(y2_seed)
    y3.append(y3_seed)
    y4.append(y4_seed)

y1 = np.array(y1)
y2 = np.array(y2)
y3 = np.array(y3)
y4 = np.array(y4)

plot_1 = np.mean(y1, axis=0)
plot_2 = np.mean(y2, axis=0)
plot_3 = np.mean(y3, axis=0)
plot_4 = np.mean(y4, axis=0)

# Calculate standard errors
std_1 = 2 * np.std(y1, axis=0) / np.sqrt(y1.shape[0])
std_2 = 2 * np.std(y2, axis=0) / np.sqrt(y2.shape[0])
std_3 = 2 * np.std(y3, axis=0) / np.sqrt(y3.shape[0])
std_4 = 2 * np.std(y4, axis=0) / np.sqrt(y4.shape[0])

# Assuming plot_1 to plot_4 are numpy arrays of the same length
x_values = np.arange(len(plot_1))

# Create individual DataFrames for each plot with error bars
df1 = pd.DataFrame({
    'x': x_values, 
    'mean': plot_1, 
    'std': std_1,
    'metric': 'Random'
})
df2 = pd.DataFrame({
    'x': x_values, 
    'mean': plot_2, 
    'std': std_2,
    'metric': 'NRMS'
})
df3 = pd.DataFrame({
    'x': x_values, 
    'mean': plot_3, 
    'std': std_3,
    'metric': 'NAML'
})
df4 = pd.DataFrame({
    'x': x_values, 
    'mean': plot_4, 
    'std': std_4,
    'metric': 'LSTUR'
})

# Combine all DataFrames into one
df = pd.concat([df1, df2, df3, df4], ignore_index=True)

# Calculate the min and max values for y-axis scaling
y_min = (df['mean'] - df['std']).min()
y_max = (df['mean'] + df['std']).max()
y_padding = (y_max - y_min) * 0.1  # Add 10% padding


df['upper'] = df['mean'] + df['std']
df['lower'] = df['mean'] - df['std']

# 2. Base line chart (unchanged)
base = alt.Chart(df).mark_line().encode(
    x=alt.X('x', title='Number of samples', axis=alt.Axis(labelFontSize=14, titleFontSize=16)),
    y=alt.Y('mean', title='Topic Calibration', 
            scale=alt.Scale(domain=[(df['lower'].min() - 0.1*(df['upper'].max()-df['lower'].min())),
                                     (df['upper'].max() + 0.1*(df['upper'].max()-df['lower'].min()))]),
            axis=alt.Axis(labelFontSize=14, titleFontSize=16)),
    color=alt.Color('metric:N', scale=alt.Scale(scheme='category10'), 
    legend=alt.Legend(
            title='Method',
            orient='bottom-right',
            fillColor='white',
            labelFontSize=14,
            titleFontSize=16,
            padding=10,
            columns=4,
            direction='vertical',
        )),
).properties(
    width=500,
    height=200,
)

# 3. Error band using the new columns
error_bands = alt.Chart(df).mark_area(opacity=0.2).encode(
    x='x',
    y='lower:Q',
    y2='upper:Q',
    color='metric:N'
)

# 4. Combine
final_chart = (base + error_bands).properties(
    width=500,
    height=200,
)

# 5. Save at high resolution
final_chart.save('results/ebnerd_converging_topic_calibrations.png', scale_factor=3.0)
final_chart