In [1]:
import pandas as pd
import numpy as np
import pickle
import plotly
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [2]:
with open("log/foodkg_results.pkl", "rb") as f:
    all_results1 = pickle.load(f)

with open("log/culinarydb_results.pkl", "rb") as f:
    all_results2 = pickle.load(f)

strategies_abv = {
    "Random":"Random",
    'MostUncertainCls': 'Most Uncertain Cluster',
    "Uncertainty": 'Uncertainty-based',
    "ClusteredUnc": "Clustered Uncertainty-based",
    'UncertaintyCls': 'Uncertainty-based Clustering',
}

colors = plotly.colors.qualitative.Set1
color_dict = dict(zip(strategies_abv.values(), colors))
use_user_feedback = [False, True]
user_names =  ['user1', 'user2', 'user3',  'sportive_user1', 'unhealthy_user1', 'vegan_user2', 'elder_user2', 'sportive_user3', 'unhealthy_user3', 'random']
strategy_names = ['Random', 'Diversity Sampling', 'Most Uncertain Cluster', 'Uncertainty-based', 'Clustered Uncertainty-based', 'Uncertainty-based Clustering']
metrics = ["f1", "mcc", "pref_acc"]
metric_map = {"f1":"F1 Score",  "mcc":"MCC", "pref_acc":"Explanation Accuracy"}

all_results1['strategy'] = all_results1['strategy'].map(strategies_abv)
all_results2['strategy'] = all_results2['strategy'].map(strategies_abv)

results1 = all_results1[all_results1['strategy'].isin(strategy_names)].groupby(["user", "model", "feedback_eval", "strategy"])[metrics].mean()
results2 = all_results2[all_results2['strategy'].isin(strategy_names)].groupby(["user", "model", "feedback_eval", "strategy"])[metrics].mean()

In [3]:
width, height = 800, 400
fontsize = 19

def line_plots(data, x_add, x_dtick, x_title, y_title, fig_title=None):

    fig = make_subplots(rows=1, cols=2, 
                        shared_yaxes=True, shared_xaxes=True, 
                        y_title=y_title, x_title=x_title, 
                        subplot_titles=["No User Feedback", "User Feedback"],
                        horizontal_spacing =0.05)

    for i, fe in enumerate(use_user_feedback):

        sub_data = data.loc[str(fe)]

        for name, row in sub_data.items():
            y = row[1:]
            x = np.arange(x_add, len(y)+x_add)
            showlegend = i == 1
            fig.add_trace(go.Scatter(x=x, y=y, name=name, mode='lines', marker_color = color_dict[name], showlegend=showlegend), row=1, col=i+1)

        
    fig.update_yaxes(
                        nticks = 10,
                        zeroline=True, zerolinecolor='lightgray',
                        showline=True, showgrid=True, gridcolor='lightgray', linecolor='gray', 
                        tickfont=dict(size=fontsize*0.9))
    
    fig.update_xaxes(showline=True, linecolor='gray', dtick=x_dtick, tickfont=dict(size=fontsize))

    fig.update_layout(
        width=width, height=height,
        margin=dict(l=70,r=20,b=70,t=0,pad=0),
        legend=dict(orientation="h", xanchor="auto",y=1.3, x=0.5, title_text='', font=dict(size=fontsize*0.8)),
        plot_bgcolor='white',
        font=dict(family="Times New Roman" , size=fontsize, color='black'),
    )
    fig.update_annotations(font_size=fontsize)
    fig.show()

    # if fig_title:
    #     fig.write_image(fig_title + ".pdf")


In [4]:
metric = 'mcc'
for i, df in enumerate([results1, results2]):

    dataset_name = ['FoodKG', "Allrecipes"][i]
    fig_title = "_".join([dataset_name, metric_map[metric]])
    data = df.groupby(["feedback_eval", "strategy"])[metric].mean()

    x_dtick = 5
    x_title = 'Step'
    y_title = metric_map[metric]
    x_add = 1
    line_plots(data, x_add, x_dtick, x_title, y_title, fig_title)

In [5]:
metric = 'mcc'
data = results1.groupby(["feedback_eval", "strategy"])[metric].mean()

horizontal_diff = {}

for use_feedback in ['False', 'True']:
    scores_rand = data.loc[use_feedback, 'Random']
    for strategy, scores in data.loc[use_feedback].items():
        if strategy != 'Random': 
            
            horizontal_diff[(strategy, use_feedback)] = (np.arange(len(scores)) - np.interp(scores_rand, scores, np.arange(len(scores))))[1:]

df = pd.DataFrame(horizontal_diff).loc[np.arange(10,60,10)].T.sort_index().round(2)
#df.to_excel('foodkg_step_diff.xlsx')
df

Unnamed: 0,Unnamed: 1,10,20,30,40,50
Clustered Uncertainty-based,False,10.03,12.1,18.7,24.56,30.99
Clustered Uncertainty-based,True,2.39,7.41,14.73,20.99,28.86
Most Uncertain Cluster,False,10.03,11.36,14.67,17.77,22.42
Most Uncertain Cluster,True,-0.12,1.89,8.5,15.17,22.91
Uncertainty-based,False,10.03,12.28,18.62,25.49,32.79
Uncertainty-based,True,2.49,7.37,14.8,22.77,30.64
Uncertainty-based Clustering,False,10.03,9.37,13.24,17.1,19.58
Uncertainty-based Clustering,True,-0.52,-1.96,-0.38,3.09,7.48


In [6]:
def line_plots2(data, x_add, x_dtick, x_title, y_title, fig_title=None):

    fig = make_subplots(rows=1, cols=2, 
                        shared_yaxes=True, shared_xaxes=True, 
                        y_title=y_title, x_title=x_title, 
                        subplot_titles=["No User Feedback", "User Feedback"], vertical_spacing=0.5,
                        horizontal_spacing =0.05)

    for i, fe in enumerate(use_user_feedback):

        sub_data = data.loc[str(fe)]

        for name, row in sub_data.iteritems():
        
            y = row[:7]
            x = np.arange(x_add, len(y)+x_add)
            showlegend = i == 1
            fig.add_trace(go.Scatter(x=x, y=y, name=name, mode='lines', marker_color = color_dict[name], showlegend=showlegend), row=1, col=i+1)

        
    fig.update_yaxes(
                    nticks = 10,
                    zeroline=True, zerolinecolor='lightgray',
                    showline=True, showgrid=True, gridcolor='lightgray', linecolor='gray', 
                    tickfont=dict(size=fontsize*0.9))


    fig.update_xaxes(showline=True, linecolor='gray', dtick=x_dtick, tickfont=dict(size=fontsize*0.8), title_standoff = 25,
                     ticktext = ['Step 1', 'Step 3.1', 1, 2, 3, 4, 5],
                     tickvals=np.arange(7))
    
    fig.update_layout(
        width=width, height=height,
        margin=dict(l=70,r=20,b=70,t=0,pad=0),
        legend=dict(orientation="h", xanchor="auto",y=1.3, x=0.5, title_text='', font=dict(size=fontsize*0.8)),
        plot_bgcolor='white',
        font=dict(family="Times New Roman" , size=fontsize, color='black'),
    )
    fig.update_annotations(font_size=fontsize)
    
    fig.show()

    # if fig_title:
    #     fig.write_image(fig_title + ".pdf")

In [7]:
metric = 'mcc'
for i, df in enumerate([results1, results2]):

    dataset_name = ['FoodKG', "Allrecipes"][i]
    fig_title = "_".join([dataset_name, metric_map[metric]]) +  "(1to5)"
    data = df.groupby(["feedback_eval", "strategy"])[metric].mean()

    x_dtick = 1
    x_title = 'Step'
    y_title = metric_map[metric]
    x_add = 0
    line_plots2(data, x_add, x_dtick, x_title, y_title, fig_title)

In [8]:
df = results1.groupby(["strategy", "feedback_eval"])['mcc'].mean().apply(lambda arr: arr[[0, 1, 2, 3, 4, 5, 6 ,-1]])
df = pd.DataFrame.from_dict(dict(zip(df.index, df.values))).T
df = df.loc[['Clustered Uncertainty-based', 'Uncertainty-based', 'Uncertainty-based Clustering', 'Most Uncertain Cluster', 'Random']]
df.columns = ["Step 1", "Step 3.1", "t=1", "t=2", "t=3", "t=4", "t=5", "t=50"]
df

Unnamed: 0,Unnamed: 1,Step 1,Step 3.1,t=1,t=2,t=3,t=4,t=5,t=50
Clustered Uncertainty-based,False,0.132995,0.307007,0.313637,0.315831,0.32188,0.325913,0.330719,0.592958
Clustered Uncertainty-based,True,0.132995,0.307007,0.340776,0.358761,0.374139,0.388275,0.403133,0.709284
Uncertainty-based,False,0.132995,0.307007,0.313637,0.315831,0.32188,0.325913,0.330719,0.58748
Uncertainty-based,True,0.132995,0.307007,0.340776,0.358761,0.374139,0.388275,0.403133,0.709715
Uncertainty-based Clustering,False,0.132995,0.307007,0.305792,0.311489,0.31472,0.318447,0.320524,0.525811
Uncertainty-based Clustering,True,0.132995,0.307007,0.331519,0.351679,0.364763,0.3717,0.379121,0.655507
Most Uncertain Cluster,False,0.132995,0.307007,0.31241,0.318111,0.328234,0.329425,0.330651,0.537848
Most Uncertain Cluster,True,0.132995,0.307007,0.332344,0.348106,0.360315,0.370496,0.381383,0.698553
Random,False,0.120023,0.265358,0.267098,0.270315,0.274577,0.28134,0.282109,0.465729
Random,True,0.120023,0.265358,0.297424,0.311616,0.327986,0.343416,0.363412,0.64279


In [9]:
# df.to_excel('foodkg_results.xlsx')

In [10]:
df.drop('Random').reset_index().groupby(['level_1']).mean().pct_change()


dropping on a non-lexsorted multi-index without a level parameter may impact performance.



Unnamed: 0_level_0,Step 1,Step 3.1,t=1,t=2,t=3,t=4,t=5,t=50
level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
False,,,,,,,,
True,0.0,0.0,0.08024,0.123722,0.145053,0.168537,0.193628,0.235713


In [11]:
i = "t=50"
print(df.xs('False', level=1)[i].max() / df.loc['Random'].loc['False'][i])
print(df.xs('True', level=1)[i].max() / df.loc['Random'].loc['True'][i])

1.2731817978871771
1.1041162589004874


In [12]:
(df.xs('True', level=1) / df.loc['Random'].loc['True']).round(3).max()

Step 1      1.108
Step 3.1    1.157
t=1         1.146
t=2         1.151
t=3         1.141
t=4         1.131
t=5         1.109
t=50        1.104
dtype: float64

In [13]:
df = results1.groupby(["strategy", "feedback_eval"])['mcc'].mean().apply(lambda arr: arr[-1])
df.unstack().T.pct_change()

strategy,Clustered Uncertainty-based,Most Uncertain Cluster,Random,Uncertainty-based,Uncertainty-based Clustering
feedback_eval,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
False,,,,,
True,0.19618,0.298793,0.38018,0.208066,0.24666
