In [1]:
# pip install altair vega_datasets pandas
!cd ~/SemanticDebugger/
!pwd
!which python

/private/home/yuchenlin/SemanticDebugger/semanticdebugger/debug_algs
/usr/bin/python


In [3]:
import numpy as np 
import pandas as pd
import json 
import os
import altair as alt 
import warnings
warnings.filterwarnings('ignore')



============================================
-------------------------------------------

# Knowledge Retain Acc

In [24]:

def get_result_data(path):
    lr = path.split("_")[-5]
    num_epoch = path.split("_")[-4]
    assert os.path.exists(path)
    output_info = json.load(open(path))
    prefix = f"{lr};{num_epoch}"
    # print(output_info.keys()) 
    online_debug_results = output_info
    return prefix, online_debug_results

def get_forgetting_data(path, add_upperbound=False):
    prefix, online_debug_results = get_result_data(path)
    forgetting_data = []
    # em_on_passes = []
    # f1_on_passes = []
    for timecode, item in online_debug_results.items():
        timecode = int(timecode)
        r = item["eval_results_overall_forget"]["metric_results"]
        # result, result_all = item 
        # print(timecode, result["EM"])
        # d = dict(timecode=timecode, em=result["EM"], f1=result["QA-F1"])
        # pass_forgetting_data.append(d)
        # em_on_passes.append(result["EM"])
        # f1_on_passes.append(result["QA-F1"])
        forgetting_data.append(dict(prefix=prefix, timecode=timecode, em=r["EM"]))
        if add_upperbound:
            forgetting_data.append(dict(prefix="reference", timecode=timecode, em=1))    
        
    return forgetting_data
    

forgetting_data = [] 
forgetting_data += get_forgetting_data("../../bug_data/output/nq_dev_0706_3e-5_e5_offline_eval/alltime_result.json", add_upperbound=True)
forgetting_data += get_forgetting_data("../../bug_data/output/nq_dev_0706_3e-5_e3_offline_eval/alltime_result.json")
forgetting_data += get_forgetting_data("../../bug_data/output/nq_dev_0706_1e-5_e5_offline_eval/alltime_result.json")
forgetting_data += get_forgetting_data("../../bug_data/output/nq_dev_0706_1e-5_e3_offline_eval/alltime_result.json")
forgetting_data_df = pd.DataFrame(forgetting_data)
forgetting_data_df.head()


Unnamed: 0,prefix,timecode,em
0,3e-5;e5,0,1.0
1,reference,0,1.0
2,3e-5;e5,1,0.86
3,reference,1,1.0
4,3e-5;e5,2,0.906667


In [25]:

x = alt.X("timecode", type="ordinal", title="Timecode")
y_em = alt.Y("em", type="quantitative", title="EM", scale=alt.Scale(domain=[0.5, 1.0]))
em_line = alt.Chart(forgetting_data_df).mark_line(interpolate='natural', point=True).encode(x=x, y=y_em, opacity=alt.value(0.8), color=alt.value('red'))

scale = alt.Scale(domain=['1e-5;e3', '1e-5;e5', '3e-5;e3', '3e-5;e5', 'reference'], range=['red', 'orange', 'purple', 'blue', 'green'])
color=alt.Color('prefix:N', scale=scale)


 
# fig = alt.Chart(reformatted_data).mark_area(opacity=0.6).encode(x="timecode:O", y=alt.Y("em:Q", stack=None, title="EM"), color=color)
 
fig = alt.Chart(forgetting_data_df).mark_line(opacity=0.7, interpolate="natural", point=True).encode(x=x, y=alt.Y("em:Q", stack=None, title="EM", scale=alt.Scale(domain=[0.4, 1])), color=color).properties(title="Knowledge Retain in EM acc. (forgetting measure) ")


fig = alt.Chart(forgetting_data_df).mark_line(opacity=0.7, interpolate="natural", point=True).encode(x=x, y=alt.Y("em:Q", stack=None, title="EM", scale=alt.Scale(domain=[0.4, 1])), color=color).properties(title="Knowledge Retain in EM acc. (forgetting measure) ")

fig = alt.layer(fig).resolve_scale()
fig.properties(width=1000).configure_axis(
    labelFontSize=18,
    titleFontSize=16, 
).configure_legend(titleFontSize=0, labelFontSize=20, orient='right', strokeColor='gray',
    fillColor='#EEEEEE',
    padding=10,
    cornerRadius=10,).configure_title(
    fontSize=20,
    font='Courier',
    anchor='middle',
    orient="top", align="center",
    color='black'
)

In [30]:
# Overall Error-Fixing Rate for curves


def get_overall_error_fixing_rate_overtime(filepath, add_upper_bound=False):
    data = json.load(open(filepath))
    overall_alltime_error_fixing_rate = []
    lr = filepath.split("_")[-5]
    num_epoch = filepath.split("_")[-4]
    prefix = f"{lr};{num_epoch}"
    
    for timecode, item in data.items():
        results = item["eval_results_overall_bug"]["metric_results"]
        dp = {}
        dp["timecode"] = int(timecode)
        dp["em"] = results["EM"]
        dp["f1"] = results["QA-F1"]
        dp["prefix"] = prefix
        overall_alltime_error_fixing_rate.append(dp)
        if add_upper_bound:
            dp = dp.copy()
            dp["prefix"] = "reference"
            dp["em"] = 1/len(data)*dp["timecode"]
            overall_alltime_error_fixing_rate.append(dp)
    dp = {}
    dp["timecode"] = 0
    dp["em"] = 0
    dp["prefix"] = prefix
    overall_alltime_error_fixing_rate.append(dp)
    dp = dp.copy()
    dp["prefix"] = "reference"
    overall_alltime_error_fixing_rate.append(dp)
    return overall_alltime_error_fixing_rate

overall_alltime_error_fixing_rate = get_overall_error_fixing_rate_overtime(filepath="../../bug_data/output/nq_dev_0706_3e-5_e5_offline_eval/alltime_result.json", add_upper_bound=True)
overall_alltime_error_fixing_rate += get_overall_error_fixing_rate_overtime(filepath="../../bug_data/output/nq_dev_0706_1e-5_e5_offline_eval/alltime_result.json")
overall_alltime_error_fixing_rate += get_overall_error_fixing_rate_overtime(filepath="../../bug_data/output/nq_dev_0706_3e-5_e3_offline_eval/alltime_result.json")
overall_alltime_error_fixing_rate += get_overall_error_fixing_rate_overtime(filepath="../../bug_data/output/nq_dev_0706_1e-5_e3_offline_eval/alltime_result.json")

overall_alltime_error_fixing_rate_df = pd.DataFrame(overall_alltime_error_fixing_rate)

overall_alltime_error_fixing_rate_df.head()




Unnamed: 0,timecode,em,f1,prefix
0,0,0.013,0.144492,3e-5;e5
1,0,0.0,0.144492,reference
2,1,0.087,0.243224,3e-5;e5
3,1,0.022222,0.243224,reference
4,2,0.156,0.291512,3e-5;e5


In [19]:

x = alt.X("timecode", type="ordinal", title="Timecode")
y_em = alt.Y("em", type="quantitative", title="EM", scale=alt.Scale(domain=[0.0, 1.0]))
em_line = alt.Chart(overall_alltime_error_fixing_rate).mark_line(interpolate='natural', point=True).encode(x=x, y=y_em, opacity=alt.value(0.8), color=alt.value('red'))

scale = alt.Scale(domain=['1e-5;e3', '1e-5;e5', '3e-5;e3', '3e-5;e5', 'reference'], range=['red', 'orange', 'purple', 'blue', 'green'])
color=alt.Color('prefix:N', scale=scale)


 
# fig = alt.Chart(reformatted_data).mark_area(opacity=0.6).encode(x="timecode:O", y=alt.Y("em:Q", stack=None, title="EM"), color=color)
 
fig = alt.Chart(overall_alltime_error_fixing_rate_df).mark_line(opacity=0.7, interpolate="natural", point=True).encode(x=x, y=alt.Y("em:Q", stack=None, title="EM", scale=alt.Scale(domain=[0.0, 1])), color=color).properties(title="Overall Bug-Fixing Rate in EM acc. ")

fig.properties(width=1000).configure_axis(
    labelFontSize=18,
    titleFontSize=16, 
).configure_legend(titleFontSize=0, labelFontSize=20, orient='right', strokeColor='gray',
    fillColor='#EEEEEE',
    padding=10,
    cornerRadius=10,).configure_title(
    fontSize=20,
    font='Courier',
    anchor='middle',
    orient="top", align="center",
    color='black'
)

## Retain-Forget F1

In [23]:
rf_f1_data = []
retain_df = forgetting_data_df
errorfix_df = overall_alltime_error_fixing_rate_df

prefixes = ['1e-5;e3', '1e-5;e5', '3e-5;e3', '3e-5;e5', 'reference']

# prefixes = ['reference', "3e-5;e3"]

overall_f1_overtime = []

for prefix in prefixes:
    for timecode in range(0, 51):
        if timecode == 0:
            ef_acc = 0
            rt_acc = 1.0
        else:
            ef_acc = errorfix_df[errorfix_df["timecode"]==timecode][errorfix_df["prefix"]==prefix].iloc[0]["em"]
            rt_acc = retain_df[retain_df["timecode"]==timecode][retain_df["prefix"]==prefix].iloc[0]["em"]

        f1 = 2*(ef_acc*rt_acc)/(rt_acc+ef_acc)
        dp = {}
        dp["prefix"] = prefix
        dp["timecode"] = timecode
        dp["ef_acc"] = ef_acc
        dp["rt_acc"] = rt_acc
        dp["f1"] = f1
        overall_f1_overtime.append(dp)
        # print(f"{f1*100:.2f}%")
overall_f1_overtime_df = pd.DataFrame(overall_f1_overtime)


x = alt.X("timecode", type="ordinal", title="Timecode")
y_em = alt.Y("em", type="quantitative", title="EM", scale=alt.Scale(domain=[0.0, 1.0]))
em_line = alt.Chart(overall_f1_overtime_df).mark_line(interpolate='natural', point=True).encode(x=x, y=y_em, opacity=alt.value(0.8), color=alt.value('red'))

scale = alt.Scale(domain=['1e-5;e3', '1e-5;e5', '3e-5;e3', '3e-5;e5', 'reference'], range=['red', 'orange', 'purple', 'blue', 'green'])
color=alt.Color('prefix:N', scale=scale)


 
# fig = alt.Chart(reformatted_data).mark_area(opacity=0.6).encode(x="timecode:O", y=alt.Y("em:Q", stack=None, title="EM"), color=color)
 
fig = alt.Chart(overall_f1_overtime_df).mark_line(opacity=0.7, interpolate="natural", point=True).encode(x=x, y=alt.Y("f1:Q", stack=None, title="F1", scale=alt.Scale(domain=[0.0, 1])), color=color).properties(title="Overall F1 (retain and bugfixing). ")

fig.properties(width=1000).configure_axis(
    labelFontSize=18,
    titleFontSize=16, 
).configure_legend(titleFontSize=0, labelFontSize=20, orient='right', strokeColor='gray',
    fillColor='#EEEEEE',
    padding=10,
    cornerRadius=10,).configure_title(
    fontSize=20,
    font='Courier',
    anchor='middle',
    orient="top", align="center",
    color='black'
)


# Error Fixing 

In [178]:
def get_error_fixing_data(path):
    prefix, online_debug_results = get_result_data(path)
    forgetting_data = []
    # em_on_passes = []
    # f1_on_passes = []
    bsz = 20
    odr = online_debug_results
    error_fixing_data = []
    for timecode, ((before, after), em_fixed, f1_fixed, em_prefixed, f1_prefixed) in \
        enumerate(zip(odr["res_on_bugs"], odr["em_fixed_bugs"], odr["f1_fixed_bugs"], odr["em_prefixed_bugs"], odr["f1_prefixed_bugs"])):
        # f1_before = before["QA-F1"]
        # f1_after = after["QA-F1"] 
        # em_fix_rate = len(em_fixed)/(bsz-len(em_prefixed))
        # f1_fix_rate = len(f1_fixed)/(bsz-len(f1_prefixed))
        # em_improve = em_after - em_before
        # f1_improve = f1_after - f1_before
        inter_prefix_efr = len(em_prefixed)/bsz
        inter_respon_efr = len(em_fixed)/(bsz-len(em_prefixed))
        dp = dict(prefix=prefix, timecode=timecode, ip_efr=inter_prefix_efr, ir_efr=inter_respon_efr)
        dp['em_before'] = before["EM"]
        dp['em_after'] = after["EM"]
        error_fixing_data.append(dp)
    return error_fixing_data


error_fixing_data = [] 
error_fixing_data += get_error_fixing_data("../../bug_data/output/nq_dev_0625_1e-5_e3_result.json")
error_fixing_data += get_error_fixing_data("../../bug_data/output/nq_dev_0625_3e-5_e3_result.json")
error_fixing_data += get_error_fixing_data("../../bug_data/output/nq_dev_0625_1e-5_e5_result.json")
error_fixing_data += get_error_fixing_data("../../bug_data/output/nq_dev_0625_3e-5_e5_result.json")
error_fixing_data_df = pd.DataFrame(error_fixing_data)
error_fixing_data_df.head()



Unnamed: 0,prefix,timecode,ip_efr,ir_efr,em_before,em_after
0,1e-5;e3,0,0.0,0.25,0.0,0.25
1,1e-5;e3,1,0.05,0.210526,0.05,0.25
2,1e-5;e3,2,0.05,0.421053,0.05,0.45
3,1e-5;e3,3,0.1,0.333333,0.1,0.4
4,1e-5;e3,4,0.05,0.210526,0.05,0.25


In [179]:
x = alt.X("timecode", type="ordinal", title="Timecode")
y_em = alt.Y("em", type="quantitative", title="EM", scale=alt.Scale(domain=[0.5, 1.0]))
em_line = alt.Chart(forgetting_data_df).mark_line(interpolate='natural', point=True).encode(x=x, y=y_em, opacity=alt.value(0.8), color=alt.value('red'))

scale = alt.Scale(domain=['1e-5;e3', '1e-5;e5', '3e-5;e3', '3e-5;e5'], range=['red', 'green', 'purple', 'blue'])
color=alt.Color('prefix:N', scale=scale) 
 
# fig = alt.Chart(reformatted_data).mark_area(opacity=0.6).encode(x="timecode:O", y=alt.Y("em:Q", stack=None, title="EM"), color=color)
 
fig = alt.Chart(error_fixing_data_df).mark_line(opacity=0.7, interpolate="natural", point=True).encode(x=x, y=alt.Y("ir_efr:Q", stack=None, title="EM", scale=alt.Scale(domain=[0.0, 1])), color=color).properties(title="Responsive Error Fixing Rate Over Time. (in EM) ")

fig.properties(width=1000).configure_axis(
    labelFontSize=18,
    titleFontSize=16, 
).configure_legend(titleFontSize=0, labelFontSize=20, orient='right', strokeColor='gray',
    fillColor='#EEEEEE',
    padding=10,
    cornerRadius=10,).configure_title(
    fontSize=20,
    font='Courier',
    anchor='middle',
    orient="top", align="center",
    color='black'
)

In [180]:
x = alt.X("timecode", type="ordinal", title="Timecode")
y_em = alt.Y("em", type="quantitative", title="EM", scale=alt.Scale(domain=[0.5, 1.0]))
em_line = alt.Chart(forgetting_data_df).mark_line(interpolate='natural', point=True).encode(x=x, y=y_em, opacity=alt.value(0.8), color=alt.value('red'))

scale = alt.Scale(domain=['1e-5;e3', '1e-5;e5', '3e-5;e3', '3e-5;e5'], range=['red', 'green', 'purple', 'blue'])
color=alt.Color('prefix:N', scale=scale) 
 
# fig = alt.Chart(reformatted_data).mark_area(opacity=0.6).encode(x="timecode:O", y=alt.Y("em:Q", stack=None, title="EM"), color=color)
 
fig = alt.Chart(error_fixing_data_df).mark_line(opacity=0.7, interpolate="natural", point=True).encode(x=x, y=alt.Y("ip_efr:Q", stack=None, title="EM", scale=alt.Scale(domain=[0.0, 0.4])), color=color).properties(title="Error Pre-Fixing Rate Over Time. (in EM) ")

fig.properties(width=1000).configure_axis(
    labelFontSize=18,
    titleFontSize=16, 
).configure_legend(titleFontSize=0, labelFontSize=20, orient='right', strokeColor='gray',
    fillColor='#EEEEEE',
    padding=10,
    cornerRadius=10,).configure_title(
    fontSize=20,
    font='Courier',
    anchor='middle',
    orient="top", align="center",
    color='black'
)

In [181]:
prefix = "1e-5;e5"
reformatted_data = []
for item in error_fixing_data:
    if item["prefix"] != prefix:
        continue
    d_before = {}
    d_before["timecode"] = item["timecode"]
    d_before["status"] = "before"
    d_before["em"] = item["em_before"]
    reformatted_data.append(d_before)
    d_after = {}
    d_after["timecode"] = item["timecode"]
    d_after["status"] = "after"
    d_after["em"] = item["em_after"]
    reformatted_data.append(d_after)

reformatted_data = pd.DataFrame(reformatted_data)
 
# base = alt.Chart(error_fixing_data).encode(x=x) 
scale = alt.Scale(domain=['before', 'after'], range=['red', 'green'])
color=alt.Color('status:N', scale=scale)



# fig = alt.Chart(reformatted_data).mark_area(opacity=0.6).encode(x="timecode:O", y=alt.Y("em:Q", stack=None, title="EM"), color=color)
 
errorfix_bar = alt.Chart(reformatted_data).mark_bar(opacity=0.3).encode(x="timecode:O", y=alt.Y("em:Q", stack=None, title="EM on Current Errors", scale=alt.Scale(domain=[0.0, 1])), color=color)


forget_curve = alt.Chart(forgetting_data_df[forgetting_data_df["prefix"]==prefix]).mark_line(opacity=1, interpolate="natural", point=True).encode(x=x, y=alt.Y("em:Q", stack=None, title="EM on Passes", scale=alt.Scale(domain=[0.45, 1])), color=alt.value('red'))

fig = alt.layer(errorfix_bar, forget_curve).resolve_scale(
    y = 'independent'
).properties(width=1000, title=f"All Measures about '{prefix}'").configure_axis(
    labelFontSize=18,
    titleFontSize=16
)
fig.configure_legend(titleFontSize=0, labelFontSize=20, orient='right', strokeColor='gray',
    fillColor='#EEEEEE',
    padding=10,
    cornerRadius=10,).configure_title(
    fontSize=20,
    font='Courier',
    anchor='middle',
    orient="top", align="center",
    color='black'
)