In [None]:
!pip install editdistance

In [2]:

import sys
sys.path.append("./../")
import json
import numpy as np
from scipy.stats import bootstrap
import os
import pandas as pd
from IPython.display import display,HTML
import editdistance
import difflib
# Define colors for each column
colors = {
    "question": "#FFA500",        # Orange
    "correct_answer": "#00A000",  # Dark Green
    "incorrect_answer": "#CC3333", # Dark Red
    "gt_answer": "#1E90FF",        # Dodger Blue
    "edit_distance": "#BA55D3"      # Medium Purple
}

# Apply the color styling
def color_text(val, color):
    return f'color: {color};'


def render_html(val):
    return val

# Function to create HTML-styled diff
def diff_html(a: str, b: str) -> str:
    line_color = {
        "+": "background-color: #DFF0D8; color: green;",  # Green for additions
        "-": "background-color: #F2DEDE; color: red;"     # Red for deletions
    }

    diffs = difflib.ndiff(a.split(), b.split())
    styled = []

    for prev in diffs:
        if prev.startswith(" "):  # No change
            styled.append(prev[2:])  # Skip leading space
        elif prev.startswith("+"):  # Addition
            styled.append(f'<span style="{line_color["+"]}">{prev[2:]}</span>')
        elif prev.startswith("-"):  # Deletion
            styled.append(f'<span style="{line_color["-"]}">{prev[2:]}</span>')

    return "<br>".join(styled)  # Join with line breaks for HTML display

def style_dataframe(df, rename_columns, sort_by_edit_distance=True):
    
    max_width = 100
    # Sort by edit distance if specified
    if sort_by_edit_distance:
        df = df.sort_values(by=["edit_distance"], ascending=True)

    # Apply HTML color styling directly to the DataFrame
    for col in df.columns:
        if col == "question":
            df[col] = df[col].apply(lambda x: f'<span style="color: {colors["question"]};">{x}</span>')
        elif col == "correct_answer":
            df[col] = df[col].apply(lambda x: f'<span style="color: {colors["correct_answer"]};">{x}</span>')
        elif col == "incorrect_answer":
            df[col] = df[col].apply(lambda x: f'<span style="color: {colors["incorrect_answer"]};">{x}</span>')
        elif col == "gt_answer":
            df[col] = df[col].apply(lambda x: f'<span style="color: {colors["gt_answer"]};">{x}</span>')
        elif col == "edit_distance":
            df[col] = df[col].apply(lambda x: f'<span style="color: {colors["edit_distance"]};">{x}</span>')
        elif col == "git_diff_correct_incorrect":
            df[col] = df[col].apply(lambda x: f'<div style="text-align: left;">{x}</div>')  # Align text to left

    # Rename columns based on the provided mapping
    df = df.rename(columns=rename_columns)
    # Return the DataFrame as HTML
    return df.to_html(escape=False, index=False, table_id="responsive_table")

# Use a style block to make the table responsive
responsive_style = """
<style>
    #responsive_table {
        width: 100%;
        table-layout: auto;  /* Allows the table to adjust column widths */
    }
</style>
"""

In [3]:

names = ["offline_star_exp/no_pause_peft_temp_1.0_part2", "offline_star_exp/pause_temp_1.0_part2"]
paths = ["/dlabscratch1/baldwin/pause2/PauseToken/logs/train/runs/2024-11-05_14-18-49/test_results.json" ,"/dlabscratch1/baldwin/pause2/PauseToken/logs/train/runs/2024-11-05_08-30-36/test_results.json"]

In [4]:
# def clean(text):
#     texts = text.split("####")
#     if len(texts) == 1:
#         return text
#     last_txt = "#### " + "".join(texts[-1].split(" "))
#     return "####".join(texts[:-1]) + last_txt
def strip_pause_tokens(text, pause_token="<|pause|>"):
    return text.replace(pause_token, "")

name2data = {}
for name,path in zip(names, paths):
    with open(path, "r") as f:
        data = json.load(f)
        
    # for entry in data:
    #     if "clean_output" in entry:
    #         entry["answer"] = clean(entry["clean_output"])
    #     else:
    #         entry["answer"] = entry["output"]
    name2data[name] = data

In [None]:
reses = []
for name, data in name2data.items():
    
    results = [data["test/accuracy"] for data in data]
    
    results = np.array(results).reshape((1,-1))
    
    ci = bootstrap(results, np.mean).confidence_interval
        
    reses.append((f"{name}: {np.mean(results)}, {ci}, {len(data)} samples", results))

reses.sort(key= lambda x: x[1].mean(),reverse = True)
for res,val in reses:
    print(res)

In [6]:
df_pause_benefits = {}
df_exclusives_pause = {} 
df_pause_detriment = {}
df_exclusives_no_pause = {}

In [None]:
def append_res_to_dict(df, question, correct_answer, incorrect_answer, gt_answer):
    if "question" not in df:
        df["question"] = []
    if "correct_answer" not in df:
        df["correct_answer"] = []
    if "incorrect_answer" not in df:
        df["incorrect_answer"] = []
    if "gt_answer" not in df:
        df["gt_answer"] = []
    if "edit_distance" not in df:
        df["edit_distance"] = []
    if "git_diff_correct_incorrect" not in df:
        df["git_diff_correct_incorrect"] = []
        
        
    df["question"].append(question)
    df["correct_answer"].append(correct_answer)
    df["incorrect_answer"].append(incorrect_answer)
    df["gt_answer"].append(gt_answer)
    df["edit_distance"].append(editdistance.eval(strip_pause_tokens(correct_answer), strip_pause_tokens(incorrect_answer)))
    df["git_diff_correct_incorrect"].append(diff_html(incorrect_answer, correct_answer))
    
ll_name2data = name2data
name1 = "offline_star_exp/pause_temp_1.0_part2"
name2 = "offline_star_exp/no_pause_peft_temp_1.0_part2"
judge1 = [data["test/accuracy"] for data in ll_name2data[name1]]
judge2 = [data["test/accuracy"] for data in ll_name2data[name2]]
j1_c = 0
j2_c = 0
j1_c_pause_inserted = 0
j2_c_pause_inserted = 0
for index,(j1,j2) in enumerate(zip(judge1,judge2)):
    if j1 and not j2:
        j1_c +=1
        append_res_to_dict(df_exclusives_pause, ll_name2data[name1][index]["input"], ll_name2data[name1][index]["predicted_output"], ll_name2data[name2][index]["predicted_output"], ll_name2data[name1][index]["ground_truth"])
        if "<|pause|>" in ll_name2data[name1][index]["predicted_output"]:
            append_res_to_dict(df_pause_benefits, ll_name2data[name1][index]["input"], ll_name2data[name1][index]["predicted_output"], ll_name2data[name2][index]["predicted_output"], ll_name2data[name1][index]["ground_truth"])
            j1_c_pause_inserted += 1
    if j2 and not j1:
        j2_c += 1
        append_res_to_dict(df_exclusives_no_pause, ll_name2data[name1][index]["input"], ll_name2data[name2][index]["predicted_output"], ll_name2data[name1][index]["predicted_output"], ll_name2data[name1][index]["ground_truth"])
        if "<|pause|>" in ll_name2data[name1][index]["predicted_output"]:
            append_res_to_dict(df_pause_detriment, ll_name2data[name1][index]["input"], ll_name2data[name2][index]["predicted_output"], ll_name2data[name1][index]["predicted_output"], ll_name2data[name1][index]["ground_truth"])
            j2_c_pause_inserted += 1
    
print(name1, "samples exculisively correct", j1_c)
print(name2, "samples exculisively correct", j2_c)
print(name1, "samples exculisively correct with pause inserted", j1_c_pause_inserted)
print(f"samples correct with {name2} and not {name1} where pause is inserted", j2_c_pause_inserted)

# All Samples Where Pause Model Predicted Correctly Using at Least One Pause Token and Non-Pause Model Predicted Incorrectly (sorted by edit distance)

In [None]:


display(
    HTML(
        responsive_style +
        style_dataframe(
            pd.DataFrame(df_pause_benefits),
            rename_columns = {
                "correct_answer": "correct answer from pause model",
                "incorrect_answer": "incorrect answer from non pause model",
                "edit_distance": "edit distance of incorrect answer w.r.t correct answer (excluding pause tokens)"
                }
        )
    )
)

# All Samples Where Pause Model Predicted Incorrectly Using at Least One Pause Token and Non-Pause Model Predicted Correctly (sorted by edit distance)

In [None]:
display(
    HTML(
        responsive_style +
        style_dataframe(
            pd.DataFrame(df_pause_detriment),
            rename_columns = {
                "correct_answer": "correct answer from non pause model",
                "incorrect_answer": "incorrect answer from pause model",
                "edit_distance": "edit distance of incorrect answer w.r.t correct answer (excluding pause tokens)"
            }
        )
    )
)

# All Samples Where Pause Model Predicted Correctly and Non-Pause Model Predicted Incorrectly (sorted by edit distance)

In [None]:
display(
    HTML(
        responsive_style +
        style_dataframe(
            pd.DataFrame(df_exclusives_pause),
            {"correct_answer": "correct answer from pause model", "incorrect_answer": "incorrect answer from non pause model", "edit_distance": "edit distance of incorrect answer w.r.t correct answer (excluding pause tokens)"}
        )
    )
)

# All Samples Where Pause Model Predicted Incorrectly and Non-Pause Model Predicted Correctly (sorted by edit distance)

In [None]:
display(
    HTML(
        responsive_style +
        style_dataframe(
            pd.DataFrame(df_exclusives_no_pause),
            {"correct_answer": "correct answer from non pause model", "incorrect_answer": "incorrect answer from pause model", "edit_distance": "edit distance of incorrect answer w.r.t correct answer (excluding pause tokens)"}
        )
    )
)