# Prompt Versioning & Failure Log  
Track iterative prompt changes, record failures, and analyze robustness.

In [None]:
!pip -q install pandas ipywidgets openai tiktoken

In [None]:
import os, pandas as pd, openai, tiktoken, datetime, ipywidgets as w
from IPython.display import display

os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'sk-')

LOG_PATH = 'prompt_fail_log.csv'
try:
    log_df = pd.read_csv(LOG_PATH)
except FileNotFoundError:
    log_df = pd.DataFrame(columns=[
        'timestamp','prompt_id','version','prompt_text',
        'decoding_cfg','expected_behavior','observed_behavior','token_count'])

def save_log():
    log_df.to_csv(LOG_PATH, index=False)

prompt_id = w.Text(value='math_demo', description='Prompt ID:')
version = w.IntText(value=1, description='Version:')
prompt_text = w.Textarea(value='Provide a poetic summary of climate change.', description='Prompt:', layout=w.Layout(width="100%", height="80px"))
expected = w.Textarea(value='Should output <50 words, poetic style.', description='Expected:', layout=w.Layout(width="100%", height="60px"))
observed = w.Textarea(value='', description='Observed:', layout=w.Layout(width="100%", height="60px"))
cfg_box = w.Textarea(value='temperature=0.7, top_p=0.95', description='Decoding cfg:', layout=w.Layout(width="100%", height="40px"))
run_btn = w.Button(description='Run & Log')
table_out = w.Output()

def run_and_log(_):
    global log_df
    enc = tiktoken.encoding_for_model('gpt-4o-mini')
    n_tok = len(enc.encode(prompt_text.value))
    cfg = {}
    for kv in cfg_box.value.split(','):
        if '=' in kv:
            k, v = kv.split('=',1)
            cfg[k.strip()] = eval(v.strip())
    try:
        resp = openai.ChatCompletion.create(
            model='gpt-4o-mini',
            messages=[{'role':'user','content': prompt_text.value}],
            max_tokens=200,
            **cfg
        )
        out_text = resp.choices[0].message.content.strip()
        observed.value = out_text
    except Exception as e:
        observed.value = f'Error: {e}'
        out_text = observed.value
    new_row = {
        'timestamp': datetime.datetime.utcnow().isoformat(),
        'prompt_id': prompt_id.value,
        'version': version.value,
        'prompt_text': prompt_text.value,
        'decoding_cfg': cfg,
        'expected_behavior': expected.value,
        'observed_behavior': out_text,
        'token_count': n_tok
    }
    log_df = pd.concat([log_df, pd.DataFrame([new_row])], ignore_index=True)
    save_log()
    with table_out:
        table_out.clear_output()
        display(log_df.tail())

run_btn.on_click(run_and_log)
display(w.VBox([prompt_id, version, prompt_text, expected, cfg_box, run_btn, observed, table_out]))
print("Use this template to iterate, log failures, and track fixes.")

### Tips  
* Increment the **version** number with each change.  
* Filter the CSV later to compute a **robustness score** (success rate).  
* Store this file in Git for full history.