In [3]:
import IPython.display as display
import pandas as pd
import numpy as np
import sys, os, time


def draw(view):
    display.display(display.HTML(view))

def draw_cols(cols, with_refs=False):
    col_len = 100 // len(cols)
    draw(f'''
        <style>
            .qq {{
                word-break: keep-all;
                white-space: pre-wrap;
            }}
        </style>

        <div style="border:1px solid grey;padding:5px;border-radius:8px;">
            <div>ID: ID &nbsp Q: {i}</div>
            <h5 style="font-weight: bold;">{question}</h5>
            <div style="display:flex; align-items:top;">'''
                    + ''.join([
                                f'<div style="width: {col_len}%; padding:10px;"> <div style="text-align:center;color:blue;">{title}</div> <pre class="qq">{text}</pre></div>'
                                for title, text in cols
                                ])
            + '</div>'
            #+ str(extract_top_words(cols[1][1], 5))
            + (f'''<br><br>{"="*100}<br><br>
                 <pre>REFERENCES:<br><br>{"<br>".join(x["references_raw"])}
                 </pre>''' if with_refs else '')
        + '</div>')
    draw("<br>")


In [8]:
sys.path.append("..")
from evaluation.extract import extract, extract_data


#
# SHOWS THZ IMPACT OF "system" PROMPT ON GENERATION (SED IDK METRIC)
#

files = {
    "v0 (without)": "../_data/albert-light_train-v0.json",
   "without system prompt": "../_data/albert-light_train-v1_without_system_prompt.json",
   "with system prompt": "../_data/albert-light_train.json",
}
corpuses = [] # texts 
frames = [] # metrics

# Generate corpus from _data/albert-light_train.json :
# cat _data/albert-light_train.json  |  jq 'map(select(.answer != "nan")) | map({query:.query, prompt:.prompt, answer: .answer})' > notebooks/albert-light.json
for name, f in files.items():
    corpus = pd.read_json(f)
    corpus =  corpus[~((corpus["answer"] == "nan") | corpus["answer"].isna())]
    corpuses.append(corpus)

    # Frame of mean values
    frame = pd.DataFrame([extract(corpus.loc[i]["answer"], how="binary") for i in corpus.index])
    frame.title = name
    frames.append(frame)

# Show evaluation metrics 
for frame in frames:
    words = frame["words"]
    print(f'''
    Total "{frame.title}" DOC: {len(frame)}
    min/max anwser: {words.min()}/{words.max()} 
    ''')


draw(pd.DataFrame(dict((frame.title, frame.mean(0).round(2)) for frame in frames)).T.to_html())

corpus = corpuses[-1] 
del frames
del corpuses


    Total "v0 (without)" DOC: 1000
    min/max anwser: 4/740 
    

    Total "without system prompt" DOC: 4985
    min/max anwser: 4/499 
    

    Total "with system prompt" DOC: 4719
    min/max anwser: 4/514 
    


Unnamed: 0,words,ttr,emails,urls,phones,dates,hours,prices_,number_artefacts,prompt_artefacts,loop,idk
v0 (without),94.08,0.7,0.0,0.03,0.02,0.02,0.0,0.06,0.18,0.0,0.0,0.18
without system prompt,97.03,0.7,0.0,0.02,0.02,0.02,0.0,0.06,0.21,0.0,0.0,0.17
with system prompt,160.76,0.58,0.0,0.08,0.02,0.03,0.0,0.09,0.29,0.0,0.01,0.02


In [5]:
# Show some samples

def has_loop(text: str) -> str:
    sentences = list(filter(lambda x: x.strip() != "", text.split(". ")))
    lines = list(filter(lambda x: x.strip() != "", text.split("\n")))
    if len(sentences) - len(set(sentences)) >= 2 or len(lines) - len(set(lines)) >= 2:
        return True
    return False

#_corpus = corpus.iloc[np.arange(5)]  # firsts   <-------------------
_corpus = corpus[corpus["answer"].apply(has_loop)]  # IDK !   <-------------------


for i, x in _corpus.iterrows():
    question = x["query"]
    cols = [
        ("prompt", x["prompt"]),
        ("answer", x["answer"]),
    ]

    draw_cols(cols, with_refs=False)
