In [7]:
%load_ext autoreload
%autoreload 2

In [4]:
from datasets import load_dataset, concatenate_datasets

def format_for_belar(row):
    row["context"] = row["selftext"]
    row["prompt"] = row["title"]
    row['ground_truth'] = row["answers"]["text"]
    return row
    
d = load_dataset("eli5")
ds = d['test_eli5'].map(format_for_belar, batched=False)
ds = ds.select_columns(["context", "prompt", "ground_truth"])

Found cached dataset eli5 (/home/jjmachan/.cache/huggingface/datasets/eli5/LFQA_reddit/1.0.0/17574e5502a10f41bbd17beba83e22475b499fa62caa1384a3d093fc856fe6fa)


  0%|          | 0/9 [00:00<?, ?it/s]

Loading cached processed dataset at /home/jjmachan/.cache/huggingface/datasets/eli5/LFQA_reddit/1.0.0/17574e5502a10f41bbd17beba83e22475b499fa62caa1384a3d093fc856fe6fa/cache-f3427cd7a8a8674f.arrow


In [5]:
ds = ds.shuffle(seed=42).select(range(500))
ds.shape

(500, 3)

In [8]:
ds.column_names

['context', 'prompt', 'ground_truth']

In [10]:
import concurrent.futures as f
from langchain.llms import OpenAI

llm = OpenAI()
prompt = """
{context}
with the above context explain like I'm five: {prompt}
"""

def get_answers(row):
    qs, cs = row["prompt"], row["context"]
    
    generated_answers = []
    with f.ThreadPoolExecutor(max_workers=10) as executor:
        results = executor.map(llm, 
            [prompt.format(context=cs[i], prompt=qs[i]) for i in range(len(qs))])
        for result in results:
            generated_answers.append(result)
     
    row["generated_answers"] = generated_answers
    return row
    
ds = ds.map(get_answers, batched=True, batch_size=10)

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

## Evalutate

In [54]:
from belar.metrics import Rouge1, Evaluation, Rouge2, RougeL

In [9]:
ds.column_names

['context', 'question', 'answers', 'answer_generated']

In [15]:
def rename(row):
    row["prompt"] = row["question"]
    row["references"] = []
    row["ground_truth"] = row["answers"]
    row["generated_text"] = row["answer_generated"]
    
    return row

ds = ds.map(rename).select_columns(
    ["prompt", "context", "references", "ground_truth", "generated_text"]
)
ds



Dataset({
    features: ['context', 'prompt', 'references', 'ground_truth', 'generated_text'],
    num_rows: 100
})

In [47]:
ds.push_to_hub("explodinggradients/eli5-test")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

In [81]:
e = Evaluation(metrics=[Rouge1, Rouge2, RougeL])
e.eval(ds.select_columns(["ground_truth"]), ds["generated_text"])

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Dataset({
    features: ['ground_truth', 'generated_text', 'rouge1_score', 'rouge2_score', 'rougeL_score'],
    num_rows: 100
})