In [1]:
# NOTE: This is ONLY necessary in jupyter notebook.
# Details: Jupyter runs an event-loop behind the scenes. 
#          This results in nested event-loops when we start an event-loop to make async queries.
#          This is normally not allowed, we use nest_asyncio to allow it for convenience.  
import nest_asyncio
nest_asyncio.apply()

In [2]:
# only run this if your have an editable install
%load_ext autoreload
%autoreload 2

## Clean dataset

steps
1. Remove failed_wikis
2. group by question_id to get unqiue questions and for each question_id select a correct and incorrect answer to be used

In [3]:
from datasets import load_dataset, concatenate_datasets, Dataset
import pandas as pd
import time
import asyncio
import os

In [4]:
DATASET_URL = "wiki_qa"
SPLIT = "test"
ds = load_dataset(DATASET_URL, split=SPLIT)
ds

Found cached dataset wiki_qa (/home/jjmachan/.cache/huggingface/datasets/wiki_qa/default/0.1.0/d2d236b5cbdc6fbdab45d168b4d678a002e06ddea3525733a24558150585951c)


Dataset({
    features: ['question_id', 'question', 'document_title', 'answer', 'label'],
    num_rows: 6165
})

In [5]:
# load failed
import json

with open('failed_wikis') as f:
    failed = json.load(f)
    
len(failed)

16

In [6]:
def clean_failed(row):
    if row['document_title'] in failed:
        return False
    return True

cleaned_ds1 = ds.filter(clean_failed, batched=False)
cleaned_ds1.shape

Loading cached processed dataset at /home/jjmachan/.cache/huggingface/datasets/wiki_qa/default/0.1.0/d2d236b5cbdc6fbdab45d168b4d678a002e06ddea3525733a24558150585951c/cache-07ce3fbd049c5d55.arrow


(6049, 5)

In [7]:
df = cleaned_ds1.to_pandas()
df.head()

Unnamed: 0,question_id,question,document_title,answer,label
0,Q0,HOW AFRICAN AMERICANS WERE IMMIGRATED TO THE US,African immigration to the United States,African immigration to the United States refer...,0
1,Q0,HOW AFRICAN AMERICANS WERE IMMIGRATED TO THE US,African immigration to the United States,The term African in the scope of this article ...,0
2,Q0,HOW AFRICAN AMERICANS WERE IMMIGRATED TO THE US,African immigration to the United States,From the Immigration and Nationality Act of 19...,0
3,Q0,HOW AFRICAN AMERICANS WERE IMMIGRATED TO THE US,African immigration to the United States,African immigrants in the United States come f...,0
4,Q0,HOW AFRICAN AMERICANS WERE IMMIGRATED TO THE US,African immigration to the United States,"They include people from different national, l...",0


In [8]:
# number of unique question id's
len(df.groupby("question_id")["question"])

617

In [9]:
# number of unique question id's without correct and incorrect answer
def count(df):
    has_corr = df.loc[df["label"] == 1]["answer"].count() > 0
    has_in_corr = df.loc[df["label"] == 0]["answer"].count() > 0
    return has_corr and has_in_corr

df.groupby("question_id").apply(count).sum()

232

In [10]:
def clean(df):
    try:
        ques = df["question"].sample().iloc[0]
        corr = df.loc[df["label"] == 1]["answer"].sample().iloc[0]
        in_corr = df.loc[df["label"] == 0]["answer"].sample().iloc[0]
    except ValueError:
        return None
    return pd.Series(
        data=[ques, corr, in_corr], 
        index="question,correct_answer,incorrect_answer".split(',')
    )

In [11]:
cleaned_df = df.groupby("question_id").apply(clean).dropna()

In [12]:
cleaned_df.head()

Unnamed: 0_level_0,question,correct_answer,incorrect_answer
question_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Q0,HOW AFRICAN AMERICANS WERE IMMIGRATED TO THE US,"As such, African immigrants are to be distingu...",From the Immigration and Nationality Act of 19...
Q1012,what are points on a mortgage,"Points, sometimes also called a ""discount poin...",Discount points may be different from originat...
Q102,how does interlibrary loan work,The user makes a request with their local libr...,Although books and journal articles are the mo...
Q1027,WHAT IS A FY QUARTER,"A fiscal year (or financial year, or sometimes...",Fiscal years vary between businesses and count...
Q1032,who wrote a rose is a rose is a rose,"The sentence ""Rose is a rose is a rose is a ro...",I know that in daily life we don't go around s...


In [18]:
cleaned_df.shape

(232, 3)

In [13]:
q, c, i = cleaned_df.iloc[2]
q, c, i

('how does interlibrary loan work',
 'The user makes a request with their local library, which, acting as an intermediary, identifies owners of the desired item, places the request, receives the item, makes it available to the user, and arranges for its return.',
 'Although books and journal articles are the most frequently requested items, some libraries will lend audio recordings, video recordings, maps, sheet music, and microforms of all kinds.')

## get generated_answer and context

In [15]:
# load the index
from llama_index import StorageContext, load_index_from_storage, ServiceContext

# CHANGE SERVICE_CONTEXT HERE!!!
openai_sc = ServiceContext.from_defaults()
service_context = openai_sc

# rebuild storage context
storage_context = StorageContext.from_defaults(persist_dir="./storage")

# load index
index = load_index_from_storage(storage_context)

In [16]:
from llama_index import (
    GPTVectorStoreIndex,
    ResponseSynthesizer,
)
from llama_index.retrievers import VectorIndexRetriever
from llama_index.query_engine import RetrieverQueryEngine
from llama_index.indices.postprocessor import SimilarityPostprocessor

# configure retriever
retriever = VectorIndexRetriever(
    index=index, 
    similarity_top_k=2,
)

# configure response synthesizer
response_synthesizer = ResponseSynthesizer.from_args(
    node_postprocessors=[
        SimilarityPostprocessor(similarity_cutoff=0.7)
    ]
)

# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
)

In [19]:
ds_from_df = Dataset.from_pandas(cleaned_df)
ds_from_df.shape

(232, 4)

In [20]:
def get_response(row):
    r = query_engine.query(row["question"])
    if r.response is None:
        r_str = ""
        c = []
    else:
        r_str = r.response
        c = [sn.node.text for sn in r.source_nodes]
    row["generated_with_rag"] = r_str
    row["context"] = c
    return row

ds_with_relevency = ds_from_df.map(get_response)

Map:   0%|          | 0/232 [00:00<?, ? examples/s]

In [45]:
ds_with_relevency.shape

(232, 6)

## generated_without_rag

using openai with asyncio

In [54]:
QUESTION_PROMPT = """\
Answer the question, each answer should contain atleast one incorrect statements. Make mistakes in dates,names or other entities.
Question: {question}
Answer:
"""

In [55]:
print(QUESTION_PROMPT.format(question=q))

Answer the question, each answer should contain atleast one incorrect statements. Make mistakes in dates,names or other entities.
Question: where does a flea live
Answer:



In [56]:
import logging
import openai
import os
import backoff

openai.api_key = os.environ.get("OPENAI_API_KEY")

# TODO better way of logging backoffs
logging.getLogger("backoff").addHandler(logging.StreamHandler())


# each of these calls have to check for
# https://platform.openai.com/docs/guides/error-codes/api-errors
# and handle it gracefully
@backoff.on_exception(backoff.expo, openai.APIError, max_tries=5)
def llm(prompts: list[str], **kwargs):
    """
    TODOs

    - what happens when backoff fails?
    """
    response = openai.Completion.create(
        model=kwargs.get("model", "text-davinci-003"),
        prompt=prompts,
        temperature=kwargs.get("temperature", 0),
        top_p=kwargs.get("top_p", 1),
        frequency_penalty=kwargs.get("frequency_penalty", 0.0),
        presence_penalty=kwargs.get("presence_penalty", 0.0),
        max_tokens=kwargs.get("max_tokens", 500),
        logprobs=kwargs.get("logprobs", 1),
        n=kwargs.get("n", 1),
    )

    return response

def gen_answers(row):
    prompts = [QUESTION_PROMPT.format(question=q) for q in row["question"]]
    completion = llm(prompts=prompts)
    row["generated_without_rag"] = [c["text"] for c in completion.choices]
    
    return row

In [59]:
final_ds = ds_with_relevency.map(
    gen_answers, batched=True, batch_size=5
)

Map:   0%|          | 0/232 [00:00<?, ? examples/s]

## Upload Dataset

In [60]:
final_ds.push_to_hub("explodinggradients/ragas-wikiqa")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Updating downloaded metadata with the new split.


In [61]:
final_ds

Dataset({
    features: ['question', 'correct_answer', 'incorrect_answer', 'question_id', 'generated_with_rag', 'context', 'generated_without_rag'],
    num_rows: 232
})