In [1]:
# NOTE: This is ONLY necessary in jupyter notebook.
# Details: Jupyter runs an event-loop behind the scenes. 
#          This results in nested event-loops when we start an event-loop to make async queries.
#          This is normally not allowed, we use nest_asyncio to allow it for convenience.  
import nest_asyncio
nest_asyncio.apply()

In [2]:
# only run this if your have an editable install
%load_ext autoreload
%autoreload 2

## Clean dataset

steps
1. Remove failed_wikis
2. group by question_id to get unqiue questions and for each question_id select a correct and incorrect answer to be used

In [83]:
from datasets import load_dataset, concatenate_datasets, Dataset
import pandas as pd
import time
import asyncio
import os

In [4]:
DATASET_URL = "wiki_qa"
SPLIT = "test"
ds = load_dataset(DATASET_URL, split=SPLIT)
ds

Found cached dataset wiki_qa (/home/jjmachan/.cache/huggingface/datasets/wiki_qa/default/0.1.0/d2d236b5cbdc6fbdab45d168b4d678a002e06ddea3525733a24558150585951c)


Dataset({
    features: ['question_id', 'question', 'document_title', 'answer', 'label'],
    num_rows: 6165
})

In [5]:
# load failed
import json

with open('failed_wikis') as f:
    failed = json.load(f)
    
len(failed)

16

In [6]:
def clean_failed(row):
    if row['document_title'] in failed:
        return False
    return True

cleaned_ds1 = ds.filter(clean_failed, batched=False)
cleaned_ds1.shape

Loading cached processed dataset at /home/jjmachan/.cache/huggingface/datasets/wiki_qa/default/0.1.0/d2d236b5cbdc6fbdab45d168b4d678a002e06ddea3525733a24558150585951c/cache-07ce3fbd049c5d55.arrow


(6049, 5)

In [7]:
df = cleaned_ds1.to_pandas()
df.head()

Unnamed: 0,question_id,question,document_title,answer,label
0,Q0,HOW AFRICAN AMERICANS WERE IMMIGRATED TO THE US,African immigration to the United States,African immigration to the United States refer...,0
1,Q0,HOW AFRICAN AMERICANS WERE IMMIGRATED TO THE US,African immigration to the United States,The term African in the scope of this article ...,0
2,Q0,HOW AFRICAN AMERICANS WERE IMMIGRATED TO THE US,African immigration to the United States,From the Immigration and Nationality Act of 19...,0
3,Q0,HOW AFRICAN AMERICANS WERE IMMIGRATED TO THE US,African immigration to the United States,African immigrants in the United States come f...,0
4,Q0,HOW AFRICAN AMERICANS WERE IMMIGRATED TO THE US,African immigration to the United States,"They include people from different national, l...",0


In [8]:
# number of unique question id's
len(df.groupby("question_id")["question"])

617

In [9]:
# number of unique question id's without correct and incorrect answer
def count(df):
    has_corr = df.loc[df["label"] == 1]["answer"].count() > 0
    has_in_corr = df.loc[df["label"] == 0]["answer"].count() > 0
    return has_corr and has_in_corr

df.groupby("question_id").apply(count).sum()

232

In [11]:
def clean(df):
    try:
        ques = df["question"].sample().iloc[0]
        corr = df.loc[df["label"] == 1]["answer"].sample().iloc[0]
        in_corr = df.loc[df["label"] == 0]["answer"].sample().iloc[0]
    except ValueError:
        return None
    return pd.Series(
        data=[ques, corr, in_corr], 
        index="question,correct_answer,incorrect_answer".split(',')
    )

In [12]:
cleaned_df = df.groupby("question_id").apply(clean).dropna()

In [13]:
cleaned_df.head()

Unnamed: 0_level_0,question,correct_answer,incorrect_answer
question_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Q0,HOW AFRICAN AMERICANS WERE IMMIGRATED TO THE US,"As such, African immigrants are to be distingu...",From the Immigration and Nationality Act of 19...
Q1012,what are points on a mortgage,"Points, sometimes also called a ""discount poin...",Discount points may be different from originat...
Q102,how does interlibrary loan work,The user makes a request with their local libr...,The lending library usually sets the due date ...
Q1027,WHAT IS A FY QUARTER,"A fiscal year (or financial year, or sometimes...",Fiscal years vary between businesses and count...
Q1032,who wrote a rose is a rose is a rose,"The sentence ""Rose is a rose is a rose is a ro...","In Stein's view, the sentence expresses the fa..."


In [14]:
q, c, i = cleaned_df.iloc[2]
q, c, i

('how does interlibrary loan work',
 'The user makes a request with their local library, which, acting as an intermediary, identifies owners of the desired item, places the request, receives the item, makes it available to the user, and arranges for its return.',
 'The lending library usually sets the due date and overdue fees of the material borrowed.')

## get generated_answer and context

In [15]:
# load the index
from llama_index import StorageContext, load_index_from_storage, ServiceContext

# CHANGE SERVICE_CONTEXT HERE!!!
openai_sc = ServiceContext.from_defaults()
service_context = openai_sc

# rebuild storage context
storage_context = StorageContext.from_defaults(persist_dir="./storage")

# load index
index = load_index_from_storage(storage_context)

In [16]:
from llama_index import (
    GPTVectorStoreIndex,
    ResponseSynthesizer,
)
from llama_index.retrievers import VectorIndexRetriever
from llama_index.query_engine import RetrieverQueryEngine
from llama_index.indices.postprocessor import SimilarityPostprocessor

# configure retriever
retriever = VectorIndexRetriever(
    index=index, 
    similarity_top_k=3,
)

# configure response synthesizer
response_synthesizer = ResponseSynthesizer.from_args(
    node_postprocessors=[
        SimilarityPostprocessor(similarity_cutoff=0.7)
    ]
)

# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
)

# query
response = query_engine.aquery(q)
print(response)

<coroutine object BaseQueryEngine.aquery at 0x7f587c97bf40>


In [49]:
from tqdm.asyncio import tqdm_asyncio

In [147]:
async def main():
    await tqdm_asyncio.gather(*[query_engine.aquery(i) 
                           for i in cleaned_df["question"][:4]])

In [148]:
start = time.time()

responses = asyncio.run(main())

end = time.time() - start
print(f"total time {end:0.2f} seconds")

100%|████████████████████████████████████████████████████████████| 4/4 [00:56<00:00, 14.14s/it]

total time 56.57 seconds





In [76]:
ds_from_df = Dataset.from_pandas(cleaned_df.iloc[:25])
ds_from_df.shape

(25, 4)

In [77]:
def get_response(row):
    r = query_engine.query(row["question"])
    if r.response is None:
        r_str = ""
        c = []
    else:
        r_str = r.response
        c = [sn.node.text for sn in r.source_nodes]
    row["generated_with_rag"] = r_str
    row["context"] = c
    return row

ds_with_relevency = ds_from_df.map(get_response)

Map:   0%|          | 0/25 [00:00<?, ? examples/s]

In [78]:
ds_with_relevency.shape

(25, 6)

In [105]:
ds_with_relevency[16]

{'question': 'what relates to erosion',
 'correct_answer': 'Water and wind erosion are now the two primary causes of land degradation ; combined, they are responsible for 84% of degraded acreage, making excessive erosion one of the most significant global environmental problems.',
 'incorrect_answer': 'Industrial agriculture , deforestation , roads , anthropogenic climate change and urban sprawl are amongst the most significant human activities in regard to their effect on stimulating erosion.',
 'question_id': 'Q1157',
 'generated_with_rag': '\nErosion is related to rainfall, surface runoff, rivers and streams, vegetative cover, topography, tectonics, development, and thermal erosion.',
 'context': ['erosion, via their effects on vegetation and soil properties. In general, given similar vegetation and ecosystems, areas with more precipitation (especially high-intensity rainfall), more wind, or more storms are expected to have more erosion.\nIn some areas of the world (e.g. the mid-wes

## generated_without_rag

using openai with asyncio

In [81]:
import openai
from aiohttp import ClientSession

openai.aiosession.set(ClientSession())
openai.api_key = os.getenv("OPENAI_API_KEY")

<Token var=<ContextVar name='aiohttp-session' default=None at 0x7f58fc3cd3f0> at 0x7f586f776100>

In [97]:
QUESTION_PROMPT = """\
Answer the following questions to the best of your ability.

{question}
"""

In [139]:
async def gen_answer(q_id, q):
    completion = await openai.Completion.acreate(
      model="text-davinci-003",
      prompt=QUESTION_PROMPT.format(question=q),
      max_tokens=100,
      temperature=1
    )
    return q_id, completion.choices[0].text

async def main(qs):
    return await asyncio.gather(*[gen_answer(q_id, q) for q_id, q in qs])

In [145]:
qs = list(zip(
    ds_with_relevency["question_id"], 
    ds_with_relevency["question"]
))

start = time.perf_counter()
# beware of rate limiting
r = asyncio.run(main(qs))
end = time.perf_counter() - start

print(f"total time {end:0.2f} seconds")

total time 14.61 seconds


In [146]:
len(r)

25

In [153]:
r_dict = {q_id: a for q_id, a in r}

def add_gen_without_rag(row):
    q_id = row["question_id"]
    row["generated_without_rag"] = r_dict[q_id]
    return row
    
final_ds = ds_with_relevency.map(add_gen_without_rag)

Map:   0%|          | 0/25 [00:00<?, ? examples/s]

In [154]:
# At the end of your program, close the http session
await openai.aiosession.get().close()

## Upload Dataset

In [158]:
final_ds.push_to_hub("explodinggradients/ragas-wikiqa")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

In [159]:
final_ds

Dataset({
    features: ['question', 'correct_answer', 'incorrect_answer', 'question_id', 'generated_with_rag', 'context', 'generated_without_rag'],
    num_rows: 25
})