In [1]:
# NOTE: This is ONLY necessary in jupyter notebook.
# Details: Jupyter runs an event-loop behind the scenes. 
#          This results in nested event-loops when we start an event-loop to make async queries.
#          This is normally not allowed, we use nest_asyncio to allow it for convenience.  
import nest_asyncio
nest_asyncio.apply()

In [2]:
# only run this if your have an editable install
%load_ext autoreload
%autoreload 2

## Clean dataset

steps
1. Remove failed_wikis
2. group by question_id to get unqiue questions and for each question_id select a correct and incorrect answer to be used

In [3]:
from datasets import load_dataset, concatenate_datasets, Dataset
import pandas as pd
import time
import asyncio
import os

In [4]:
DATASET_URL = "wiki_qa"
SPLIT = "test"
ds = load_dataset(DATASET_URL, split=SPLIT)
ds

Found cached dataset wiki_qa (/home/jjmachan/.cache/huggingface/datasets/wiki_qa/default/0.1.0/d2d236b5cbdc6fbdab45d168b4d678a002e06ddea3525733a24558150585951c)


Dataset({
    features: ['question_id', 'question', 'document_title', 'answer', 'label'],
    num_rows: 6165
})

In [5]:
# load failed
import json

with open('failed_wikis') as f:
    failed = json.load(f)
    
len(failed)

16

In [6]:
def clean_failed(row):
    if row['document_title'] in failed:
        return False
    return True

cleaned_ds1 = ds.filter(clean_failed, batched=False)
cleaned_ds1.shape

Loading cached processed dataset at /home/jjmachan/.cache/huggingface/datasets/wiki_qa/default/0.1.0/d2d236b5cbdc6fbdab45d168b4d678a002e06ddea3525733a24558150585951c/cache-07ce3fbd049c5d55.arrow


(6049, 5)

In [7]:
df = cleaned_ds1.to_pandas()
df.head()

Unnamed: 0,question_id,question,document_title,answer,label
0,Q0,HOW AFRICAN AMERICANS WERE IMMIGRATED TO THE US,African immigration to the United States,African immigration to the United States refer...,0
1,Q0,HOW AFRICAN AMERICANS WERE IMMIGRATED TO THE US,African immigration to the United States,The term African in the scope of this article ...,0
2,Q0,HOW AFRICAN AMERICANS WERE IMMIGRATED TO THE US,African immigration to the United States,From the Immigration and Nationality Act of 19...,0
3,Q0,HOW AFRICAN AMERICANS WERE IMMIGRATED TO THE US,African immigration to the United States,African immigrants in the United States come f...,0
4,Q0,HOW AFRICAN AMERICANS WERE IMMIGRATED TO THE US,African immigration to the United States,"They include people from different national, l...",0


In [8]:
# number of unique question id's
len(df.groupby("question_id")["question"])

617

In [9]:
# number of unique question id's without correct and incorrect answer
def count(df):
    has_corr = df.loc[df["label"] == 1]["answer"].count() > 0
    has_in_corr = df.loc[df["label"] == 0]["answer"].count() > 0
    return has_corr and has_in_corr

df.groupby("question_id").apply(count).sum()

232

In [10]:
def clean(df):
    try:
        ques = df["question"].sample().iloc[0]
        corr = df.loc[df["label"] == 1]["answer"].sample().iloc[0]
        in_corr = df.loc[df["label"] == 0]["answer"].sample().iloc[0]
    except ValueError:
        return None
    return pd.Series(
        data=[ques, corr, in_corr], 
        index="question,correct_answer,incorrect_answer".split(',')
    )

In [11]:
cleaned_df = df.groupby("question_id").apply(clean).dropna()

In [12]:
cleaned_df.head()

Unnamed: 0_level_0,question,correct_answer,incorrect_answer
question_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Q0,HOW AFRICAN AMERICANS WERE IMMIGRATED TO THE US,"As such, African immigrants are to be distingu...",African immigrants in the United States come f...
Q1012,what are points on a mortgage,"Points, sometimes also called a ""discount poin...",Loan qualification based on monthly income ver...
Q102,how does interlibrary loan work,"Interlibrary loan (abbreviated ILL, and someti...",The lending library usually sets the due date ...
Q1027,WHAT IS A FY QUARTER,"A fiscal year (or financial year, or sometimes...","Nevertheless, the fiscal year is identical to ..."
Q1032,who wrote a rose is a rose is a rose,"The sentence ""Rose is a rose is a rose is a ro...",For later periods in literature this would no ...


In [13]:
q, c, i = cleaned_df.iloc[2]
q, c, i

('how does interlibrary loan work',
 'Interlibrary loan (abbreviated ILL, and sometimes called interloan, document delivery, or document supply) is a service whereby a user of one library can borrow books or receive photocopies of documents that are owned by another library.',
 'The lending library usually sets the due date and overdue fees of the material borrowed.')

## get generated_answer and context

In [14]:
# load the index
from llama_index import StorageContext, load_index_from_storage, ServiceContext

# CHANGE SERVICE_CONTEXT HERE!!!
openai_sc = ServiceContext.from_defaults()
service_context = openai_sc

# rebuild storage context
storage_context = StorageContext.from_defaults(persist_dir="./storage")

# load index
index = load_index_from_storage(storage_context)

In [31]:
from llama_index import (
    GPTVectorStoreIndex,
    ResponseSynthesizer,
)
from llama_index.retrievers import VectorIndexRetriever
from llama_index.query_engine import RetrieverQueryEngine
from llama_index.indices.postprocessor import SimilarityPostprocessor

# configure retriever
retriever = VectorIndexRetriever(
    index=index, 
    similarity_top_k=2,
)

# configure response synthesizer
response_synthesizer = ResponseSynthesizer.from_args(
    node_postprocessors=[
        SimilarityPostprocessor(similarity_cutoff=0.7)
    ]
)

# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
)

In [49]:
from tqdm.asyncio import tqdm_asyncio

In [147]:
async def main():
    await tqdm_asyncio.gather(*[query_engine.aquery(i) 
                           for i in cleaned_df["question"][:4]])

In [148]:
start = time.time()

responses = asyncio.run(main())

end = time.time() - start
print(f"total time {end:0.2f} seconds")

100%|████████████████████████████████████████████████████████████| 4/4 [00:56<00:00, 14.14s/it]

total time 56.57 seconds





In [32]:
ds_from_df = Dataset.from_pandas(cleaned_df.iloc[:25])
ds_from_df.shape

(25, 4)

In [33]:
def get_response(row):
    r = query_engine.query(row["question"])
    if r.response is None:
        r_str = ""
        c = []
    else:
        r_str = r.response
        c = [sn.node.text for sn in r.source_nodes]
    row["generated_with_rag"] = r_str
    row["context"] = c
    return row

ds_with_relevency = ds_from_df.map(get_response)

Map:   0%|          | 0/25 [00:00<?, ? examples/s]

In [34]:
ds_with_relevency.shape

(25, 6)

In [114]:
i = 11
q, q_id, c = ds_with_relevency[i]['question'], ds_with_relevency[i]['question_id'], ds_with_relevency[i]["context"]
c = ' '.join(c)
q, q_id, c

('where does a flea live',
 'Q1100',
 'Flea, the common name for the order Siphonaptera, includes 2,500 species of small flightless insects that live as external parasites of mammals and birds. Fleas live by ingesting the blood of their hosts. Adult fleas grow to about 3 millimetres (1⁄8 inch) long, are usually brown, and have bodies that are "flattened" sideways or narrow, enabling them to move through their hosts\' fur or feathers. They lack wings; their hind legs are extremely well adapted for jumping. Their claws keep them from being dislodged, and their mouthparts are adapted for piercing skin and sucking blood. They can leap 50 times their body length, a feat second only to jumps made by another group of insects, the superfamily of froghoppers. Flea larvae are worm-like, with no limbs; they have chewing mouthparts and feed on organic debris left on their hosts\' skin.\nGenetic evidence indicates that fleas are a specialised lineage of parasitic scorpionflies (Mecoptera) sensu lat

In [36]:
ds_with_relevency[0], len(ds_with_relevency[0]['context'])

({'question': 'HOW AFRICAN AMERICANS WERE IMMIGRATED TO THE US',
  'correct_answer': 'As such, African immigrants are to be distinguished from African American people, the latter of whom are descendants of mostly West and Central Africans who were involuntarily brought to the United States by means of the historic Atlantic slave trade .',
  'incorrect_answer': 'African immigrants in the United States come from almost all regions in Africa and do not constitute a homogeneous group.',
  'question_id': 'Q0',
  'generated_with_rag': '\nAfrican Americans were immigrated to the United States primarily through the Immigration and Nationality Act of 1965, which repealed the national quotas that had been in effect since 1921 and 1924. This act provided a separate category for refugees and greater opportunity for family reunification. Additionally, the Diversity Visa Program, or green card lottery, was created by the Immigration Act of 1990, which allowed people born in countries with low rates 

## generated_without_rag

using openai with asyncio

In [135]:
import openai
from aiohttp import ClientSession

openai.aiosession.set(ClientSession())
openai.api_key = os.getenv("OPENAI_API_KEY")

In [140]:
QUESTION_PROMPT = """\
answer the following question but make sure the some of the facts are incorrect

{question}
"""

In [141]:
print(QUESTION_PROMPT.format(question=q, context=c))

answer the following question but make sure the some of the facts are incorrect

where does a flea live



In [142]:
async def gen_answer(q_id, q):
    completion = await openai.Completion.acreate(
      model="text-davinci-003",
      prompt=QUESTION_PROMPT.format(question=q, context=c),
      max_tokens=200,
      temperature=1
    )
    return q_id, completion.choices[0].text

async def main(qs):
    return await asyncio.gather(*[gen_answer(q_id, q) for q_id, q in qs])

In [143]:
asyncio.run(gen_answer(q_id, q))

('Q1100',
 '\nA flea typically lives on its host, such as a dog or cat, but can also live in carpets, on bedding, and in grass.')

In [125]:
qs = list(zip(
    ds_with_relevency["question_id"], 
    ds_with_relevency["question"]
))

start = time.perf_counter()
# beware of rate limiting
r = asyncio.run(main(qs))
end = time.perf_counter() - start

print(f"total time {end:0.2f} seconds")

total time 12.35 seconds


In [126]:
len(r)

25

In [127]:
r_dict = {q_id: a for q_id, a in r}

def add_gen_without_rag(row):
    q_id = row["question_id"]
    row["generated_without_rag"] = r_dict[q_id]
    return row
    
final_ds = ds_with_relevency.map(add_gen_without_rag)

Map:   0%|          | 0/25 [00:00<?, ? examples/s]

In [128]:
# At the end of your program, close the http session
await openai.aiosession.get().close()

## Upload Dataset

In [129]:
final_ds.push_to_hub("explodinggradients/ragas-wikiqa")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/617 [00:00<?, ?B/s]

Updating downloaded metadata with the new split.


In [130]:
final_ds

Dataset({
    features: ['question', 'correct_answer', 'incorrect_answer', 'question_id', 'generated_with_rag', 'context', 'generated_without_rag'],
    num_rows: 25
})