In [1]:
# only run this if your have an editable install
%load_ext autoreload
%autoreload 2

In [2]:
from datasets import load_dataset, concatenate_datasets

DATASET_URL = "wiki_qa"
SPLIT = "test"
ds = load_dataset(DATASET_URL, split=SPLIT)
ds

Found cached dataset wiki_qa (/home/jjmachan/.cache/huggingface/datasets/wiki_qa/default/0.1.0/d2d236b5cbdc6fbdab45d168b4d678a002e06ddea3525733a24558150585951c)


Dataset({
    features: ['question_id', 'question', 'document_title', 'answer', 'label'],
    num_rows: 6165
})

### Download the wikipedia pages

In [6]:
import wikipedia as wiki

wiki_doc_titles = sorted(set(ds["document_title"]))


def get_wiki_doc(title):
    assert isinstance(title, str)
    try:
        doc = wiki.WikipediaPage(title=title)
    except (wiki.PageError, wiki.DisambiguationError) as e:
        return False
    with open(f"./data/{title}.txt", "w") as f:
        f.write(doc.content)
    return True

In [71]:
from tqdm import tqdm

failed = []
for t in tqdm(wiki_doc_titles):
    r = get_wiki_doc(t)
    if r is False:
        failed.append(t)

 20%|███████████▏                                            | 124/619 [04:55<11:16,  1.37s/it]

[Crater lake] failed!


 23%|████████████▎                                         | 141/619 [07:28<3:47:03, 28.50s/it]

[Deep Blue Sea] failed!


 37%|████████████████████▉                                   | 232/619 [11:44<09:56,  1.54s/it]

[Hannibal (film)] failed!


 43%|████████████████████████                                | 266/619 [14:43<08:17,  1.41s/it]

[IÂ²C] failed!


 46%|█████████████████████████▌                              | 283/619 [15:06<07:38,  1.36s/it]

[June bug] failed!


 48%|██████████████████████████▋                             | 295/619 [15:22<06:13,  1.15s/it]

[La NiÃ±a] failed!


 53%|█████████████████████████████▌                          | 327/619 [16:01<05:20,  1.10s/it]

[List of youngest birth mothers] failed!


 66%|████████████████████████████████████▉                   | 408/619 [17:48<04:27,  1.27s/it]

[Our Song] failed!


 66%|█████████████████████████████████████▏                  | 411/619 [17:52<04:28,  1.29s/it]

[Parcel] failed!


 69%|██████████████████████████████████████▌                 | 426/619 [18:41<33:02, 10.27s/it]

[PokÃ©mon] failed!


 73%|████████████████████████████████████████▋               | 450/619 [19:14<04:22,  1.55s/it]

[Range (mathematics)] failed!


 80%|████████████████████████████████████████████▋           | 494/619 [20:17<02:59,  1.44s/it]

[Sixth Army] failed!


 82%|█████████████████████████████████████████████▊          | 506/619 [21:37<08:56,  4.75s/it]

[Spades] failed!


 86%|████████████████████████████████████████████████▎       | 534/619 [22:16<01:50,  1.30s/it]

[Systemic] failed!


 87%|████████████████████████████████████████████████▌       | 537/619 [22:20<01:36,  1.18s/it]

[Tamari] failed!


 89%|██████████████████████████████████████████████████      | 553/619 [22:42<01:26,  1.32s/it]

[The Bells] failed!


100%|████████████████████████████████████████████████████████| 619/619 [24:20<00:00,  2.36s/it]


In [73]:
failed

['Crater lake',
 'Deep Blue Sea',
 'Hannibal (film)',
 'IÂ²C',
 'June bug',
 'La NiÃ±a',
 'List of youngest birth mothers',
 'Our Song',
 'Parcel',
 'PokÃ©mon',
 'Range (mathematics)',
 'Sixth Army',
 'Spades',
 'Systemic',
 'Tamari',
 'The Bells']

In [88]:
import json

with open("failed_wikis", "w") as f:
    json.dump(failed, f)

In [3]:
import json

with open("failed_wikis") as f:
    failed = json.load(f)

len(failed)

16

## Clean Dataset

- remove the failed wiki's and questions

In [12]:
def clean_failed_and_incorrect(row):
    if row["document_title"] in failed:
        return False
    return True

In [13]:
cleaned_ds1 = ds.filter(clean_failed_and_incorrect, batched=False)
cleaned_ds1.shape

Loading cached processed dataset at /home/jjmachan/.cache/huggingface/datasets/wiki_qa/default/0.1.0/d2d236b5cbdc6fbdab45d168b4d678a002e06ddea3525733a24558150585951c/cache-61607228d22e9a55.arrow


(6049, 5)

In [9]:
count_wiki_questions = {
    item: cleaned_ds1["document_title"].count(item)
    for item in set(cleaned_ds1["document_title"])
}
len(count_wiki_questions)

603

In [14]:
len(set(cleaned_ds1["document_title"]))

603

## baseline

first load the selected documents

In [15]:
from llama_index import Document

docs = []
for d in sorted(set(cleaned_ds1["document_title"])):
    with open(f"./data/{d}.txt") as f:
        docs.append(Document(text=f.read()))

len(docs)

603

In [27]:
from llama_index import GPTVectorStoreIndex
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index import LangchainEmbedding, ServiceContext, StorageContext

# load in HF embedding model from langchain
embed_model = LangchainEmbedding(HuggingFaceEmbeddings())
hf_sc = ServiceContext.from_defaults(embed_model=embed_model)

# openai embeddings
openai_sc = ServiceContext.from_defaults()

In [24]:
# make nodes
from llama_index.node_parser import SimpleNodeParser
from langchain.text_splitter import TokenTextSplitter

spliter = TokenTextSplitter(chunk_size=300, chunk_overlap=50)

parser = SimpleNodeParser(text_splitter=spliter)

nodes = parser.get_nodes_from_documents(documents=docs)

len(nodes)

16966

In [25]:
nodes[0]

Node(text='The 18th century lasted from January 1, 1701 (MDCCI) to December 31, 1800 (MDCCC). During the 18th century, elements of Enlightenment thinking culminated in the American, French, and Haitian Revolutions. During the century, slave trading and human trafficking expanded across the shores of the Atlantic, while declining in Russia, China, and Korea. Revolutions began to challenge the legitimacy of monarchical and aristocratic power structures, including the structures and beliefs that supported slavery. The Industrial Revolution began during mid-century, leading to radical changes in human society and the environment. \nWestern historians have occasionally defined the 18th century otherwise for the purposes of their work. For example, the "short" 18th century may be defined as 1715–1789, denoting the period of time between the death of Louis XIV of France and the start of the French Revolution, with an emphasis on directly interconnected events. To historians who expand the cen

In [28]:
# CHANGE SERVICE_CONTEXT HERE!!!
service_context = openai_sc

# create index
index = GPTVectorStoreIndex.from_documents(
    documents=docs,
    service_context=service_context,
)

# query with embed_model specified
qe = index.as_query_engine(
    mode="embedding", verbose=True, service_context=service_context
)

In [29]:
# save the index
index.storage_context.persist(persist_dir="./storage")

when loading make sure `service_context` is initialized and configured the same

In [17]:
# load the index
from llama_index import StorageContext, load_index_from_storage, ServiceContext

# CHANGE SERVICE_CONTEXT HERE!!!
openai_sc = ServiceContext.from_defaults()
service_context = openai_sc

# rebuild storage context
storage_context = StorageContext.from_defaults(persist_dir="./storage")

# load index
index = load_index_from_storage(storage_context)

# query with embed_model specified
qe = index.as_query_engine(
    mode="embedding", verbose=True, service_context=service_context, use_async=True
)

In [43]:
from llama_index import (
    GPTVectorStoreIndex,
    ResponseSynthesizer,
)
from llama_index.retrievers import VectorIndexRetriever
from llama_index.query_engine import RetrieverQueryEngine
from llama_index.indices.postprocessor import SimilarityPostprocessor

# configure retriever
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=3,
)

# configure response synthesizer
response_synthesizer = ResponseSynthesizer.from_args(
    node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.7)]
)

# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
)

# query
response = query_engine.query("where do the mohawks live?")
print(response)


The Mohawks live in settlements in northern New York State and southeastern Canada, including the reserves of Kanièn:ke, Kanaʼtsioharè:ke, Ahkwesáhsne, Kahnawà:ke, Kanehsatà:ke, Tioweró:ton, Kenhtè꞉ke, Wáhta, and Ohswé:ken.


In [41]:
response.response

'\nThe Mohawks live in settlements in northern New York State and southeastern Canada, including the reserves of Kanièn:ke, Kanaʼtsioharè:ke, Ahkwesáhsne, Kahnawà:ke, Kanehsatà:ke, Tioweró:ton, Kenhtè꞉ke, Wáhta, and Ohswé:ken.'

In [61]:
i = 45
q_id, q, a = final_ds[i]["question_id"], final_ds[i]["question"], final_ds[i]["answer"]
q_id, q, a

('Q2004',
 'who is flo from progressive',
 'Flo debuted in 2008 through television commercials and has since appeared in radio advertisements and web banners .')

In [46]:
from IPython.display import Markdown, display

r = qe.query(q)

In [51]:
display(
    Markdown(
        f"""
<b>{q_id}: {q}</b>

<b>Generated Answer: </b>
<i>{r}</i>

<b>Original Answer: </b>
<i>{a}</i>
"""
    )
)


<b>Q1506: where do the mohawks live</b>

<b>Generated Answer: </b>
<i>
The Mohawks live in settlements in northern New York State and southeastern Canada, including Kanièn:ke, Kanaʼtsioharè:ke, Ahkwesáhsne, Kahnawà:ke, Kanehsatà:ke, Tioweró:ton, Kenhtè꞉ke, Wáhta, and Ohswé:ken.</i>

<b>Original Answer: </b>
<i>Their traditional homeland stretched southward of the Mohawk River , eastward to the Green Mountains of Vermont , westward to the border with the Oneida Nation 's traditional homeland territory, and northward to the St Lawrence River.</i>


In [16]:
def generate_response(row):
    r = qe.query(row["question"])
    row["generated_text"] = r.response
    row["retrieved_context"] = [sn.node.text for sn in r.source_nodes]

    # some renamings for ragas
    row["prompt"] = row["question"]
    row["ground_truth"] = row["answer"]
    return row


generate_response(final_ds[0])

{'question_id': 'Q33',
 'question': 'how are antibodies used in',
 'document_title': 'antibody',
 'answer': 'An antibody (Ab), also known as an immunoglobulin (Ig), is a large Y-shaped protein produced by B-cells that is used by the immune system to identify and neutralize foreign objects such as bacteria and viruses .',
 'label': 1,
 'generated_text': 'research?\n\nAntibodies are used in research to identify and locate intracellular and extracellular proteins, differentiate cell types by the proteins they express, separate proteins and anything bound to them (co-immunoprecipitation) from other molecules in a cell lysate, identify proteins separated by electrophoresis, examine protein expression in tissue sections or to locate proteins within cells with the assistance of a microscope, detect and quantify proteins with ELISA and ELISpot techniques, and act as a guide for drugs to reach their target.',
 'retrieved_context': ['by dilution cloning to generate cell clones that all produce t

In [28]:
final_ds_1 = final_ds.map(generate_response)

Map:   0%|          | 0/56 [00:00<?, ? examples/s]

In [29]:
final_ds_1.push_to_hub("explodinggradients/ragas-wikiqa")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]