# Dataset
lets load the dataset and verified testset

In [42]:
from datasets import load_dataset

qasper_dataset = load_dataset("allenai/qasper")
qasper_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'abstract', 'full_text', 'qas', 'figures_and_tables'],
        num_rows: 888
    })
    validation: Dataset({
        features: ['id', 'title', 'abstract', 'full_text', 'qas', 'figures_and_tables'],
        num_rows: 281
    })
    test: Dataset({
        features: ['id', 'title', 'abstract', 'full_text', 'qas', 'figures_and_tables'],
        num_rows: 416
    })
})

In [43]:
val_ids = qasper_dataset['validation']['id']
val_ids[:10]

['1912.01214',
 '1810.08699',
 '1609.00425',
 '1801.05147',
 '1811.00383',
 '1909.09067',
 '1704.06194',
 '1909.00512',
 '2003.03106',
 '1708.01464']

In [44]:
ls ./data/

1609.00425.pdf  1708.01464.pdf  1810.08699.pdf  1909.00512.pdf  1912.01214.pdf
1704.06194.pdf  1801.05147.pdf  1811.00383.pdf  1909.09067.pdf  2003.03106.pdf


In [45]:
import pandas as pd

df = pd.read_csv("./valid_first_10_ragas.csv", index_col=0)
df = df.dropna().reset_index(drop=True)
df

Unnamed: 0,question,contexts,ground_truth,evolution_type,episode_done
0,What are some linguistic signals of dogmatic l...,"['ers, often in a strongly opinionated way (“y...","Present tense, past tense, negative emotion, s...",simple,True
1,How does the previous vowel affect the pronunc...,[' ein English that affects the pro-\nnunciati...,The previous vowel affects the pronunciation o...,simple,True
2,What are the two natural language processing t...,['arXiv:1909.09067v1 [cs.CL] 19 Sep 2019A Co...,The two natural language processing tasks that...,simple,True
3,What is the Cloze objective of MLM in cross-li...,['Figure 2: The overview of BRidge Language Mo...,The Cloze objective of MLM in cross-lingual pr...,simple,True
4,What is the main challenge that leads to the f...,['Cross-lingual Pre-training Based Transfer fo...,The main challenge that leads to the failure o...,simple,True
...,...,...,...,...,...
59,How does the increased context-specificity in ...,['Figure 3: The intra-sentence similarity is t...,"In ELMo, words in the same sentence are more s...",conditional,True
60,What is the dataset used for Armenian named en...,['arXiv:1810.08699v1 [cs.CL] 19 Oct 2018pioN...,The dataset used for Armenian named entity rec...,conditional,True
61,What role do grapheme-phoneme mappings play in...,['with the output of the wFST-based Phonetisau...,Grapheme-phoneme mappings play a crucial role ...,conditional,True
62,Why is pre-ordering the assisting language cru...,[' 1999) which enables the model to learn\nthe...,Pre-ordering the assisting language is crucial...,conditional,True


# Baseline LlamaIndex

In [2]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

documents = SimpleDirectoryReader("data").load_data()
index = VectorStoreIndex.from_documents(documents)

In [105]:
index.storage_context.persist()

In [3]:
i = 0
query_engine = index.as_query_engine()
response = query_engine.query(df.question[i])
print(f"""\
Query: {df.question[i]}
Answer: {response}
Ground Truth: {df.ground_truth[i]}
""")

Query: What are some linguistic signals of dogmatic language?
Answer: Pronouns, verb tense, sentiment (positive or negative), emotion (positive or negative), swearing, interrogative language, and negation are some linguistic signals of dogmatic language.
Ground Truth: Present tense, past tense, negative emotion, swearing, interrogative language, and negation are some linguistic signals of dogmatic language.



# Baseline GPT3.5

In [4]:
from llama_index.llms.openai import OpenAI

llm = OpenAI()

prompt = """\
Answer the following question.
Question: {question}
Answer:
"""

async def query_llm(query):
    resp = await llm.acomplete(prompt.format(question=query))
    return resp.text

In [5]:
i = 0
response = await query_llm(df.question[i])
print(f"""\
Query: {df.question[i]}
Answer: {response}
Ground Truth: {df.ground_truth[i]}
""")

Query: What are some linguistic signals of dogmatic language?
Answer: Some linguistic signals of dogmatic language include the use of absolute terms such as "always," "never," "must," and "should," as well as a lack of willingness to consider alternative viewpoints or evidence. Dogmatic language often involves making definitive statements without room for discussion or debate, and may be characterized by a tone of certainty or superiority. Additionally, dogmatic language may involve personal attacks or dismissive language towards those who hold differing opinions.
Ground Truth: Present tense, past tense, negative emotion, swearing, interrogative language, and negation are some linguistic signals of dogmatic language.



# Evaluate a few Examples

lets evaluate a few examples and this is a good time to introduce our first metric.

## Answer Correctness
but what is it? Que slides ->

In [96]:
from ragas.metrics import answer_correctness, faithfulness
from ragas import evaluate

from datasets import Dataset

In [97]:
i = 11

q = df.question[i]
gt = df.ground_truth[i]

rag_response = query_engine.query(q)
llm_response = await query_llm(q)

In [98]:
print("query: ", q)
print("ground_truth:", gt)

query:  How does the pre-ordering of assisting language sentences help bridge the word order gap in low resource language translation?
ground_truth: The pre-ordering of assisting language sentences helps bridge the word order gap in low resource language translation by matching the word order of the source language.


In [99]:
# evaluate single
row = {
    "question": [q],
    "ground_truth": [gt],
    "rag_answer": [rag_response.response],
    "llm_answer": [llm_response],
}
ds = Dataset.from_dict(row)

In [100]:
# attaching a tracer
# langsmith
from langchain.callbacks.tracers import LangChainTracer

tracer = LangChainTracer(project_name="notes")

In [101]:
print("rag_response:", rag_response.response)

rag_response: Pre-ordering the assisting language sentences to match the word order of the source language ensures that the context of words in the parallel source and assisting language sentences are similar. This leads to consistent contextual representations across the source languages, helping to bridge the word order gap in low resource language translation.


In [102]:
r = evaluate(
    ds, 
    metrics=[answer_correctness], column_map={"answer": "rag_answer"},
    callbacks=[tracer],
)
r

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

{'answer_correctness': 0.8405}

In [103]:
print("llm_response:", llm_response)

llm_response: Pre-ordering of assisting language sentences helps bridge the word order gap in low resource language translation by rearranging the words in the assisting language to match the word order of the target low-resource language before translation. This makes the translation process easier and more accurate as it reduces the complexity of the translation model. It also helps to preserve the meaning of the original sentence during translation.


In [104]:
r = evaluate(ds, metrics=[answer_correctness], column_map={"answer": "llm_answer"})
r

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

{'answer_correctness': 0.8871}

# Evaluation

In [14]:
from tqdm import tqdm

In [66]:
# build dataset - RAG
rag_contexts = []
rag_answer = []
for q in tqdm(df.question):
    resp = await query_engine.aquery(q)
    rag_answer.append(resp.response)
    rag_contexts.append(
        [n.node.text for n in resp.source_nodes]
    )

100%|██████████████████████████████████████████████████████████| 64/64 [01:57<00:00,  1.83s/it]


In [67]:
# build dataset - LLM gpt-3.5
llm_answers = []
for q in tqdm(df.question):
    resp = await query_llm(q)
    llm_answers.append(resp)

100%|██████████████████████████████████████████████████████████| 64/64 [04:50<00:00,  4.54s/it]


In [68]:
# convert to dataset
dataset_dict = {
    "question": df.question,
    "rag_answer": rag_answer,
    "rag_contexts": rag_contexts,
    "llm_answer": llm_answers,
    "llm_contexts": [['']] * len(llm_answers),
    "ground_truth": df.ground_truth
}

ds = Dataset.from_dict(dataset_dict)

In [69]:
# evaluate RAG
rag_eval_result = evaluate(
    ds, 
    metrics=[answer_correctness], column_map={"answer": "rag_answer", "contexts": "rag_contexts"},
    callbacks=[tracer],
    raise_exceptions=False,
)
rag_eval_result

Evaluating:   0%|          | 0/64 [00:00<?, ?it/s]

{'answer_correctness': 0.7069}

In [70]:
# evaluate LLM
llm_eval_result = evaluate(
    ds, 
    metrics=[answer_correctness], 
    column_map={"answer": "llm_answer", "contexts": "llm_contexts"}
)
llm_eval_result

Evaluating:   0%|          | 0/64 [00:00<?, ?it/s]

{'answer_correctness': 0.5967}

Hmmm, seems like GPT already knows a lot about this dataset - lets try GPT-4 then?

In [30]:
llm = OpenAI(model="gpt-4")

async def query_llm(query):
    resp = await llm.acomplete(prompt.format(question=query))
    return resp.text

In [34]:
# build dataset - LLM get-4
gpt4_answers = []
for q in tqdm(df.question):
    resp = await query_llm(q)
    gpt4_answers.append(resp)

100%|██████████████████████████████████████████████████████████| 64/64 [05:08<00:00,  4.81s/it]


In [36]:
# convert to dataset
dataset_dict = {
    "question": df.question,
    "rag_answer": rag_answer,
    "rag_contexts": rag_contexts,
    "llm_answer": llm_answers,
    "llm_contexts": [['']] * len(llm_answers),
    "gpt4_answer": gpt4_answers,
    "gpt4_contexts": [['']] * len(gpt4_answers),
    "ground_truth": df.ground_truth,
}

ds = Dataset.from_dict(dataset_dict)

In [39]:
# evaluate LLM
gpt4_eval_result = evaluate(
    ds, 
    metrics=[answer_correctness], 
    column_map={"answer": "gpt4_answer", "contexts": "gpt4_contexts"}
)
gpt4_eval_result

Evaluating:   0%|          | 0/64 [00:00<?, ?it/s]

{'answer_correctness': 0.6203}

# Analyse Results

In [72]:
rag_eval_df = rag_eval_result.to_pandas()
llm_eval_df = llm_eval_result.to_pandas()
dataset_df = ds.to_pandas()

dataset_df["answer_correctness_llm"] = llm_eval_df.answer_correctness
dataset_df["answer_correctness_rag"] = rag_eval_df.answer_correctness
dataset_df.head()

Unnamed: 0,question,rag_answer,rag_contexts,llm_answer,llm_contexts,ground_truth,answer_correctness_llm,answer_correctness_rag
0,What are some linguistic signals of dogmatic l...,"Pronouns, verb tense, sentiment (positive or n...","[From a behavioral standpoint,\ndogmatic peopl...",1. Absolutist Language: Dogmatic language ofte...,[],"Present tense, past tense, negative emotion, s...",0.456591,0.822189
1,How does the previous vowel affect the pronunc...,The previous vowel in English can be affected ...,"[But despite\nbeing glottographic, in few writ...",The pronunciation of a word in English can be ...,[],The previous vowel affects the pronunciation o...,0.412959,0.607144
2,What are the two natural language processing t...,Text simplification and readability assessment...,"[As\npart of a rule-based approach, the operat...",Text simplification and Machine translation,[],The two natural language processing tasks that...,0.218641,0.740867
3,What is the Cloze objective of MLM in cross-li...,The Cloze objective of MLM in cross-lingual pr...,"[In the pretraining phase, we ﬁrst pretrain ML...",The Cloze objective of MLM (Masked Language Mo...,[],The Cloze objective of MLM in cross-lingual pr...,0.616323,0.749505
4,What is the main challenge that leads to the f...,The main challenge that leads to the failure o...,[tialize a low-resource source →target model (...,The main challenge that leads to the failure o...,[],The main challenge that leads to the failure o...,0.612409,0.614842


In [73]:
dataset_df.to_csv("baseline_eval.csv")

You can check the [spreadsheet](https://docs.google.com/spreadsheets/d/1FJ55p6QrP7WqBsOGF4lwhYZAeHZbZmWqY_7MgfbvjL4/edit?usp=sharing) for full analysis.