# Improvements we want to try

Things we want to try
- improve retriever with reranker
- improve generation with gpt4

In [9]:
import os
from llama_index.core import StorageContext, load_index_from_storage

# load old baseline
def load_index(dir):
    if not os.path.exists(dir):
        raise FileNotFoundError("Saved index not found!")
        
    # load the existing index
    storage_context = StorageContext.from_defaults(persist_dir=dir)
    index = load_index_from_storage(storage_context)
    return index

In [10]:
baseline_index = load_index("./rags/baseline/")

In [19]:
# load testset
import pandas as pd

df = pd.read_csv("./valid_first_10_ragas.csv", index_col=0)
df = df.dropna().reset_index(drop=True)

# select a subset
df = df.iloc[:4]
df

Unnamed: 0,question,contexts,ground_truth,evolution_type,episode_done
0,What are some linguistic signals of dogmatic l...,"['ers, often in a strongly opinionated way (“y...","Present tense, past tense, negative emotion, s...",simple,True
1,How does the previous vowel affect the pronunc...,[' ein English that affects the pro-\nnunciati...,The previous vowel affects the pronunciation o...,simple,True
2,What are the two natural language processing t...,['arXiv:1909.09067v1 [cs.CL] 19 Sep 2019A Co...,The two natural language processing tasks that...,simple,True
3,What is the Cloze objective of MLM in cross-li...,['Figure 2: The overview of BRidge Language Mo...,The Cloze objective of MLM in cross-lingual pr...,simple,True


# Improve Retreiver

lets use Cohere reranker module

In [13]:
import os
from llama_index.postprocessor.cohere_rerank import CohereRerank


api_key = os.environ["COHERE_API_KEY"]
cohere_rerank = CohereRerank(api_key=api_key, top_n=2)

In [54]:
i = 2
print(f"{df.question[i]}")

What are the two natural language processing tasks that deal with the concept of simplified language?


In [55]:
# improved
query_engine_cohere = baseline_index.as_query_engine(
    similarity_top_k=10,
    node_postprocessors=[cohere_rerank],
)
improved_ret = query_engine_cohere.retrieve

resp = query_engine_cohere.query(df.question[i])
print(resp)
print(resp.source_nodes[0])

Automatic readability assessment and automatic text simplification.
Node ID: ee6895fb-9704-4772-873a-3a3be7515948
Text: arXiv:1909.09067v1  [cs.CL]  19 Sep 2019A Corpus for Automatic
Readability Assessment and Text Simp liﬁcation of German Alessia
Battisti Institute of Computational Linguistics University of Zurich
Andreasstrasse 15, 8050 Zurich alessia.battisti@uzh.chSarah Ebling
Institute of Computational Linguistics University of Zurich
Andreasstrasse 15, 8050...
Score:  0.999



In [56]:
# baseline
query_engine_baseline = baseline_index.as_query_engine()
baseline_ret = baseline_index.as_retriever()

resp = query_engine_baseline.query(df.question[i])
print(resp)
nodes = baseline_ret.retrieve(df.question[0])
print(nodes[0])

Automatic text simplification and readability assessment.
Node ID: 959781a9-7849-4b2a-b213-89f25189a72a
Text: From a behavioral standpoint, dogmatic people solve problems
differently, spend- ing less time framing a problem and expressing
more certainty in their solution (Lohman, 2010). Here we similarly
examine how user behaviors on Reddit re- late to a language model of
dogmatism. Ertel sought to capture dogmatism linguistically, though a
small lexicon...
Score:  0.871



## Check a few Examples

In [22]:
from ragas.metrics import context_precision, context_recall
from ragas import evaluate

from datasets import Dataset

In [34]:
i = 2

q = df.question[i]
gt = df.ground_truth[i]

baseline_response = query_engine_baseline.query(q)
improved_response = query_engine_cohere.query(q)

In [35]:
# evaluate single
row = {
    "question": [q],
    "ground_truth": [gt],
    "baseline_contexts": [[n.node.text for n in baseline_response.source_nodes]],
    "improved_contexts": [[n.node.text for n in improved_response.source_nodes]],
}
ds = Dataset.from_dict(row)

In [36]:
# attaching a tracer
# langsmith
from langchain.callbacks.tracers import LangChainTracer

tracer = LangChainTracer(project_name="notes")

In [37]:
# baseline
r = evaluate(
    ds, 
    metrics=[context_precision, context_recall], column_map={"contexts": "baseline_contexts"},
    callbacks=[tracer],
)
r

Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

{'context_precision': 1.0000, 'context_recall': 1.0000}

In [38]:
# baseline
r = evaluate(
    ds, 
    metrics=[context_precision, context_recall], column_map={"contexts": "improved_contexts"},
    callbacks=[tracer],
)
r

Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

{'context_precision': 1.0000, 'context_recall': 1.0000}

## Evaluate

In [59]:
# build dataset - baseline

from tqdm import tqdm 

baseline_contexts = []
for q in tqdm(df.question):
    resp = await query_engine_baseline.aretrieve(q)
    baseline_contexts.append(
        [n.node.text for n in resp]
    )

100%|████████████████████████████████████████████████████████████| 4/4 [00:00<00:00,  4.97it/s]


In [61]:
# build dataset - improved

from tqdm import tqdm 

improved_contexts = []

for q in tqdm(df.question):
    resp = await query_engine_cohere.aquery(q)
    improved_contexts.append(
        [n.node.text for n in resp.source_nodes]
    )

100%|████████████████████████████████████████████████████████████| 4/4 [00:06<00:00,  1.60s/it]


In [62]:
# convert to dataset
dataset_dict = {
    "question": df.question,
    "baseline_contexts": baseline_contexts,
    "improved_contexts": improved_contexts,
    "ground_truth": df.ground_truth
}

ds = Dataset.from_dict(dataset_dict)

In [63]:
# evaluate baseline
rag_eval_result = evaluate(
    ds, 
    metrics=[context_precision, context_recall], 
    column_map={"contexts": "baseline_contexts"},
    callbacks=[tracer],
    raise_exceptions=False,
)
rag_eval_result

Evaluating:   0%|          | 0/8 [00:00<?, ?it/s]

{'context_precision': 0.8750, 'context_recall': 1.0000}

In [65]:
# evaluate improved
rag_eval_result = evaluate(
    ds, 
    metrics=[context_precision, context_recall], 
    column_map={"contexts": "improved_contexts"},
    callbacks=[tracer],
    raise_exceptions=False,
)
rag_eval_result

Evaluating:   0%|          | 0/8 [00:00<?, ?it/s]

{'context_precision': 1.0000, 'context_recall': 1.0000}

# Improve LLM

lets try GPT-4 RAG with `faithfulness` and `answer_correctness`

In [67]:
from llama_index.llms.openai import OpenAI

gpt4 = OpenAI(model="gpt-4")

# Local settings
qe_gpt4 = baseline_index.as_query_engine(llm=gpt4)

In [68]:
# build dataset - gpt4-rag

improved_answer = []
improved_contexts = []

for q in tqdm(df.question.iloc[:4]):
    resp = await qe_gpt4.aquery(q)
    improved_answer.append(resp.response)
    improved_contexts.append(
        [n.node.text for n in resp.source_nodes]
    )

100%|████████████████████████████████████████████████████████████| 4/4 [00:17<00:00,  4.44s/it]


In [69]:
# build dataset - baseline-rag

rag_contexts = []
rag_answer = []
for q in tqdm(df.question.iloc[:4]):
    resp = await query_engine_baseline.aquery(q)
    rag_answer.append(resp.response)
    rag_contexts.append(
        [n.node.text for n in resp.source_nodes]
    )

100%|████████████████████████████████████████████████████████████| 4/4 [00:04<00:00,  1.23s/it]


In [70]:
# convert to dataset
dataset_dict = {
    "question": df.question,
    "baseline_answer": rag_answer,
    "baseline_contexts": rag_contexts,
    "gpt4_answer": improved_answer,
    "gpt4_contexts": improved_contexts,
    "ground_truth": df.ground_truth
}

ds = Dataset.from_dict(dataset_dict)

In [73]:
from ragas.metrics import answer_similarity

In [74]:
# evaluate baseline
rag_eval_result = evaluate(
    ds, 
    metrics=[faithfulness, answer_correctness, answer_similarity], 
    column_map={"contexts": "baseline_contexts", "answer": "baseline_answer"},
    callbacks=[tracer],
    raise_exceptions=False,
)
rag_eval_result

Evaluating:   0%|          | 0/12 [00:00<?, ?it/s]

{'faithfulness': 0.8393, 'answer_correctness': 0.7185, 'answer_similarity': 0.9632}

In [75]:
# evaluate gpt4
rag_eval_result = evaluate(
    ds, 
    metrics=[faithfulness, answer_correctness, answer_similarity], 
    column_map={"contexts": "gpt4_contexts", "answer": "gpt4_answer"},
    callbacks=[tracer],
    raise_exceptions=False,
)
rag_eval_result

Evaluating:   0%|          | 0/12 [00:00<?, ?it/s]

{'faithfulness': 0.8750, 'answer_correctness': 0.6627, 'answer_similarity': 0.9465}

In [76]:
answer_correctness.weights = [1, 0]

In [77]:
# evaluate gpt4
rag_eval_result = evaluate(
    ds, 
    metrics=[faithfulness, answer_correctness, answer_similarity], 
    column_map={"contexts": "gpt4_contexts", "answer": "gpt4_answer"},
    callbacks=[tracer],
    raise_exceptions=False,
)
rag_eval_result

Evaluating:   0%|          | 0/12 [00:00<?, ?it/s]

{'faithfulness': 0.8750, 'answer_correctness': 0.5431, 'answer_similarity': 0.9465}

In [79]:
tmp_df = rag_eval_result.to_pandas()
tmp_df

Unnamed: 0,question,baseline_answer,baseline_contexts,answer,contexts,ground_truth,faithfulness,answer_correctness,answer_similarity
0,What are some linguistic signals of dogmatic l...,"Pronouns, verb tense, sentiment (positive or n...","[From a behavioral standpoint,\ndogmatic peopl...",Linguistic signals of dogmatic language includ...,"[From a behavioral standpoint,\ndogmatic peopl...","Present tense, past tense, negative emotion, s...",1.0,0.705882,0.949174
1,How does the previous vowel affect the pronunc...,The pronunciation of words in English can be a...,"[But despite\nbeing glottographic, in few writ...","In English, the pronunciation of a vowel can b...","[But despite\nbeing glottographic, in few writ...",The previous vowel affects the pronunciation o...,1.0,0.4,0.890755
2,What are the two natural language processing t...,Text simplification and readability assessment...,"[As\npart of a rule-based approach, the operat...",The two natural language processing tasks that...,"[As\npart of a rule-based approach, the operat...",The two natural language processing tasks that...,0.5,0.666667,0.984958
3,What is the Cloze objective of MLM in cross-li...,The Cloze objective of MLM in cross-lingual pr...,"[In the pretraining phase, we ﬁrst pretrain ML...",The Cloze objective of Masked Language Modelin...,"[In the pretraining phase, we ﬁrst pretrain ML...",The Cloze objective of MLM in cross-lingual pr...,1.0,0.4,0.961062
