#  🔢 Imports

In [46]:
import pandas as pd
import numpy as np
import dspy
from dotenv import load_dotenv
import os
import requests

load_dotenv()

True

## ⚙️ Setting up dspy

In [2]:
turbo = dspy.OpenAI(model="gpt-3.5-turbo")
# rm module is currently not available.

dspy.settings.configure(lm=turbo)

## 📀 Data to train it on

In [7]:
data_qa = pd.read_csv("question_answers_eval.csv")
data_qa = data_qa.sample(frac=1).reset_index(drop=True)

In [8]:
data_qa

Unnamed: 0,Study Title,Question,Answer,Link,Source
0,Cascade signal amplification strategy for the ...,What is the significance of sensitive and sele...,Sensitive and selective analysis of low conten...,https://pubmed.ncbi.nlm.nih.gov/38438228,cancer_pubmed
1,Holomics - a user-friendly R shiny application...,Can you explain the workflow provided by Holom...,The workflow provided by Holomics starts with ...,https://pubmed.ncbi.nlm.nih.gov/38438871,bioinformatics_pubmed
2,"The genetic basis of hydrocephalus: genes, pat...",What is the status of pharmacologic treatments...,No effective pharmacologic treatments for hydr...,https://pubmed.ncbi.nlm.nih.gov/38439105,brain hemorrhage_pubmed
3,Sarcopenia is associated with hypomethylation ...,What is the role of TWEAK and FN14 in the path...,Tumor necrosis factor (TNF)-like weak inducer ...,https://pubmed.ncbi.nlm.nih.gov/38437928,diabetes_pubmed
4,Metformin,Does the research mention anything about the o...,"No, the research does not mention anything abo...",CHEMBL6329,chembl
...,...,...,...,...,...
471,Amlodipine,Is Lisinopril an oral medication or not?,"No, Lisinoprol is not an oral medication.",CHEMBL265667,chembl
472,Simvastatin,"Sure, here are five questions and answers base...",The molecular formula for Metformin is not pro...,CHEMBL6329,chembl
473,Simvastatin,Is Simvastatin approved for therapeutic use?,"According to the information provided, Simvast...",CHEMBL267864,chembl
474,Aspirin,Is Aspirin an oral medication?,"According to the information provided, Aspirin...",CHEMBL265667,chembl


In [18]:
sample_100_data = []
for idx, row in enumerate(data_qa.iterrows()):
    dspy_example = dspy.Example(
        {
            "question": row[1]["Question"],
            "answer": row[1]["Answer"],
            "context": row[1]["Study Title"],
        }
    ).with_inputs("question", "context")
    sample_100_data.append(dspy_example)
    if idx == 100:
        break

## 🚙 Utils: Brave search API for now 

In [None]:
import collections


def call_brave_search_api(search_text: str) -> collections.defaultdict[list]:
    endpoint = "{url_address}?count={count}&q={search_text}&search_lang=en&extra_snippets=True".format(
        url_address="https://api.search.brave.com/res/v1/web/search",
        count=5,
        search_text=search_text,
    )

    headers = {
        "Accept": "application/json",
        "Accept-Encoding": "gzip",
        "X-Subscription-Token": str(os.environ["BRAVE_SUBSCRIPTION_KEY"]),
    }
    results = collections.defaultdict(list)

    try:
        response = requests.get(endpoint, headers=headers)
        response.raise_for_status()
        web_response = response.json().get("web").get("results")
        i = 0
        if web_response:
            for resp in web_response:
                detailed_text = resp.get("description") + "".join(
                    resp.get("extra_snippets") if resp.get("extra_snippets") else ""
                )
                results[i] = {
                    "text": detailed_text,
                    "url": resp["url"],
                    "page_age": resp.get("page_age"),
                }
                i = i + 1

    except Exception as ex:
        raise ex
    return results

In [60]:
op = call_brave_search_api(dev_example.context)

In [75]:
context.replace("<strong>", "**").replace("</strong>", "**")

"**DNA** is increasingly being used as the engineering material of choice for the construction of nanoscale circuits, structures, and motors. Many of these enzyme-free constructions function by **DNA** **strand** **displacement** reactions. The kinetics **of** **strand** **displacement** can be modulated by **toeholds**, short ...DNA is increasingly being used as the engineering material of choice for the construction of nanoscale circuits, structures, and motors. Many of these enzyme-free constructions function by DNA strand displacement reactions. The kinetics of strand displacement can be modulated by toeholds, short single-stranded segments of DNA that colocalize reactant DNA molecules.Hanwen Lu, Binrong Ding, Liujuan Tong, Fan Wu, Xinyao Yi, Jianxiu Wang. Toehold-Mediated Strand Displacement Reaction for Dual-Signal Electrochemical Assay of Apolipoprotein E Genotyping. ACS Sensors 2020, 5 (9) , 2959-2965. https://doi.org/10.1021/acssensors.0c01511Gurbrinder Ghotra, Bach Kim Nguyen

In [76]:
context

"<strong>DNA</strong> is increasingly being used as the engineering material of choice for the construction of nanoscale circuits, structures, and motors. Many of these enzyme-free constructions function by <strong>DNA</strong> <strong>strand</strong> <strong>displacement</strong> reactions. The kinetics <strong>of</strong> <strong>strand</strong> <strong>displacement</strong> can be modulated by <strong>toeholds</strong>, short ...DNA is increasingly being used as the engineering material of choice for the construction of nanoscale circuits, structures, and motors. Many of these enzyme-free constructions function by DNA strand displacement reactions. The kinetics of strand displacement can be modulated by toeholds, short single-stranded segments of DNA that colocalize reactant DNA molecules.Hanwen Lu, Binrong Ding, Liujuan Tong, Fan Wu, Xinyao Yi, Jianxiu Wang. Toehold-Mediated Strand Displacement Reaction for Dual-Signal Electrochemical Assay of Apolipoprotein E Genotyping. ACS Senso

In [78]:
def create_context(data):
    data = call_brave_search_api(data)
    context = " \n \n ".join([con["text"][:1000] for con in data.values()])
    context = context.replace("<strong>", "**").replace("</strong>", "**")
    return context

In [79]:
# setting up and testing the basic signature
class QA(dspy.Signature):
    """Answer biomedical questions with help of context title given."""

    question = dspy.InputField()
    context = dspy.InputField()
    answer = dspy.OutputField(desc="Specific information about the question asked.")


# testing out QA
generate_answer = dspy.Predict(QA)
dev_example = sample_100_data[0]
# Call the predictor on a particular input.
pred = generate_answer(
    question=dev_example.question, context=create_context((dev_example.context))
)

# Print the input and the prediction.
print(f"Question: {dev_example.question}")
print(f"Predicted Answer: {pred.answer}")

Question: What is the significance of sensitive and selective analysis of low content nucleic acid sequences?
Predicted Answer: Sensitive and selective analysis of low content nucleic acid sequences is significant because it allows for the detection of specific DNA sequences, which is crucial for identifying animal species, detecting meat adulteration, and diagnosing diseases like Alzheimer's. Techniques like toehold-mediated strand displacement (TMSD) and rolling circle amplification (RCA) enable highly sensitive detection of target DNA sequences, even at low concentrations, making them valuable tools in biomedical research and diagnostics.


In [80]:
# actual answer
dev_example.answer

'Sensitive and selective analysis of low content nucleic acid sequences plays an important role in pathogen analysis, disease diagnosis and biomedicine.'

In [81]:
# what prompt did it gave to model?
turbo.inspect_history(n=1)





Answer biomedical questions with help of context title given.

---

Follow the following format.

Question: ${question}
Context: ${context}
Answer: Specific information about the question asked.

---

Question: What is the significance of sensitive and selective analysis of low content nucleic acid sequences?
Context:
**DNA** is increasingly being used as the engineering material of choice for the construction of nanoscale circuits, structures, and motors. Many of these enzyme-free constructions function by **DNA** **strand** **displacement** reactions. The kinetics **of** **strand** **displacement** can be modulated by **toeholds**, short ...DNA is increasingly being used as the engineering material of choice for the construction of nanoscale circuits, structures, and motors. Many of these enzyme-free constructions function by DNA strand displacement reactions. The kinetics of strand displacement can be modulated by toeholds, short single-stranded segments of DNA that colocalize r

In [82]:
class QA_module_dspy(dspy.Module):
    """
    Answer biomedical questions with help of context title given.
    Context: Title of Research paper.
    """

    def __init__(self):
        super().__init__()
        self.generate_answer = dspy.ChainOfThought(QA)

    def _get_context(self, context):
        return create_context(context)

    def forward(self, question, context):
        prediction = self.generate_answer(
            question=question, context=self._get_context(context)
        )
        return dspy.Prediction(answer=prediction.answer)

In [87]:
class Assess(dspy.Signature):
    """Assess the answer given by LLM with acutal answer."""

    assessment_question = dspy.InputField()
    assessed_answer = dspy.InputField()
    assessment_answer = dspy.OutputField(desc="Yes or No")


gpt4T = dspy.OpenAI(model="gpt-4-1106-preview", max_tokens=1000, model_type="chat")


def metric(gold, pred, trace=None):
    question, answer, predicted_output = gold.question, gold.answer, pred.answer

    is_it_good_enough = (
        "Is the answer given by LLM is good enough to answer the given question"
    )
    correct = f"The text should answer `{question}` with `{answer}`. Does the assessed text is relavant to the answer given?"

    with dspy.context(lm=gpt4T):
        correct = dspy.Predict(Assess)(
            assessed_answer=question, assessment_question=correct
        )
        is_it_good_enough = dspy.Predict(Assess)(
            assessed_answer=predicted_output, assessment_question=is_it_good_enough
        )

    correct, is_it_good_enough = [
        m.assessment_answer.lower() == "yes" for m in [correct, is_it_good_enough]
    ]
    print(correct, is_it_good_enough)
    score = (
        (correct + is_it_good_enough)
        if correct and (len(predicted_output) <= 280)
        else 0
    )
    if trace is not None:
        return score >= 2
    return score / 2.0

In [89]:
from dspy.teleprompt import BootstrapFewShot

# Set up a basic teleprompter, which will compile our RAG program.
teleprompter = BootstrapFewShot(metric=metric)

# Compile!
compiled_rag = teleprompter.compile(QA_module_dspy(), trainset=sample_100_data[:5])

 20%|██        | 1/5 [00:04<00:17,  4.37s/it]

False True
0.0


 40%|████      | 2/5 [00:08<00:13,  4.51s/it]

False True
0.0


 60%|██████    | 3/5 [00:12<00:08,  4.17s/it]

False True
0.0


 80%|████████  | 4/5 [00:19<00:05,  5.09s/it]

False True
0.0


100%|██████████| 5/5 [00:24<00:00,  4.91s/it]

False False
0.0
Bootstrapped 0 full traces after 5 examples in round 0.





In [90]:
turbo.inspect_history(n=1)





Answer biomedical questions with help of context title given.

---

Follow the following format.

Question: ${question}

Context: ${context}

Reasoning: Let's think step by step in order to ${produce the answer}. We ...

Answer: Specific information about the question asked.

---

Question: What is the role of TWEAK and FN14 in the pathogenesis of sarcopenia?
Context: Sarcopenia is associated with hypomethylation of TWEAK and increased plasma levels of TWEAK and its downstream inflammatory factor TNF-α in older adults: A case-control study.
Answer: Tumor necrosis factor (TNF)-like weak inducer of apoptosis (TWEAK) and its receptor fibroblast growth factor inducible 14 (FN14) are known to play important roles in the pathogenesis of sarcopenia.

---

Question: What is the significance of sensitive and selective analysis of low content nucleic acid sequences?
Context: Cascade signal amplification strategy for the electrochemical aptasensing of nucleic acid: Combination of dual-output 

In [91]:
compiled_rag.save("compiled_rag_QA_random.json")