### 🔽 Requirements

In [None]:
!pip install groq openai bs4 tqdm pandas 

# 🔗 Imports

In [None]:
import requests
import xml.etree.ElementTree as ET
import json 
import pandas as pd 
import time 
from tqdm.auto import tqdm
from openai import OpenAI
import re
import os
from groq import Groq
from bs4 import BeautifulSoup
from html.parser import HTMLParser
from dotenv import load_dotenv
load_dotenv()


## 🤲🏻 Utils

In [None]:

def get_groq_completion(prompt):
    """generates the completion for groq

    Args:
        prompt (str): the prompt for groq completion
    """
    groq_API_key = os.environ("GROQ_API_KEY")

    client = Groq(
        api_key=groq_API_key,
    )

    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
        model="mixtral-8x7b-32768",
    )

    return chat_completion.choices[0].message.content

# 🦠 Bioarxiv generate_data

In [None]:
def fetch_biorxiv_data(server, interval, cursor=0, format="json"):
    """
    Fetch data from bioRxiv API.

    Args:
        server (str): 'biorxiv' or 'medrxiv'.
        interval (str): Date interval in 'YYYY-MM-DD/YYYY-MM-DD' or number of recent days/articles 'Nd' or 'N'.
        cursor (int): Cursor for pagination.
        format (str): Data format 'json' or 'xml'.

    Returns:
        dict: JSON response from the API.
    """
    base_url = f"https://api.biorxiv.org/details/{server}/{interval}/{cursor}/{format}"
    response = requests.get(base_url)
    if response.status_code == 200:
        return response.json()
    else:
        raise Exception(f"Error fetching data: {response.status_code}")


def extract_data(json_data):
    """
    Extract relevant data from the API response.

    Args:
        json_data (dict): JSON response from the bioRxiv API.

    Returns:
        list: List of dictionaries with extracted paper details.
    """
    papers = []
    for item in json_data["collection"]:
        paper_details = {
            "doi": item["doi"],
            "title": item["title"],
            "authors": item["authors"],
            "date": item["date"],
            "abstract": item["abstract"],
        }
        papers.append(paper_details)
    return papers


def get_biorxiv_data(server, start_date, end_date, max_results=100):
    """
    Get data from bioRxiv within a date range.

    Args:
        server (str): 'biorxiv' or 'medrxiv'.
        start_date (str): Start date in 'YYYY-MM-DD'.
        end_date (str): End date in 'YYYY-MM-DD'.
        max_results (int): Maximum number of results to fetch.

    Returns:
        DataFrame: Pandas DataFrame with the fetched data.
    """
    interval = f"{start_date}/{end_date}"
    cursor = 0
    all_papers = []

    while len(all_papers) < max_results:
        data = fetch_biorxiv_data(server, interval, cursor)
        papers = extract_data(data)
        all_papers.extend(papers)
        cursor += 100
        if len(papers) < 100:
            break

    return pd.DataFrame(all_papers)


# Example usage
server = "biorxiv"
start_date = "2023-01-01"
end_date = "2023-01-31"
max_results = 200

df = get_biorxiv_data(server, start_date, end_date, max_results)
# %%
df.to_csv("biorxiv_data_abstract.csv", index=False)


####################### biorxiv_data QA generation #######################

df = pd.read_csv("./biorxiv_data_abstract.csv")
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))


def generate_questions(df, client):
    responses = []
    for i, row in tqdm(df.iterrows()):
        try:
            prompt = f"""
            While researching for the latest research papers from biorxiv we came across the following research paper: {row['title']}, 
            Abstract of the research paper is {row['abstract']}
            Can you generate 5 questions and answers for the following research paper that researcher might ask?

            ---

            - Answers should be list of dictionaries (JSON mode). 
            - Answers should be as mentioned in research paper don't add additional knowledge. 
            - Questions should have high intellectual and information value don't ask childish questoins like what's the name of research paper and stuff. 
            - Answers should be only from available information in abstract of the paper provided.

            ---
            Answer:
            """
            answer = client.chat.completions.create(
                model="gpt-3.5-turbo",
                messages=[
                    {
                        "role": "system",
                        "content": "you are expert researcher in healthcare domain.",
                    },
                    {"role": "user", "content": prompt},
                ],
                response_format={"type": "json_object"},
            )
            responses.append(json.loads(answer.choices[0].message.content))
        except Exception as e:
            time.sleep(30)
            print(f"Error in generating questions for index {i}: {e}")
            responses.append(None)
    return responses


answer = generate_questions(df.head(50), client)

# %%


df["QAs"] = None
for idx, ans in enumerate(answer):
    question_pattern = "'question'\s*:\s*'([^']+)'"
    questions = re.findall(question_pattern, str(ans))
    # questions = [q[0] for q in questions if q[0] != ""]
    if len(questions) == 0:
        question_pattern = "'question'\s*:\s*\"([^\"]+)"
        questions = re.findall(question_pattern, str(ans))
    answer_pattern = "'answer':\s*'([^']+)'"
    answers = re.findall(answer_pattern, str(ans))
    try:
        assert len(questions) == len(answers)
        QAs = []

        for i in range(len(questions)):
            QAs.append({"Question": questions[i], "Answer": answers[i]})
        df.at[idx, "QAs"] = str(QAs)
    except AssertionError:
        print(f"Questions and Answers are not equal for index {idx}")
        print(f"Questions: {questions}")
        print(f"Answers: {answers}")
        df.at[idx, "QAs"] = None

# %%
df.to_csv("bioarxiv_abstracts.csv", index=False)
# %%
########### making DF to store into the gsheet ############
col_names = ["Study Title", "Question", "Answer", "link", "source"]
store_df = pd.DataFrame(columns=col_names)
for i, row in df.iterrows():
    if row["QAs"] is not None:
        for j in eval(row["QAs"]):
            store_df = pd.concat(
                [
                    store_df,
                    pd.DataFrame(
                        {
                            "Study Title": [row["title"]],
                            "Question": [j["Question"]],
                            "Answer": [j["Answer"]],
                            "link": [row["doi"]],
                            "source": ["bioarxiv"],
                        }
                    ),
                ]
            )
            # store_df = store_df.append({'Study Title': row['Title'], 'Question': j['Question'], 'Answer': j['Answer'], 'link': row['link'], 'source': row['disease']+"pubmed"}, ignore_index=True)
# %%
store_df.reset_index(drop=True).to_csv("bioarxiv_append_finale.csv", index=False)

# %%


# 🧪 Chembl data extraction and generation 

In [None]:

def search_chembl(query, retries=5, timeout=10):
    """
    Search ChEMBL for compounds related to a query

    Args:
        query (str): Search query for ChEMBL data
        retries (int, optional): number of retries in case of failure. Defaults to 5.
        timeout (int, optional): request time out in case server doesn't respond. Defaults to 10.

    Returns:
        response_json: returns the response in json format.
    """
    base_url = "https://www.ebi.ac.uk/chembl/api/data/molecule"
    params = {"search": query, "format": "json"}

    for attempt in range(retries):
        try:
            response = requests.get(base_url, params=params, timeout=timeout)
            if response.status_code == 200:
                return response.json()
            else:
                print(f"Request failed with status code: {response.status_code}")
        except Exception as e:
            print(f"Attempt {attempt + 1} of {retries}: {e}")
            time.sleep(int(2.5**attempt))

    return None


def parse_chembl_data(response_json, drug_name):
    """
    Parse the JSON response from ChEMBL

    Args:
        response_json (dict): JSON response from ChEMBL API

    Returns:
        List[dict]: returns a list of compounds with relevant data
    """
    compounds = []

    for item in response_json["molecules"][:5]:
        molecule_data = {
            "drug": drug_name,
            "molecule_chembl_id": item.get("molecule_chembl_id"),
            "Molecule Properties": {
                "alogp": item.get("molecule_properties", {}).get("alogp"),
                "aromatic_rings": item.get("molecule_properties", {}).get(
                    "aromatic_rings"
                ),
            },
            "Molecule ID and Structure": {
                "canonical_smiles": item.get("molecule_structures", {}).get(
                    "canonical_smiles"
                ),
            },
            "Chemical and Physical Nature": {
                "molecule_type": item.get("molecule_type", ""),
                "chirality": item.get("chirality", ""),
            },
            "Development and Approval Status": {
                "first_approval": item.get("first_approval", ""),
                "black_box_warning": item.get("black_box_warning", ""),
            },
            "Other Relevant Information": {
                "oral": item.get("oral", ""),
                "therapeutic_flag": item.get("therapeutic_flag", ""),
            },
        }
        compounds.append(molecule_data)

    return compounds


# %%


def get_chembl_document():
    """
    getting all the docuemnts

    Returns:
        json_response: response releated to the document
    """
    url = "https://www.ebi.ac.uk/chembl/api/data/document/"

    headers = {"Accept": "application/json"}  # Specify that you want JSON

    response = requests.get(url, headers=headers)

    if response.headers.get("Content-Type") == "application/json":
        return response.json()  # Return the JSON response
    else:
        return "Response was not in JSON format."


# Example usage
drugs = [
    "Aspirin",
    "Paracetamol (Acetaminophen)",
    "Ibuprofen",
    "Metformin",
    "Atorvastatin",
    "Simvastatin",
    "Lisinopril",
    "Amlodipine",
    "Amoxicillin",
    "Ciprofloxacin",
    "Doxycycline",
    "Azithromycin",
    "Prednisone",
    "Warfarin",
    "Insulin Glargine",
    "Losartan",
    "Omeprazole",
    "Fluoxetine (Prozac)",
    "Sertraline",
    "Alprazolam",
]
drug_data_df = pd.DataFrame()
for query in drugs:
    response_json = search_chembl(query)
    if response_json:
        compound_data = parse_chembl_data(response_json, drug_name=query)
        df = pd.DataFrame(compound_data)
        drug_data_df = pd.concat([drug_data_df, df])
# %%

drug_data_df = drug_data_df.reset_index(drop=True)


def generate_questions(df, client):
    responses = []
    for i, row in tqdm(df.iterrows()):
        try:
            prompt = f"""
            While researching for about the {row['drug']} we came across the following information from chembl
            {str(row.to_dict())}
            Can you generate 5 questions and answers for the following research paper that researcher might ask?

            ---

            - Answers should be list of. 
            - Answers should be as mentioned in research paper don't add additional knowledge. 
            - Questions should have high intellectual and information value don't ask childish questoins like what's the name of research paper and stuff. 
            - Answers should be only from available information in abstract of the paper provided.

            ---

            example: 
            Q: What is the molecule_chembl_id for the {row['drug']}?
            A: {row['molecule_chembl_id']}
            ---
            """

            answer = get_groq_completion(prompt)
            responses.append(answer)
            if i % 20 == 0:
                time.sleep(60)
        except Exception as e:
            time.sleep(30)
            print(f"Error in generating questions for index {i}: {e}")
            responses.append(None)
        #     responses.append(json.loads(answer.choices[0].message.content))
        # except Exception as e:
        #     time.sleep(30)
        #     print(f"Error in generating questions for index {i}: {e}")
        #     responses.append(None)
    return responses


# %%
answer = generate_questions(drug_data_df.head(50), "")
# %%

drug_data_df["QAs"] = None
for idx, ans in enumerate(answer):
    try:
        questions = ans.split("Q:")
        QA_dict = {}
        # print(questions)
        for q in questions:
            try:
                question_data = q.split("A:")[0].replace("\n", "").strip()
                answer_data = q.split("A:")[1].replace("\n", "").strip()
                QA_dict[question_data] = answer_data
            except Exception as e:
                print(e)
    except Exception as e:
        print(e)
    drug_data_df.at[idx, "QAs"] = str(QA_dict)
    # df['QAs'] =
# %%
drug_data_df.to_csv("chembl_data.csv", index=False)
# %%
col_names = ["drug", "Question", "Answer", "chembl_id", "source"]
store_df = pd.DataFrame(columns=col_names)
for i, row in drug_data_df.iterrows():
    if row["QAs"] is not None:
        for k, v in eval(row["QAs"]).items():
            store_df = pd.concat(
                [
                    store_df,
                    pd.DataFrame(
                        {
                            "drug": [row["drug"]],
                            "Question": [k],
                            "Answer": [v],
                            "chembl_id": [row["molecule_chembl_id"]],
                            "source": ["chembl"],
                        }
                    ),
                ]
            )
            # store_df = store_df.append({'Study Title': row['Title'], 'Question': j['Question'], 'Answer': j['Answer'], 'link': row['link'], 'source': row['disease']+"pubmed"}, ignore_index=True)

# %%
store_df.to_csv("chembl_append_csv_finale.csv", index=False)
# %%


# 🥼 Pubmed data extraction and generation

In [None]:

def search_pubmed(query, retries=5, timeout=10):
    """
    Search PubMed for papers related to a query

    Args:
        query (str): Search query for pubmed data
        retries (int, optional): number of retries in case of failure. Defaults to 5.
        timeout (int, optional): request time out in case server doesn't respond to our given request. Defaults to 10.

    Returns:
        response_json: returns the response in json format the query and the json dictionary.
    """
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    params = {"db": "pubmed", "term": query, "retmode": "json", "retmax": 12}

    for attempt in range(retries):
        try:
            response = requests.get(base_url, params=params, timeout=timeout)
            if response.status_code == 200:
                return response.json()
            else:
                print(f"Request failed with status code: {response.status_code}")
        except Exception as e:
            print(f"Attempt {attempt + 1} of {retries}: {e}")
            time.sleep(int(2.5**attempt))

    return None  # or handle this appropriately


def get_abstracts_from_pubmed(paper_ids):
    """
    Get abstracts for a list of paper IDs from PubMed

    Args:
        paper_ids (List[str]): List of paper IDs

    Returns:
        xml_abstract: returns the abstract in xml format
    """
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    abstracts = []
    for paper_id in tqdm(paper_ids):
        params = {"db": "pubmed", "id": paper_id, "retmode": "xml"}
        try:
            response = requests.get(base_url, params=params)
            if response.status_code == 200:
                # Extracting abstract from XML can be complex and depends on the XML structure
                # Here we just return the raw XML for simplicity
                abstracts.append(response.text)
            else:
                abstracts.append("Error fetching abstract")
        except Exception as e:
            print(f"Error fetching abstract: {e}")
            abstracts.append("Error fetching abstract")

    return abstracts


def parse_pubmed_xml(xml_data):
    """
    Parse XML data from PubMed

    Args:
        xml_data (str): XML data from PubMed

    Returns:
        parsed_data: returns the parsed data in json format
    """
    root = ET.fromstring(xml_data)
    articles = []

    for article in root.findall(".//PubmedArticle"):
        try:
            article_data = {
                "PMID": article.find(".//PMID").text,
                "Title": article.find(".//ArticleTitle").text,
                "Abstract": (
                    article.find(".//AbstractText").text
                    if article.find(".//AbstractText") is not None
                    else "No abstract"
                ),
                "Authors": [
                    auth.find("ForeName").text + " " + auth.find("LastName").text
                    for auth in article.findall(".//Author")
                ],
            }
            articles.append(article_data)
        except Exception as e:
            print("cannot parse article:", e)

    return articles


# Search for papers related to brain damage
def get_json_data(search_diseases: list = []):
    """
    Get json data from PubMed

    Args:
        search_diseases (List[str], optional): what diseases to serach on pubmed for abstracts. Defaults to [].

    Returns:
        parsed_data: returns the parsed pubmed data with abstract in list of dictionaries.
    """
    if len(search_diseases) == 0:
        search_diseases = [
            "brain damage",
            "bioinformatics",
            "cancer",
            "diabetes",
            "brain hemorrhage",
        ]
    final_data = []
    for search_query in tqdm(search_diseases):
        search_result = search_pubmed(search_query)

        json_results = []
        if search_result and "esearchresult" in search_result:
            paper_ids = search_result["esearchresult"]["idlist"]
            abstracts = get_abstracts_from_pubmed(paper_ids)
            for abstract in abstracts:
                try:
                    json_result = parse_pubmed_xml(abstract)
                    json_result[0]["disease"] = search_query
                    json_result[0][
                        "link"
                    ] = f"https://pubmed.ncbi.nlm.nih.gov/{json_result[0]['PMID']}"
                    json_results.append(json_result[0])
                except Exception as e:
                    print("cannot parse abstract:", e)
            final_data.extend(json_results)
        else:
            print("Error in searching PubMed")
    return final_data


################ PARSE AND SAVE TO DATAFRAME ################
final_data = get_json_data()
# %%
df = pd.DataFrame(final_data)
df = df.sample(frac=1).reset_index(drop=True)

################ QUESTION ANSWERS GENERATION ################

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))


def generate_questions(df, client):
    responses = []
    for i, row in tqdm(df.iterrows()):
        try:
            prompt = f"""
            While researching for the {row['disease']} we came across the following research paper: {row['Title']}, 
            Abstract of the research paper is {row['Abstract']}
            Can you generate 5 questions and answers for the following research paper that researcher might ask?

            ---

            - Answers should be list of dictionaries (JSON mode). 
            - Answers should be as mentioned in research paper don't add additional knowledge. 
            - Questions should have high intellectual and information value don't ask childish questoins like what's the name of research paper and stuff. 
            - Answers should be only from available information in abstract of the paper provided.

            ---
            Answer:
            """
            answer = client.chat.completions.create(
                model="gpt-3.5-turbo",
                messages=[
                    {
                        "role": "system",
                        "content": "you are expert researcher in healthcare domain.",
                    },
                    {"role": "user", "content": prompt},
                ],
                response_format={"type": "json_object"},
            )
            responses.append(json.loads(answer.choices[0].message.content))
        except Exception as e:
            time.sleep(30)
            print(f"Error in generating questions for index {i}: {e}")
            responses.append(None)
    return responses


answer = generate_questions(df.head(50), client)

# %%
df["QAs"] = None
for idx, ans in enumerate(answer):
    question_pattern = "'question'\s*:\s*'([^']+)'"
    questions = re.findall(question_pattern, str(ans))
    # questions = [q[0] for q in questions if q[0] != ""]
    if len(questions) == 0:
        question_pattern = "'question'\s*:\s*\"([^\"]+)"
        questions = re.findall(question_pattern, str(ans))
    answer_pattern = "'answer':\s*'([^']+)'"
    answers = re.findall(answer_pattern, str(ans))
    try:
        assert len(questions) == len(answers)
        QAs = []

        for i in range(len(questions)):
            QAs.append({"Question": questions[i], "Answer": answers[i]})
        df.at[idx, "QAs"] = str(QAs)
    except AssertionError:
        print(f"Questions and Answers are not equal for index {idx}")
        print(f"Questions: {questions}")
        print(f"Answers: {answers}")
        df.at[idx, "QAs"] = None

# %%
df.to_csv("pubmed_abstracts.csv", index=False)
# %%
########### making DF to store into the gsheet ############
col_names = ["Study Title", "Question", "Answer", "link", "source"]
store_df = pd.DataFrame(columns=col_names)
for i, row in df.iterrows():
    if row["QAs"] is not None:
        for j in eval(row["QAs"]):
            store_df = pd.concat(
                [
                    store_df,
                    pd.DataFrame(
                        {
                            "Study Title": [row["Title"]],
                            "Question": [j["Question"]],
                            "Answer": [j["Answer"]],
                            "link": [row["link"]],
                            "source": [row["disease"] + "_pubmed"],
                        }
                    ),
                ]
            )
            # store_df = store_df.append({'Study Title': row['Title'], 'Question': j['Question'], 'Answer': j['Answer'], 'link': row['link'], 'source': row['disease']+"pubmed"}, ignore_index=True)
# %%
store_df.reset_index(drop=True).to_csv("pubmed_append_csv_finale.csv", index=False)

# %%


# 🩺 Clinical trials data storage and generation 

In [None]:


conditions_trial_list = [
    "heart attack",
    "Lung cancer",
    "Diabetes",
    "covid-19",
    "high blood pressure",
    "asthma",
]

link = "https://clinicaltrials.gov/api/rss?cond=heart+attack&dateField=StudyFirstPostDate"
response = requests.get(link)

fin_df = pd.DataFrame(columns=["Study Title", "Question", "Answer", "link"])
# %%
map_dict = {
    "Exposure, Dose, Body Burden and Health Effects of Lead": "https://clinicaltrials.gov/study/NCT00013819?cond=Lead%20Poisoning&limit=10&rank=5",
    "Homeopathic Preparation Plumbum Metallicum for Lead Poisoning": "https://clinicaltrials.gov/study/NCT00931905?cond=Lead%20Poisoning&limit=10&rank=4",
    "The Combined Effect of 2,3-Dimercaptosuccinic Acid and Multi-Nutrients on Children in Lead Poisoning": "https://clinicaltrials.gov/study/NCT00374894?cond=Lead%20Poisoning&limit=10&rank=6",
}

for i, link in map_dict.items():
    question_formats = [
        f"What is the purpose of the study {i}",
        f"How many participants was enrolled in {i}",
        f"What treatment was given to the participants in {i}",
        f"Where did the study {i} take place",
        f"What is the type of following study: {i}",
    ]

    for q in question_formats:
        fin_df = pd.concat(
            [
                fin_df,
                pd.DataFrame(
                    {
                        "Study Title": [i],
                        "Question": [q],
                        "Answer": [None],
                        "link": link,
                    }
                ),
            ]
        )
fin_df.to_csv("questions.csv", index=False)

