##Downloading Data from PhysioNet

In [9]:
#for this step use your physionet account(userid and password)
!wget -r -N -c -np --user jashrajm --ask-password https://physionet.org/files/archehr-qa-bionlp-task-2025/1.1/

Password for user ‘jashrajm’: 
--2025-03-27 05:14:52--  https://physionet.org/files/archehr-qa-bionlp-task-2025/1.1/
Resolving physionet.org (physionet.org)... 18.18.42.54
Connecting to physionet.org (physionet.org)|18.18.42.54|:443... connected.
HTTP request sent, awaiting response... 401 Unauthorized
Authentication selected: Basic realm="PhysioNet", charset="UTF-8"
Reusing existing connection to physionet.org:443.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: ‘physionet.org/files/archehr-qa-bionlp-task-2025/1.1/index.html’

physionet.org/files     [ <=>                ]     721  --.-KB/s    in 0s      

Last-modified header missing -- time-stamps turned off.
2025-03-27 05:14:54 (180 MB/s) - ‘physionet.org/files/archehr-qa-bionlp-task-2025/1.1/index.html’ saved [721]

Loading robots.txt; please ignore errors.
--2025-03-27 05:14:54--  https://physionet.org/robots.txt
Reusing existing connection to physionet.org:443.
HTTP request sent, awaitin

##Importing Required Libraries

In [2]:
import xml.etree.ElementTree as ET
import pandas as pd
import json

##Loading and Parsing XML File

In [None]:
xml_file ="/content/physionet.org/files/archehr-qa-bionlp-task-2025/1.1/dev/archehr-qa.xml"
tree = ET.parse(xml_file)
root = tree.getroot()

##Extracting Data from XML

In [3]:
# List to store extracted data
data = []

# Iterate over each <case> element
for case in root.findall("case"):
    row_data = {}

    # Extracting attributes
    row_data["case_id"] = case.get("id", "N/A")

    # Extracting text fields (strip to remove unwanted spaces/newlines)
    row_data["patient_narrative"] = case.find("patient_narrative").text.strip() if case.find("patient_narrative") is not None else "N/A"
    row_data["clinician_question"] = case.find("clinician_question").text.strip() if case.find("clinician_question") is not None else "N/A"
    row_data["note_excerpt"] = case.find("note_excerpt").text.strip() if case.find("note_excerpt") is not None else "N/A"

    # Extracting phrases from <patient_question> into a dictionary
    phrases_dict = {
        phrase.get("id", "N/A"): phrase.text.strip()
        for phrase in case.findall("patient_question/phrase") if phrase.text
    }
    row_data["patient_question"] = phrases_dict

    # Extracting sentences from <note_excerpt_sentences> into a dictionary
    sentences_dict = {
        sentence.get("id", "N/A"): sentence.text.strip()
        for sentence in case.findall("note_excerpt_sentences/sentence") if sentence.text
    }
    row_data["note_excerpt_sentences"] = sentences_dict

    # Append row data
    data.append(row_data)

##Creating a Pandas DataFrame

In [4]:
df = pd.DataFrame(data)
df.head(3)

Unnamed: 0,case_id,patient_narrative,clinician_question,note_excerpt,patient_question,note_excerpt_sentences
0,1,I had severe abdomen pain and was hospitalised...,Why was ERCP recommended to him over continuin...,Brief Hospital Course:\n\nDuring the ERCP a pa...,{'0': 'My question is if the sludge was there ...,"{'0': 'Brief Hospital Course:', '1': 'During t..."
1,2,I just wrote about my dad given multiple shots...,Why was he given lasix and his oxygen flow rat...,Brief Hospital Course:\n\nAcute diastolic hear...,{'0': 'dad given multiple shots of lasciks aft...,"{'0': 'Brief Hospital Course:', '1': 'Acute di..."
2,3,my son fell and lost conciousness for a couple...,What is the expected course of recovery for him?,Discharge Instructions:\nYou were admitted to ...,{'0': 'he is continously irritated and has hea...,{'0': 'Discharge Instructions: You were admitt...


## Loading JSON File for Sentence Relevance

In [None]:
json_file = "/content/physionet.org/files/archehr-qa-bionlp-task-2025/1.1/dev/archehr-qa_key.json"
with open(json_file, "r") as file:
    json_data = json.load(file)

##Processing JSON Data into a Dictionary

In [5]:
case_relevance_dict = {}
for item in json_data:
    case_id = item["case_id"]
    relevance_dict = {"not-relevant": [], "supplementary": [], "essential": []}

    for answer in item["answers"]:
        sentence_id = answer["sentence_id"]
        relevance = answer["relevance"]
        if relevance in relevance_dict:
            relevance_dict[relevance].append(sentence_id)

    case_relevance_dict[case_id] = relevance_dict


##Function to Retrieve Relevance Labels

In [None]:
def get_relevance(case_id, relevance_type):
    return case_relevance_dict.get(str(case_id), {}).get(relevance_type, [])

##Adding Relevance Labels to DataFrame

In [None]:
df["not-relevant"] = df["case_id"].apply(lambda x: get_relevance(x, "not-relevant"))
df["supplementary"] = df["case_id"].apply(lambda x: get_relevance(x, "supplementary"))
df["essential"] = df["case_id"].apply(lambda x: get_relevance(x, "essential"))

In [6]:
df.head(3)

Unnamed: 0,case_id,patient_narrative,clinician_question,note_excerpt,patient_question,note_excerpt_sentences,not-relevant,supplementary,essential
0,1,I had severe abdomen pain and was hospitalised...,Why was ERCP recommended to him over continuin...,Brief Hospital Course:\n\nDuring the ERCP a pa...,{'0': 'My question is if the sludge was there ...,"{'0': 'Brief Hospital Course:', '1': 'During t...","[0, 2, 3, 4, 8]",[],"[1, 5, 6, 7]"
1,2,I just wrote about my dad given multiple shots...,Why was he given lasix and his oxygen flow rat...,Brief Hospital Course:\n\nAcute diastolic hear...,{'0': 'dad given multiple shots of lasciks aft...,"{'0': 'Brief Hospital Course:', '1': 'Acute di...","[0, 2, 3, 5, 6, 8, 9, 10]",[],"[1, 4, 7]"
2,3,my son fell and lost conciousness for a couple...,What is the expected course of recovery for him?,Discharge Instructions:\nYou were admitted to ...,{'0': 'he is continously irritated and has hea...,{'0': 'Discharge Instructions: You were admitt...,"[0, 1, 2, 3, 6]","[7, 8, 9]","[4, 5]"
